目标地址:xxxx
技术选型:python
软件包管理工具:pipenv
编辑器:jupyter
分析目标地址:
gplId表示项目ID,可变参数
结果收集方式:
数据库
代码实现
导入相关模块
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os,sys
# 网页提取函数
def get_one_page(i):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
paras = {
'a': 'x',
'b': 'x',
'c': 'x',
'd': 'x',
'e': 'x'
}
url = 'xx?' + urlencode(paras)
response = requests.get(url,headers = headers)
if response.status_code == 200:
response.encoding='utf-8'
return response.text
return None
except RequestException:
print('爬取失败')
# beatutiful soup解析然后提取表格
def parse_one_page(html):
soup = BeautifulSoup(html)
content = soup.select('.ttable')[0]
tbl = pd.read_html(content.prettify(),header = 1)[0]
tbl.rename(columns = {'序号':'serial_number',
'xx':'option',
'xx':'fanwei',
'xx':'company_name',
'xx':'shigong_date',
'xx':'order_no',
'xx':'miaomu_name',
'xx':'type',
'xx':'spec',
'xx':'count',
xx':'dead_count',
'xx':'zhongzhi_midu',
'xx':'mianji',
'xx':'unit',
'xx':'danjia',
'xx':'xiaoji',
'xx':'zhongzhididian',
'xx':'chuhuonongchang',
'xx':'remark',
'xx':'image_count',
'xx':'image'
},inplace = True)
del tbl['option']
del tbl['image_count']
del tbl['image']
return tbl
# 创建表结构
import pymysql
# 创建表结构
def generate_mysql():
conn = pymysql.connect(
host='xxxx',
user='root',
password='xxxx',
port=3307,
charset = 'utf8',
db = 'miao_mu_data')
cursor = conn.cursor()
sql = 'CREATE TABLE IF NOT EXISTS miaomu (' \
'serial_number INT(20) NOT NULL AUTO_INCREMENT,' \
'fanwei varchar(50) ,' \
'company_name VARCHAR(50) ,' \
'shigong_date varchar(50),' \
'order_no varchar(50),' \
'miaomu_name varchar(50),' \
'type varchar(50),' \
'spec varchar(50),' \
'count varchar(50),' \
'dead_count varchar(50),' \
'zhongzhi_midu varchar(50),' \
'mianji varchar(50),' \
'unit varchar(50),' \
'danjia varchar(50),' \
'xiaoji varchar(50),' \
'zhongzhididian varchar(50),' \
'chuhuonongchang varchar(50),' \
'remark varchar(50),' \
'PRIMARY KEY (serial_number))'
cursor.execute(sql)
conn.close()
# 存储到数据库
from sqlalchemy import create_engine
# 存储到数据库
def write_to_sql(tbl, db = 'miao_mu_data'):
engine = create_engine('mysql+pymysql://root:密码@ip:3307/{0}?charset=utf8'.format(db))
try:
tbl.to_sql('miaomu',con = engine,if_exists='append',index=False)
except Exception as e:
print(e)
# 主函数
import time
# 主函数
def main(page):
start_time = time.time()
for i in range(1,page):
print(i)
html = get_one_page(page)
df = parse_one_page(html)
if not df.empty:
write_to_sql(df)
# 间隔执行
time.sleep(3)
endtime = time.time()-start_time
print('程序运行了%.2f秒' %endtime)
if __name__ == '__main__':
main(100000)
# 生成表结构
# generate_mysql()