首先进入中彩网http://www.zhcw.com/
点击 双色球
链接。如下图:
再次点击 往期回顾
链接。如下图:
而后将鼠标移入表格处,右击选择查看框架源码
。如下图:
查看其框架源码可知,其获取的url为:http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html
好了,那接下笔者就是使用requests去请求url,而后使用BeautifulSoup去解析html。前提是已安装了BeautifulSoup库哦!完整代码如下:
import requests
from bs4 import BeautifulSoup
import time
'''
通过url以get方式请求获取响应数据
'''
def getHTML(url):
try:
res=requests.get(url)
return res.content.decode(res.apparent_encoding,'ignore')
except Exception as e:
pass
'''
获取页面总数
'''
def getPages(html):
soup = BeautifulSoup(html, 'html.parser')
nums = int(soup.findAll('p')[1].find('strong').get_text())
return nums
'''
解析页面数据,获取目标数据
'''
def parseData(html):
soup = BeautifulSoup(html, 'html.parser')
trs = soup.findAll('tr')[2:-1]
leng = len(trs)
res = []
for i in range(leng):
res0 = []
tds = trs[i].findAll('td')
pFir = tds[4].get_text().replace(" ", "").split("\n")
res0.append(tds[0].get_text())
res0.append(tds[1].get_text())
res0.append(str(tds[2].get_text()).replace("\n", " "))
res0.append(tds[3].get_text())
res0.append(pFir[0])
res0.append(pFir[1])
res0.append(tds[5].get_text())
res0.append(tds[6].findAll('a')[0]['href'])
res0.append(tds[6].findAll('a')[1]['href'])
res.append(res0)
return res
'''
通过页索引获取页面地址
'''
def getUrl(n):
return 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(n) + '.html'
def main():
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
html = getHTML(url)
# 获取总页数
pageNo = getPages(html)
fileName = u'双色球.txt'.encode('UTF-8')
# 依次处理每一页数据
for _ in range(1, 2):
print("正在爬取第%d页数据..." % _)
url = getUrl(_)
html = getHTML(url)
# 解析数据
data = parseData(html)
with open(fileName, 'w') as fp:
for item in data:
fp.write('%s\n'
%(item))
print("第%d页数据爬取完成!" % _)
time.sleep(3)
print("爬取完成!")
if __name__ == '__main__':
main()
格式后续再处理吧!