采集和解析糗事百科网页上的内容。
该网页分页显示数据,URL 格式是
https://www.qiushibaike.com/8hr/page/2/
其中最后的数字表示页码。
需求是爬取糗事百科网站前20页的内容,包括每个帖子里的用户姓名、段子内容(包括正文文字和图片)、点赞数和评论数,并将结果保存到 JSON 文件中。
一、单线程实现
使用单线程依次获取网页内容,步骤依次是:
构建网址——>访问网页并获取源代码——>解析源代码——>转成 JSON 格式——>存储到本地文件。
import json
import requests
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", "Accept-Language": 'zh-CN,zh;q=0.8'}
local_file = open("duanzi.json", "a")
def parse_html(html):
text = etree.HTML(html)
node_list = text.xpath("//div[recommend-article(@id, 'qiushi_tag')]")
for node in node_list:
try:
username = node.xpath("./li").xpath('./div')[0].xpath('./div')[0].xpath('.//span').text
image = node.xpath("./li").xpath(".//@src")
title = node.xpath("./li").xpath('./div')[0].xpath("./a").text
like = node.xpath("./li").xpath('./div')[0].xpath('./div')[0].xpath('./div')[0].xpath(".//span")[0].text
comments = node.xpath("./li").xpath('./div')[0].xpath('./div')[0].xpath('./div')[0].xpath(".//span")[3].text
items = {
"username": username,
"title": title,
"image": image,
"zan": like,
"comments": comments
}
local_file.write(json.dumps(items, ensure_ascii=False) + "\n")
except:
pass
def main():
for page in range(1, 21):
url = "http://www.qiushibaike.com/8hr/page/" + str(page) + "/"
html = requests.get(url, headers=headers).text
parse_html(html)
if __name__ == "__main__":
main()