版权声明:Copyright © https://cloud.tencent.com/developer/article/1477120
#author: Zheng
#time: 2018/7/13 20:17
# 爬取果壳问答
import re
import requests
import json
temp_url = 'https://www.guokr.com/ask/highlight/?page={}'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
for i in range(1,101):
url = temp_url.format(str(i))
resp = requests.get(url,headers)
content = resp.content.decode()
ret = re.findall(r'<h2><a target="_blank" href="(.*)">(.*)</a></h2>', content)
info = dict()
for i in ret:
# print(i)
info[i[0]] = i[1]
# info_str = json.dumps(info)
# print(info_str)
with open('info.txt','a') as f:
json.dump(info,f,ensure_ascii=False,indent=2)
f.write('\n\n')
print('爬取完成...')