版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/weixin_40313634/article/details/90141841
from urllib.parse import urlencode
import requests
import json
import os
from pyquery import PyQuery as pq
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
# 抓取单个页面
def get_page(page):
parms = {
'uid' : '2830678474',
'luicode' : '10000011',
'lfid' : '1076032830678474',
'type' : 'uid',
'value' : '2830678474',
'containerid' : '1076032830678474',
'page' : page
}
url = base_url + urlencode(parms)
try:
response = requests.get(url, headers = headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
return None
# 解析网页内容
def parse_page(json):
weibo = []
if json:
items = json.get('data').get('cards')
for item in items:
mblog = item.get('mblog')
if mblog != None:
text = pq(mblog.get('text')).text()
weibo.append(text + '\n\t')
return weibo
# 获取微博总页数
def sum_page():
json = get_page(1)
if json:
total_item = json.get('data').get('cardlistInfo').get('total')
sum = int(total_item / 10) + 1
else:
sum = 100
return sum
if __name__ == '__main__':
sum = sum_page()
for page in range(sum):
data = get_page(page)
weibo = parse_page(data)
# 保存解析的内容到txt里
with open('weibo.txt', 'a', encoding = 'utf-8') as f:
for t in weibo:
f.write(t)
# 保存微博每页的数据
file = os.path.join(os.getcwd(), 'tmp', str(page) + '.json')
with open(file, 'w', encoding = 'utf-8') as f:
f.write(json.dumps(data, indent=2, ensure_ascii=False))```