版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/weixin_40313634/article/details/103099547
'''
爬取当当网的五星图书排行榜的信息
'''
Max_Page = 3 # 爬取前三页的排行榜信息
import requests
from pyquery import PyQuery as pq
import json
def requests_dangdang(url, headers=None, retry=3):
for _ in range(retry):
try:
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
except requests.RequestException as e:
print(f'Requests Error:\n\t{url}\n\t{e.args}')
return None
def parse_html(text):
doc = pq(text)('ul[class="bang_list clearfix bang_list_mode"] li')
for item in doc.items():
yield {
'排名': item('[class^="list_num"]').text().rstrip('.'), ## 排名
'书名': item('.name a').text(), ## 书名
'推荐指数': item('.star .tuijian').text(), ## 推荐指数
'作者': item('.publisher_info a').attr.title, ## 作者
'五星评分次数': item('.biaosheng span').text(), ## 五星评分次数
'价格': item('.price p:not([class]) .price_n').text(), ## 价格
'图书链接': item('.pic a').attr.href ## 图书链接
}
return
def save_book_info(book):
json_str = json.dumps(book, ensure_ascii=False, indent=4)
with open('book.json', 'a', encoding='utf-8') as f:
f.write(json_str)
def run(max_page=Max_Page):
for page in range(1, max_page+1):
url = f'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-{str(page)}'
text = requests_dangdang(url)
if text == None:
continue
for book in parse_html(text):
save_book_info(book)
if __name__ == '__main__':
run()
执行结果:
{
"排名": "1",
"书名": "谜案鉴赏",
"推荐指数": "100%推荐",
"作者": "[美]莉比・菲舍尔・赫尔曼 著,汪德均 /刘建洲/马遇乐 译",
"五星评分次数": "17669次",
"价格": "¥35.80",
"图书链接": "http://product.dangdang.com/28470981.html"
}{
"排名": "2",
"书名": "朝圣者",
"推荐指数": "100%推荐",
"作者": "【澳】泰瑞・海耶斯 译尤传莉著;酷威文化 出品",
"五星评分次数": "19538次",
"价格": "¥45.20",
"图书链接": "http://product.dangdang.com/25141508.html"
}