版权声明:Copyright © https://cloud.tencent.com/developer/article/1477113
# v2
import requests
import json
from lxml import etree
temp_url = 'https://movie.douban.com/top250?start={}'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
i = 0
item = {}
while i < 250:
a = 0
url = temp_url.format(i)
resp = requests.get(url, headers=headers)
html_str = resp.content.decode()
html = etree.HTML(html_str)
# 分组
li_list = html.xpath('//ol[@class="grid_view"]/li')
if len(li_list) < 25:
break
# print(li_list[3].xpath('//div/div[@class="hd"]/a/@href'))
for li in li_list:
title_list = li.xpath('//div/div[@class="hd"]/a/span[1]/text()')
href_list = li.xpath('//div/div[@class="hd"]/a/@href')
pinglun_list = li.xpath('//div[@class="bd"]/p[last()]/span/text()')
pingjia_list = li.xpath('//div[@class="bd"]/div[@class="star"]/span[last()]/text()')
item['title'] = title_list[a]
item['href'] = href_list[a]
# item['pl'] = pinglun_list[a] if pinglun_list[a] else ''
item['pj'] = pingjia_list[a]
a += 1
with open('doubanmovie.txt', 'a') as f:
json.dump(item,f,indent=2,ensure_ascii=False)
f.write('\n')
i += 25
import requests
import json
from lxml import etree
temp_url = 'https://movie.douban.com/top250?start={}'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
i = 25
while True:
url = temp_url.format(str(i))
resp = requests.get(url, headers=headers)
html_str = resp.content.decode()
html = etree.HTML(html_str)
href_list = html.xpath("//div[@class='info']/div/a/@href")
title_list = html.xpath("//div[@class='info']/div/a/span/text()")
for href in href_list:
item = {}
item["href"] = href
# 解决出现\xa0/\xa0魔法公主的问题
item['title'] = ''.join(title_list[href_list.index(href)].split())
with open('doubanmovie.txt','a') as f:
json.dump(item,f,indent=2,ensure_ascii=False)
f.write('\n')
# print(item)
i += 25
if len(title_list) < 25:
break
print('爬取完成。。。')