环境
python3.6 + requests+lxml
Lxml资料
分析
网址:https://movie.douban.com/cinema/nowplaying/chengdu/
F12分析网页,通过lxml解析requests获取的到页面信息,筛选出自己电影相关信息。
源码
importrequests
fromlxmlimportetree
importos
defparse1(url):
response = requests.get(url).content
response = etree.HTML(response)
datas = response.xpath('//div[@id="nowplaying"]/div[2]/ul/li')
fordataindatas:
all_attrib = data.attrib
name = all_attrib['data-title']
score = all_attrib['data-score']
director = all_attrib['data-director']
actors = all_attrib['data-actors']
duration = all_attrib['data-duration']
poster_url = data.xpath('ul/li[@class="poster"]/a/img/@src')[]
info_url = data.xpath('ul/li[@class="poster"]/a/@href')[]
info = parse2(info_url)
if notos.path.exists('movies/'+name):
os.makedirs('movies/'+name)
withopen('movies/'+name+'/'+name+'.txt','w')asft:
all_info ='电影:'+name +'\n'+'评分:'+score+'\n'+'时长:'+duration+'\n'\
'导演:'+director+'\n'+'主演:'+actors+'\n'+'剧情:'+info+'\n'
ft.write(all_info.replace('\u22ef','...'))
download(name,poster_url)
defparse2(url):
response = requests.get(url).content
response = etree.HTML(response)
datas = response.xpath('//div[@class="related-info"]')
info =''
fordataindatas:
info = data.xpath('div/span/text()')[].strip()
returninfo
defdownload(name,url):
content = requests.get(url).content
withopen('movies/'+name+'/'+name+'.jpg','wb')asfp:
fp.write(content)
if__name__ =='__main__':
nowplaying_url ='https://movie.douban.com/cinema/nowplaying/chengdu/'
parse1(nowplaying_url)
领取专属 10元无门槛券
私享最新 技术干货