"""
TODO 显示影片基本信息
TODO 访问豆瓣电影Top250(https://movie.douban.com/top250?start=0),
TODO 获取每部电影的中文片名、排名、评分及其对应的链接,按照“排名-中文片名-评分-链接”的格式显示在屏幕上。
"""
import requests
from bs4 import BeautifulSoup
def get_html(url):
header = { 'User-Agent': 'Mozilla/5.0' }
return requests.get(url=url, headers=header).text
# TODO 网页解析函数
def parser_html(response):
soup = BeautifulSoup(response, 'html.parser')
li_list = soup.select('ol.grid_view li')
for li in li_list:
# TODO 排名
ranking = li.select('em')[0].string
# TODO 片名
title = li.select('span')[0].string
# TODO 评分
score = li.select('span.rating_num')[0].string
# TODO 链接
href = li.select('a[href]')[0].get('href')
# TODO 排名-中文片名-评分-链接
print(ranking , '-' , title , '-' , score , '-' + href)
# TODO 主函数入口
if __name__ == '__main__':
base_url = 'https://movie.douban.com/top250?start={}&filter='
# TODO 爬取前250部电影信息
url_list = [base_url.format(i) for i in range(0, 250, 25)]
for page_url in url_list:
info = parser_html(get_html(page_url))
"""
TODO 访问豆瓣电影Top250(https://movie.douban.com/top250?start=0),
TODO 在问题1的基础上,获取每部电影的导演、编剧、主演、类型、上映时间、片长、评分人数以及剧情简介等信息,
TODO 并将获取到的信息以CSV格式保存至本地文件中。
"""
import requests
from bs4 import BeautifulSoup
import csv
def get_html(url):
header = {'User-Agent': 'Mozilla/5.0'}
return requests.get(url=url, headers=header).text
def parser_html(response):
soup = BeautifulSoup(response, 'html.parser')
li_list1 = soup.select('ol.grid_view li')
movies_info = []
for li1 in li_list1:
# TODO 获取链接
href = li1.select('a[href]')[0].get('href')
inner_soup = BeautifulSoup(requests.get(url=href, headers={'User-Agent': 'Mozilla/5.0'}).text, 'html.parser')
# v 获取导演
director_span = inner_soup.find('span', string='导演')
director = director_span.find_next('span').find('a').text if director_span else ''
print(director)
# TODO 获取编剧
writer_span = inner_soup.find('span', string='编剧')
writers = writer_span.find_next('span').find_all('a') if writer_span else []
writer = ', '.join([w.text for w in writers])
print(writer)
# TODO 获取主演
actors_span = inner_soup.find('span', string='主演')
actors = ', '.join([a.text for a in actors_span.find_next('span').find_all('a')] if actors_span else [])
print(actors)
# TODO 获取类型
genres_span = inner_soup.find('span', property='v:genre')
genres = ', '.join([g.text for g in genres_span.find_all_next('span', property='v:genre')] if genres_span else [])
print(genres)
# TODO 获取上映时间
release_date_span = inner_soup.find('span', property='v:initialReleaseDate')
release_date = release_date_span.text if release_date_span else ''
print(release_date)
# TODO 获取片长
runtime_span = inner_soup.find('span', property='v:runtime')
runtime = runtime_span.text if runtime_span else ''
print(runtime)
# TODO 获取评分人数
rating_count = li1.select('div.star span')[-1].string.replace('人评价', '').strip()
print(rating_count)
# TODO 获取剧情简介
summary_span = inner_soup.find('span', property='v:summary')
summary = summary_span.text.strip() if summary_span else ''
summary = summary.replace('\n', ' ').replace('\r', '') # 去除换行符和回车符
print(summary)
print("------------------------------------------------------------------------")
movie_info = {
'导演': director,
'编剧': writer,
'主演': actors,
'类型': genres,
'上映时间': release_date,
'片长': runtime,
'评分人数': rating_count,
'剧情简介': summary
}
movies_info.append(movie_info)
return movies_info
def save_to_csv(movies_info, filename):
if not movies_info:
print("movies_info is empty.")
return
with open(filename, mode='w', encoding='utf-8', newline='') as file:
writer = csv.DictWriter(file, fieldnames=movies_info[0].keys())
writer.writeheader()
for movie in movies_info:
writer.writerow(movie)
print("爬取完毕并保存为 CSV 文件")
if __name__ == '__main__':
base_url = 'https://movie.douban.com/top250?start={}&filter='
url_list = [base_url.format(i) for i in range(0, 250, 25)]
all_movies_info = []
for page_url in url_list:
try:
info = parser_html(get_html(page_url))
if info:
all_movies_info.extend(info)
else:
print(f"Page {page_url} returned no movie info.")
except KeyboardInterrupt:
break
except Exception as e:
print(f"Error on page {page_url}: {e}")
# TODO 保存为 CSV 文件
save_to_csv(all_movies_info, 'douban_top250_info.csv')
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。