"""
TODO 显示影片基本信息
TODO 访问豆瓣电影Top250(https://movie.douban.com/top250?start=0),
TODO 获取每部电影的中文片名、排名、评分及其对应的链接,按照“排名-中文片名-评分-链接”的格式显示在屏幕上。
"""
import requests
from lxml import etree
def get_html(url):
header = { 'User-Agent': 'Mozilla/5.0' }
return requests.get(url=url, headers=header).text
def parser_html(response):
# TODO 将HTML文本解析为lxml的Element对象
html = etree.HTML(response)
# TODO 使用xpath选择器提取信息
li_list = html.xpath('//ol[@class="grid_view"]/li')
for li in li_list:
# TODO 排名
ranking = li.xpath('.//em/text()')[0]
# TODO 片名
title = li.xpath('.//span[@class="title"]/text()')[0]
# TODO 评分
score = li.xpath('.//span[@class="rating_num"]/text()')[0]
# TODO 链接
href = li.xpath('.//a/@href')[0]
# TODO 输出排名-中文片名-评分-链接
print(ranking, '-', title, '-', score, '-', href)
if __name__ == '__main__':
base_url = 'https://movie.douban.com/top250?start={}&filter='
# TODO 爬取前250部电影信息
url_list = [base_url.format(i) for i in range(0, 250, 25)]
for page_url in url_list:
html_content = get_html(page_url)
parser_html(html_content)
"""
TODO 访问豆瓣电影Top250(https://movie.douban.com/top250?start=0),
TODO 在问题1的基础上,获取每部电影的导演、编剧、主演、类型、上映时间、片长、评分人数以及剧情简介等信息,
TODO 并将获取到的信息以CSV格式保存至本地文件中。
"""
import requests
from bs4 import BeautifulSoup
import csv
url_base = "https://movie.douban.com/top250?start={}&filter="
headers = {"user-agent":"Mozilla/5.0"}
with open('movie_info.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['导演', '编剧', '主演', '类型', '上映时间', '片长', '评分人数', '剧情简介']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# TODO 分页爬取
for i in range(0, 250, 25):
url = url_base.format(i)
resp = requests.get(url=url, headers=headers)
soup = BeautifulSoup(resp.content, "lxml")
divs = soup.find_all("div", class_="item")
for index, li in enumerate(divs):
href = li.find("a").get('href')
resp_detail = requests.get(url=href, headers=headers)
soup_detail = BeautifulSoup(resp_detail.content, "lxml")
# TODO 导演
director_info = soup_detail.find("span", class_="pl")
director_name = director_info.find_next("span").text.split(":")[0].strip() if director_info else None
print(director_name)
# TODO 编剧
writer_info = soup_detail.find("span", class_="attrs")
writer_name = writer_info.find_next("span").text.split(":")[1].strip() if writer_info else None
print(writer_name)
# TODO 主演
spans = soup_detail.find_all("span", class_="attrs")
actors = spans[2].text if len(spans) >= 3 else None
print(actors)
# TODO 类型
genres_name = soup_detail.find("span", property="v:genre")
genres_name = genres_name.find_next("span").text if genres_name else None
print(genres_name)
# TODO 上映时间
release_date = soup_detail.find("span", property="v:initialReleaseDate")
release_date = release_date.find_next("span").text if release_date else None
print(release_date)
# TODO 片长
runtime = soup_detail.find("span", property="v:runtime")
runtime = runtime.text if runtime else None
print(runtime)
# TODO 评分人数
rating_count = soup_detail.find("span", property="v:votes")
rating_count = rating_count.text if rating_count else None
print(rating_count)
# TODO 剧情简介
summary_span = soup_detail.find("span", property="v:summary")
summary = summary_span.text.strip() if summary_span else None
summary = summary.replace('\n', ' ').replace('\r', '')
print(summary)
print("-------------------------------------------------------------------")
# TODO 将数据写入 CSV 文件
movie_info = {
'导演': director_name,
'编剧': writer_name,
'主演': actors,
'类型': genres_name,
'上映时间': release_date,
'片长': runtime,
'评分人数': rating_count,
'剧情简介': summary
}
writer.writerow(movie_info)
print("数据已保存到 movie_info.csv")
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。