大家好,我是北山啦。
@toc
利用Python对豆瓣电影Top250电影进行爬取,收集相关的信息,并且利用Python进行数据分析,获取'排名','电影名称','导演','上映年份','制作国家','类型','评分','评价分数','短评'等字段,探索相关的数据。
#https://beishan.blog.csdn.net/article/details/112735850
第一页:https://movie.douban.com/top250
第二页:https://movie.douban.com/top250?start=25&filter=
第三页:https://movie.douban.com/top250?start=50&filter=
观察可知,我们只需要修改start参数即可
headers中有很多字段,这些字段都有可能会被对方服务器拿过来进行判断是否为爬虫
通过headers中的User-Agent字段来
在这里我们只需要添加请求头即可
这里我使用的是xpath
# -*- coding: utf-8 -*-
# @Author: Kun
import requests
from lxml import etree
import pandas as pd
df = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36',
'Referer': 'https://movie.douban.com/top250'}
columns = ['排名','电影名称','导演','上映年份','制作国家','类型','评分','评价分数','短评']
def get_data(html):
xp = etree.HTML(html)
lis = xp.xpath('//*[@id="content"]/div/div[1]/ol/li')
for li in lis:
"""排名、标题、导演、演员、"""
ranks = li.xpath('div/div[1]/em/text()')
titles = li.xpath('div/div[2]/div[1]/a/span[1]/text()')
directors = li.xpath('div/div[2]/div[2]/p[1]/text()')[0].strip().replace("\xa0\xa0\xa0","\t").split("\t")
infos = li.xpath('div/div[2]/div[2]/p[1]/text()')[1].strip().replace('\xa0','').split('/')
dates,areas,genres = infos[0],infos[1],infos[2]
ratings = li.xpath('.//div[@class="star"]/span[2]/text()')[0]
scores = li.xpath('.//div[@class="star"]/span[4]/text()')[0][:-3]
quotes = li.xpath('.//p[@class="quote"]/span/text()')
for rank,title,director in zip(ranks,titles,directors):
if len(quotes) == 0:
quotes = None
else:
quotes = quotes[0]
df.append([rank,title,director,dates,areas,genres,ratings,scores,quotes])
d = pd.DataFrame(df,columns=columns)
d.to_excel('Top250.xlsx',index=False)
for i in range(0,251,25):
url = "https://movie.douban.com/top250?start={}&filter=".format(str(i))
res = requests.get(url,headers=headers)
html = res.text
get_data(html)
结果如下:
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 2 15:19:29 2021
@author: 北山啦
"""
import pandas as pd
import time
import requests
from lxml import etree
from queue import Queue
from threading import Thread, Lock
class Movie():
def __init__(self):
self.df = []
self.headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36',
'Referer': 'https://movie.douban.com/top250'}
self.columns = ['排名','电影名称','导演','上映年份','制作国家','类型','评分','评价分数','短评']
self.lock = Lock()
self.url_list = Queue()
def get_url(self):
url = 'https://movie.douban.com/top250?start={}&filter='
for i in range(0,250,25):
self.url_list.put(url.format(str(i)))
def get_html(self):
while True:
if not self.url_list.empty():
url = self.url_list.get()
resp = requests.get(url,headers=self.headers)
html = resp.text
self.xpath_parse(html)
else:
break
def xpath_parse(self,html):
xp = etree.HTML(html)
lis = xp.xpath('//*[@id="content"]/div/div[1]/ol/li')
for li in lis:
"""排名、标题、导演、演员、"""
ranks = li.xpath('div/div[1]/em/text()')
titles = li.xpath('div/div[2]/div[1]/a/span[1]/text()')
directors = li.xpath('div/div[2]/div[2]/p[1]/text()')[0].strip().replace("\xa0\xa0\xa0","\t").split("\t")
infos = li.xpath('div/div[2]/div[2]/p[1]/text()')[1].strip().replace('\xa0','').split('/')
dates,areas,genres = infos[0],infos[1],infos[2]
ratings = li.xpath('.//div[@class="star"]/span[2]/text()')[0]
scores = li.xpath('.//div[@class="star"]/span[4]/text()')[0][:-3]
quotes = li.xpath('.//p[@class="quote"]/span/text()')
for rank,title,director in zip(ranks,titles,directors):
if len(quotes) == 0:
quotes = None
else:
quotes = quotes[0]
self.df.append([rank,title,director,dates,areas,genres,ratings,scores,quotes])
d = pd.DataFrame(self.df,columns=self.columns)
d.to_excel('douban.xlsx',index=False)
def main(self):
start_time = time.time()
self.get_url()
th_list = []
for i in range(5):
th = Thread(target=self.get_html)
th.start()
th_list.append(th)
for th in th_list:
th.join()
end_time = time.time()
print(end_time-start_time)
if __name__ == '__main__':
spider = Movie()
spider.main()
获取数据后,就可以对自己感兴趣的内容进行分析了
df = pd.read_excel("Top250.xlsx",index_col=False)
df.head()
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
c1 = (
Bar()
.add_xaxis(x1)
.add_yaxis("影片数量", y1)
.set_global_opts(
title_opts=opts.TitleOpts(title="Top250年份分布"),
datazoom_opts=opts.DataZoomOpts(),
)
.render("1.html")
)
plt.figure(figsize=(10,6))
plt.hist(list(df["评分"]),bins=8,facecolor="blue", edgecolor="black", alpha=0.7)
plt.show()
plt.figure(figsize=(10,5), dpi=100)
plt.scatter(df.index,df['评分'])
plt.show()
总的来说,排名越靠前,评价人数越多,并且分数也越高。
c2 = (
Bar()
.add_xaxis(df1["电影名称"].to_list())
.add_yaxis("评论数", df1["评价分数"].to_list(),color=Faker.rand_color())
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position="right"))
.set_global_opts(title_opts=opts.TitleOpts(title="电影评论Top10"))
.render("2.html")
)
让我们来看看人气最高的有哪些影片,你又看过几部呢?
可以看到这些导演很🐂呀
from collections import Counter
colors = ' '.join([i for i in df[ '类型']]).strip().split()
c = dict(Counter(colors))
c
发现有个错误值
d = c.pop('1978(中国大陆)')
删除即可
方法二 del[dkey]
d = {'a':1,'b':2,'c':3}
# 删除给定key的元素
del d['a']
print(d)
# 删除不存在的元素
del d['m']
clear一次性删除所有字典元素d = {'a':1,'b':2,'c':3} print(d) # 删除所有元素,允许d为{} d.clear() print(d)
统计展示
在这里插入图片描述
可视化展示
c = (
WordCloud()
.add(
"",
words,
word_size_range=[20, 100],
textstyle_opts=opts.TextStyleOpts(font_family="cursive"),
)
.set_global_opts(title_opts=opts.TitleOpts(title="WordCloud-自定义文字样式"))
.render("wordcloud_custom_font_style.html")
)
## https://blog.csdn.net/qq_45176548/article/details/112735850
就可以清楚的看到,Top250的电影的类别
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。