上面是Scrapy的架构图,下面简单介绍一下各个组件
打开命令行,开启第一个Scrapy项目的实践
scrapy startproject douban
项目创建完成后可以看到在工程创建的位置有了douban文件夹,打开以后包含了上述的组件,可以使用spyder,pycharm等ide打开项目
cd douban
scrapy genspider example example.com
进入douban文件夹,并创建spider,上述命令中的example替换为spider的名字doubanmovie,example.com替换为 douban.com ,输入上述命令之后可以看到多了一个spider的py文件。
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 电影标题
title = scrapy.Field()
# 豆瓣评分
star = scrapy.Field()
# 主演信息
Staring = scrapy.Field()
# 豆瓣排名
rank = scrapy.Field()
# 描述
quote = scrapy.Field()
# 豆瓣详情页
url = scrapy.Field()
import scrapy
from douban.items import DoubanItem
class DoubanmovieSpider(scrapy.Spider):
name = 'doubanmovie'
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
for item in response.css('.item'):
movie = DoubanItem()
#Staring = item.xpath('//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/p[1]/text()').extract_first()
Staring =item.css('.bd p::text').extract_first()
rank = item.css('.pic em::text').extract_first()
title = item.css('.hd span.title::text').extract_first()
star = item.css('.star span.rating_num::text').extract_first()
quote = item.css('.quote span.inq::text').extract_first()
url = item.css('.pic a::attr("href")').extract_first()
image_url = item.css('.pic img::attr("src")').extract_first()
movie['rank'] = rank
movie['title'] = title
movie['star'] = star
movie['Staring'] = Staring
movie['quote'] = quote
movie['url'] = url
movie['image_url'] = image_url
yield movie
# 获取下一页的url
next_url = response.css('span.next a::attr("href")').extract_first()
if next_url is not None:
url = self.start_urls[0] + next_url
yield scrapy.Request(url=url, callback=self.parse)
class DoubanPipeline(object):
def __init__(self) -> None:
# 连接
self.client = MongoClient(host='localhost', port=27017)
# 如果设置有权限, 则需要先登录
# db_auth = self.client.admin
# db_auth.authenticate('root', 'root')
# 需要保存到的collection
self.col = self.client['douban_movie']
self.top250 = self.col.top250
# 先清除之前保存的数据
# self.top250.delete_many({})
def process_item(self, item, spider):
res = dict(item)
self.top250.insert_one(res)
return item
def open_spider(self, spider):
pass
def close_spider(self, spider):
self.client.close()
scrapy crawl doubanmovie
scrapy crawl doubanmovie -o top250.json -s FEED_EXPORT_ENCODING=UTF-8
回复“电影”,获得排行榜信息与源代码