实现步骤:
scrapy startproject firstdemo
scrapy genspider -t crawl xxx [www.xxx.com
](http://www.xxx.com/)
from scrapy_redis.spiders import RedisCrawlSpider
start_urls
和allowed_domains
进行注释 redis_key = 'sun'
可以被共享的调度器的名称RedisCrawlSpider
settings
ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 400}
redis
的set
集合来存储请求的指纹数据,从而实现请求去重的持久化 DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
scrapy_redis
组件 自己的调度器 SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
Redis
中请求队列和去重指纹set
(人话:爬虫一般机器宕机了,重启后是否继续爬虫还是从0开始) SCHEDULER_PERSIST = True
redis
服务器
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
redis
相关操作配置
redis.conf
redis.windows.conf
bind 127.0.0.1
删除protected-mode yes
改为 no
redis-server redis.windows.conf
redis-cli
scrapy runspider xxx.py
lpush xxx www.xx.com
lrange xx:items 0 -1
llen xx:items
redis
的proName:items
这个数据结构中
代码实现
sun2.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from sun2Pro.items import Sun2ProItem
class Sun2Spider(RedisCrawlSpider):
name = 'sun2'
redis_key = 'sun'
rules = (
Rule(LinkExtractor(allow=r'id=2&page=\d+'), callback='parse_item', follow=True),
)
def parse_item(self, response):
li_list = response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
for li in li_list:
new_num = li.xpath('./span[1]/text()').extract_first()
new_title = li.xpath('./span[3]/a/text()').extract_first()
item = Sun2ProItem()
item['title'] = new_title
item['new_num'] = new_num
yield item
items
import scrapy
class Sun2ProItem(scrapy.Item):
title = scrapy.Field()
new_num = scrapy.Field()
settings
DOWNLOAD_DELAY = 3
# 指定管道
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 400
}
# 指定调度器
# 增加一个去重容器类的配置,使用redis的set集合来存储请求的指纹数据,从而实现请求去重的持久化
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
# 使用scrapy_redis组件 自己的调度器
SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
# 配置调度器是否要持久化,也就是当爬虫结束,是否要清空Redis中请求队列和去重指纹set(人话:爬虫一般机器宕机了,重启后是否继续爬虫还是从0开始)
SCHEDULER_PERSIST = True
# 指定redis
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
redis
的set
数据结构中urls
: semebers urls
案例演示
movie.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from moviePro.items import MovieproItem
class MovieSpider(CrawlSpider):
name = 'movie'
start_urls = ['http://www.male37.live/index.php/vod/type/id/2/page/2.html']
rules = (
# Rule(LinkExtractor(allow=r'/id/3/page/\d+/\.html'), callback='parse_item', follow=True),
Rule(LinkExtractor(allow=r'id/\d+/page/\d+\.html'), callback='parse_item', follow=True),
)
conn = Redis(host='127.0.0.1', port=6379)
def parse_item(self, response):
li_list = response.xpath('/html/body/div[1]/div/div[1]/div/div/div[2]/ul/li')
for li in li_list:
detail_url = 'http://www.male37.live' + li.xpath('./div/a/@href').extract_first()
ex = self.conn.sadd('urls', detail_url)
if ex == 1:
print('该url没有被爬取过,可以进行数据爬取!')
yield scrapy.Request(url=detail_url, callback=self.parse_detail)
else:
print('数据还没更新,暂无新数据可爬取!')
def parse_detail(self, response):
item = MovieproItem()
item['name'] = response.xpath(
'/html/body/div[1]/div/div[1]/div[1]/div/div/div/div[2]/h1/text()').extract_first()
print(item['name'], '--------------')
item['desc'] = response.xpath(
'/html/body/div[1]/div/div[1]/div[1]/div/div/div/div[2]/p[5]/span[2]').extract_first()
item['desc'] = ''.join(item['desc'])
yield item
items.py
import scrapy
class MovieproItem(scrapy.Item):
name = scrapy.Field()
desc = scrapy.Field()
pipelines.py
class MovieproPipeline:
conn = None
def open_spider(self, spider):
self.conn = spider.conn
def process_item(self, item, spider):
dic = {
'name': item['name'],
'desc': item['desc']
}
print(dic)
self.conn.lpush('movieData', dic)
return item
settings
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
ITEM_PIPELINES = {
'moviePro.pipelines.MovieproPipeline': 300,
}