git clone https://github.com/rolando/scrapy-redis.git
import scrapy
from sunsite.items import SunsiteItem
class SunproSpider(scrapy.Spider):
name = 'sunpro'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1']
def parse(self, response):
li_list = response.xpath("/html/body/div[2]/div[3]/ul[2]//li")
for li in li_list:
item = SunsiteItem()
item['title'] = li.xpath("./span[3]/a/text()").extract_first()
status= li.xpath("./span[2]/text()").extract_first().split('\n ')[1]
item['status'] = status.split("\n ")[0]
# print(item)
yield item
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunsite.items import SunsiteItem
from scrapy_redis.spiders import RedisCrawlSpider
class SunprocrawlSpider(RedisCrawlSpider):
name = 'Sunprocrawl'
# allowed_domains = ['www.xxx.com']
# start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1']
redis_key = 'sunurl'
rules = (
Rule(LinkExtractor(allow=r'id=1&page=\d+'), callback='parse_item', follow=True),
)
def parse_item(self, response):
li_list = response.xpath("/html/body/div[2]/div[3]/ul[2]//li")
for li in li_list:
item = SunsiteItem()
item['title'] = li.xpath("./span[3]/a/text()").extract_first()
status = li.xpath("./span[2]/text()").extract_first().split('\n ')[1]
item['status'] = status.split("\n ")[0]
# print(item)
yield item
import scrapy
class SunsiteItem(scrapy.Item):
title = scrapy.Field()
status = scrapy.Field()
# -*- coding: utf-8 -*-
#该spider在基础spider上进行分布式修改
import scrapy
from JDspider.items import JdspiderItem
import json
#-----1导入分布式爬虫类
from scrapy_redis.spiders import RedisSpider
class JdproSpider(RedisSpider): #----2继承RedisSpider类方法
name = 'JDpro'
# start_urls = ['https://book.jd.com/booksort.html']
# ----4 设置redis-key
redis_key = 'tranurl'
# ----5 设置__init__
def __init__(self, *args, **kwargs):
domain = kwargs.pop('domain', '')
self.allowed_domains = list(filter(None, domain.split(',')))
super(JdproSpider, self).__init__(*args, **kwargs)
def parse(self, response):
#获取图书大分类中的列表
big_node_list = response.xpath("//div[@class='mc']//dt/a")
# 【:1】切片,先获取一类数据测试
# for big_node in big_node_list[:1]:
for big_node in big_node_list:
#大分类的名称
big_category = big_node.xpath("./text()").extract_first()
#大分类的URL
big_category_link = response.urljoin(big_node.xpath("./@href").extract_first())
# print(big_category, big_category_link)
# 获取所有图书小分类节点列表
#注意点---获取兄弟节点的xpath语法结构;小分类的整体节点
small_node_list = big_node.xpath("../following-sibling::dd[1]/em/a")
#【:1】切片,先获取一类数据测试
for small_node in small_node_list[:1]:
temp = {}
temp['big_category'] = big_category
temp['big_category_link'] = big_category_link
#获取小分类的名称
temp['small_category'] = small_node.xpath("./text()").extract_first()
#获取小分类的URL
temp['small_category_link'] = response.urljoin(small_node.xpath("./@href").extract_first())
# print(temp)
#注意点,筛选出来的数据持续传输,meta的使用
yield scrapy.Request(
url=temp['small_category_link'],
callback= self.parse_book_link,
#上面保存的item传递给下一个解析函数
meta = {'data':temp}
)
#解析详情
def parse_book_link(self,response):
temp = response.meta['data']
#获取到Book的标签
book_list = response.xpath("//*[@id='J_goodsList']/ul/li/div")
# print(len(book_list))
#遍历标签页
for book in book_list:
item = JdspiderItem()
item['big_category'] = temp['big_category']
item['big_category_link'] = temp['big_category_link']
item['small_category'] = temp['small_category']
item['small_category_link'] = temp['small_category_link']
#书的名字
item['bookname'] = book.xpath('./div[3]/a/em/text()|./div/div[2]/div[2]/div[3]/a/em/text()').extract_first()
#书的作者
item['author'] = book.xpath('./div[4]/span[1]/a/text()|./div/div[2]/div[2]/div[4]/span[1]/span[1]/a/text()').extract_first()
#书的URL
item['link'] = response.urljoin(book.xpath('./div[1]/a/@href|./div/div[2]/div[2]/div[1]/a/@href').extract_first())
# print(item)
# 获取图书编号,目的拼接图书的Price
skuid = book.xpath('.//@data-sku').extract_first()
# skuid = book.xpath('./@data-sku').extract_first()
# print("skuid:",skuid)
# 拼接图书价格地址
pri_url = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuid
# print(pri_url)
yield scrapy.Request(url=pri_url, callback=self.parse_price, meta={'meta_1': item})
#拿到一条数据测试,可以开启
# break
def parse_price(self,response):
#拿到传递过来的item
item = response.meta['meta_1']
#解析json页面
dict_data = json.loads(response.body)
#解析价钱,传递到item中
item['price'] = dict_data[0]['p']
# print(item)
yield item
Github:https://github.com/xbhog/scrapyRedis
致谢:如果对您有帮助,希望随手一个star,感谢!!