大家好,又见面了,我是你们的朋友全栈君。
新建工程
scrapy startproject tutorial
进入tutorial目录,在spider下面新建quotes_spider.py
import scrapy
from ..items import QuotesItem
#coding:utf-8
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domain = "toscrape.com"
def start_requests(self):
for i in range(1,2):
url = "http://quotes.toscrape.com/page/" + str(i) + "/"
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
item = QuotesItem()
for quote in response.css('div.quote'):
item['text'] = quote.css('span.text::text').get(),
item['author'] = quote.css('small.author::text').get(),
item['tags'] = quote.css('div.tags a.tag::text').getall()
yield item
进入items.py,代码如下:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class QuotesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
pass
进入pipelines.py进行设置,对数据进行清洗
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class TutorialPipeline(object):
def process_item(self, item, spider):
return item
class QuotesPipeline(object):
def process_item(self,item, spider):
if item['text']:
item['text'] = item['text'][0][1:-1]
if item['author']:
item['author'] = item['author'][0][1:-1]
return item
进入setting.py,修改相关配置
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
'tutorial.pipelines.QuotesPipeline': 500,
}
FEED_EXPORT_ENCODING = 'utf-8'
进行命令行,执行爬虫
scrapy crawl quotes -o quotes.jl
import json
import codecs
class JsonPipeline(object):
def __init__(self):
self.file = codecs.open('logs.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
发布者:全栈程序员栈长,转载请注明出处:https://javaforall.cn/153159.html原文链接:https://javaforall.cn