创建项目
scrapy startproject jianshu
scrapy genspider -t crawl jianshu_spider "jianshu.com"
jianshu_spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import JianshuItem
class JianshuSpiderSpider(CrawlSpider):
name = 'jianshu_spider'
allowed_domains = ['jianshu.com']
start_urls = ['http://jianshu.com/']
rules = (
Rule(LinkExtractor(allow=r'.*/p/[0-9a-z][12].*'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
title = response.xpath("//h1[@class='title']/text()").get()
avatar = response.xpath("//a[@class='avatar']/img/@src").get()
author = response.xpath("//span[@class='name']/a/text()").get()
pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
#获取文章id
url = response.url
url1 = url.split("?")[0]
article_id = url1.split("/")[-1]
#文章内容,包括标签,而不是存文本内容
content = response.xpath("//div[@class='show-content']").get()
word_count = response.xpath("//span[@class='wordage']/text()").get()
comment_count = response.xpath("//span[@class='comments-count']/text()").get()
read_count = response.xpath("//span[@class='views-count']/text()").get()
like_count = response.xpath("//span[@class='likes-count']/text()").get()
subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall())
item = JianshuItem(
title=title,
avatar=avatar,
pub_time=pub_time,
author=author,
origin_url=response.url,
content=content,
article_id=article_id,
subjects=subjects,
word_count=word_count,
comment_count=comment_count,
like_count=like_count,
read_count=read_count
)
yield item
items.py
import scrapy
class JianshuItem(scrapy.Item):
title = scrapy.Field()
content = scrapy.Field()
article_id = scrapy.Field()
origin_url = scrapy.Field()
author = scrapy.Field()
avatar = scrapy.Field()
pub_time = scrapy.Field()
read_count = scrapy.Field()
like_count = scrapy.Field()
word_count = scrapy.Field()
subjects = scrapy.Field()
comment_count = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# import pymysql
#
# class JianshuPipeline(object):
# def __init__(self):
# dbparams = {
# 'host': '127.0.0.1',
# 'port': 3306,
# 'user': 'root',
# 'password': '123456',
# 'database': 'jianshu',
# 'charset': 'utf8'
# }
# self.conn = pymysql.connect(**dbparams)
# self.cursor = self.conn.cursor()
# self._sql = None
#
# def process_item(self, item, spider):
# self.cursor.execute(self.sql, (item['title'], item['content'],
# item['author'], item['avatar'], item['pub_time'], item['article_id'],
# item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count']))
# self.conn.commit()
# return item
#
# @property
# def sql(self):
# if not self._sql:
# self._sql = """
# insert into article(id,title,content,author,avatar,pub_time,
# article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
# """
# return self._sql
# return self._sql
# 采用twisted异步保存到mysql
import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors
class JianshuTwistedPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': '123456',
'database': 'jianshu',
'charset': 'utf8',
'cursorclass': cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams)
self._sql = None
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(id,title,content,author,avatar,pub_time,
article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
def process_item(self, item, spider):
defer = self.dbpool.runInteraction(self.insert_item, item)
defer.addErrback(self.handle_error, item, spider)
def insert_item(self, cursor, item):
cursor.execute(self.sql, (item['title'], item['content'],
item['author'], item['avatar'], item['pub_time'], item['article_id'],
item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count']))
def handle_error(self, error, item, spider):
# print(error)
pass
middlewares.py
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
from scrapy.http.response.html import HtmlResponse
class SeleniumDownloadMiddleware(object):
def __init__(self):
self.driver = webdriver.Chrome()
def process_request(self,request,spider):
self.driver.get(request.url)
time.sleep(1)
try:
while True:
showmore = self.driver.find_element_by_class_name('show-more')
showmore.click()
time.sleep(0.5)
if not showmore:
break
except:
pass
source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8')
return response
settings.py
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
}
DOWNLOADER_MIDDLEWARES = {
'jianshu.middlewares.SeleniumDownloadMiddleware': 543,
}
ITEM_PIPELINES = {
# 'jianshu.pipelines.JianshuPipeline': 300,
'jianshu.pipelines.JianshuTwistedPipeline': 1,
}
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl jianshu_spider".split())