Python 系列文章 —— crawlerdb

原创

玩转编程

发布于 2022-01-15 23:01:31

1780

发布于 2022-01-15 23:01:31

文章被收录于专栏：玩转编程

crawlerdb

import mysql.connector
import pymysql

from pyspider.result import ResultWorker


class crawlerdb:
    conn = None
    cursor = None

    def __init__(self):
        self.conn = pymysql.connect("127.0.0.1", "root", "12345678", "crawler")
        self.cursor = self.conn.cursor()


    def insert(self, _result):
        

        sql = "insert into info(title,body,editorial,ctime) VALUES('{}','{}','{}','{}')"

        try:
            sql = sql.format(pymysql.escape_string(_result.get('title')), pymysql.escape_string(_result.get('body')), _result.get('editorial'),_result.get('ctime'))
            self.cursor.execute(sql)
            self.conn.commit()
            return True
        except mysql.connector.Error:
            print('插入失败')
            return False

pysider

import re

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=5 * 60)
    def on_start(self):
        self.crawl('http://www.chinashina.com/rexinwen/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match(".*list_32_\d+\.html", each.attr.href, re.U):
                self.crawl(each.attr.href, callback=self.list_page)
        # 将第一页的抓取
        self.crawl(" http://www.chinashina.com/rexinwen/list_32_1.html", callback=self.list_page)
         
    @config(age=10 * 24 * 60 * 60)
    def list_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match(".*plus/view.php\?aid=\d+", each.attr.href, re.U):
                self.crawl(each.attr.href, callback=self.detail_page)
   
    @config(priority=2)
    def detail_page(self, response):
        other = response.doc('html > body > .clearfix > .main_lt > div > .news_about > p').text()
        source = other.split('\xa0\xa0\xa0\xa0\xa0')
        ctime = source[2].replace('时间：', '')
        editorial = source[0].split("：")[-1].strip()
        return {
            "title": response.doc('.news_title').text(),
            "ctime": ctime,
            "editorial": editorial,
            "body": response.doc('html > body > .clearfix > .main_lt > div > .news_txt').text()
        }
    
    def on_result(self,result):
        if not result:
            return
        sql = crawlerdb()
        sql.insert(result)

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

python

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

python

登录后参与评论

0 条评论

热度

Python 系列文章 —— crawlerdb

Python 系列文章 —— crawlerdb

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐