#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@Date : 2019/08/20 09:44
@Author :wuyupku
@File : shigeSpider.py
'''
from utils import MySpider, MongoBase
from datetime import date
from lxml import etree
import sys
class shigeSpider():
def __init__(self):
self.db = MongoBase()
self.spider = MySpider()
def download(self, url):
self.domain = url.split('/')[2]
data = self.spider.get(url)
if data:
self.parse(data)
def parse(self, data):
response = etree.HTML(data)
for row in response.xpath('//div[@class="left"]/div[@class="sons"]'):
title = row.xpath('div[@class="cont"]/p/a/b/text()')[0] if row.xpath('div[@class="cont"]/p/a/b/text()') else ''
dynasty = row.xpath('div[@class="cont"]/p[@class="source"]//text()')[0] if row.xpath('div[@class="cont"]/p[@class="source"]//text()') else ''
author = row.xpath('div[@class="cont"]/p[@class="source"]//text()')[-1] if row.xpath('div[@class="cont"]/p[@class="source"]//text()') else ''
content = ''.join(row.xpath('div[@class="cont"]/div[@class="contson"]//text()')).replace(' ', '').replace('\n', '') if row.xpath('div[@class="cont"]/div[@class="contson"]//text()') else ''
tag = ','.join(row.xpath('div[@class="tag"]/a/text()')) if row.xpath('div[@class="tag"]/a/text()') else ''
self.db.add_new_row('shigeSpider', { 'title': title, 'dynasty': dynasty, 'author': author, 'content': content, 'tag': tag, 'createTime': str(date.today()) })
print 'Title: {}'.format(title)
if response.xpath('//div[@class="pages"]/a/@href'):
self.download('http://' + self.domain + response.xpath('//div[@class="pages"]/a/@href')[-1])
if __name__ == '__main__':
sys.setrecursionlimit(100000)
url = 'http://so.gushiwen.org/type.aspx'
do = shigeSpider()
do.download(url)