首页
学习
活动
专区
工具
TVP
发布
精选内容/技术社群/优惠产品,尽在小程序
立即前往

利用Python采集起点中文网小说,并解决字体反爬的问题

个人比较喜欢看小说,于是乎想利用Python爬取小说网站--起点中文网,在Python编程爬取定位过程中遇到了Python反爬虫,咨询了我旁边的前端大神,说下方法

当前页面接口返回的html源码

当前页面接口返回的html源码

第一步:获取当前页面的字体文件链接,可以通过正则获取

start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=1'

#获取当前页面的html

response = requests.get(start_url).text

#通过正则获取当前页面字体文件链接

url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)

第二步:通过fontTools模块获取当前字体映射关系

def get_font(url):

response = requests.get(url)

font = TTFont(BytesIO(response.content))

cmap = font.getBestCmap()

font.close()

return cmap

第三步:通过当前映射关系可以对应的数据被变更为英文,然后创建dict已经转换

def get_encode(cmap,values):

WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}

word_count=''

for value in values.split(';'):

value = value[2:]

key = cmap[int(value)]

word_count += WORD_MAP[key]

return word_count

第四步:然后就是通过pyquery进行数据提取

def get_index(start_url):

#获取当前页面的html

response = requests.get(start_url).text

doc = pq(response)

#获取当前字体文件名称

classattr = doc('p.update > span > span').attr('class')

pattern = '(.*?)'%classattr

#获取当前页面所有被字数字符

numberlist = re.findall(pattern,response)

#获取当前包含字体文件链接的文本

fonturl = doc('p.update > span > style').text()

#通过正则获取当前页面字体文件链接

url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)

cmap = get_font(url)

books = doc('.all-img-list li').items()

i = 0

for book in books:

item = {}

item['img'] = 'http:' + book('.book-img-box a img').attr('src')

item['bookname'] = book('.book-mid-info h4 a').text()

item['author'] = book('.name').text()

item['classes'] = book('p.author > a:nth-child(4)').text()

item['content'] = book('.intro').text()

item['number'] = get_encode(cmap,numberlist[i][:-1])

i += 1

第五步:将输入存入mongodb

client = pymongo.MongoClient('127.0.0.1')

db = client.qidian

p = db.finish

def mongo(item):

p.insert(item)

附当前Python爬虫文件源码

#coding=utf-8

import requests,json,time,re

from requests.exceptions import RequestException

from pyquery import PyQuery as pq

from fontTools.ttLib import TTFont

from io import BytesIO

import pymongo

client = pymongo.MongoClient('127.0.0.1')

db = client.qidian

p = db.finish

start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page='

def get_font(url):

response = requests.get(url)

font = TTFont(BytesIO(response.content))

cmap = font.getBestCmap()

font.close()

return cmap

def get_encode(cmap,values):

WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}

word_count=''

for value in values.split(';'):

value = value[2:]

key = cmap[int(value)]

word_count += WORD_MAP[key]

return word_count

def get_index(start_url):

#获取当前页面的html

response = requests.get(start_url).text

doc = pq(response)

#获取当前字体文件名称

classattr = doc('p.update > span > span').attr('class')

pattern = '(.*?)'%classattr

#获取当前页面所有被字数字符

numberlist = re.findall(pattern,response)

#获取当前包含字体文件链接的文本

fonturl = doc('p.update > span > style').text()

#通过正则获取当前页面字体文件链接

url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)

cmap = get_font(url)

books = doc('.all-img-list li').items()

i = 0

for book in books:

item = {}

item['img'] = 'http:' + book('.book-img-box a img').attr('src')

item['bookname'] = book('.book-mid-info h4 a').text()

item['author'] = book('.name').text()

item['classes'] = book('p.author > a:nth-child(4)').text()

item['content'] = book('.intro').text()

item['number'] = get_encode(cmap,numberlist[i][:-1])

i += 1

mongo(item)

def mongo(item):

p.insert(item)

def main():

for page in range(1,1000):

url = start_url + str(page)

get_index(url)

if __name__ == '__main__':

main()

针对月票榜月票数字体反爬修改

def get_index(start_url):

# 获取当前页面的html

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

response = requests.get(start_url).text

doc = pq(response)

# 获取当前包含字体文件链接的文本

fonturl = doc('div.total > p > span > style').text()

# 通过正则获取当前页面字体文件链接

addr = re.search('font-family: (.+?); src', fonturl).group(1)

url = 'https://qidian.gtimg.com/qd_anti_spider/{addr}.woff'.format(addr=addr)

cmap = get_font(url)

print(cmap)

# 获取当前字体文件名称

pattern = '(.*?)' % addr

# 获取当前页面所有被字数字符

numberlist = re.findall(pattern, response)

print('numberlist: ', numberlist)

books = doc('.book-img-text li').items()

i = 0

print('i: ', i)

for book in books:

item = {}

item['img'] = 'http:' + book('.book-img-box a img').attr('src')

item['bookname'] = book('.book-mid-info h4 a').text()

item['author'] = book('.name').text()

item['classes'] = book('p.author > a:nth-child(4)').text()

item['content'] = book('.intro').text()

item['number'] = get_encode(cmap, numberlist[i][:-1])

item['font_url'] = url

i += 1

mongo(item)

  • 发表于:
  • 原文链接https://kuaibao.qq.com/s/20200509A0FKL300?refer=cp_1026
  • 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号(企鹅号)传播渠道之一,根据《腾讯内容开放平台服务协议》转载发布内容。
  • 如有侵权,请联系 cloudcommunity@tencent.com 删除。

扫码

添加站长 进交流群

领取专属 10元无门槛券

私享最新 技术干货

扫码加入开发者社群
领券