文章/答案/技术大牛

发布

多线程爬取小说网站

文章来源：企鹅号 - 总有那么一次

多线程爬取小说网站“全书网”

1、可爬取所有分类的文章

2、包括小说的封面、作者、介绍以及每章的内容，就是说这个小说网站架构都爬下来了。

3、本来是打算都爬下来的，后来发现我还是太年轻，一本书的内容就占了3到4M的数据库空间，爬到300多本以后才恍然大悟，要知道全站至少十几万本书。这你可以算一下要多少空间，我这小小服务器承担不起了。后来就改成先爬所有小说的封面、作者、书名、介绍以及链接。省略了章节具体内容后总共爬了15万3千多本。。。

---------------------

这篇是我CSDN博客上已经发表过的，原文地址：

# coding:utf8

#!/usr/bin/python

# -*- coding: UTF-8 -*-

importrequests

importpymysql

frombs4importBeautifulSoup

import_thread

importtime

importthreading

# 获取书简介，修改书信息

defgetIntroduce(novel_href,id):

header = {

'Host':'www.quanshuwang.com',

'Upgrade-Insecure-Requests':'1',

'Connection':'keep-alive',

'Accept-Encoding':'gzip, deflate',

'Accept-Language':'zh-CN,zh;q=0.9',

'Cache-Control':'max-age=0',

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}

time.sleep(0.2)

novellist = requests.get(novel_href,headers=header,timeout=20)

novellist.encoding ='gbk'

soup = BeautifulSoup(novellist.text,'lxml')

res = soup.select("#waa")

if(len(res)>):

# 书简介

introduce = soup.select("#waa")[].get_text()

chapterHref = soup.select(".reader")[].get("href")

print(introduce)

sql ="UPDATE novel_info SET novel_introduce='%s' WHERE novel_href='%s'"% (introduce,novel_href)

te = threading.Thread(target=getChapterList,args=(chapterHref,id,sql))

te.start()

# getChapterList(chapterHref,id,sql)

# 获取章节信息

defgetChapterList(h,id,sql):

db = pymysql.connect("localhost","root","123456","wx_app")

db.ping(True)

time.sleep(0.2)

novellist = requests.get(h,timeout=20)

novellist.encoding ='gbk'

soup = BeautifulSoup(novellist.text,'lxml')

list = soup.select(".dirconone > li");

i =1

print("开始输入-> 书ID：%d "% id)

insertNovelInfo(sql,db)

forchapterinlist:

contHref = chapter.select("a")[].get("href")

# 章节标题

contTitle = chapter.select("a")[].get_text()

# content = getContents(contHref)

res1 = requests.get(contHref,timeout=20)

res1.encoding ='gbk'

soup = BeautifulSoup(res1.text,'lxml')

tx = soup.select(".mainContenr")

if(len(tx) >):

content = soup.select(".mainContenr")[].get_text().lstrip('style5();').rstrip('style6();')

else:

content = h

print("章节:%s"% (contTitle))

sql1 ="INSERT INTO `novel_chapter`(novel_id,chapter_id,chapter_name) VALUES(%d,%d,'%s')"% (id,i,contTitle)

i = i+1

sql2 ="INSERT INTO `novel_chapter_info`(chapter_id,chapter_name,chapter_content,novel_id) VALUES(%d,'%s','%s',%d)"% (i,contTitle,content,id)

insertNovelInfo(sql1,db)

insertNovelInfo(sql2,db)

print("文件%s输入完成"% id)

db.commit()

db.close()

defgetContents(h):

res = requests.get(h,timeout=20)

res.encoding ='gbk'

soup = BeautifulSoup(res.text,'lxml')

tx = soup.select(".mainContenr")

if(len(tx)>):

content = soup.select(".mainContenr")[].get_text().lstrip('style5();').rstrip('style6();')

else:

content = h

returncontent

definsertNovelInfo(sql,db):

cursor = db.cursor()

try:

cursor.execute(sql)

except:

#回滚

db.rollback()

print("mysql错误：",sql)

exec()

# getIntroduce('http://www.quanshuwang.com/book_135083.html')

deftest(i):

print(i)

definit(count,num):

# count = 0

while(count

str ="select a.novel_href,a.novel_id from novel_info a inner join (select novel_id from novel_info GROUP BY novel_id limit %d,1) b on a.novel_id=b.novel_id "% count

# 打开数据库连接

db = pymysql.connect("localhost","root","123456","wx_app")

db.ping(True)

# 使用cursor()方法获取操作游标

cursor = db.cursor()

try:

# 执行SQL语句

cursor.execute(str)

# 获取所有记录列表

results = cursor.fetchall()

db.close()

except:

print("Error: unable to fecth data")

# 关闭数据库连接

forrowinresults:

getIntroduce(row[],row[1])

print(row[],row[1])

count = count +1

deftest(res):

i =;

while(i

print(res)

i = i+1

try:

threads = []

# 循环开启线程

foriinrange(,100):

# 每个线程执行多少文章

j = i+1

t = threading.Thread(target=init,args=(i,j))

i = j

threads.append(t)

t.start()

fortinthreads:

t.join()

print("end")

except:

print("Error: 无法启动线程")

发表于: 2019-01-282019-01-28 08:00:46
原文链接：https://kuaibao.qq.com/s/20190128G05P1W00?refer=cp_1026
腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
如有侵权，请联系 cloudcommunity@tencent.com 删除。

多线程爬取小说网站

相关快讯

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐