多线程爬取小说网站“全书网”
1、可爬取所有分类的文章
2、包括小说的封面、作者、介绍以及每章的内容,就是说这个小说网站架构都爬下来了。
3、本来是打算都爬下来的,后来发现我还是太年轻,一本书的内容就占了3到4M的数据库空间,爬到300多本以后才恍然大悟,要知道全站至少十几万本书。这你可以算一下要多少空间,我这小小服务器承担不起了。后来就改成先爬所有小说的封面、作者、书名、介绍以及链接。省略了章节具体内容后总共爬了15万3千多本。。。
---------------------
这篇是我CSDN博客上已经发表过的,原文地址:
# coding:utf8
#!/usr/bin/python
# -*- coding: UTF-8 -*-
importrequests
importpymysql
frombs4importBeautifulSoup
import_thread
importtime
importthreading
# 获取书简介,修改书信息
defgetIntroduce(novel_href,id):
header = {
'Host':'www.quanshuwang.com',
'Upgrade-Insecure-Requests':'1',
'Connection':'keep-alive',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
time.sleep(0.2)
novellist = requests.get(novel_href,headers=header,timeout=20)
novellist.encoding ='gbk'
soup = BeautifulSoup(novellist.text,'lxml')
res = soup.select("#waa")
if(len(res)>):
# 书简介
introduce = soup.select("#waa")[].get_text()
chapterHref = soup.select(".reader")[].get("href")
print(introduce)
sql ="UPDATE novel_info SET novel_introduce='%s' WHERE novel_href='%s'"% (introduce,novel_href)
te = threading.Thread(target=getChapterList,args=(chapterHref,id,sql))
te.start()
# getChapterList(chapterHref,id,sql)
# 获取章节信息
defgetChapterList(h,id,sql):
db = pymysql.connect("localhost","root","123456","wx_app")
db.ping(True)
time.sleep(0.2)
novellist = requests.get(h,timeout=20)
novellist.encoding ='gbk'
soup = BeautifulSoup(novellist.text,'lxml')
list = soup.select(".dirconone > li");
i =1
print("开始输入-> 书ID:%d "% id)
insertNovelInfo(sql,db)
forchapterinlist:
contHref = chapter.select("a")[].get("href")
# 章节标题
contTitle = chapter.select("a")[].get_text()
# content = getContents(contHref)
res1 = requests.get(contHref,timeout=20)
res1.encoding ='gbk'
soup = BeautifulSoup(res1.text,'lxml')
tx = soup.select(".mainContenr")
if(len(tx) >):
content = soup.select(".mainContenr")[].get_text().lstrip('style5();').rstrip('style6();')
else:
content = h
print("章节:%s"% (contTitle))
sql1 ="INSERT INTO `novel_chapter`(novel_id,chapter_id,chapter_name) VALUES(%d,%d,'%s')"% (id,i,contTitle)
i = i+1
sql2 ="INSERT INTO `novel_chapter_info`(chapter_id,chapter_name,chapter_content,novel_id) VALUES(%d,'%s','%s',%d)"% (i,contTitle,content,id)
insertNovelInfo(sql1,db)
insertNovelInfo(sql2,db)
print("文件%s输入完成"% id)
db.commit()
db.close()
defgetContents(h):
res = requests.get(h,timeout=20)
res.encoding ='gbk'
soup = BeautifulSoup(res.text,'lxml')
tx = soup.select(".mainContenr")
if(len(tx)>):
content = soup.select(".mainContenr")[].get_text().lstrip('style5();').rstrip('style6();')
else:
content = h
returncontent
definsertNovelInfo(sql,db):
cursor = db.cursor()
try:
cursor.execute(sql)
except:
#回滚
db.rollback()
print("mysql错误:",sql)
exec()
# getIntroduce('http://www.quanshuwang.com/book_135083.html')
deftest(i):
print(i)
definit(count,num):
# count = 0
while(count
i=
str ="select a.novel_href,a.novel_id from novel_info a inner join (select novel_id from novel_info GROUP BY novel_id limit %d,1) b on a.novel_id=b.novel_id "% count
# 打开数据库连接
db = pymysql.connect("localhost","root","123456","wx_app")
db.ping(True)
# 使用cursor()方法获取操作游标
cursor = db.cursor()
try:
# 执行SQL语句
cursor.execute(str)
# 获取所有记录列表
results = cursor.fetchall()
db.close()
except:
print("Error: unable to fecth data")
# 关闭数据库连接
forrowinresults:
getIntroduce(row[],row[1])
print(row[],row[1])
count = count +1
deftest(res):
i =;
while(i
print(res)
i = i+1
try:
threads = []
# 循环开启线程
foriinrange(,100):
# 每个线程执行多少文章
j = i+1
t = threading.Thread(target=init,args=(i,j))
i = j
threads.append(t)
t.start()
fortinthreads:
t.join()
print("end")
except:
print("Error: 无法启动线程")
领取专属 10元无门槛券
私享最新 技术干货