pycharm+chromedriver+bs4+re+threading+queue模拟登录小说多线程爬虫
一般在cmd里面pip install ×××,作者是用pycharm解释器的,里面可用搜索库名进行下载,写python爬虫一般推荐这个 而chromedriver版本对应关系可以自行百度,下载完安装到自己选定的路径,并记得在下面的代码修改路径
①首先是根据输入的小说名模拟登录网站http://www.biquge.tv/进行模拟检索,如有多种可能会生成一个选择表格 (如果只有一本检索结果,则会跳过这个选择步骤,直接进行下一步)
pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
contents1 = re.findall(pattern1, driver.page_source)
pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
contents2 = re.findall(pattern2, driver.page_source)
if len(contents2) and len(contents1):
URLlist = []
namelist = []
authorlist = []
for content in contents1:
URLlist.append(content[0])
namelist.append(content[1])
flag = False
for content in contents2:
if flag == True:
authorlist.append(content)
flag = False
else:
flag = True
print('小说网站搜索的结果如下:')
print('\t' + '编号' + '\t\t' + '小说' + '\t\t' + '作者' + '\t')
num = 1
for name, author, in zip(namelist, authorlist):
print('\t' + str(num) + '\t\t' + name + '\t\t' + author + '\t')
num += 1
step = int(input('请选择所需的小说,输入对应的编号:'))
want_url = str(URLlist[step - 1])
②爬取所需要下载的小说各个章节的URL,将其依次存入队列 在第一步中获得对应网页的URL进行队列存储
driver.get(want_url)
page_source = driver.page_source
pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
All_html = re.findall(pattern2, page_source)
for ones in All_html[9:]:
part_url = ones[0]
title = ones[1]
print(title + '+' + base_url + part_url)
q.put(title + '+' + base_url + part_url)
driver.close()
driver.quit()
③首先会打印出章节名字和对应的URL,这可以检查是否成功爬到所需的小说,选择所需要的线程数量,一般跟自己的电脑cpu性能有很大关系,选择20~40就够了
下面是线程的生成与最后的结束关闭线程
threadnum = int(input('请输入所要开启的爬虫线程数量:'))
start_time = time.time()
for i in range(1, threadnum + 1, 1):
threadList.append('Spider_Thread-' + str(i))
queueLock = threading.Lock() # 避免多个线程保卫同一块数据的时候,产生错误,所以加锁
threads = []
threadID = 1
# 创建新线程
for tName in threadList:
thread = myThread(threadID, tName, q)
thread.start()
threads.append(thread)
threadID += 1
# 等待队列清空
while not q.empty():
pass
# 通知线程是时候退出
exitFlag = 1
# 等待所有线程完成
for t in threads:
t.join()
print(t.name + '退出成功')
④选择完线程数 启动成功后,就开始爬取各章节到指定的文件夹
此外,最重要的多线程重载threading如下,一般把主要的运行代码放在重载run(self)函数里面
class myThread(threading.Thread): # 继承父类threading.Thread
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
# 把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
print(self.name+'启动成功')
while not exitFlag:
queueLock.acquire() #锁定线程
if not q.empty():
item = q.get()
queueLock.release() #释放线程
title = item.split('+')[0]
href = item.split('+')[1]
get_content(title, href)
else:
print('数据全部结束')
queueLock.release()# 释放线程
还需要记住开启线程锁,防止冲突,定义后,在run函数内使用 ,如上图
queueLock = threading.Lock() # 避免多个线程保卫同一块数据的时候,产生错误,所以加锁
好了,具体全部代码如下(只需更改driverchrome安装的路径和存储小说的文件夹路径就可以运行了):
import queue
import threading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup
import time
#多线程爬取笔趣阁小说,可进行手动搜索所需的小说
#@author Himit_ZH
#qq:372347736
exitFlag = 0
q = queue.Queue()
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#driverchrome安装的路径
driver_path = r'E:\py\chromedriver\chromedriver.exe'
base_url = r'http://www.biquge.tv'
#存储小说的路径
txt_path = r'E://py//小说//'
#小说总章节数
Sum_Chapters = 0.0
#所要搜索的小说名字
novel_name = str()
class scrapy_biquge():
def get_url(self):
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
driver.get('http://www.biquge.tv/')
driver.find_element_by_id('wd').send_keys(novel_name)
driver.find_element_by_id('sss').click()
# 设置窗口句柄跳到该抓取的网页
handles = driver.window_handles
driver.switch_to.window(handles[1])
if '出现错误!' in driver.page_source:
driver.close()
driver.quit()
print('输入错误,请重新输入')
return False
current_url = driver.current_url
if 'search.php?' in current_url :
pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
contents1 = re.findall(pattern1, driver.page_source)
pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
contents2 = re.findall(pattern2, driver.page_source)
if len(contents2) and len(contents1):
URLlist = []
namelist = []
authorlist = []
for content in contents1:
URLlist.append(content[0])
namelist.append(content[1])
flag = False
for content in contents2:
if flag == True:
authorlist.append(content)
flag = False
else:
flag = True
print('小说网站搜索的结果如下:')
print('\t'+'编号'+'\t\t'+'小说'+'\t\t'+'作者'+'\t')
num = 1
for name, author, in zip(namelist,authorlist):
print('\t'+str(num)+'\t\t'+name+'\t\t'+author+'\t')
num += 1
step = int(input('请选择所需的小说,输入对应的编号:'))
want_url = str(URLlist[step-1])
driver.get(want_url)
page_source = driver.page_source
pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
All_html = re.findall(pattern2, page_source)
for ones in All_html[9:]:
part_url = ones[0]
title = ones[1]
print(title + '+' + base_url+part_url)
q.put(title + '+' + base_url+part_url)
driver.close()
driver.quit()
return True
if '抱歉,搜索没有结果^_^' in driver.page_source:
driver.close()
driver.quit()
print('抱歉,搜索没有结果,请重新输入')
return False
class myThread(threading.Thread): # 继承父类threading.Thread
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
# 把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
print(self.name+'启动成功')
while not exitFlag:
queueLock.acquire() #锁定线程
if not q.empty():
item = q.get()
queueLock.release() #释放线程
title = item.split('+')[0]
href = item.split('+')[1]
get_content(title, href)
else:
print('数据全部结束')
queueLock.release()# 释放线程
def get_content(title, href):
driver = webdriver.Chrome(executable_path=driver_path,chrome_options=chrome_options)
driver.get(href)
bs4 = BeautifulSoup(driver.page_source, 'lxml')
title = bs4.h1.get_text() # 章节名
filename = txt_path+''.join(title.split()[0])+'.txt'
content = bs4.find('div', id='content')
content = content.get_text()
with open(filename, 'w', encoding='utf-8') as f:
f.write("\r"+title+"\r\n")
f.write(content)
print('['+title+'] 成功下载,'+'现已下载总章节数的{:.2f}%'.format(((1.0 - q.qsize()/Sum_Chapters))*100))
driver.close()
driver.quit()
if __name__ == '__main__':
# 所有url进队列以后,启动线程
while True:
try:
novel_name = input('请输入你想要搜索的小说名字:')
if scrapy_biquge().get_url():
break
except KeyError:
pass
Sum_Chapters = q.qsize()
threadList = []
threadnum = int(input('请输入所要开启的爬虫线程数量:'))
start_time = time.time()
for i in range(1, threadnum+1, 1):
threadList.append('Spider_Thread-'+str(i))
queueLock = threading.Lock() # 避免多个线程保卫同一块数据的时候,产生错误,所以加锁
threads = []
threadID = 1
# 创建新线程
for tName in threadList:
thread = myThread(threadID, tName, q)
thread.start()
threads.append(thread)
threadID += 1
# 等待队列清空
while not q.empty():
pass
# 通知线程是时候退出
exitFlag = 1
# 等待所有线程完成
for t in threads:
t.join()``
print(t.name+'退出成功')
end_time = time.time()
print('本次爬取小说耗时为'+str(round(end_time-start_time, 2))+'秒')