Python3多线程小说爬虫可提供查询功能

HcodeBlogger

发布于 2020-07-14 09:46:05

4390

发布于 2020-07-14 09:46:05

文章被收录于专栏：Hcode网站

前言

pycharm+chromedriver+bs4+re+threading+queue模拟登录小说多线程爬虫

首先要安装selenium，BeautifulSoup库，下载对应的chromedriver版本

一般在cmd里面pip install ×××，作者是用pycharm解释器的，里面可用搜索库名进行下载，写python爬虫一般推荐这个而chromedriver版本对应关系可以自行百度，下载完安装到自己选定的路径，并记得在下面的代码修改路径

具体步骤如下：

①首先是根据输入的小说名模拟登录网站http://www.biquge.tv/进行模拟检索，如有多种可能会生成一个选择表格（如果只有一本检索结果，则会跳过这个选择步骤，直接进行下一步）

pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
contents1 = re.findall(pattern1, driver.page_source)
pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
contents2 = re.findall(pattern2, driver.page_source)
if len(contents2) and len(contents1):
    URLlist = []
    namelist = []
    authorlist = []
    for content in contents1:
        URLlist.append(content[0])
        namelist.append(content[1])
    flag = False
    for content in contents2:
        if flag == True:
            authorlist.append(content)
            flag = False
        else:
            flag = True
    print('小说网站搜索的结果如下：')
    print('\t' + '编号' + '\t\t' + '小说' + '\t\t' + '作者' + '\t')
    num = 1
    for name, author, in zip(namelist, authorlist):
        print('\t' + str(num) + '\t\t' + name + '\t\t' + author + '\t')
        num += 1
    step = int(input('请选择所需的小说，输入对应的编号：'))
    want_url = str(URLlist[step - 1])

②爬取所需要下载的小说各个章节的URL，将其依次存入队列在第一步中获得对应网页的URL进行队列存储

driver.get(want_url)
page_source = driver.page_source
pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
All_html = re.findall(pattern2, page_source)

for ones in All_html[9:]:
    part_url = ones[0]
    title = ones[1]
    print(title + '+' + base_url + part_url)
    q.put(title + '+' + base_url + part_url)
driver.close()
driver.quit()

③首先会打印出章节名字和对应的URL，这可以检查是否成功爬到所需的小说，选择所需要的线程数量，一般跟自己的电脑cpu性能有很大关系，选择20~40就够了

下面是线程的生成与最后的结束关闭线程

threadnum = int(input('请输入所要开启的爬虫线程数量：'))
start_time = time.time()
for i in range(1, threadnum + 1, 1):
    threadList.append('Spider_Thread-' + str(i))
queueLock = threading.Lock()  # 避免多个线程保卫同一块数据的时候，产生错误，所以加锁
threads = []
threadID = 1
# 创建新线程
for tName in threadList:
    thread = myThread(threadID, tName, q)
    thread.start()
    threads.append(thread)
    threadID += 1
# 等待队列清空
while not q.empty():
    pass
# 通知线程是时候退出
exitFlag = 1
# 等待所有线程完成
for t in threads:
    t.join()
    print(t.name + '退出成功')

④选择完线程数启动成功后，就开始爬取各章节到指定的文件夹

此外，最重要的多线程重载threading如下，一般把主要的运行代码放在重载run（self）函数里面

class myThread(threading.Thread): # 继承父类threading.Thread

    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter

    def run(self):
          # 把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
        print(self.name+'启动成功')
        while not exitFlag:
            queueLock.acquire() #锁定线程
            if not q.empty():
                item = q.get()
                queueLock.release() #释放线程
                title = item.split('+')[0]
                href = item.split('+')[1]
                get_content(title, href)
            else:
                print('数据全部结束')
                queueLock.release()# 释放线程

还需要记住开启线程锁，防止冲突，定义后，在run函数内使用，如上图

queueLock = threading.Lock() # 避免多个线程保卫同一块数据的时候，产生错误，所以加锁

好了，具体全部代码如下（只需更改driverchrome安装的路径和存储小说的文件夹路径就可以运行了）：

import queue
import threading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup
import time

#多线程爬取笔趣阁小说，可进行手动搜索所需的小说
#@author Himit_ZH
#qq:372347736

exitFlag = 0
q = queue.Queue()
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#driverchrome安装的路径
driver_path = r'E:\py\chromedriver\chromedriver.exe'

base_url = r'http://www.biquge.tv'

#存储小说的路径
txt_path = r'E://py//小说//'

#小说总章节数
Sum_Chapters = 0.0

#所要搜索的小说名字
novel_name = str()

class scrapy_biquge():

    def get_url(self):

        driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
        driver.get('http://www.biquge.tv/')
        driver.find_element_by_id('wd').send_keys(novel_name)
        driver.find_element_by_id('sss').click()
        # 设置窗口句柄跳到该抓取的网页
        handles = driver.window_handles
        driver.switch_to.window(handles[1])
        if '出现错误！' in driver.page_source:
            driver.close()
            driver.quit()
            print('输入错误，请重新输入')
            return False
        current_url = driver.current_url
        if 'search.php?' in current_url :
            pattern1 = re.compile(r'<td class="odd"><a href="(.*?)">(.*?)</a>', re.S)
            contents1 = re.findall(pattern1, driver.page_source)
            pattern2 = re.compile(r'<td class="odd">(.*?)</td>', re.S)
            contents2 = re.findall(pattern2, driver.page_source)
            if len(contents2) and len(contents1):
                URLlist = []
                namelist = []
                authorlist = []
                for content in contents1:
                    URLlist.append(content[0])
                    namelist.append(content[1])
                flag = False
                for content in contents2:
                    if flag == True:
                        authorlist.append(content)
                        flag = False
                    else:
                        flag = True
                print('小说网站搜索的结果如下：')
                print('\t'+'编号'+'\t\t'+'小说'+'\t\t'+'作者'+'\t')
                num = 1
                for name, author, in zip(namelist,authorlist):
                    print('\t'+str(num)+'\t\t'+name+'\t\t'+author+'\t')
                    num += 1
                step = int(input('请选择所需的小说，输入对应的编号：'))
                want_url = str(URLlist[step-1])
                driver.get(want_url)
                page_source = driver.page_source
                pattern2 = re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>', re.S)
                All_html = re.findall(pattern2, page_source)

                for ones in All_html[9:]:
                    part_url = ones[0]
                    title = ones[1]
                    print(title + '+' + base_url+part_url)
                    q.put(title + '+' + base_url+part_url)
                driver.close()
                driver.quit()
                return True

        if '抱歉，搜索没有结果^_^' in driver.page_source:
            driver.close()
            driver.quit()
            print('抱歉，搜索没有结果，请重新输入')
            return False


class myThread(threading.Thread): # 继承父类threading.Thread

    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter

    def run(self):
          # 把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
        print(self.name+'启动成功')
        while not exitFlag:
            queueLock.acquire() #锁定线程
            if not q.empty():
                item = q.get()
                queueLock.release() #释放线程
                title = item.split('+')[0]
                href = item.split('+')[1]
                get_content(title, href)
            else:
                print('数据全部结束')
                queueLock.release()# 释放线程
def get_content(title, href):
        driver = webdriver.Chrome(executable_path=driver_path,chrome_options=chrome_options)
        driver.get(href)
        bs4 = BeautifulSoup(driver.page_source, 'lxml')
        title = bs4.h1.get_text()  # 章节名
        filename = txt_path+''.join(title.split()[0])+'.txt'
        content = bs4.find('div', id='content')
        content = content.get_text()
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("\r"+title+"\r\n")
            f.write(content)
        print('['+title+']  成功下载，'+'现已下载总章节数的{:.2f}%'.format(((1.0 - q.qsize()/Sum_Chapters))*100))
        driver.close()
        driver.quit()


if __name__ == '__main__':
    # 所有url进队列以后，启动线程
    while True:
        try:
            novel_name = input('请输入你想要搜索的小说名字：')
            if scrapy_biquge().get_url():
                break
        except KeyError:
            pass
    Sum_Chapters = q.qsize()
    threadList = []
    threadnum = int(input('请输入所要开启的爬虫线程数量：'))
    start_time = time.time()
    for i in range(1, threadnum+1, 1):
        threadList.append('Spider_Thread-'+str(i))
    queueLock = threading.Lock() # 避免多个线程保卫同一块数据的时候，产生错误，所以加锁
    threads = []
    threadID = 1
      # 创建新线程
    for tName in threadList:
        thread = myThread(threadID, tName, q)
        thread.start()
        threads.append(thread)
        threadID += 1
      # 等待队列清空
    while not q.empty():
        pass
      # 通知线程是时候退出
    exitFlag = 1
      # 等待所有线程完成
    for t in threads:
        t.join()``
        print(t.name+'退出成功')
    end_time = time.time()
    print('本次爬取小说耗时为'+str(round(end_time-start_time, 2))+'秒')

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2020年3月30日，如有侵权请联系 cloudcommunity@tencent.com 删除

php