用Python实现笔趣阁小说爬取,以后看小说都不用愁啦?
GUI使用经典的tkinter完成的
界面初始化
window = tk.Tk()
window.title('笔趣阁爬虫')
width = 300
height = 200
# 获取屏幕尺寸以计算布局参数,使窗口居屏幕中央
screenwidth = window.winfo_screenwidth()
screenheight = window.winfo_screenheight()
alignstr = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2)
window.geometry(alignstr)
# 设置窗口是否可变长、宽,True:可变,False:不可变
window.resizable(width=False, height=True)
window.geometry('300x200')
# result = tk.StringVar()
# result.set("")
# main()
L1 = tk.Label(window, text='输入网址:')
L1.grid(row=0, column=0)
E1 = tk.Entry(window)
E1.grid(row=0, column=1)
B1 = tk.Button(window, text='开始爬取', command=lambda : thread_it)
B1.grid(row=0, column=2)
T1 = tk.Text(window, width=40, height=10)
T1.grid(row=1, columnspan=3)
window.mainloop()
实现的过程中 遇到了界面卡死的问题
我用了多线程的解决办法
将爬取单独写出来一个线程
不影响主线程的运行
def thread_it():
# 创建
t = threading.Thread(target=main)
# 守护 !!!
t.setDaemon(True)
# 启动
t.start()
完整代码如下:
import requests
import time
from lxml import etree
import tkinter as tk
import threading
url_list = []
def get_tag(response, tag):
html = etree.HTML(response)
ret = html.xpath(tag)
return ret
def parse_url(url):
response = requests.get(url)
response.encoding = 'gbk'
return response.text
def find_url(response):
chapter = get_tag(response, '//*[@id="list"]/dl/dd/a/@href')
# print(chapter)
for i in chapter:
url_list.append(E1.get() + i)
# url_list.append('https://www.52bqg.com/book_187/' + i)
# print(url_list)
def find_content(url):
global T1
response = parse_url(url)
chapter = get_tag(response, '//*[@id="box_con"]/div[2]/h1/text()')[0]
content = get_tag(response, '//*[@id="content"]/text()')
# print('正在爬取', chapter)
T1.insert("end", '正在爬取:' + chapter)
with open('{}.txt'.format(title), 'at', encoding='utf-8') as j:
j.write(chapter)
for i in content:
if i == '\r\n':
continue
j.write(i)
j.close()
T1.insert("end", chapter + ':保存完毕')
# print(chapter, '保存完毕')
time.sleep(2)
T1.delete(1.0, tk.END)
def main():
global title
T1.insert('end', '开始爬取,请耐心等候')
start_url = E1.get()
# print(start_url)
# start_url = 'https://www.52bqg.com/book_187/'
response = parse_url(start_url)
# print(response)
title = get_tag(response, '//*[@id="info"]/h1/text()')[0]
# print(title)
find_url(response)
# print(1)
for url in url_list:
find_content(url)
def thread_it():
# 创建
t = threading.Thread(target=main)
# 守护 !!!
t.setDaemon(True)
# 启动
t.start()
if __name__ == '__main__':
window = tk.Tk()
window.title('笔趣阁爬虫')
width = 300
height = 200
# 获取屏幕尺寸以计算布局参数,使窗口居屏幕中央
screenwidth = window.winfo_screenwidth()
screenheight = window.winfo_screenheight()
alignstr = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2)
window.geometry(alignstr)
# 设置窗口是否可变长、宽,True:可变,False:不可变
window.resizable(width=False, height=True)
window.geometry('300x200')
# result = tk.StringVar()
# result.set("")
# main()
L1 = tk.Label(window, text='输入网址:')
L1.grid(row=0, column=0)
E1 = tk.Entry(window)
E1.grid(row=0, column=1)
B1 = tk.Button(window, text='开始爬取', command=lambda : thread_it())
B1.grid(row=0, column=2)
T1 = tk.Text(window, width=40, height=10)
T1.grid(row=1, columnspan=3)
window.mainloop()
效果图如下:
会用Python来爬取小说,以后看书就不用愁了!
遇到什么问题都可以过来找我的哟,私聊我就好啦!
领取专属 10元无门槛券
私享最新 技术干货