# -*- coding: utf-8 -*-
from selenium import webdriver
import time, re,requests,os,time,random,traceback
import urllib.request,threading
from bs4 import BeautifulSoup
import html.parser
from tkinter import *
from tkinter import ttk
import tkinter.messagebox
def getHtml(questionId,page):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--start-maximized') # 最大化运行(全屏窗口),不设置,取元素会报错
chrome_options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
chrome_options.add_argument('--incognito') # 隐身模式(无痕模式)
chrome_options.add_argument('--headless') # 浏览器不提供可视化页面
driver = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options) # 打开浏览器
driver.get("https://www.zhihu.com/question/"+questionId+"/answers/updated?page="+str(page)) # 打开想要爬取的知乎页面
# 模拟用户操作
def execute_times(times):
for i in range(times):
print('第'+str(i)+'次点击')
driver.execute_script("window.scrollTo(0, "+str(1000 * i)+");")
time.sleep(3)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
execute_times(12)
result_raw = driver.page_source # 这是原网页 HTML 信息
result_soup = BeautifulSoup(result_raw, 'html.parser')# 然后将其解析
result_bf = result_soup.prettify() # 结构化原 HTML 文件
answers = driver.find_elements_by_class_name("RichContent-inner")
txt = "start\n"
for answer in answers:
if len(answer.text) > 300:
txt = txt + answer.text + "\n-----------我是分隔符------\n"
with open(questionId +"/page_"+str(page)+".txt", 'w',encoding="utf-8") as zhpage: # 存储路径里的文件夹需要事先创建。
zhpage.write(txt)
zhpage.close()
print("爬取回答页面成功!!!")
driver.quit()
return result_soup
def readTxt(path):
f = open(path,'r',encoding='utf-8')
strTxt = f.read()
f.close()
return strTxt
def main(questionId,startPage,endPage):
mkdir([questionId])
for i in range(startPage,endPage):
try:
getHtml(questionId,i)
time.sleep(random.choice(range(5,8)))
except Exception:
traceback.print_exc()
pass
def mkdir(paths):
for path in paths:
if not os.path.exists(path):
os.mkdir(path)
def getanswer():
questionId = var_id.get()
start = var_start.get()
end = var_end.get()
main(questionId,start,end)
if __name__ == '__main__':
main(str(308829198),101,200)
tk = Tk()
tk.title('获取知乎问题所有答案')
tk.geometry('600x150')
frame = Frame(tk)
Label(tk,text='问题标识:(例:https://www.zhihu.com/question/324405640/answer/720532471中的324405640 )',width=200,anchor=W, justify=LEFT).place(x=10,y=10)
var_id = Variable()
question_id = Entry(tk,textvariable=var_id,width=30)
question_id.place(x=10,y=40)
Label(tk,text='开始页:').place(x=230,y=40)
var_start = Variable()
e = Entry(tk, textvariable=var_start,width=10).place(x=290,y=40)
var_start.set(1)
Label(tk,text='结束页:').place(x=360,y=40)
var_end = Variable()
e = Entry(tk, textvariable=var_end,width=10).place(x=420,y=40)
var_end.set(10)
Button(tk, text="获取答案", command=getanswer).place(x=200,y=80)
#tk.mainloop()