首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >知乎爬虫-指定问题采集全回答

知乎爬虫-指定问题采集全回答

作者头像
十四君
发布2020-03-03 15:50:56
5230
发布2020-03-03 15:50:56
举报
文章被收录于专栏:UrlteamUrlteam

知乎回答提取程序

只需运行主程序html.py即可。书籍信息保存在read2.txt中  源代码保存已注释。

from selenium import webdriver
import requests
import re
from lxml import etree
from selenium.webdriver import ChromeOptions  # 需要导入的类
from selenium.webdriver.common.keys import Keys
import time
import tiqu
 
# 记录网页源代码
def write2(html, name):
    path = "./data/" + name + '.txt'
    with open(path, 'w', encoding='utf-8')as file:
        file.write(html)
        file.close()
 
def socket_get():
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 代理
    # option.add_argument('--proxy-server=http://127.0.0.1:9090')
    socket = webdriver.Chrome('C:/Users/31114/PycharmProjects/chromedriver.exe', options=option)
    return socket
def sele(url, name, dict):
    socket = socket_get()
    socket.get(url)
    tingzhi = "写回答</button>"
    tingzhi2 = "写第一个回答</button>"
    # 暴力下拉页面
    last_html = ""
    for i in range(1, 1000):
        js = "var q=document.documentElement.scrollTop=1000000" # js下拉到底端
        socket.execute_script(js)
        html = socket.page_source # 获取源代码
        #write2(html, name) # 写入name.txt
        strd = re.findall(tingzhi, html) # 寻找结束条件
        strd2 = re.findall(tingzhi2,html) # 寻找结束条件
        if len(strd) +len(strd2)>=3: # 满足条件跳出循环
            last_html = html
            break
        # print(str)
        # write(str)
        time.sleep(1)
    # write2(last_html,name) # 写入name.txt
    # 若在当前位置提取,如要保存网页源代码则去掉注释,并在上方加上注释
    if last_html != "":
        tiqu.find(last_html,name+'.txt',dict)
    print(url+"已提取完毕")
    time.sleep(3)
    pass
 
 
if __name__ == "__main__":
    url2 = "https://www.zhihu.com/question/345473425"
    #url ="https://www.zhihu.com/question/374501668"
    old_vis = tiqu.get_old_vis()  # 获取曾经提取过的页面名字
    dict = tiqu.get_old_books()  # 获取曾经提取过的数据
    html_name = re.findall('([0-9]{5,11})', url2) # 网页编号
    print(html_name[0])
    html_name = html_name[0]
    if html_name+'.txt'+'\n' not in old_vis:
        sele(url=url2, name=html_name,dict=dict)
 

下面代码保存名字为tiqu.py

import re
import os
 
 
# 读取网页源码
def read(html, dict, old_vis):
    # 判断是否查找过
    if html + '\n' in old_vis:
        return
    # 打开文档
    with open('./data/' + html, 'r', encoding='utf-8')as file:
        str = file.readlines()
        # print(str)
        file.close()
    strr = ""
    for ind in str:
        strr += ind
    find(strr, html, dict)
 
 
def find(strr, html, dict):
    books = re.findall('《(.*?)》', strr)
    if books != []:
        # print(books)
        tongji(books, dict)
        # print(html)
        write_vis(html)
 
 
def tongji(books, dict):
    for book in books:
        book = re.sub('<.*?>', "", book)  # 去除多余网页源码
        # print(dict[book])
        # 如果不存在
        if book not in dict.keys():
            dict[book] = 1
        else:
            dict[book] += 1
        pass
    # 排序
    aps = sorted(dict.items(), key=lambda d: d[1], reverse=True)
    # print(dict)
    # print(aps[0][0])
    # 写入read2文档
    write(aps)
 
 
def write(aps):
    print(aps)
    lend = len(aps)
    print("当前有" + str(lend) + "本书")
    with open('./read2.txt', 'w', encoding='utf-8')as file:
        for i in aps:
            file.write(i[0] + "\t" + str(i[1]) + '\n')
        file.close()
 
 
# 获取data下所有网页源码名字
def all_html():
    list = os.listdir('./data')
    return list
 
 
# 获取曾经提取过的数据
def get_old_books():
    old_dic_books = {}
    with open('./read2.txt', 'r', encoding='utf-8')as file:
        list = file.readlines()
        file.close()
    # 将list转为dic
    for book in list:
        book.replace('\n', '')
        try:
            old_dic_books[book.split('\t')[0]] = int(book.split('\t')[1].replace('\n', ''))
        except Exception as e:
            print(e)
    return old_dic_books
 
 
def write_vis(html):
    with open('./vis.txt', 'a', encoding='utf-8')as file:
        if html != "":
            file.write(html + '\n')
        else:
            file.write('')
        file.close()
 
 
# 获取曾经提取过的页面名字
def get_old_vis():
    write_vis("")
    with open('./vis.txt', 'r', encoding='utf-8')as file:
        list = file.readlines()
        file.close()
    return list
 
 
if __name__ == "__main__":
    old_vis = get_old_vis()  # 获取曾经提取过的页面名字
    dict = get_old_books()  # 获取曾经提取过的数据
    list = all_html()  # 获取data下所有网页源码名字
    for html in list:
        read(html, dict, old_vis)  # 读取网页源码
 

运行即可提取其中是书名的部分

作者—–某天飞。。

原创文章,转载请注明: 转载自URl-team

本文链接地址: 知乎爬虫-指定问题采集全回答

本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2020-02-292,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档