Python爬虫，微信公众号话题标签内容采集打印PDF输出

二爷

发布于 2020-09-23 10:19:34

7230

文章被收录于专栏：二爷记二爷记

微信公众号内容采集，比较怪异，其参数，post参数需要话费时间去搞定，这里采集的是话题标签的内容，同时应用了pdfkit打印输出内容。

这里实现应用了两个版本，第一个是直接网页访问，其真实地址即post网址也存在比较多的参数，没有尝试过，获取到的内容仅有部分，比较不理想。第二个版本是采用了无头浏览器直接访问，获取到网页源码，进行解析，得到想要的内容。

本渣渣现在比较懒，代码都是拿以前的，现成的，复制，改改，直接使用的！

版本一：

#微信公众号内容获取打印pdf
#by 微信：huguo00289
#https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14
# -*- coding: UTF-8 -*-
import requests
from fake_useragent import UserAgent
import os,re
import pdfkit


confg = pdfkit.configuration(
    wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')

class Du():
    def __init__(self,furl):
        ua=UserAgent()
        self.headers={
            "User-Agent": ua.random,
                      }
        self.url=furl


    def get_urls(self):

        response=requests.get(self.url,headers=self.headers,timeout=8)
        html=response.content.decode('utf-8')
        req=re.findall(r'var data={(.+?)if',html,re.S)[0]
        urls=re.findall(r',"link":"(.+?)",',req,re.S)


        urls=set(urls)
        print(len(urls))


        return urls



    def get_content(self,url,category):
        response = requests.get(url, headers=self.headers, timeout=8)
        print(response.status_code)
        html = response.content.decode('utf-8')
        req = re.findall(r'<div id="img-content" class="rich_media_wrp">(.+?)var first_sceen__time',html,re.S)[0]

        #获取标题
        h1=re.findall(r'<h2 class="rich_media_title" id="activity-name">(.+?)</h2>',req,re.S)[0]
        h1=h1.strip()
        pattern = r"[\/\\\:\*\?\"\<\>\|]"
        h1 = re.sub(pattern, "_", h1)  # 替换为下划线
        print(h1)

        #获取详情
        detail = re.findall(r'<div class="rich_media_content " id="js_content" style="visibility: hidden;">(.+?)<script nonce=".+?" type="text/javascript">',req,re.S)[0]





        data = f'<h1>{h1}</h1>\n{detail}'

        self.dypdf(h1,data,category)

        return data




    def dypdf(self,h1,data,category):
        datas = f'<html><head><meta charset="UTF-8"></head><body>{data}</body></html>'
        print("开始打印内容！")
        pdfkit.from_string(datas, f'{category}/{h1}.pdf', configuration=confg)
        print("打印保存成功！")




if __name__=='__main__':
    furl="https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"
    category="潘通色卡（电子版）"
    datas = ''
    os.makedirs(f'{category}/',exist_ok=True)
    spider=Du(furl)
    urls=spider.get_urls()
    for url in urls:
        print(f">> 正在爬取链接：{url} ..")
        try:
            data=spider.get_content(url,category)
        except Exception as e:
            print(f"爬取错误，错误代码为:{e}")

        datas='%s%s%s'%(datas,'\n',data)

    spider.dypdf(category,datas,category)

版本二：

#微信公众号内容获取打印pdf
#by 微信：huguo00289
#https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14
# -*- coding: UTF-8 -*-
import requests
from selenium import webdriver
import os,re,time
import pdfkit
from bs4 import BeautifulSoup



confg = pdfkit.configuration(
    wkhtmltopdf=r'D:\wkhtmltox-0.12.5-1.mxe-cross-win64\wkhtmltox\bin\wkhtmltopdf.exe')

class wx():
    def __init__(self,furl):
        self.url = furl
        self.chrome_driver = r'C:\Users\Administrator\Desktop\chromedriver_win32\chromedriver.exe'  # chromedriver的文件位置
        self.browser = webdriver.Chrome(executable_path=self.chrome_driver)


    def get_urls(self):
        urls=[]
        self.browser.get(self.url)
        hrefs=self.browser.find_elements_by_xpath("//div[@class='article_list']/a[@class='list_item js_post']")
        for href in hrefs:
            url=href.get_attribute('href')
            urls.append(url)

        print(len(urls))


        return urls



    def get_content(self,url,category):
        self.browser.get(url)
        time.sleep(5)
        # 调用driver的page_source属性获取页面源码
        pageSource = self.browser.page_source
        soup=BeautifulSoup(pageSource,'lxml')

        #获取标题
        h1=re.findall(r'<h2 class="rich_media_title" id="activity-name">(.+?)</h2>',pageSource,re.S)[0]
        h1=h1.strip()
        pattern = r"[\/\\\:\*\?\"\<\>\|]"
        h1 = re.sub(pattern, "_", h1)  # 替换为下划线
        print(h1)


        #获取详情
        detail =soup.find('div',class_="rich_media_content")
        detail=str(detail)
        del_text="""<section class="" style="margin-top: -1px; max-width: 100%; font-family: 微软雅黑; white-space: normal; min-height: 40px; visibility: visible; height: 40px; line-height: 40px; border-radius: 10px; text-align: center; box-shadow: rgb(190, 190, 190) 0px 3px 5px; color: rgb(255, 255, 255); box-sizing: border-box !important; word-wrap: break-word !important; background-image: none; background-attachment: scroll; background-color: rgb(245, 143, 198); background-position: 0% 0%; background-repeat: repeat;"><strong class="" style="max-width: 100%; box-sizing: border-box !important; word-wrap: break-word !important;"><span style="max-width: 100%; font-size: 14px; box-sizing: border-box !important; word-wrap: break-word !important;">↑ 点击上方<span style="max-width: 100%; box-sizing: border-box !important; word-wrap: break-word !important;">“染整百科”</span>关注我们</span></strong></section>"""

        detail=detail.replace(del_text,'')


        data = f'<h1>{h1}</h1>\n{detail}'

        self.dypdf(h1,data,category)

        return data




    def dypdf(self,h1,data,category):
        datas = f'<html><head><meta charset="UTF-8"></head><body>{data}</body></html>'
        print("开始打印内容！")
        pdfkit.from_string(datas, f'{category}/{h1}.pdf', configuration=confg)
        print("打印保存成功！")


    def quit(self):
        self.browser.quit()


if __name__=='__main__':
    furl="https://mp.weixin.qq.com/mp/homepage?__biz=MzA4NjQ3MDk4OA==&hid=5&sn=573b1b806f9ebf63171a56ee2936b883&devicetype=android-29&version=27001239&lang=zh_CN&nettype=WIFI&a=&session_us=gh_7d55ab2d943f&wx_header=1&fontScale=100&from=timeline&isappinstalled=0&scene=1&subscene=2&clicktime=1594602258&enterid=1594602258&ascene=14"
    category="潘通色卡（电子版）"
    datas = ''
    os.makedirs(f'{category}/',exist_ok=True)
    spider=wx(furl)
    urls=spider.get_urls()
    for url in urls:
        print(f">> 正在爬取链接：{url} ..")
        try:
            data=spider.get_content(url,category)
        except Exception as e:
            print(f"爬取错误，错误代码为:{e}")

        datas='%s%s%s'%(datas,'\n',data)

    spider.quit()
    spider.dypdf(category,datas,category)

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2020-09-12，如有侵权请联系 cloudcommunity@tencent.com 删除

http

本文分享自 Python与SEO学习微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

http

登录后参与评论

0 条评论

热度

Python爬虫，微信公众号话题标签内容采集打印PDF输出

Python爬虫，微信公众号话题标签内容采集打印PDF输出

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐