前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >20.multi_协程方法抓取总阅读量

20.multi_协程方法抓取总阅读量

作者头像
hankleo
发布2020-09-17 11:07:32
3590
发布2020-09-17 11:07:32
举报
文章被收录于专栏:Hank’s BlogHank’s Blog
代码语言:javascript
复制
# 用asyncio和aiohttp抓取博客的总阅读量 (提示:先用接又找到每篇文章的链接)
# https://www.jianshu.com/u/130f76596b02

import re
import asyncio
import aiohttp
import requests
import ssl
from lxml import etree
from asyncio.queues import Queue
from aiosocksy import Socks5Auth
from aiosocksy.connector import ProxyConnector, ProxyClientRequest


class Common():
    task_queue = Queue()
    result_queue = Queue()
    result_queue_1 = []


async def session_get(session, url, socks):
    auth = Socks5Auth(login='...', password='...')
    headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
    timeout = aiohttp.ClientTimeout(total=20)
    response = await session.get(
        url,
        proxy=socks,
        proxy_auth=auth,
        timeout=timeout,
        headers=headers,
        ssl=ssl.SSLContext()
    )
    return await response.text(), response.status


async def download(url):
    connector = ProxyConnector()
    socks = None
    async with aiohttp.ClientSession(
            connector=connector,
            request_class=ProxyClientRequest
    ) as session:
        ret, status = await session_get(session, url, socks)
        if 'window.kk' in ret and len(ret) < 1000:
            url = ret.split("window.kk='")[1].split("'")[0]
            ret, status = await session_get(session, url, socks)
        return ret, status


async def parse_html(content):
    read_num_pattern = re.compile(r'"views_count":\d+')
    read_num = int(read_num_pattern.findall(content)[0].split(':')[-1])
    return read_num


def get_all_article_links():
    links_list = []
    for i in range(1, 21):
        url = 'https://www.jianshu.com/u/130f76596b02?order_by=shared_at&page={}'.format(
            i)
        header = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
            '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
        response = requests.get(url,
                                headers=header,
                                timeout=5
                                )
        tree = etree.HTML(response.text)
        article_links = tree.xpath(
            '//div[@class="content"]/a[@class="title"]/@href')
        for item in article_links:
            article_link = 'https://www.jianshu.com' + item
            links_list.append(article_link)
            print(article_link)
    return links_list

async def down_and_parse_task(queue):
    while True:
        try:
            url = queue.get_nowait()
        except BaseException:
            return
        error = None
        for retry_cnt in range(3):
            try:
                html, status = await download(url)
                if status != 200:
                    html, status = await download(url)
                read_num = await parse_html(html)
                print(read_num)
                # await Common.result_queue.put(read_num)
                Common.result_queue_1.append(read_num)
                break
            except Exception as e:
                error = e
                await asyncio.sleep(0.2)
                continue
        else:
            raise error

async def count_sum():
    while True:
        try:
            print(Common.result_queue_1)
            print('总阅读量 = ', sum(Common.result_queue_1))
            await asyncio.sleep(3)
        except BaseException:
            pass

async def main():
    all_links = get_all_article_links()
    for item in set(all_links):
        await Common.task_queue.put(item)
    for _ in range(10):
        loop.create_task(down_and_parse_task(Common.task_queue))
        loop.create_task(count_sum())

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.create_task(main())
    loop.run_forever()
本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2019-05-06 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档