爬虫之汽车之家/抽屉新热榜/煎蛋网

超蛋lhy

发布于 2018-08-31 16:37:12

6260

发布于 2018-08-31 16:37:12

文章被收录于专栏：Pythonista

汽车之家新闻：

import requests
from bs4 import BeautifulSoup
import os

# 模拟浏览器发请求
r1 = requests.get(url='https://www.autohome.com.cn/news/')
# print(r1) #<class 'requests.models.Response'>
r1.encoding = 'gbk'  # 解决乱码问题
# print(r1.text)#打印网页文本
# print(r1.content)#打印网页bytes数据

# bs4解析响应体对象
soup = BeautifulSoup(r1.text, 'html.parser')
# print(soup)

# bs4查找标签
container = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})
li_list = container.find_all(name='li')
# print(type(li_list))#<class 'bs4.element.ResultSet'>
for tag in li_list:
    # 过滤出所有title <h3>标签
    title = tag.find(name='h3')
    if not title:
        continue
    # 简介
    summary = tag.find(name='p')
    # 文章url
    # a是找到的tag,字典数据
    a = tag.find(name='a')
    url = "https:" + a.attrs.get("href")
    # 图片url
    img = tag.find(name='img')
    img_url = "https:" + img.get("src")
    print('标题:', title.text)
    print('简介：', summary.text)
    print('url:', url)
    print('图片地址：', img_url)
    print('-' * 100)

# 保存图片
r2 = requests.get(url=img_url)
file_name = img_url.rsplit('/', maxsplit=1)[1]
file_path = os.path.join('imgs', file_name)
with open(file_path, 'wb')as f:
    f.write(r2.content)

抽屉新热榜新闻

import requests
import os
from bs4 import BeautifulSoup

# 抽屉不加请求头的话，默认403终止了
r1 = requests.get(
    url='https://dig.chouti.com/',
    headers={
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
)

soup = BeautifulSoup(r1.text, 'html.parser')
# 内容区域
container = soup.find(name='div', attrs={'class': "content-list"})
div_list = container.find_all(name='div')
for tag in div_list:
    title = tag.find(name="a", attrs={'class': "show-content color-chag"})
    if not title:
        continue
    summary = tag.find(name="div", attrs={"class": "area-summary"})
    if not summary:
        continue
    a = tag.find(name="a")
    url = "https:" + a.attrs.get("href")
    img = tag.find("img")
    # 获取img的源地址，可能有None，因此要做判断
    img_url = img.get("original")
    if not img_url:
        continue
    img_url = "https:" + img_url
    print(img_url)

    # 下载缩略图
    r2 = requests.get(
        url=img_url
    )
    file_namne = img_url.rsplit("/", maxsplit=1)[1]
    file_path = os.path.join("img2", file_namne)
    with open(file_path, 'wb') as f:
        f.write(r2.content)

登录github

import requests

# 访问github页面
r1 = requests.get(
    url="https://github.com/login"
)
r1_cookie_dict = r1.cookies.get_dict()
# print(r1.text)
# bs4解析页面
from bs4 import BeautifulSoup

s1 = BeautifulSoup(r1.text, "html.parser")

token = s1.find(name="input", attrs={"name": "authenticity_token"}).get("value")
print(token)
r2 = requests.post(
    url="https://github.com/session",
    data={
        "commit": "Sign in",
        "utf8": "✓",
        "authenticity_token": token,
        "login": "",
        "password": ""
    },
    cookies=r1_cookie_dict
)
print(r2.text)

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2018-08-30 ，如有侵权请联系 cloudcommunity@tencent.com 删除

其他

本文分享自作者个人站点/博客前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

其他

登录后参与评论

0 条评论

热度

爬虫之汽车之家/抽屉新热榜/煎蛋网

爬虫之汽车之家/抽屉新热榜/煎蛋网

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐