汽车之家新闻:
import requests
from bs4 import BeautifulSoup
import os
# 模拟浏览器发请求
r1 = requests.get(url='https://www.autohome.com.cn/news/')
# print(r1) #<class 'requests.models.Response'>
r1.encoding = 'gbk' # 解决乱码问题
# print(r1.text)#打印网页文本
# print(r1.content)#打印网页bytes数据
# bs4解析响应体对象
soup = BeautifulSoup(r1.text, 'html.parser')
# print(soup)
# bs4查找标签
container = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'})
li_list = container.find_all(name='li')
# print(type(li_list))#<class 'bs4.element.ResultSet'>
for tag in li_list:
# 过滤出所有title <h3>标签
title = tag.find(name='h3')
if not title:
continue
# 简介
summary = tag.find(name='p')
# 文章url
# a是找到的tag,字典数据
a = tag.find(name='a')
url = "https:" + a.attrs.get("href")
# 图片url
img = tag.find(name='img')
img_url = "https:" + img.get("src")
print('标题:', title.text)
print('简介:', summary.text)
print('url:', url)
print('图片地址:', img_url)
print('-' * 100)
# 保存图片
r2 = requests.get(url=img_url)
file_name = img_url.rsplit('/', maxsplit=1)[1]
file_path = os.path.join('imgs', file_name)
with open(file_path, 'wb')as f:
f.write(r2.content)
抽屉新热榜新闻
import requests
import os
from bs4 import BeautifulSoup
# 抽屉不加请求头的话,默认403终止了
r1 = requests.get(
url='https://dig.chouti.com/',
headers={
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
)
soup = BeautifulSoup(r1.text, 'html.parser')
# 内容区域
container = soup.find(name='div', attrs={'class': "content-list"})
div_list = container.find_all(name='div')
for tag in div_list:
title = tag.find(name="a", attrs={'class': "show-content color-chag"})
if not title:
continue
summary = tag.find(name="div", attrs={"class": "area-summary"})
if not summary:
continue
a = tag.find(name="a")
url = "https:" + a.attrs.get("href")
img = tag.find("img")
# 获取img的源地址,可能有None,因此要做判断
img_url = img.get("original")
if not img_url:
continue
img_url = "https:" + img_url
print(img_url)
# 下载缩略图
r2 = requests.get(
url=img_url
)
file_namne = img_url.rsplit("/", maxsplit=1)[1]
file_path = os.path.join("img2", file_namne)
with open(file_path, 'wb') as f:
f.write(r2.content)
登录github
import requests
# 访问github页面
r1 = requests.get(
url="https://github.com/login"
)
r1_cookie_dict = r1.cookies.get_dict()
# print(r1.text)
# bs4解析页面
from bs4 import BeautifulSoup
s1 = BeautifulSoup(r1.text, "html.parser")
token = s1.find(name="input", attrs={"name": "authenticity_token"}).get("value")
print(token)
r2 = requests.post(
url="https://github.com/session",
data={
"commit": "Sign in",
"utf8": "✓",
"authenticity_token": token,
"login": "",
"password": ""
},
cookies=r1_cookie_dict
)
print(r2.text)