Python requests bs4 csv
import requests
import bs4
import unicodecsv as csv
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/90.0.4430.93 Safari/537.36 "
}
#拼接链接
for i in range(0, 10):
html = requests.get('https://movie.douban.com/top250?start=' + str(start), headers=header)
# 不加.text 或 .content 就会出现response 200
print(html.text)
html.encoding = 'utf-8'
start += 25
soup = bs4.BeautifulSoup(html.text, 'html.parser')
for item in soup.find_all('div', 'info'):
title = item.div.span.string
# print(title)
yearline = item.find('div', 'bd').p.contents[2].string
yearline = yearline.replace('\n', '')
yearline = yearline.replace(' ', '')
year = yearline[0:4]
rating = item.find('span', {'class': 'rating_num'}).get_text()
oneresult = [title, rating, year]
result.append(oneresult)
print(result)
with open('top_250.csv', 'wb') as f:
w = csv.writer(f)
w.writerows(result)
f.close()
完整代码
#导入requests bs4
import requests
import bs4
import unicodecsv as csv
start = 0
result = []
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/90.0.4430.93 Safari/537.36 "
}
#拼接链接
for i in range(0, 10):
html = requests.get('https://movie.douban.com/top250?start=' + str(start), headers=header)
# 不加.text 或 .content 就会出现response 200
print(html.text)
html.encoding = 'utf-8'
start += 25
soup = bs4.BeautifulSoup(html.text, 'html.parser')
for item in soup.find_all('div', 'info'):
title = item.div.span.string
# print(title)
yearline = item.find('div', 'bd').p.contents[2].string
yearline = yearline.replace('\n', '')
yearline = yearline.replace(' ', '')
year = yearline[0:4]
rating = item.find('span', {'class': 'rating_num'}).get_text()
oneresult = [title, rating, year]
result.append(oneresult)
print(result)
with open('top_250.csv', 'wb') as f:
w = csv.writer(f)
w.writerows(result)
f.close()
付费内容
版权属于:Cyril
本文链接:https://cloud.tencent.com/developer/article/1858304
转载时须注明出处及本声明