应用python爬取金点设计奖获奖作品数据的时候,发现无论如何更换协议头,获取的json数据都不会更改,但是手动打开网页json加载的数据会更改,后面想到使用使用session,数据果断出来了!
网站:
http://www.goldenpin.org.tw
金点设计奖(Golden Pin Design Award),是台湾创意设计中心执行,在台湾拥有35年历史,是台湾历史最悠久、最权威且最富知名度的专业设计竞赛。2014年首度将报名资格扩大到全球的华人市场(新增中国、中国香港、中国澳门、新加坡、马来西亚五地),参赛厂商超过数千家,报名作品累积上万件。被媒体称为「全球华人市场最顶尖设计奖项」、「设计界的金马奖」 。
目标网址:
http://www.goldenpin.org.tw/金點設計獎/?y=2019
抓包访问数据:
ajax加载分页数据:
json加载html网页数据:
协议头数据都是一致,无论如何访问哪个年份,除了页码数据(分页)不一样
更改协议头,补全Referer,补全协议头都不能获取到不同的数据
后面采用requests session,第一次访问年份获奖数据,再用json获取html数据
requests session的用法
在使用python requests库时遇到一个问题,就是如何在session中设置对所有请求都生效的cookie?requests中的session对象一大特性就是它会自动为你管理cookie,当你登录一个页面时,它可以自动识别response中的set cookie头,然后为下面的请求一直维持这个cookie。
添加cookie有2种方式:
一个是把cookie先写成字典形式,然后把字典转换为cookiejar
s = requests.Session() # 开启一个会话Session
cookie_dict={'49BAC005-7D5B-4231-8CEA-16939BEACD67': 'cktest001', # 从chrome浏览器中取到的cookie值
'JSESSIONID':'F4FFF69B8XXXXXXC8DCB4C061C0',
'JSESSIONIDSSO':'9D49C76FD6XXXXXF294242B44A'
}
s.cookies = requests.utils.cookiejar_from_dict(cookie_dict, cookiejar=None, overwrite=True) # 把cookie值转换为cookiejar类型,然后传给Session
#注意:这个方法会替换掉原有的cookies
二是追加cookies
s = requests.Session() # 开启一个会话Session
jar = requests.cookies.RequestsCookieJar() # 创建一个Cookie Jar对象
jar.set('49BAC005-7D5B-4231-8CEA-1XXXXBEACD67','cktXXXX001') # 向Cookie Jar对象中添加cookie值
jar.set('JSESSIONID','F4FFF69B8CXXXX80F0C8DCB4C061C0')
jar.set('JSESSIONIDSSO','9D49C7XXXX448FDF5B0F294242B44A')
s.cookies.update(jar) # 把cookies追加到Session中
来实现python金点设计奖数据爬虫
第一步:构建采集url:
def get_url():
urls=[]
categorys = ["金點設計獎", "金點概念設計獎", "金點新秀設計獎"]
years = ["2019", "2018", "2017", "2016", "2015"]
for category in categorys:
cate_gory=urllib.parse.quote(category)
for year in years:
url=f"http://www.goldenpin.org.tw/{cate_gory}/?y={year}"
print(url)
urls.append(url)
print(len(urls))
return urls
第二步:随机协议头
def ua():
ua=UserAgent()
headers={"User-Agent":ua.random,}
return headers
第三步:requests 访问年份首页,获取requests session
def get_session(furl):
s = requests.session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
try:
fresponse = s.get(furl, headers=ua(), timeout=10)
except requests.exceptions.RequestException as e:
print(f'访问链接失败,错误代码:{e}')
with open(f'furl_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{furl}-访问链接失败,错误代码:{e}\n')
print(fresponse.status_code)
time.sleep(2)
return s
第四步:获取json数据
def get_req(s,i):
url="http://www.goldenpin.org.tw/ft-admin/admin-ajax.php"
data={
'action': 'presscore_template_ajax',
'postID': '15317',
'paged': '1',
'targetPage': i,
'term':'',
'orderby':'',
'order':'',
'nonce': '1f3d287a9a',
'contentType': 'portfolio',
'pageData[type]': 'page',
'pageData[template]': 'portfolio',
'pageData[layout]': 'masonry',
'sender': 'more',
}
response=s.post(url,data=data,headers=ua(),timeout=10)
print(response.status_code)
if response.status_code==200:
req=response.content.decode('utf-8')
html=json.loads(req)['html']
h3s=re.findall(r'<h3 class="entry-title"><a target="_blank" href="(.+?)" title="(.+?)" rel',html,re.S)
print(len(h3s))
for h3 in h3s:
title=h3[1]
href=h3[0]
data=title,href
print(data)
get_content(href)
time.sleep(2)
大奖设计作品数据获取:
def get_content(url):
#url="http://www.goldenpin.org.tw/project/ps63/"
response = requests.post(url,headers=ua(),timeout=10).content.decode('utf-8')
time.sleep(2)
html=etree.HTML(response)
category=html.xpath('//ol[@class="breadcrumbs text-small"]/a[1]/text()')[0]
print(category)
year=html.xpath('//ol[@class="breadcrumbs text-small"]/a[2]/text()')[0]
print(year)
subclassification = html.xpath('//ol[@class="breadcrumbs text-small"]/text()')[0]
print(subclassification)
h1=html.xpath('//h1[@class="entry-title"]/text()')[0]
h1 = re.sub(r'[\|\/\<\>\:\*\?\'\\"]', "_", h1) # 剔除不合法字符
print(h1) # 获取标题
path=f"{year}/{category}/{subclassification}/"
os.makedirs(path,exist_ok=True) #创建目录
descriptions=html.xpath('//div[@class="wpb_wrapper"]//text()')
description=''.join(descriptions)
texts='%s%s%s'%(h1,'\n',description)
print(description)
imgs=html.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@data-src')
down(h1,imgs,path,texts)
下载模块:
def down(h1,imgs,path,texts):
try:
print(f'>>> 开始保存{h1}.txt文本..')
with open(f'{path}/{h1}.txt','w',encoding='utf-8') as f:
f.write(texts)
print(f'>>> 保存{h1}.txt文本成功!')
except Exception as e:
print(f'获取详情数据失败,错误代码:{e}')
with open(f'{path}/text_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{h1},{texts}-获取详情数据失败,错误代码:{e}\n')
i=1
for img in imgs:
img_url=img
suffix = os.path.splitext(img_url)[1] # 获取后缀名
img_name = '%s%s%d%s' % (h1,'_',i, suffix)
print(f'>>> 开始下载{img_name}图片..')
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
try:
r = s.get(img_url, timeout=20, headers=ua())
with open(f'{path}/{img_name}', 'wb') as f:
f.write(r.content)
print(f'>>> 开始下载{img_name}图片完成!')
except requests.exceptions.RequestException as e:
print(f'下载图片失败,错误代码:{e}')
with open(f'{path}/img_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}\n')
time.sleep(1)
i=i+1
最后:创建一个mian()函数来运行爬虫
def main():
urls=get_url()
for furl in urls:
print(f'>>> 正在抓取 {furl} 链接数据...')
try:
s=get_session(furl)
for i in range(5, 6):
try:
get_req(s,i)
except Exception as e:
print(f'获取网页数据失败,错误代码:{e}')
with open(f'json_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{furl},{i},-获取网页数据失败,错误代码:{e}\n')
except Exception as e:
print(f'获取网页数据失败,错误代码:{e}')
with open(f'spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{furl}-获取网页数据失败,错误代码:{e}\n')
time.sleep(1)
time.sleep(5)
运行效果:
附完整源码:
#金点设计奖采集
# -*- coding: utf-8 -*-
#20200103 by 微信:huguo00289
import requests
from fake_useragent import UserAgent
import json,re,os,time
from lxml import etree
from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库
import urllib.parse
def ua():
ua=UserAgent()
headers={"User-Agent":ua.random,}
return headers
def get_session(furl):
s = requests.session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
try:
fresponse = s.get(furl, headers=ua(), timeout=10)
except requests.exceptions.RequestException as e:
print(f'访问链接失败,错误代码:{e}')
with open(f'furl_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{furl}-访问链接失败,错误代码:{e}\n')
print(fresponse.status_code)
time.sleep(2)
return s
def get_req(s,i):
url="http://www.goldenpin.org.tw/ft-admin/admin-ajax.php"
data={
'action': 'presscore_template_ajax',
'postID': '15317',
'paged': '1',
'targetPage': i,
'term':'',
'orderby':'',
'order':'',
'nonce': '1f3d287a9a',
'contentType': 'portfolio',
'pageData[type]': 'page',
'pageData[template]': 'portfolio',
'pageData[layout]': 'masonry',
'sender': 'more',
}
response=s.post(url,data=data,headers=ua(),timeout=10)
print(response.status_code)
if response.status_code==200:
req=response.content.decode('utf-8')
html=json.loads(req)['html']
h3s=re.findall(r'<h3 class="entry-title"><a target="_blank" href="(.+?)" title="(.+?)" rel',html,re.S)
print(len(h3s))
for h3 in h3s:
title=h3[1]
href=h3[0]
data=title,href
print(data)
get_content(href)
time.sleep(2)
def get_content(url):
#url="http://www.goldenpin.org.tw/project/ps63/"
response = requests.post(url,headers=ua(),timeout=10).content.decode('utf-8')
time.sleep(2)
html=etree.HTML(response)
category=html.xpath('//ol[@class="breadcrumbs text-small"]/a[1]/text()')[0]
print(category)
year=html.xpath('//ol[@class="breadcrumbs text-small"]/a[2]/text()')[0]
print(year)
subclassification = html.xpath('//ol[@class="breadcrumbs text-small"]/text()')[0]
print(subclassification)
h1=html.xpath('//h1[@class="entry-title"]/text()')[0]
h1 = re.sub(r'[\|\/\<\>\:\*\?\'\\"]', "_", h1) # 剔除不合法字符
print(h1) # 获取标题
path=f"{year}/{category}/{subclassification}/"
os.makedirs(path,exist_ok=True) #创建目录
descriptions=html.xpath('//div[@class="wpb_wrapper"]//text()')
description=''.join(descriptions)
texts='%s%s%s'%(h1,'\n',description)
print(description)
imgs=html.xpath('//div[@class="fancy-media-wrap layzr-bg"]/img[@class="lazy-load preload-me"]/@data-src')
down(h1,imgs,path,texts)
def down(h1,imgs,path,texts):
try:
print(f'>>> 开始保存{h1}.txt文本..')
with open(f'{path}/{h1}.txt','w',encoding='utf-8') as f:
f.write(texts)
print(f'>>> 保存{h1}.txt文本成功!')
except Exception as e:
print(f'获取详情数据失败,错误代码:{e}')
with open(f'{path}/text_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{h1},{texts}-获取详情数据失败,错误代码:{e}\n')
i=1
for img in imgs:
img_url=img
suffix = os.path.splitext(img_url)[1] # 获取后缀名
img_name = '%s%s%d%s' % (h1,'_',i, suffix)
print(f'>>> 开始下载{img_name}图片..')
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
try:
r = s.get(img_url, timeout=20, headers=ua())
with open(f'{path}/{img_name}', 'wb') as f:
f.write(r.content)
print(f'>>> 开始下载{img_name}图片完成!')
except requests.exceptions.RequestException as e:
print(f'下载图片失败,错误代码:{e}')
with open(f'{path}/img_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}\n')
time.sleep(1)
i=i+1
def get_url():
urls=[]
categorys = ["金點設計獎", "金點概念設計獎", "金點新秀設計獎"]
years = ["2019", "2018", "2017", "2016", "2015"]
for category in categorys:
cate_gory=urllib.parse.quote(category)
for year in years:
url=f"http://www.goldenpin.org.tw/{cate_gory}/?y={year}"
print(url)
urls.append(url)
print(len(urls))
return urls
def main():
urls=get_url()
for furl in urls:
print(f'>>> 正在抓取 {furl} 链接数据...')
try:
s=get_session(furl)
for i in range(5, 6):
try:
get_req(s,i)
except Exception as e:
print(f'获取网页数据失败,错误代码:{e}')
with open(f'json_spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{furl},{i},-获取网页数据失败,错误代码:{e}\n')
except Exception as e:
print(f'获取网页数据失败,错误代码:{e}')
with open(f'spider.txt', 'a+', encoding='utf-8') as f:
f.write(f'{furl}-获取网页数据失败,错误代码:{e}\n')
time.sleep(1)
time.sleep(5)
if __name__=="__main__":
main()
本文分享自 Python与SEO学习 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!