现在很多网站都是异步加载的方式加载数据,大部分都是json数据,如果不知道数据的传递过程,一些参数理不清头绪的话,又想要获取数据,那就比较难搞了,尤其是对于本渣渣级选手而言。
目标网址
https://www.keyshot.com/gallery/
需求
获取图片信息,需高清大图
经过简单浏览器抓包调试,可以获取到一些信息!
不想努力了,想了两个笨方法,好在数据量不大!
枚举法获取图片地址,爬取图片
1.枚举获取图片地址
代码示例
for i in range(10000):
if len(str(i))==1:
i=f'000{i}'
if len(str(i))==2:
i = f'00{i}'
if len(str(i))==3:
i = f'0{i}'
if len(str(i)) ==4:
i=i
print(i)
url=f"https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg"
if requests.get(url, headers=self.random_headers):
print("存在图片!")
图片链接:
https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0003.jpg
可以看到id与图片链接是存在关系的,所以,对于id进行迭代,同时进行了if判断!
2.图片下载
代码示例
def save_img(self, img_url, img_name, path):
os.makedirs(f'{path}/', exist_ok=True)
print("开始下载图片!")
print(f">>> 开始保存 {img_name} 图片")
r = requests.get(img_url, headers=self.random_headers,timeout=8)
with open(f'{path}/{img_name}.jpg', 'wb') as f:
f.write(r.content)
print(f">>> 保存 {img_name} 图片成功")
这里需要注意的是 timeout=8 属性一定需要标配,尤其是国外网站获取请求的话,不然容易卡死!
完整代码
# -*- coding: UTF-8 -*-
#微信:huguo00289
import requests
import random,os
class Httprequest(object):
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
]
@property #把方法变成属性的装饰器
def random_headers(self):
return {
'User-Agent': random.choice(self.ua_list)
}
class Get_imgs(Httprequest):
def __init__(self):
self.path="key"
def getimgs(self):
for i in range(10000):
if len(str(i))==1:
i=f'000{i}'
if len(str(i))==2:
i = f'00{i}'
if len(str(i))==3:
i = f'0{i}'
if len(str(i)) ==4:
i=i
print(i)
url=f"https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-{i}.jpg"
if requests.get(url, headers=self.random_headers):
print("存在图片!")
self.save_img(url, str(i), self.path)
#下载图片
def save_img(self, img_url, img_name, path):
os.makedirs(f'{path}/', exist_ok=True)
print("开始下载图片!")
print(f">>> 开始保存 {img_name} 图片")
r = requests.get(img_url, headers=self.random_headers,timeout=8)
with open(f'{path}/{img_name}.jpg', 'wb') as f:
f.write(r.content)
print(f">>> 保存 {img_name} 图片成功")
if __name__=='__main__':
spider=Get_imgs()
spider.getimgs()
手动获取json数据包,爬取图片
1.正则获取图片地址
代码示例
img_urls=[]
zeimg=r'href="(.+?)"'
imgs=re.findall(zeimg,str(datas),re.S)
for img in imgs:
if "www.keyshot.com" in img:
img_urls.append(img)
2.多线程下载图片,这里使用了线程池技术
代码示例
def main():
img_urls=get_imgs()
try:
# 开4个 worker,没有参数时默认是 cpu 的核心数
pool = ThreadPool()
results = pool.map(save_img, img_urls)
pool.close()
pool.join()
print("采集所有图片完成!")
except:
print("Error: unable to start thread")
完整代码
#keyshot图片采集
# -*- coding: UTF-8 -*-
#微信:huguo00289
import requests,re,os,random
from multiprocessing.dummy import Pool as ThreadPool
def get_imgs():
datas="""
/*
* 提示:该行代码过长,系统自动注释不进行高亮。一键复制会移除系统注释
* data: "<li id="eg-2-post-id-333312" data-skin="keyshot-gallery" class="filterall filter-entertainment eg-keyshot-gallery-wrapper eg-post-id-333312 eg-newli" data-date="1352027697" data-title="dmitrij-le">↵ <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121-25x25.jpg" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0095-700x1121.jpg" alt="" width="700" height="1121"></div>↵↵ <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton esgbox" href="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0095.jpg" data-width="923" data-height="1478" ></a> <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵ <div class="esg-center eg-post-333312 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333312" href="javascript:void(0);" target="_self"></a></div>↵ <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-post-333312 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Dmitrij Leppée</div>↵ <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ </div>↵ </div>↵↵</li>↵<li id="eg-2-post-id-333248" data-skin="keyshot-gallery" class="filterall filter-jewelry eg-keyshot-gallery-wrapper eg-post-id-333248 eg-newli" data-date="1351522438" data-title="tiho-ramov">↵ <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321-25x25.png" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0073-700x321.png" alt="" width="700" height="321"></div>↵↵ <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton esgbox" href="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0073.png" data-width="1000" data-height="458" ></a> <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵ <div class="esg-center eg-post-333248 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333248" href="javascript:void(0);" target="_self"></a></div>↵ <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-post-333248 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Tiho Ramovic</div>↵ <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ </div>↵ </div>↵↵</li>↵<li id="eg-2-post-id-333308" data-skin="keyshot-gallery" class="filterall filter-entertainment eg-keyshot-gallery-wrapper eg-post-id-333308 eg-newli" data-date="1349780210" data-title="vitaly-bul">↵ <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020-25x25.jpg" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2012/10/keyshot-gallery-0113-700x1020.jpg" alt="" width="700" height="1020"></div>↵↵ <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton esgbox" href="https://www.keyshot.com/wp-content/uploads/2012/10/keyshot-gallery-0113.jpg" data-width="961" data-height="1400" ></a> <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵ <div class="esg-center eg-post-333308 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333308" href="javascript:void(0);" target="_self"></a></div>↵ <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-post-333308 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Vitaly Bulgarov</div>↵ <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ </div>↵ </div>↵↵</li>↵<li id="eg-2-post-id-333310" data-skin="keyshot-gallery" class="filterall filter-entertainment eg-keyshot-gallery-wrapper eg-post-id-333310 eg-newli" data-date="1345460494" data-title="maarten-ve">↵ <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017-25x25.jpg" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0094-700x1017.jpg" alt="" width="700" height="1017"></div>↵↵ <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton esgbox" href="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0094.jpg" data-width="1321" data-height="1920" ></a> <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵ <div class="esg-center eg-post-333310 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333310" href="javascript:void(0);" target="_self"></a></div>↵ <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-post-333310 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Maarten Verhoeven</div>↵ <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ </div>↵ </div>↵↵</li>↵<li id="eg-2-post-id-333207" data-skin="keyshot-gallery" class="filterall filter-engineering eg-keyshot-gallery-wrapper eg-post-id-333207 eg-newli" data-date="1334153155" data-title="philippe-v">↵ <div class="esg-media-cover-wrapper">↵<div class="esg-entry-media"><img src="https://www.keyshot.com/wp-content/plugins/essential-grid/public/assets/images/300x200transparent.png" data-lazythumb="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394-25x25.jpg" data-no-lazy="1" data-lazysrc="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0054-700x394.jpg" alt="" width="700" height="394"></div>↵↵ <div class="esg-entry-cover esg-transition" data-delay="0" data-duration="deafult" data-clickable="on" data-transition="esg-fade">↵↵<a class="eg-invisiblebutton esgbox" href="https://www.keyshot.com/wp-content/uploads/2016/06/keyshot-gallery-0054.jpg" data-width="1280" data-height="720" ></a> <div class="esg-overlay esg-transition eg-keyshot-gallery-container" data-delay="0" data-duration="default" data-transition="esg-fade"></div>↵↵ <div class="esg-center eg-post-333207 eg-keyshot-gallery-element-14-a esg-transition" data-delay="0" data-duration="default" data-transition="esg-slideup"><a class="eg-keyshot-gallery-element-14 eg-post-333207" href="javascript:void(0);" target="_self"></a></div>↵ <div class="esg-center eg-keyshot-gallery-element-8 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-post-333207 eg-keyshot-gallery-element-3 esg-transition" data-delay="0.1" data-duration="default" data-transition="esg-flipup">Philippe Vanagt</div>↵ <div class="esg-center eg-keyshot-gallery-element-9 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ <div class="esg-center eg-keyshot-gallery-element-11 esg-none esg-clear" style="height: 5px; visibility: hidden;"></div>↵ </div>↵ </div>↵↵</li>↵"
*/
message: ""
success: true
"""
img_urls=[]
zeimg=r'href="(.+?)"'
imgs=re.findall(zeimg,str(datas),re.S)
for img in imgs:
if "www.keyshot.com" in img:
img_urls.append(img)
print(len(img_urls))
return img_urls
#下载图片
def save_img(img_url):
path = "key"
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
]
os.makedirs(f'{path}/', exist_ok=True)
img_name=img_url.split('/')[-1]
print("开始下载图片!")
print(f">>> 开始保存 {img_name} 图片")
r = requests.get(img_url,headers={'User-Agent':random.choice(ua_list)},timeout=8)
with open(f'{path}/{img_name}', 'wb') as f:
f.write(r.content)
print(f">>> 保存 {img_name} 图片成功")
def main():
img_urls=get_imgs()
try:
# 开4个 worker,没有参数时默认是 cpu 的核心数
pool = ThreadPool()
results = pool.map(save_img, img_urls)
pool.close()
pool.join()
print("采集所有图片完成!")
except:
print("Error: unable to start thread")
if __name__=='__main__':
main()
微信公众号:二爷记
不定时分享python源码及工具
本文分享自 Python与SEO学习 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!