01
网站分析
随意百度搜索找的一个网站!
目标网址:
https://www.ivsky.com/bizhi/one_piece_t571/
很明显,一个列表页,一共有24个链接,对应的24个图片,可惜只有缩略图!
所以,从列表页应该是获取不到大图的图片地址,考虑从内页获取!
继续观察列表页路径地址:
https://www.ivsky.com/bizhi/one_piece_t571/index_2.html
很明显的数字id型列表页,知晓总的列表页数目,就可以构造访问了!
当然需要注意的是,首页的列表页即为入口页,构造访问路径的时候用一个判断处理就可以实现了!
02
代码编写
引用库
import requests,time
from lxml import etree
列表链接源码
列表链接xpath获取
hrefs=tree.xpath('//ul[@class="il"]/li/div[@class="il_img"]/a/@href')
详情页图片源码
详情页图片链接xpath获取
img_url=tree.xpath('//img[@id="imgis"]/@src')[0]
第一版参考:
# one_piece 天堂图片网 图片采集
# https://www.ivsky.com/bizhi/one_piece_t571/index_2.html
# 20211105 by 微信:huguo00289
# -*- coding: UTF-8 -*-
import requests,time
from lxml import etree
def get_imghrefs(url):
# url = "https://www.ivsky.com/bizhi/one_piece_t571/index_2.html"
headers={"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
response=requests.get(url=url,headers=headers,timeout=6)
print(response.status_code)
html=response.content.decode('utf-8')
tree=etree.HTML(html)
hrefs=tree.xpath('//ul[@class="il"]/li/div[@class="il_img"]/a/@href')
print(len(hrefs))
print(hrefs)
img_hrefs=[]
for href in hrefs:
href=f'https://www.ivsky.com{href}'
img_hrefs.append(href)
return img_hrefs
def get_img(url):
#url="https://www.ivsky.com/bizhi/one_piece_v36640/pic_599277.html"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
response = requests.get(url=url, headers=headers, timeout=6)
print(response.status_code)
html = response.content.decode('utf-8')
tree = etree.HTML(html)
img_url=tree.xpath('//img[@id="imgis"]/@src')[0]
img_url =f'https:{img_url}'
imgname=img_url.split('/')[-1]
print(img_url,imgname)
img=img_url,imgname
return img
def down_img(img):
# img_url="https://img-pre.ivsky.com/img/bizhi/pre/201606/21/one_piece-005.jpg"
# imgname="one_piece-005.jpg"
img_url=img[0]
imgname=img[1]
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
r = requests.get(url=img_url, headers=headers, timeout=6)
with open(imgname,'wb') as f:
f.write(r.content)
print(f">> 下载 {imgname} 图片成功!")
def main():
url="https://www.ivsky.com/bizhi/one_piece_t571/index_2.html"
img_hrefs=get_imghrefs(url)
for img_href in img_hrefs:
img=get_img(img_href)
down_img(img)
time.sleep(1)
if __name__=='__main__':
main()
第二版 类class 参考:
# one_piece 天堂图片网 图片采集
# https://www.ivsky.com/bizhi/one_piece_t571/index_2.html
# 20211105 by 微信:huguo00289
# -*- coding: UTF-8 -*-
import requests,time,os
from lxml import etree
import threading
class One():
def __init__(self,url,pagenum):
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
self.url=url
self.pagenum=pagenum
pathname=url.split('/')[-2]
self.path=f'{pathname}/'
os.makedirs(self.path,True)
def get_img_herfs(self):
for page in range(1,self.pagenum+1):
if page==1:
url=self.url
else:
url=f'{self.url}index_{page}.html'
self.get_imghrefs(page,url)
def get_imghrefs(self,page,url):
print(f">> 正在爬取 第{page}页 链接:{url}")
response=requests.get(url=url,headers=self.headers,timeout=6)
html=response.content.decode('utf-8')
tree=etree.HTML(html)
hrefs=tree.xpath('//ul[@class="il"]/li/div[@class="il_img"]/a/@href')
print(len(hrefs))
print(hrefs)
self.thread_get_imgs(page,hrefs)
def thread_get_imgs(self,page,hrefs):
threadings=[]
for href in hrefs:
href=f'https://www.ivsky.com{href}'
t=threading.Thread(target=self.get_img,args=(href,page))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print("多线程下载图片完成")
def get_img(self,url,page):
response = requests.get(url=url, headers=self.headers, timeout=6)
html = response.content.decode('utf-8')
time.sleep(1)
tree = etree.HTML(html)
img_url=tree.xpath('//img[@id="imgis"]/@src')[0]
img_url =f'https:{img_url}'
img_name=img_url.split('/')[-1]
img_name=f'{page}_{img_name}'
print(img_url,img_name)
self.down_img(img_url,img_name)
def down_img(self,img_url,img_name):
r = requests.get(url=img_url, headers=self.headers, timeout=6)
with open(f'{self.path}{img_name}','wb') as f:
f.write(r.content)
print(f">> 下载 {img_name} 图片成功!")
def main():
url="https://www.ivsky.com/bizhi/one_piece_t571/"
pagenum=12
spider=One(url,pagenum)
spider.get_img_herfs()
if __name__=='__main__':
main()
该网站没有反爬,适合新手学习参考,注意设置time.sleep,减轻服务器压力!