获取素材图无忧，Pixabay图库网Python多线程采集下载

二爷

发布于 2020-07-22 11:31:14

1.6K0

发布于 2020-07-22 11:31:14

文章被收录于专栏：二爷记

图片素材想必是不少人都在寻找的内容，随着版权意识的加深，可供免费使用的图片素材可不是那么好找的哦，不过还是有不少国外知名素材网站可供我们使用，而且国内访问也是比较快，同时支持中文，感谢网站制作及素材分享者们！

下面来以一个大部分人都熟悉的图库网站，Pixabay，为例，使用Python多线程采集下载美女图片素材。

Pixabay是全球知名的图库网站及充满活力的创意社区，拥有上百万张免费正版高清照片素材，涵盖风景、人物、动态、静物等多种分类，你可以在任何地方使用Pixabay图库中的素材，无惧版权风险。

目标网址：

https://pixabay.com/zh/images/search/美女/

同样的，还有一个推荐网站，也是非常出名的，设计小哥哥强烈推荐的：

https://www.pexels.com/zh-tw/

大家可以自行练手尝试！

关键，免费，无需为版权问题所困扰！

抓取效果：

多线程运行效果：

几个关键点：

1.字符串utf-8编码

网址中中文转换为utf-8的编码还是比较常见的，这里使用 urllib.parse 转码

import urllib.parse
category="美女"
category=urllib.parse.quote(category) #转换utf-8编码
print(category)

2.使用request.urlretrieve下载图片卡机的问题解决

下载文件出现urllib.ContentTooShortError且重新下载文件会存在用时过长的问题，而且往往会尝试好几次，甚至十几次，偶尔会陷入死循环，这种情况是非常不理想的。
为此，笔者利用socket模块，使得每次重新下载的时间变短，且避免陷入死循环，从而提高运行效率。

import socket
import urllib.request
#设置超时时间为30s
socket.setdefaulttimeout(30)
#解决下载不完全问题且避免陷入死循环
try:
    urllib.request.urlretrieve(url,image_name)
except socket.timeout:
    count = 1
    while count <= 5:
        try:
            urllib.request.urlretrieve(url,image_name)                                                
            break
        except socket.timeout:
            err_info = 'Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count
            print(err_info)
            count += 1
    if count > 5:
        print("downloading picture fialed!")

#来源：本文为CSDN博主「山阴少年」

3.Python zip() 函数的用法

zip() 函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表。

>>>a = [1,2,3]
>>> b = [4,5,6]
>>> c = [4,5,6,7,8]
>>> zipped = zip(a,b)     # 打包为元组的列表
[(1, 4), (2, 5), (3, 6)]
>>> zip(a,c)              # 元素个数与最短的列表一致
[(1, 4), (2, 5), (3, 6)]
>>> zip(*zipped)          # 与 zip 相反，*zipped 可理解为解压，返回二维矩阵式
[(1, 2, 3), (4, 5, 6)]

附单线程版本：

#https://pixabay.com 图片抓取

import requests
from fake_useragent import UserAgent
import urllib.parse
from urllib import request
from lxml import etree
import re,os,time

def parse_page(url):
    opener = request.build_opener()
    opener.addheaders = [('User-Agent',
                          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
    request.install_opener(opener)

    ua=UserAgent()
    headers={'UserAgent':ua.random}

    response=requests.get(url,headers=headers,timeout=10).content.decode('utf-8')
    time.sleep(2)
    print(response)
    html=etree.HTML(response)
    imgs=html.xpath('//div[@class="flex_grid credits search_results"]/div[@class="item"]/a/img/@data-lazy-srcset')
    alts=html.xpath('//div[@class="flex_grid credits search_results"]/div[@class="item"]/a/img/@alt')
    print(imgs)
    print(alts)
    for img,alt in zip(imgs,alts):
        img_url=re.findall('1x, (.*?) 2x', img)[0]
        suffix = os.path.splitext(img_url)[1]  # 获取后缀名
        img_name='%s%s'%(alt.replace(',','_'),suffix)
        print(img_url,img_name)
        '''
        try:
            request.urlretrieve(img_url, '%s%s'%('pixabay/',img_name))
            print(">>> 图片下载完成！")
            time.sleep(1)
        except Exception as e:
            print(f'下载图片失败，错误代码：{e}')'''
        try:
            r = requests.get(img_url, headers=headers,timeout=10)
            time.sleep(1)
            with open('%s%s'%('pixabay/',img_name), 'wb') as f:
                f.write(r.content)
                print("图片下载完成！")
        except Exception as e:
            print(f'下载图片失败，错误代码：{e}')

def main():
    category="美女"
    category=urllib.parse.quote(category) #转换utf-8编码
    print(category)
    for i in range(1,3):
        print(">>> 正在抓取第%d页..." % i)
        url="https://pixabay.com/zh/images/search/%s/?pagi=%d" %(category,i)
        print(url)
        parse_page(url)


if __name__=='__main__':
    main()

附多线程版本：

经典的生产者与消费者模式

注意，经过测试，当分页小于3时候，线程运行会出现问题，与循环的控制设置相关。

#https://pixabay.com 图片抓取

import requests
from fake_useragent import UserAgent
import urllib.parse
from urllib import request
from lxml import etree
import re,os,time
import threading
from queue import Queue

#生产者模式
class Producer(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Producer,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.img_queue=img_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url=self.page_queue.get()
            self.parse_page(url)

    def parse_page(self,url):
        ua=UserAgent()
        headers={'UserAgent':ua.random}
        response=requests.get(url,headers=headers,timeout=10).content.decode('utf-8')
        time.sleep(2)
        html=etree.HTML(response)
        imgs=html.xpath('//div[@class="flex_grid credits search_results"]/div[@class="item"]/a/img/@data-lazy-srcset')
        alts=html.xpath('//div[@class="flex_grid credits search_results"]/div[@class="item"]/a/img/@alt')
        print(imgs)
        print(alts)
        for img,alt in zip(imgs,alts):
            img_url=re.findall('1x, (.*?) 2x', img)[0]
            suffix = os.path.splitext(img_url)[1]  # 获取后缀名
            img_name='%s%s'%(alt.replace(',','_'),suffix)
            self.img_queue.put((img_url,img_name))




#消费者模式
class Consumer(threading.Thread):
    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        ua = UserAgent()
        headers = {'UserAgent': ua.random}
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                break
            img_url,img_name=self.img_queue.get()
            print(img_url,img_name)
            r = requests.get(img_url, headers=headers,timeout=10)
            time.sleep(2)
            with open('%s%s'%('pixabay/',img_name), 'wb') as f:
                f.write(r.content)
                print("图片下载完成！")



def main():
    category="美女"
    category=urllib.parse.quote(category) #转换utf-8编码
    print(category)
    page_queue=Queue(100)
    img_queue=Queue(1000)
    for i in range(1,51):
        print(">>> 正在抓取第%d页..." % i)
        url="https://pixabay.com/zh/images/search/%s/?pagi=%d" %(category,i)
        print(url)
        page_queue.put(url)

    for x in range(5):
        t=Producer(page_queue,img_queue)
        t.start()

    for x in range(5):
        t=Consumer(page_queue,img_queue)
        t.start()


if __name__=='__main__':
    main()

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2019-12-25，如有侵权请联系 cloudcommunity@tencent.com 删除

编程算法