Python3爬虫模拟登录爬取百度图片

HcodeBlogger

发布于 2020-07-14 09:54:01

6200

发布于 2020-07-14 09:54:01

文章被收录于专栏：Hcode网站

pycharm+chrome模拟登录爬取百度图片（附加使用代理IP）

具体工具如下

pycharm，python3.7，selenium库，request库，os文件库，re模块的正则表达式运用，自行下载chromedriver自测软件

具体步骤如下

①爬虫爬取可用代理IP ②用户输入搜索的关键字 ③进行数据爬取 ④爬取成功后下载到文件夹

代码如下

from selenium import webdriver # 从selenium导入webdriver
from selenium.webdriver.chrome.options import Options
import requests
import re
import time
import shutil
import os
from bs4 import BeautifulSoup
# @author Himit_ZH
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

#获取代理ip网站的代理ip
def get_free_proxy():
    url = 'https://www.xicidaili.com/nn/'
    response = requests.get(url, headers=headers)
    html_ele = etree.HTML(response.content)
    tr_eles = html_ele.xpath('//table[@id="ip_list"]//tr')
    tr_eles.pop(0)
    for tr_ele in tr_eles:
        ip_str = tr_ele.xpath('./td[2]/text()')[0]
        port = tr_ele.xpath('./td[3]/text()')[0]
        yield ip_str + ':' + port

#验证代理ip的可用性
def validate_proxy(proxy_str):
    url = 'http://httpbin.org'
    proxy = {
        'http': 'http://' + proxy_str,
        'https': 'http://'+proxy_str
    }
    try:
        response = requests.get(url, proxies=proxy, timeout=2)
        if response.status_code == 200:
            return True
    except:
        return False

#对抓取文件URL进行下载存储到指定文件夹
def saveImg(pic_link, x, num):
    path = r"E://py//检索图片//"  # 存储路径
    try:
        pp = requests.get(pic_link, headers=headers, allow_redirects=False, timeout=7)
        pth = path + x + ".png"  # 设置图片名
        with open(pth, "wb") as f:
            for chunk in pp:  # 读取每个图片链接的二进制数据
                f.write(chunk)  # 写入
        print("检索得到的第%d张已经下载好"% num)
    except:
        print('爬取URL出错停止')

#滑动轴柄获取更多网页信息
def scrollBy(index, driver):
    i = 0
    while i <= index:
        time.sleep(2)
        driver.execute_script("window.scrollBy(0," + str(1000 * i) + ")")
        i += 1

def get_picture(proxy,search_name):
    driver_path = r'E:\py\chromedriver\chromedriver.exe'
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--proxy-server={0}'.format(proxy))
    driver = webdriver.Chrome(executable_path = driver_path, chrome_options=chrome_options)
    driver.get('https://www.baidu.com')
    driver.implicitly_wait(2)
    driver.find_element_by_id('kw').send_keys(search_name)
    driver.find_element_by_id('su').click()
    driver.implicitly_wait(2)
    driver.find_element_by_link_text(search_name+'_百度图片').click()
    driver.implicitly_wait(2)
    # 设置窗口句柄跳到该抓取的网页
    handles = driver.window_handles
    driver.switch_to.window(handles[1])
    scrollBy(5, driver)
    pic_url = re.findall('"objURL":"(.*?)",', driver.page_source, re.S)
    x = 1
    #清空原先文件夹
    path = r"E://py//检索图片//"  # 存储路径
    shutil.rmtree(path)
    os.mkdir(path)
    for each in pic_url:
        saveImg(each, search_name+str(x), x)
        x += 1
    driver.quit()
if __name__ == '__main__':
    time_start=time.time()
    print('正在寻找可用的代理IP，请稍后...')
    for item in get_free_proxy():
        if validate_proxy(item):
            time_OK1 = time.time()
            print('寻找到可用IP:'+item+''+'耗时为'+str(round(time_OK1-time_start, 2))+'秒')
            want_find = input('请输入想查找的照片名字（模糊检索)：')
            print('开始进行网页爬取,请稍后...')
            get_picture(item, want_find)
            break

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2020年3月28日，如有侵权请联系 cloudcommunity@tencent.com 删除

tcp/ip