pycharm,python3.7,selenium库,request库,os文件库,re模块的正则表达式运用,自行下载chromedriver自测软件
①爬虫爬取可用代理IP ②用户输入搜索的关键字 ③进行数据爬取 ④爬取成功后下载到文件夹
from selenium import webdriver # 从selenium导入webdriver
from selenium.webdriver.chrome.options import Options
import requests
import re
import time
import shutil
import os
from bs4 import BeautifulSoup
# @author Himit_ZH
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
#获取代理ip网站的代理ip
def get_free_proxy():
url = 'https://www.xicidaili.com/nn/'
response = requests.get(url, headers=headers)
html_ele = etree.HTML(response.content)
tr_eles = html_ele.xpath('//table[@id="ip_list"]//tr')
tr_eles.pop(0)
for tr_ele in tr_eles:
ip_str = tr_ele.xpath('./td[2]/text()')[0]
port = tr_ele.xpath('./td[3]/text()')[0]
yield ip_str + ':' + port
#验证代理ip的可用性
def validate_proxy(proxy_str):
url = 'http://httpbin.org'
proxy = {
'http': 'http://' + proxy_str,
'https': 'http://'+proxy_str
}
try:
response = requests.get(url, proxies=proxy, timeout=2)
if response.status_code == 200:
return True
except:
return False
#对抓取文件URL进行下载存储到指定文件夹
def saveImg(pic_link, x, num):
path = r"E://py//检索图片//" # 存储路径
try:
pp = requests.get(pic_link, headers=headers, allow_redirects=False, timeout=7)
pth = path + x + ".png" # 设置图片名
with open(pth, "wb") as f:
for chunk in pp: # 读取每个图片链接的二进制数据
f.write(chunk) # 写入
print("检索得到的第%d张已经下载好"% num)
except:
print('爬取URL出错停止')
#滑动轴柄获取更多网页信息
def scrollBy(index, driver):
i = 0
while i <= index:
time.sleep(2)
driver.execute_script("window.scrollBy(0," + str(1000 * i) + ")")
i += 1
def get_picture(proxy,search_name):
driver_path = r'E:\py\chromedriver\chromedriver.exe'
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--proxy-server={0}'.format(proxy))
driver = webdriver.Chrome(executable_path = driver_path, chrome_options=chrome_options)
driver.get('https://www.baidu.com')
driver.implicitly_wait(2)
driver.find_element_by_id('kw').send_keys(search_name)
driver.find_element_by_id('su').click()
driver.implicitly_wait(2)
driver.find_element_by_link_text(search_name+'_百度图片').click()
driver.implicitly_wait(2)
# 设置窗口句柄跳到该抓取的网页
handles = driver.window_handles
driver.switch_to.window(handles[1])
scrollBy(5, driver)
pic_url = re.findall('"objURL":"(.*?)",', driver.page_source, re.S)
x = 1
#清空原先文件夹
path = r"E://py//检索图片//" # 存储路径
shutil.rmtree(path)
os.mkdir(path)
for each in pic_url:
saveImg(each, search_name+str(x), x)
x += 1
driver.quit()
if __name__ == '__main__':
time_start=time.time()
print('正在寻找可用的代理IP,请稍后...')
for item in get_free_proxy():
if validate_proxy(item):
time_OK1 = time.time()
print('寻找到可用IP:'+item+''+'耗时为'+str(round(time_OK1-time_start, 2))+'秒')
want_find = input('请输入想查找的照片名字(模糊检索):')
print('开始进行网页爬取,请稍后...')
get_picture(item, want_find)
break