---------------------------------------------------------------------------------------------------#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: J.W
import requests
import re
import time
import random
from lxml import etree
from bs4 import BeautifulSoup
import csv
from get_c_proxies import *
import threading
from Queue import Queue
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
agents = [
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
]
def get_socks_content(proxy_list):
url='https://www.socks-proxy.net/'
h=0
while True:
try:
#proxies ={'http': 'socks5://10.152.152.10:9100', 'https': 'socks5://10.152.152.10:9100'}
proxies =random.choice(proxy_list)
user_agent=random.choice(agents)
time.sleep(0.5)
header = {'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}
requests.adapters.DEFAULT_RETRIES = 5
results = requests.get(url, headers=header,proxies=proxies,timeout=5)
results.encoding='utf-8'
if results.status_code==200:
print proxies
break
except Exception as e:
continue
tree = etree.HTML(results.text)
u_ips =tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[1]/text()')
ports=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[2]/text()')
cuntries=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[4]/text()')
u_type=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[5]/text()')
with open(r'ipproxy2.csv','ab+') as csvfile:
writer = csv.writer(csvfile,dialect='excel')
for i in range(0,len(u_ips)):
ip=u_ips[i]
port=ports[i]
country=cuntries[i]
porxy_type=u_type[i].upper()
writer.writerow([porxy_type.lower(),ip,port,country, "0"])
csvfile.close()
print "********存%s个代理********"% i
print "write end https://www.socks-proxy.net/"
def get_http_content(url,proxy_list):
global UseProxiesList
h=0
while h
proxies =random.choice(proxy_list)
user_agent=random.choice(agents)
header = {'host':url,'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}
requests.adapters.DEFAULT_RETRIES = 2
try:
results = requests.get(url, headers=header,proxies=proxies,timeout=3)
if results.status_code!=200:
continue
else:
print proxies
save_data(results)
break
except Exception as e:
h+=1
pass
continue
def save_data(results):
tree = etree.HTML(results.text)
url_ips =tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[1]/text()')
url_ports=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[2]/text()')
url_cuntries=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[4]/text()')
url_typies=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[7]/text()')
url_type=[]
print "write data"
i=0
while i
if url_typies[i]=="yes":
url_type.append("HTTPS")
else:
url_type.append("HTTP")
i+=1
with open(r'ipproxy2.csv','ab+') as csvfile:
writer = csv.writer(csvfile,dialect='excel')
for i in range(0,len(url_ips)):
ip=url_ips[i]
port=url_ports[i]
country=url_cuntries[i]
porxy_type=url_type[i].upper()
writer.writerow([porxy_type.lower(),ip,port,country, "0"])
csvfile.close()
print "********存%s个代理********"% i
def runproxy(ip_tmp):
global proxy_list
url="https://www.google.com/"
while True:
if ip_tmp[0].upper()=="SOCKS5":#lower:大写字符转化成小写的 upper:小写字符转化成大写的函数
proxiess5= {'http':'socks5://'+ip_tmp[1]+":"+ip_tmp[2],'https':'socks5://'+ip_tmp[1]+":"+ip_tmp[2]}
user_agent=random.choice(agents)
try:
results=requests.get(url,headers=header,proxies =proxiess5 ,timeout=3)
if results.status_code==200:
proxy_list.append(ip_tmp)
else:
break
except Exception as e:
break
elif ip_tmp[0].upper()=="SOCKS4":
proxiess4 = {'http': 'socks4://'+ip_tmp[1]+":"+ip_tmp[2],'https': 'socks4://'+ip_tmp[1]+":"+ip_tmp[2]}
user_agent=random.choice(agents)
header = {'host':url,'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}
try:
results=requests.get(url,headers=header,proxies =proxiess5 ,timeout=3)
if results.status_code==200:
proxy_list.append(ip_tmp)
else:
break
except Exception as e:
break
elif ip_tmp[0].upper()=="HTTPS":
proxieshs = {'https': 'https://'+ip_tmp[1]+":"+ip_tmp[2]}
user_agent=random.choice(agents)
header = {'host':url,'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}
try:
results=requests.get(url,headers=header,proxies =proxieshs,timeout=3)
if results.status_code==200:
proxy_list.append(ip_tmp)
else:
break
except Exception as e:
break
else :
proxiesh = {'http': 'http://'+ip_tmp[1]+":"+ip_tmp[2]+"/"}
user_agent=random.choice(agents)
header = {'host':url,'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}
try:
results=requests.get(url,headers=header,proxies =proxiesh ,timeout=3)
if results.status_code==200:
proxy_list.append(ip_tmp)
else:
break
except Exception as e:
break
save_data_proxy(proxy_list)
def save_data_proxy(proxy_lis):
global g,h
while proxy_lis!=[]:
with open(r'ipproxy.csv','ab+') as csvfile:
writer = csv.writer(csvfile,dialect='excel')
writer.writerow(proxy_lis)
if proxy_lis[0]=='socks5' or proxy_lis[0]=='socks4':
f.write('%s %s %s\n' %(proxy_lis[0],proxy_lis[1],proxy_lis[2]))
g+=1
else:pass
csvfile.close()
f.close()
print '***已经有%s个代理save*** \n\r' % g
else:
h+=1
print h, " 代理被淘汰*** \n"
def GetUseProxies(proxiesl):
'''测试代理是否可用'''
global e,f
f+=1
print '***正在发送第%s个请求*** \n\r' % f
try:
requests.get('https://api.ipify.org?format=json',proxies = proxiesl ,timeout=2)
UseProxiesList.append(proxiesl)
except:
e+=1
print '***已经有%s个代理被淘汰*** \n\r' % e
pass
return UseProxiesList
class MyTHread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def runing(self):
global queue
while not queue.empty():
proxy_t =queue.get()
GetUseProxies(proxy_t)
class MyTHreading(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def runings(self):
global queues
while not queues.empty():
proxy_t =queues.get()
print 'tset...%s'% proxy_t
runproxy(proxy_t)
def main():
global queue,UseProxiesList,proxy_list,e,f,g,queues,h
f=0
e=0
g=0
h=0
queues=Queue()
proxy_list=[]
UseProxiesList=[]
queue=Queue()
urls=['https://www.sslproxies.org/','https://free-proxy-list.net/anonymous-proxy.html','https://free-proxy-list.net/#list','https://free-proxy-list.net/uk-proxy.html','https://www.us-proxy.org/']
#以上有时不能用,在tor 环境中可以使用,可以考虑删除
print ">>>>>>crawl proxies
t=get_c_proxies()
proxies_c=t.run()
print "代理数量%s" % len(proxies_c)
for i in range(0,len(proxies_c)):
proxy=proxies_c[i]
queue.put(proxy)
for i in range(50):
c=MyTHread()
c.runing()
print '可用代理数量%s' % len(UseProxiesList)
print ">>>>>>>>>>>>>>>>>>>>>>>>>>"
print ">>>>>>GET IP:PROT
get_socks_content(UseProxiesList)
#以下可以考虑删除---------------------------------
i=0
while i
url=urls[i]
print ">>>>>%s
get_http_content(url,UseProxiesList)
i+=1
#-------------------------------------------------
print ">>>>>>>>>>>>>>>>>>>>>>>>>>"
with open('ipproxy2.csv','rb') as f1:
reader=csv.reader(f1)
for proxyrow in reader:
proxy_tmp=proxyrow[0:3]
queues.put(proxyrow)
f1.close()
for i in range(50):
d=MyTHreading()
d.runings()
print "-----tset end---------"
if __name__ == "__main__":
main()
--------------------------------------------------------------------------------------------------------
代码中get_c_proxies 是我自己写的一个class,另外,csv文件格式如上图,如果需要可以联系我。
领取专属 10元无门槛券
私享最新 技术干货