文章/答案/技术大牛

发布

使用已有ip代理爬取ip、检查ip

文章来源：企鹅号 - 吾爱吾所爱

---------------------------------------------------------------------------------------------------#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Author: J.W

import requests

import re

import time

import random

from lxml import etree

from bs4 import BeautifulSoup

import csv

from get_c_proxies import *

import threading

from Queue import Queue

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

agents = [

"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",

"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",

"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",

"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",

"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",

"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",

]

def get_socks_content(proxy_list):

url='https://www.socks-proxy.net/'

h=0

while True:

try:

#proxies ={'http': 'socks5://10.152.152.10:9100', 'https': 'socks5://10.152.152.10:9100'}

proxies =random.choice(proxy_list)

user_agent=random.choice(agents)

time.sleep(0.5)

header = {'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}

requests.adapters.DEFAULT_RETRIES = 5

results = requests.get(url, headers=header,proxies=proxies,timeout=5)

results.encoding='utf-8'

if results.status_code==200:

print proxies

break

except Exception as e:

continue

tree = etree.HTML(results.text)

u_ips =tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[1]/text()')

ports=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[2]/text()')

cuntries=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[4]/text()')

u_type=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[5]/text()')

with open(r'ipproxy2.csv','ab+') as csvfile:

writer = csv.writer(csvfile,dialect='excel')

for i in range(0,len(u_ips)):

ip=u_ips[i]

port=ports[i]

country=cuntries[i]

porxy_type=u_type[i].upper()

writer.writerow([porxy_type.lower(),ip,port,country, "0"])

csvfile.close()

print "********存%s个代理********"% i

print "write end https://www.socks-proxy.net/"

def get_http_content(url,proxy_list):

global UseProxiesList

h=0

while h

proxies =random.choice(proxy_list)

user_agent=random.choice(agents)

header = {'host':url,'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}

requests.adapters.DEFAULT_RETRIES = 2

try:

results = requests.get(url, headers=header,proxies=proxies,timeout=3)

if results.status_code!=200:

continue

else:

print proxies

save_data(results)

break

except Exception as e:

h+=1

pass

continue

def save_data(results):

tree = etree.HTML(results.text)

url_ips =tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[1]/text()')

url_ports=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[2]/text()')

url_cuntries=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[4]/text()')

url_typies=tree.xpath('//*[@id="proxylisttable"]/tbody/tr[position()>0]/td[7]/text()')

url_type=[]

print "write data"

i=0

while i

if url_typies[i]=="yes":

url_type.append("HTTPS")

else:

url_type.append("HTTP")

i+=1

with open(r'ipproxy2.csv','ab+') as csvfile:

writer = csv.writer(csvfile,dialect='excel')

for i in range(0,len(url_ips)):

ip=url_ips[i]

port=url_ports[i]

country=url_cuntries[i]

porxy_type=url_type[i].upper()

writer.writerow([porxy_type.lower(),ip,port,country, "0"])

csvfile.close()

print "********存%s个代理********"% i

def runproxy(ip_tmp):

global proxy_list

url="https://www.google.com/"

while True:

if ip_tmp[0].upper()=="SOCKS5":#lower：大写字符转化成小写的 upper：小写字符转化成大写的函数

proxiess5= {'http':'socks5://'+ip_tmp[1]+":"+ip_tmp[2],'https':'socks5://'+ip_tmp[1]+":"+ip_tmp[2]}

user_agent=random.choice(agents)

try:

results=requests.get(url,headers=header,proxies =proxiess5 ,timeout=3)

if results.status_code==200:

proxy_list.append(ip_tmp)

else:

break

except Exception as e:

break

elif ip_tmp[0].upper()=="SOCKS4":

proxiess4 = {'http': 'socks4://'+ip_tmp[1]+":"+ip_tmp[2],'https': 'socks4://'+ip_tmp[1]+":"+ip_tmp[2]}

user_agent=random.choice(agents)

header = {'host':url,'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}

try:

results=requests.get(url,headers=header,proxies =proxiess5 ,timeout=3)

if results.status_code==200:

proxy_list.append(ip_tmp)

else:

break

except Exception as e:

break

elif ip_tmp[0].upper()=="HTTPS":

proxieshs = {'https': 'https://'+ip_tmp[1]+":"+ip_tmp[2]}

user_agent=random.choice(agents)

header = {'host':url,'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}

try:

results=requests.get(url,headers=header,proxies =proxieshs,timeout=3)

if results.status_code==200:

proxy_list.append(ip_tmp)

else:

break

except Exception as e:

break

else :

proxiesh = {'http': 'http://'+ip_tmp[1]+":"+ip_tmp[2]+"/"}

user_agent=random.choice(agents)

header = {'host':url,'User-Agent': user_agent,'Connection': 'keep-alive','Pragma': 'no-cache','Cache-Control': 'no-cache'}

try:

results=requests.get(url,headers=header,proxies =proxiesh ,timeout=3)

if results.status_code==200:

proxy_list.append(ip_tmp)

else:

break

except Exception as e:

break

save_data_proxy(proxy_list)

def save_data_proxy(proxy_lis):

global g,h

while proxy_lis!=[]:

with open(r'ipproxy.csv','ab+') as csvfile:

writer = csv.writer(csvfile,dialect='excel')

writer.writerow(proxy_lis)

if proxy_lis[0]=='socks5' or proxy_lis[0]=='socks4':

f.write('%s %s %s\n' %(proxy_lis[0],proxy_lis[1],proxy_lis[2]))

g+=1

else:pass

csvfile.close()

f.close()

print '***已经有%s个代理save*** \n\r' % g

else:

h+=1

print h, " 代理被淘汰*** \n"

def GetUseProxies(proxiesl):

'''测试代理是否可用'''

global e,f

f+=1

print '***正在发送第%s个请求*** \n\r' % f

try:

requests.get('https://api.ipify.org?format=json',proxies = proxiesl ,timeout=2)

UseProxiesList.append(proxiesl)

except:

e+=1

print '***已经有%s个代理被淘汰*** \n\r' % e

pass

return UseProxiesList

class MyTHread(threading.Thread):

def __init__(self):

threading.Thread.__init__(self)

def runing(self):

global queue

while not queue.empty():

proxy_t =queue.get()

GetUseProxies(proxy_t)

class MyTHreading(threading.Thread):

def __init__(self):

threading.Thread.__init__(self)

def runings(self):

global queues

while not queues.empty():

proxy_t =queues.get()

print 'tset...%s'% proxy_t

runproxy(proxy_t)

def main():

global queue,UseProxiesList,proxy_list,e,f,g,queues,h

f=0

e=0

g=0

h=0

queues=Queue()

proxy_list=[]

UseProxiesList=[]

queue=Queue()

urls=['https://www.sslproxies.org/','https://free-proxy-list.net/anonymous-proxy.html','https://free-proxy-list.net/#list','https://free-proxy-list.net/uk-proxy.html','https://www.us-proxy.org/']

#以上有时不能用，在tor 环境中可以使用，可以考虑删除

print ">>>>>>crawl proxies

t=get_c_proxies()

proxies_c=t.run()

print "代理数量%s" % len(proxies_c)

for i in range(0,len(proxies_c)):

proxy=proxies_c[i]

queue.put(proxy)

for i in range(50):

c=MyTHread()

c.runing()

print '可用代理数量%s' % len(UseProxiesList)

print ">>>>>>>>>>>>>>>>>>>>>>>>>>"

print ">>>>>>GET IP:PROT

get_socks_content(UseProxiesList)

#以下可以考虑删除---------------------------------

i=0

while i

url=urls[i]

print ">>>>>%s

get_http_content(url,UseProxiesList)

i+=1

#-------------------------------------------------

print ">>>>>>>>>>>>>>>>>>>>>>>>>>"

with open('ipproxy2.csv','rb') as f1:

reader=csv.reader(f1)

for proxyrow in reader:

proxy_tmp=proxyrow[0:3]

queues.put(proxyrow)

f1.close()

for i in range(50):

d=MyTHreading()

d.runings()

print "-----tset end---------"

if __name__ == "__main__":

main()

--------------------------------------------------------------------------------------------------------

代码中get_c_proxies 是我自己写的一个class，另外，csv文件格式如上图，如果需要可以联系我。

发表于: 2018-06-172018-06-17 10:59:43
原文链接：https://kuaibao.qq.com/s/20180617G0H7N700?refer=cp_1026
腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
如有侵权，请联系 cloudcommunity@tencent.com 删除。

使用已有ip代理爬取ip、检查ip

相关快讯

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐