最近学习爬虫,加上对onion网站比较有兴趣,利用以下工具对已有的暗网网址进行信息收集,对是否正常运行、服务器类型、标题进行了爬取。具体需要的工作有以下三种:
1. tor
2. cow cow把tor的socks5协议转为http协议
3. Vidalia tor expert bundle没有图形界面,需要Vidalia配合使用
4.python27
具体代码如下
# -*- coding:utf-8
import csv
import lxml
from bs4 import BeautifulSoup
import requests
import stem
import stem.connection
from stem import Signal
from stem.control import Controller
import time
import codecs
def renew_connection():
url ='https://api.ipify.org?format=json'
with Controller.from_port(port = 9151) as controller:
controller.authenticate(password = 'test1234')
controller.signal(Signal.NEWNYM)
controller.close()
def get_html_url():
html_url = []
inFile = open("HiddenServices.csv", "r")
reader = csv.reader(inFile)
for line in reader:
ip_temp =line[0]
html_url.append(ip_temp)
inFile.close()
return html_url
def check_url(htmls):
html_text=[]
for i in range(0,len(htmls)):
URL="http://"+htmls[i]
renew_connection()
proxies = {'http': 'http://127.0.0.1:7777', 'https':'http://127.0.0.1:7777'}
s = requests.Session()
try:
r = s.get(URL,headers={'Connection': 'close'},proxies =proxies,timeout=20)
soup = BeautifulSoup(r.text,'html5lib')
print(URL+" "+str(r.status_code)+""+str(r.headers.get("server"))+" "+title)
html_text.append([URL,r.status_code,r.headers.get("server"),title])
time.sleep(0.5)
except Exception as e:
print e
continue
with open('onion.csv','wb') as csvfile:
writer = csv.writer(csvfile,dialect='excel')
writer.writerow(['url','status_code','server','title'])
for i in range(len(html_text)):
line=html_text[i]
writer.writerow([str(line[0]),int(line[1]),str(line[2]),str(line[3])])
csvfile.close()
def main():
print "Running tests..."
htmls=get_html_url()
check_url(htmls)
print "test end..."
if __name__ == '__main__':
main()
领取专属 10元无门槛券
私享最新 技术干货