实验环境
python 版本:3.3.5
实验目的
学习研究,获取网站中特定 url
基本思路:
1)给定一个初始 URL(入口 URL)
2)下载初始页面,解析并抓取初始页面中特定的 URL,同时还可以对已分析数据进行保存
3)根据实际情况,决定是否对抓取的 URL 进行进一步过滤,筛选
4)循环,将过滤后的每个 URL 当作初始 url 再次抓取
这里需要对已抓取的 URL 做判断,以免重复抓取,即需要保存已抓取记录,便于后续判断。
python 脚本
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from urllib.request import *
import gzip, re
from io import BytesIO
from html.parser import HTMLParser
# 爬虫类
class Reptile:
"""to download web pages"""
def __init__(self, filepath="d:/url.txt"):
self.url_set = set() # 用于存储已下载过的页面 url
self.filepath = filepath # 目标 url 文件路径
self.data = ""
def get_page(self, url, headers):
request = Request(url, headers=headers)
request.add_header('Accept-encoding', 'gzip') #下载经过 gzip 方式压缩后的网页,减少网络流量
try:
response = urlopen(request) # 发送请求报文
if response.code == 200: # 请求成功
page = response.read() # 读取经压缩后的页面
if response.info().get("Content-Encoding") == "gzip":
page_data = BytesIO(page)
gzipper = gzip.GzipFile(fileobj = page_data)
self.data = gzipper.read()
else:
print("gzip unused")
self.data = page_data # 网页未采用 gzip 方式压缩,使用原页面
except Exception:
pass
self.url_set.add(url)
# 及时保存目标 url
with open(self.filepath, "a") as f:
flag =
re.findall("http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9][0-9][0-9][0-9][0-9]",
url)
if flag:
f.write(url)
f.write("\n")
return self.data
# 获取种子 URL
def get_url_seed(self, url_set, include, exclusive):
url_seed_set = set() # 存放相同服务器下的 url
seed_set = set() # 存放最后过滤的 url
# 过滤不属于当前服务器下的 url
while len(url_set) != 0:
url = url_set.pop()
if re.findall(include, url):
url_seed_set.add(url)
# 进一步过滤不想要的 url
has_exclusion = False # 用于判断是否进一步过滤
if exclusive != "":
has_exclusion = True
while len(url_seed_set) != 0:
url = url_seed_set.pop()
if re.findall(exclusive, url) == []:
seed_set.add(url)
return seed_set
else:
return url_seed_set
# 筛选种子 url(过滤掉已经抓取过的 url)
def filter_seed_url(self, url_set):
result_set = url_set - self.url_set
return result_set
# 解析器类
class MyHtmlParser(HTMLParser):
def reset(self):
HTMLParser.reset(self) # 注意顺序
self.url_set = set()
def handle_starttag(self, tag, attrs):
#self.url = []
url_list = [value for key, value in attrs if "href" == key]
if url_list:
for url in url_list:
self.url_set.add(url)
##############测试################
# 添加头域,伪装浏览器访问网站,防止一些网站拒绝爬虫访问
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101
Firefox/33.0"}
# 构造解析器
parser = MyHtmlParser(strict = False)
# 下载网页
page_number = 1
print("program is downloading the frist url page")
reptile = Reptile()
page = reptile.get_page(init_url, headers)
print("processing the %dth url page" % page_number)
# 解析网页(获取 url)
parser.feed(str(page))
# 获取种子 url
exclusion = "mod=logincard.phparchivermod=viewthread[.]css[.]js[.]gif"
".jpgabout[.]phppanel[.]php[.]swfsearch[.]php"
exclusive = re.compile(exclusion)
include = re.compile(homepage)
url_seed_set = reptile.get_url_seed(parser.url_set, include, exclusive)
# 过滤种子 url
result_set = reptile.filter_seed_url(url_seed_set)
print("complete")
# 循环
if_continue = "yes"
while if_continue in ("yes", "", "YES", "Yes"):
if_continue = input("if continue another grab, input 'yes' or click Enter to continue,"
"anything else to exit: ")
if if_continue not in ("yes", "", "Yes", "YES"):
break
deep = input("the level you want reptile to try: ")
if deep.isdigit() == False:
print("value must be a number, try gain")
continue
else:
level = int(deep)
num = 0
i = 0
total_set = set()
while i < level:
print("*****************parsing url pages on the %dth level*****************" % (i+1))
for url in result_set:
print("program is processing the %dth url" % (page_number+1))
page = reptile.get_page(url, headers)
parser.feed(str(page))
url_seed_set = reptile.get_url_seed(parser.url_set, include, exclusive)
result_set_tmp = reptile.filter_seed_url(url_seed_set)
total_set = total_set ^ result_set_tmp
page_number = page_number + 1
result_set = total_set
if result_set:
i = i + 1
else:
break
print("complete")
结果:
声明:仅供学习研究使用,请勿用于其它非法用途
领取专属 10元无门槛券
私享最新 技术干货