利用 Python 编写简单网络爬虫实例 1

实验环境

python 版本:3.3.5

实验目的

学习研究,获取网站中特定 url

基本思路:

1)给定一个初始 URL(入口 URL)

2)下载初始页面,解析并抓取初始页面中特定的 URL,同时还可以对已分析数据进行保存

3)根据实际情况,决定是否对抓取的 URL 进行进一步过滤,筛选

4)循环,将过滤后的每个 URL 当作初始 url 再次抓取

这里需要对已抓取的 URL 做判断,以免重复抓取,即需要保存已抓取记录,便于后续判断。

python 脚本

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from urllib.request import *

import gzip, re

from io import BytesIO

from html.parser import HTMLParser

# 爬虫类

class Reptile:

"""to download web pages"""

def __init__(self, filepath="d:/url.txt"):

self.url_set = set() # 用于存储已下载过的页面 url

self.filepath = filepath # 目标 url 文件路径

self.data = ""

def get_page(self, url, headers):

request = Request(url, headers=headers)

request.add_header('Accept-encoding', 'gzip') #下载经过 gzip 方式压缩后的网页,减少网络流量

try:

response = urlopen(request) # 发送请求报文

if response.code == 200: # 请求成功

page = response.read() # 读取经压缩后的页面

if response.info().get("Content-Encoding") == "gzip":

page_data = BytesIO(page)

gzipper = gzip.GzipFile(fileobj = page_data)

self.data = gzipper.read()

else:

print("gzip unused")

self.data = page_data # 网页未采用 gzip 方式压缩,使用原页面

except Exception:

pass

self.url_set.add(url)

# 及时保存目标 url

with open(self.filepath, "a") as f:

flag =

re.findall("http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9][0-9][0-9][0-9][0-9]",

url)

if flag:

f.write(url)

f.write("\n")

return self.data

# 获取种子 URL

def get_url_seed(self, url_set, include, exclusive):

url_seed_set = set() # 存放相同服务器下的 url

seed_set = set() # 存放最后过滤的 url

# 过滤不属于当前服务器下的 url

while len(url_set) != 0:

url = url_set.pop()

if re.findall(include, url):

url_seed_set.add(url)

# 进一步过滤不想要的 url

has_exclusion = False # 用于判断是否进一步过滤

if exclusive != "":

has_exclusion = True

while len(url_seed_set) != 0:

url = url_seed_set.pop()

if re.findall(exclusive, url) == []:

seed_set.add(url)

return seed_set

else:

return url_seed_set

# 筛选种子 url(过滤掉已经抓取过的 url)

def filter_seed_url(self, url_set):

result_set = url_set - self.url_set

return result_set

# 解析器类

class MyHtmlParser(HTMLParser):

def reset(self):

HTMLParser.reset(self) # 注意顺序

self.url_set = set()

def handle_starttag(self, tag, attrs):

#self.url = []

url_list = [value for key, value in attrs if "href" == key]

if url_list:

for url in url_list:

self.url_set.add(url)

##############测试################

# 添加头域,伪装浏览器访问网站,防止一些网站拒绝爬虫访问

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101

Firefox/33.0"}

# 构造解析器

parser = MyHtmlParser(strict = False)

# 下载网页

page_number = 1

print("program is downloading the frist url page")

reptile = Reptile()

page = reptile.get_page(init_url, headers)

print("processing the %dth url page" % page_number)

# 解析网页(获取 url)

parser.feed(str(page))

# 获取种子 url

exclusion = "mod=logincard.phparchivermod=viewthread[.]css[.]js[.]gif"

".jpgabout[.]phppanel[.]php[.]swfsearch[.]php"

exclusive = re.compile(exclusion)

include = re.compile(homepage)

url_seed_set = reptile.get_url_seed(parser.url_set, include, exclusive)

# 过滤种子 url

result_set = reptile.filter_seed_url(url_seed_set)

print("complete")

# 循环

if_continue = "yes"

while if_continue in ("yes", "", "YES", "Yes"):

if_continue = input("if continue another grab, input 'yes' or click Enter to continue,"

"anything else to exit: ")

if if_continue not in ("yes", "", "Yes", "YES"):

break

deep = input("the level you want reptile to try: ")

if deep.isdigit() == False:

print("value must be a number, try gain")

continue

else:

level = int(deep)

num = 0

i = 0

total_set = set()

while i < level:

print("*****************parsing url pages on the %dth level*****************" % (i+1))

for url in result_set:

print("program is processing the %dth url" % (page_number+1))

page = reptile.get_page(url, headers)

parser.feed(str(page))

url_seed_set = reptile.get_url_seed(parser.url_set, include, exclusive)

result_set_tmp = reptile.filter_seed_url(url_seed_set)

total_set = total_set ^ result_set_tmp

page_number = page_number + 1

result_set = total_set

if result_set:

i = i + 1

else:

break

print("complete")

结果:

声明:仅供学习研究使用,请勿用于其它非法用途

  • 发表于:
  • 原文链接:http://kuaibao.qq.com/s/20180107G0PXPC00?refer=cp_1026

扫码关注云+社区