文章/答案/技术大牛

发布

利用 Python 编写简单网络爬虫实例 3

文章来源：企鹅号 - 网络数据采集

实验环境

python 版本：3.3.5

实验目的

目标 url 存在子页面中的文章中，随机分布，我们要把它找出来。

python 脚本

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from urllib.request import *

import gzip, re

from io import BytesIO

from html.parser import HTMLParser

# 爬虫类

class Reptile:

"""to download web pages"""

def __init__(self):

self.url_set = set() # 用于存储已下载过的页面 url

self.data = ""

# 下载页面

def get_page(self, url, headers):

request = Request(url, headers=headers)

request.add_header('Accept-encoding', 'gzip') #下载经过 gzip 方式压缩后的网页，减少网络流量

try:

response = urlopen(request) # 发送请求报文

if response.code == 200: # 请求成功

page = response.read() # 读取经压缩后的页面

if response.info().get("Content-Encoding") == "gzip":

page_data = BytesIO(page)

gzipper = gzip.GzipFile(fileobj = page_data)

self.data = gzipper.read()

else:

print("gzip unused")

self.data = page_data # 网页未采用 gzip 方式压缩，使用原页面

except Exception:

pass

self.url_set.add(url)

return self.data

# 获取论坛目标版块 url

def get_forum_url(self, url_set, home, include):

forum_url_set = set() # 用于存放版块 url

while len(url_set) > 0:

url = url_set.pop()

if re.findall(include, url):

# 读取的版块 url 通常是 forum-53-1.html 形势的

url = home + url

forum_url_set.add(url)

return forum_url_set

# 获取版块 url 下的帖子 url

def get_title_url(self, url_set, home, include):

title_url_set = set() # 用于存放帖子 url

while len(url_set) > 0:

url = url_set.pop()

if re.findall(include, url):

# 读取的帖子 url 通常是 thread-1044711-1-1.html 形式的

url = home + url

title_url_set.add(url)

return title_url_set

# 解析器类

class MyHtmlParser(HTMLParser):

def reset(self):

HTMLParser.reset(self) # 注意顺序

self.url_set = set()

def handle_starttag(self, tag, attrs):

#self.url = []

url_list = [value for key, value in attrs if "href" == key]

if url_list:

for url in url_list:

self.url_set.add(url)

##############测试################

# 添加头域，伪装浏览器访问网站,防止一些网站拒绝爬虫访问

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101

Firefox/33.0"}

# 构造解析器

parser = MyHtmlParser(strict = False)

# 下载网页

page_number = 1

print("program is downloading the frist url page")

reptile = Reptile()

page = reptile.get_page(init_url, headers)

print("processing the %dth url page" % page_number)

# 解析网页(获取 url)

parser.feed(str(page))

# 获取分类版块 url

# 分成多个变量来写，主要是为了书写方便，排版友好

pattern1 = "forum-122-[1-9]forum-243-[1-9]forum-40-[1-9]forum-63-[1-9]"

pattern2 = "forum-42-[1-9]forum-53-[1-9]forum-275-[1-9]forum-140-[-9]"

pattern3 = "forum-138-[1-9]forum-139-[1-9]forum-141-[1-9]"

pattern = pattern1 + pattern2 + pattern3

include = re.compile(pattern)

forum_url_set = reptile.get_forum_url(parser.url_set, home, include)

# 循环，获取每个分类版块下，1-10 子版块的 url(前 10 页)(翻页页面)

result_url_set = set()

forum_index = 1

for forum_url in forum_url_set:

page = reptile.get_page(forum_url, headers)

parser.feed(str(page))

print("getting the board urls in the %dth forum page" % forum_index)

tmp_url_set = reptile.get_forum_url(parser.url_set, home, include)

forum_index = forum_index + 1

result_url_set = result_url_set ^ tmp_url_set

title_url_set = set()

forum_index = 1

title_index = 1

for forum_url in result_url_set:

page = reptile.get_page(forum_url, headers)

parser.feed(str(page))

# 获取版块下的帖子 url

pattern1 = "thread-[0-9]-[0-9]-[0-9][.]html"

pattern2 = "thread-[0-9]-[0-9]-[0-9][.]html"

pattern3 = "thread-[0-9]-[0-9]-[0-9][.]html"

pattern4 = "thread-[0-9]-[0-9]-[0-9][.]html"

pattern = pattern1 + pattern2 + pattern3 + pattern4

include = re.compile(pattern)

print("getting all title urls in the %dth forum board" % forum_index)

tmp_url_set = reptile.get_title_url(parser.url_set, home, include)

forum_index = forum_index + 1

title_url_set = title_url_set ^ tmp_url_set

# 获取目标 url

target_index = 1

title_index = 1

filepath = "d:/url.txt"

for title_url in title_url_set:

print("processing the %dth title url" % title_index)

page = reptile.get_page(title_url, headers)

parser.feed(str(page))

# 保存目标 url

with open(filepath, "a") as f:

while len(parser.url_set) > 0:

url = parser.url_set.pop() pattern = "http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9]"

include = re.compile(pattern)

flag = re.findall(include, url)

if flag:

print("find target! saving the %dth target url in the %dth title page" %

(target_index, title_index))

f.write("the %dth url: %s" % (target_index, url))

target_index = target_index + 1

f.write("\n")

title_index = title_index + 1

print("complete")

结果：

声明：仅供学习研究使用，请勿用于其它非法用途。

发表于: 2018-01-092018-01-09 23:00:16
原文链接：http://kuaibao.qq.com/s/20180109G0Z38R00?refer=cp_1026
腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
如有侵权，请联系 cloudcommunity@tencent.com 删除。

扫码

添加站长进交流群

领取专属 10元无门槛券

私享最新 技术干货

利用 Python 编写简单网络爬虫实例 3

相关快讯

扫码

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐