代码:
# -*- coding:utf-8 -*-
importurllib
importurllib2
importre
# 处理页面标签类
classTool:
# 去除img标签,7位长空格
removeImg = re.compile('| |')
# 删除超链接标签
removeAddr = re.compile('|')
# 把换行的标签换为\n
replaceLine = re.compile('|
|
|
')
# 将表格制表替换为\t
replaceTD = re.compile('')
# 把段落开头换为\n加空两格
replacePara = re.compile('
')
# 将换行符或双换行符替换为\n
replaceBR = re.compile('
|
')
# 将其余标签剔除
removeExtraTag = re.compile('')
defreplace(self, x):
x = re.sub(self.removeImg,"", x)
x = re.sub(self.removeAddr,"", x)
x = re.sub(self.replaceLine,"\n", x)
x = re.sub(self.replaceTD,"\t", x)
x = re.sub(self.replacePara,"\n", x)
x = re.sub(self.replaceBR,"\n", x)
x = re.sub(self.removeExtraTag,"", x)
# strip()将前后多余内容删除
returnx.strip()
# 百度贴吧爬虫类
classBDTB:
# 初始化,传入基地址,是否只看楼主的参数
def__init__(self, baseUrl, seeLZ, floorTag):
# base链接地址
self.baseURL = baseUrl
# 是否只看楼主
self.seeLZ ='?see_lz='+str(seeLZ)
# HTML标签剔除工具类对象
self.tool = Tool()
# 全局file变量,文件写入操作对象
self.file =None
# 楼层标号,初始为1
self.floor =1
# 默认的标题,如果没有成功获取到标题的话则会用这个标题
self.defaultTitle =u"百度贴吧"
# 是否写入楼分隔符的标记
self.floorTag = floorTag
# 传入页码,获取该页帖子的代码
defgetPage(self, pageNum):
try:
url =self.baseURL +self.seeLZ +'&pn='+str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
returnresponse.read().decode('utf-8')
excepturllib2.URLError, e:
ifhasattr(e,"reason"):
printu"连接百度贴吧失败,错误原因", e.reason
returnNone
# 获取帖子标题
defgetTitle(self,page):
pattern = re.compile('(.*?)', re.S)
result = re.search(pattern, page)
ifresult:
# print result.group(1) #测试输出
returnresult.group(1).strip()
else:
returnNone
# 获取帖子一共有多少页
defgetPageNum(self,page):
pattern = re.compile('
result = re.search(pattern, page)
ifresult:
# print result.group(1) #测试输出
returnresult.group(1).strip()
else:
returnNone
# 获取每一层楼的内容,传入页面内容
defgetContent(self, page):
pattern = re.compile('
(.*?)
', re.S)
items = re.findall(pattern, page)
contents = []
foriteminitems:
# 将文本进行去除标签处理,同时在前后加入换行符
content ="\n"+self.tool.replace(item) +"\n"
contents.append(content.encode('utf-8'))
returncontents
defsetFileTitle(self, title):
# 如果标题不是为None,即成功获取到标题
iftitleis notNone:
self.file =open(title +".txt","w+")
else:
self.file =open(self.defaultTitle +".txt","w+")
defwriteData(self, contents):
# 向文件写入每一楼的信息
foritemincontents:
ifself.floorTag =='1':
# 楼之间的分隔符
floorLine ="\n"+str(
self.floor) +u"-----------------------------------------------------------------------------------------\n"
self.file.write(floorLine)
self.file.write(item)
self.floor +=1
defstart(self):
indexPage =self.getPage(2)
printindexPage
pageNum =self.getPageNum(indexPage)
title =self.getTitle(indexPage)
self.setFileTitle(title)
ifpageNum ==None:
print"URL已失效,请重试"
return
try:
print"该帖子共有"+str(pageNum) +"页"
foriinrange(1,int(pageNum) +1):
print"正在写入第"+str(i) +"页数据"
page =self.getPage(i)
contents =self.getContent(page)
self.writeData(contents)
# 出现写入异常
exceptIOError, e:
print"写入异常,原因"+ e.message
finally:
print"写入任务完成"
printu"请输入帖子代号"
baseURL ='http://tieba.baidu.com/p/'+str(raw_input(u'http://tieba.baidu.com/p/'))
seeLZ =raw_input("是否只获取楼主发言,是输入1,否输入0\n")
floorTag =raw_input("是否写入楼层信息,是输入1,否输入0\n")
bdtb = BDTB(baseURL, seeLZ, floorTag)
bdtb.start()
效果图:
领取专属 10元无门槛券
私享最新 技术干货