Python爬虫教程，爬取小说网站

python学习教程

发布于 2020-03-02 10:05:04

1.4K0

发布于 2020-03-02 10:05:04

文章被收录于专栏：python学习教程

爬取网站：http://www.biqugecom.com/

爬取方式：整站爬取，就是把该站所有的小说都爬下来。

本次爬取涉及到的知识点有：

Xpath

类的定义及使用

requests库的使用

准备工作

安装requests库：

pip3 install requests

安装lxml库：

pip3 install lxml

分析网站：

得到每个分类的页面的链接只有上面箭头指的地方变了下，因此这里使用Python自动生成了分类的链接：

typeLinks = []
    for i in range(1, 9):
        typeLinks.append('http://www.biqugecom.com/list/%s-1.html' % (str(i)))
    print(typeLinks)

结果为：

['http://www.biqugecom.com/list/1-1.html', 'http://www.biqugecom.com/list/2-1.html', 'http://www.biqugecom.com/list/3-1.html', 'http://www.biqugecom.com/list/4-1.html', 'http://www.biqugecom.com/list/5-1.html', 'http://www.biqugecom.com/list/6-1.html', 'http://www.biqugecom.com/list/7-1.html', 'http://www.biqugecom.com/list/8-1.html']

之后再根据每个分类的链接，爬取该分类下的小说，分析分类页面小说的Xpath：

通过Xpath 获取到了一本小说的链接。

再根据一本小说的链接爬取该小说的章节链接，首先获取章节的Xpath：

获取到了一章的链接，再根据一章的链接获取小说的文本内容，还是Xpath获取：

获取到小说的文本内容，下面就可以开始爬取内容了，这里先整理一下待爬取的内容的Xpath：

//*[@class='media-heading book-title']/a/@href

一章的Xpath：

//*//dd//a/@href

小说内容的Xpath：

//*[@id='content']/text()

话不多说，直接上代码：


import requests
import random
from lxml import etree
import os
 
# 设置requests库的重连接次数
requests.adapters.DEFAULT_RETRIES = 5
# 爬取的主域名
HOST = 'http://www.biqugecom.com'
# User-Agent
user_agent = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
 
 
# 爬取一本小说
class ScrapyOne(object):
 
    def __init__(self, rootLink):
        super(ScrapyOne, self).__init__()
        self.rootLink = rootLink
 
    # 爬取每章的链接
    def scrapyLink(self):
        try:
            # 随机生成请求头
            header = {"User-Agent": random.choice(user_agent)}
            res = requests.get(self.rootLink, headers=header)
            res.encoding = 'gbk'
            # 解析HTML
            data = etree.HTML(res.text)
            links = []
            # 获取书名
            bookname = data.xpath("//*[@id='info']/h1/text()")[0]
            # 获取每章的链接，由于前9个是推荐章节，因此从第10个开始爬
            for link in data.xpath("//*//dd//a/@href")[9:]:
                links.append(HOST + link)
            if links:
                return {
                    'bookname': bookname,
                    'links': links
                }
            else:
                return []
        except Exception as e:
            print(e)
            return []
 
    # 爬取一章的内容
    def scrapyText(self, url):
        try:
            header = {"User-Agent": random.choice(user_agent)}
            res = requests.get(url, headers=header)
            res.encoding = 'gbk'
            data = etree.HTML(res.text)
            # 获取章节名
            name = data.xpath("//*[@class='bookname']/h1/text()")[0]
            texts = []
            # 获取小说内容
            for text in data.xpath("//*[@id='content']/text()"):
                text = text.replace('\r\n', '').replace('\xa0\xa0\xa0\xa0', '')
                if text:
                    texts.append(text)
            if texts:
                return {
                    'name': name,
                    'texts': texts
                }
            else:
                return False
        except Exception as e:
            print(e)
            return False
 
    # 保存一章
    def save(self, bookname, name, texts):
        try:
            # 文件夹不存在则以小说名字创建
            if not os.path.exists('./' + bookname):
                os.makedirs('./' + bookname)
            with open('./%s/%s.txt' % (bookname, name), 'w', encoding='UTF - 8 - sig') as f:
                f.write(name + '\n')
                for text in texts:
                    f.write(text + '\n')
            f.close()
            return True
        except Exception as e:
            print(e)
            return False
 
    # 主函数
    def main(self):
        try:
            # 获取书的章节信息
            bookInfo = self.scrapyLink()
            # 这里的i主要是为了方便爬取出的小说在资源管理器好排序
            i = 0
            for link in bookInfo['links']:
                # 获取一章的内容
                info = self.scrapyText(link)
                if info:
                    if self.save(bookInfo['bookname'], str(i) + '-' + info['name'], info['texts']):
                        print('存储成功', info['name'])
                    else:
                        print('存储失败', info['name'])
                    i += 1
        except Exception as e:
            print(e)
 
 
# 获取每个分类下的小说链接
def scrapyRootLink(url):
    try:
        header = {"User-Agent": random.choice(user_agent)}
        res = requests.get(url, headers=header)
        res.encoding = 'gbk'
        data = etree.HTML(res.text)
        links = []
        for link in data.xpath("//*[@class='media-heading book-title']/a/@href"):
            if link:
                links.append(link)
        if links:
            print('分类已完毕 %s' % (url))
            return links
        else:
            return []
    except Exception as e:
        print(e)
        return []
 
 
if __name__ == "__main__":
    typeLinks = []
    # 生成分类链接
    for i in range(1, 9):
        typeLinks.append('http://www.biqugecom.com/list/%s-1.html' % (str(i)))
    for typeLink in typeLinks:
        # 根据分类链接
        for rootLink in scrapyRootLink(typeLink):
            # 爬取一本小说
            one = ScrapyOne(rootLink)
            one.main()

程序运行结果：

分类已完毕 http://www.biqugecom.com/list/1-1.html
存储成功   第一章 小生来也
存储成功   第二章 过河拆桥
存储成功   第三章 仙子落难
存储成功   第四章 偷鸡的贼
存储成功   第五章 后会有期
存储成功   第六章 有人带路
存储成功   第七章 有辱斯文
存储成功   第八章 撒回野吧
存储成功   第九章 同道中人
存储成功   第十章 什么意思
存储成功   第十一章 令师何人
存储成功   第十二章 此道漫漫
存储成功   第十三章 诸位慢行
存储成功   第十四章 御风而行
存储成功   第十五章 怪物凶猛
存储成功   第十六章 也很吓人
存储成功   第十七章 做梦来着
存储成功   第十八章 很不简单
存储成功   第十九章 天刑符经
存储成功   第二十章 灵山有路
存储成功   第二十一章 故人寻来
存储成功   第二十二章 图的个啥
存储成功   第二十三章 灵山仙缘
......

查看爬取的小说：