Python3爬虫实战(二):电子书标题、作者、简介

本文以'allitebooks'网站对象,实现电子书标题、作者、简介批量获取,并以json和csv文件形式存入本地。

代码使用python的requests模块和xpath、bs4两种方式,并以json和csv格式转存本地。

分成4步:1,发请求;2,解析数据;3,保存数据;4,json转换成csv。

Code:1,使用xpath;2,使用bs4。

# 1,使用xpath:
import requests
from lxml import etree
import json
import csv
import time

class BookSpider(object):
    def __init__(self):
        self.base_url = 'http://www.allitebooks.com/page/{}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.86 Chrome/73.0.3683.86 Safari/537.36'}
        self.data_of_book_dict = []


    # 1,构建所有的url:
    def get_url_list(self):
        url_list = []
        for i in range(1, 10):
            url = self.base_url.format(i)
            url_list.append(url)
        return url_list

    # 2,发请求:
    def send_request(self, url):
        data = requests.get(url, headers = self.headers).content.decode()
        return data

    # 3,解析数据:
    def parse_xpath_data(self, data):
        parse_data = etree.HTML(data)
        # 1,解析出所有的书:
        book_list = parse_data.xpath('//div[@class="main-content-inner clearfix"]/article')
        # print(len(book_list))

        # 2,解析出每本书的信息:
        for book in book_list:
            book_dict = {}
            # 1,书名:
            book_dict['book_name'] = book.xpath('.//h2[@class="entry-title"]//text()')  # .:表示当前路径, //:表示跨节点
            #print(book_name)

            # 2,该书的url:获取该路径下面的属性src
            book_dict['book_img_url'] = book.xpath('./div[@class="entry-thumbnail hover-thumb"]/a/img/@src')[0]  # book_list中元素的xpath路径article(相当于本地路径)下面的标签div
            #print(book_img_url)

            # 3,书的作者:
            book_dict['book_author'] = book.xpath('.//h5[@class="entry-author"]/a/text()')[0]
            #print(book_author)

            # 4,书的简介:
            book_dict['book_info'] = book.xpath('.//div[@class="entry-summary"]/p/text()')[0]  # 也可以写//text()替换/p/text()
            #print(book_info)
            self.data_of_book_dict.append(book_dict)


    # 4,保存数据:
    def save_data(self):
        json.dump(self.data_of_book_dict, open('ebook_xpath.json', 'w'))

    # 5,启动:
    def start(self):
        url_list = self.get_url_list()

        # 循环遍历发送请求:
        for url in url_list:
            print(url)
            data = self.send_request(url)
            self.parse_xpath_data(data)

        self.save_data()

start = time.time()

BookSpider().start()

# 将json数据转换成csv格式:列表数据转csv
# 1,读 创建文件:
json_fp = open('ebook_xpath.json', 'r')
csv_fp = open('ebook_xpath.csv', 'w')

# 2,提出表头 表内容:
data_list = json.load(json_fp)
sheet_title = data_list[0].keys()  # 方法一


sheet_data = []
for data in data_list:
    sheet_data.append(data.values())
# print(sheet_data)

# 3,csv写入器:
writer = csv.writer(csv_fp)

# 4,写入表头:
writer.writerow(sheet_title)

# 5,写入内容:
writer.writerows(sheet_data)

# 6,关闭两个文件:
json_fp.close()
csv_fp.close()


end = time.time()
print('cost time: ', end-start, 's')

# 2,使用bs4:
import requests
from bs4 import BeautifulSoup
import json
import csv
import time

class BookSpider(object):
    def __init__(self):
        self.base_url = 'http://www.allitebooks.com/page/{}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.86 Chrome/73.0.3683.86 Safari/537.36'}
        self.data_of_book_dict = []


    # 1,构建所有的url:
    def get_url_list(self):
        url_list = []
        for i in range(1, 10):
            url = self.base_url.format(i)
            url_list.append(url)
        return url_list

    # 2,发请求:
    def send_request(self, url):
        data = requests.get(url, headers = self.headers).content.decode()
        return data

    # 3,解析数据:
    def parse_bs4_data(self, data):
        bs4_data = BeautifulSoup(data, 'lxml')

        # 1,解析出所有的书:
        book_list = bs4_data.select('article')
        #print(len(book_list))

        # 2,解析出每本书的信息:
        for book in book_list:
            book_dict = {}
            # 1,书名:
            book_dict['book_name'] = book.select_one('.entry-title').get_text()


            # 2,该书的url:获取该路径下面的属性src
            book_dict['book_img_url'] = book.select_one('.attachment-post-thumbnail').get('src')


            # 3,书的作者:
            book_dict['book_author'] = book.select_one('.entry-author').get_text()[3:]


            # 4,书的简介:
            book_dict['book_info'] = book.select_one('.entry-summary').get_text()

            self.data_of_book_dict.append(book_dict)
            #print(book_dict)



    # 4,保存数据:
    def save_data(self):
        json.dump(self.data_of_book_dict, open('ebook_bs4.json', 'w'))

    # 5,启动:
    def start(self):
        url_list = self.get_url_list()

        # 循环遍历发送请求:
        for url in url_list:
            print(url)
            data = self.send_request(url)
            self.parse_bs4_data(data)

        self.save_data()

start = time.time()

BookSpider().start()


# 将json数据转换成csv格式:列表数据转csv
# 1,读 创建文件:
json_fp = open('ebook_bs4.json', 'r')
csv_fp = open('ebook_bs4.csv', 'w')

# 2,提出表头 表内容:
data_list = json.load(json_fp)
sheet_title = data_list[0].keys()  # 方法一


sheet_data = []
for data in data_list:
    sheet_data.append(data.values())
# print(sheet_data)

# 3,csv写入器:
writer = csv.writer(csv_fp)

# 4,写入表头:
writer.writerow(sheet_title)

# 5,写入内容:
writer.writerows(sheet_data)

# 6,关闭两个文件:
json_fp.close()
csv_fp.close()

end = time.time
print('cost time: ', end-start, 's')

原文发布于微信公众号 - MiningAlgorithms(gh_d0cc50d1ed34)

原文发表时间:2019-05-23

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券