导语:很久没有写爬虫了,几乎快要忘(废)了,吾爱上看到的这篇,所以直接“拿”过来,就有了这篇,简单写了写,并套上了GUI,打包exe,方便宝爸宝妈们参考使用!
本渣渣号里不知道多少宝爸宝妈,如有需要自取,人教版小初高教材下载,单本书籍下载成图片并可合成PDF电子书,不过在线浏览电子书,即可,感觉又是搞了没什么软用的玩意...
网址:https://jc.pep.com.cn/
协议头更换,写了一个类,同时用了classmethod 修饰符方法直接调用!
class Ua():
def __init__(self):
self.ua=None
@classmethod
def ulist(self):
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
]
self.ua = random.choice(ua_list)
# print(self.ua)
return self.ua
图片合成PDF电子书,也写了一个类,用的第三方库为:img2pdf、PIL,同时也用了classmethod 修饰符方法直接调用!
class Spdf():
def __init__(self):
pass
@classmethod
def get_pdf(self,file,pagenum,output):
print('\n开始合并图片成PDF...', end='')
file_imgs = [os.path.join(file, str(i) + '.jpg') for i in range(1,pagenum+1)]
# 不用以下代码会使img2pdf报错
for img_path in file_imgs:
with open(img_path, 'rb') as data:
img = Image.open(data)
# 将PNG中RGBA属性变为RGB,即可删掉alpha透明度通道
img.convert('RGB').save(img_path)
with open(f'{file}{output}.pdf', 'wb') as f:
f.write(img2pdf.convert(file_imgs))
print(f'保存到 {file}{output}.pdf')
关键部分,其实没什么难度,需要自行添加书籍ID及页面页码总数,比较规律的页码,直接循环递增即可获取到所有电子书教材图片!
class Book():
def __init__(self,id,pagenum):
self.id=id
self.pagenum=int(pagenum)
self.title=None
self.file=None
def get_title(self):
url=f'https://book.pep.com.cn/{self.id}/mobile/index.html'
ua = Ua.ulist()
headers = {'User-Agent': ua}
response = requests.get(url=url, headers=headers, timeout=6)
print(response.status_code)
html=response.content.decode('utf-8')
time.sleep(2)
title=re.findall(r'<title>(.+?)</title>',html,re.S)[0]
self.title=title
self.file = f'{self.title}/'
print(f'已获取 ID:{self.id}书籍\n书籍标题:{self.title}')
def get_book(self):
for page in range(1,self.pagenum+1):
jpg_url=f'https://book.pep.com.cn/{self.id}/files/mobile/{page}.jpg'
self.get_jpg(jpg_url,page)
print(f"ID:{self.id}书籍下载成功!")
time.sleep(6)
def get_jpg(self,jpg_url,page):
os.makedirs(self.file,exist_ok=True)
ua = Ua.ulist()
headers = {'User-Agent': ua}
response=requests.get(url=jpg_url,headers=headers,timeout=6)
with open(f'{self.file}{page}.jpg','wb') as f:
f.write(response.content)
print(f"{page}.jpg 图片下载成功!")
time.sleep(2)
def run(self):
self.get_title()
self.get_book()
Spdf.get_pdf(self.file, self.pagenum, self.title)
# -*- coding: utf-8 -*-
#https://jc.pep.com.cn/
import requests
import random
import time
import re
import os
import img2pdf
from PIL import Image
class Ua():
def __init__(self):
self.ua=None
@classmethod
def ulist(self):
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
]
self.ua = random.choice(ua_list)
# print(self.ua)
return self.ua
class Spdf():
def __init__(self):
pass
@classmethod
def get_pdf(self,file,pagenum,output):
print('\n开始合并图片成PDF...', end='')
file_imgs = [os.path.join(file, str(i) + '.jpg') for i in range(1,pagenum+1)]
# 不用以下代码会使img2pdf报错
for img_path in file_imgs:
with open(img_path, 'rb') as data:
img = Image.open(data)
# 将PNG中RGBA属性变为RGB,即可删掉alpha透明度通道
img.convert('RGB').save(img_path)
with open(f'{file}{output}.pdf', 'wb') as f:
f.write(img2pdf.convert(file_imgs))
print(f'保存到 {file}{output}.pdf')
class Book():
def __init__(self,id,pagenum):
self.id=id
self.pagenum=int(pagenum)
self.title=None
self.file=None
def get_title(self):
url=f'https://book.pep.com.cn/{self.id}/mobile/index.html'
ua = Ua.ulist()
headers = {'User-Agent': ua}
response = requests.get(url=url, headers=headers, timeout=6)
print(response.status_code)
html=response.content.decode('utf-8')
time.sleep(2)
title=re.findall(r'<title>(.+?)</title>',html,re.S)[0]
self.title=title
self.file = f'{self.title}/'
print(f'已获取 ID:{self.id}书籍\n书籍标题:{self.title}')
def get_book(self):
for page in range(1,self.pagenum+1):
jpg_url=f'https://book.pep.com.cn/{self.id}/files/mobile/{page}.jpg'
self.get_jpg(jpg_url,page)
print(f"ID:{self.id}书籍下载成功!")
time.sleep(6)
def get_jpg(self,jpg_url,page):
os.makedirs(self.file,exist_ok=True)
ua = Ua.ulist()
headers = {'User-Agent': ua}
response=requests.get(url=jpg_url,headers=headers,timeout=6)
with open(f'{self.file}{page}.jpg','wb') as f:
f.write(response.content)
print(f"{page}.jpg 图片下载成功!")
time.sleep(2)
def run(self):
self.get_title()
self.get_book()
Spdf.get_pdf(self.file, self.pagenum, self.title)
if __name__=='__main__':
id=1291001103221
pagenum=64
spider=Book(id,pagenum)
spider.run()
以下为运行效果展示,获取的效果参考一下!
简单的写了GUI,打包环境为 win7 64位,可能其他系统存在不兼容状况,仅供参考学习使用!
请不要用于其他用途!
附exe工具下载地址:
阿里云盘
人教版教材下载工具-by 微...号:Python与SEO https://www.aliyundrive.com/s/JJMahrowgDA 提取码: up18
百度云盘
链接:
https://pan.baidu.com/s/1u8VmLs7-qpGCk28ZjvjSJA?pwd=7847
提取码:
7847
·················END·················
本文分享自 Python与SEO学习 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!