代码如下,认真读一下还是很容易看懂的
import requests
import time
import multiprocessing
from lxml import etree
import re
import pymysql
import random
sql=pymysql.connect(host='localhost',password='123456',user='root',db='cg',autocommit=True)
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
ip=[
'101.231.104.82:80',
'218.17.46.17:808',
'116.114.19.211:443',
'39.137.69.6:8080',
'118.25.10.200:8080',
'110.167.30.50:8060',
'39.137.107.98:80',
'39.137.69.8:80',
'221.180.170.104:8080',
'39.137.69.10:80',
'39.137.69.9:80',
]
t={
'http':random.choice(ip)
}
def save(name,content,daoyan,zhuyan,diqu,years,image,m3,type1):
cur = sql.cursor()
cur.execute(f'''insert into movies values(Null,'{name}','{image}','{m3}','{content}','{type1}','{daoyan}','{zhuyan}','{diqu}','{years}',now())''')
print('保存完成',name)
def get_page():
url='http://www.86cg.com/index.php?s=vod-type-id-1-mcid--lz--area--year--letter--order-addtime-picm-1-p-10.html'
# 获取每一页的url,提供给后面请求m3u8文件
try:
time.sleep(1)
d = requests.get(url=url, headers=headers, proxies=t)
da = etree.HTML(d.text)
data=da.xpath('//div[@id="contents"]/ul')
for x in data:
a=x.xpath('./li/a/@href') # 电影页url
print(a)
for x1 in a:
xa='http://www.86cg.com'+''.join(x1) # 获取到详情页的url
# print(xa)
# print('详情',xa)
get_detail(xa) # 从get_datail中获取返回的值
# print(m,type(m))
# 再次解析m3u8详情url,提供给get_m3u8函数进行获取m3u8文件
# print('完成第',x1,'页')
except Exception as e:
pass
# error.append(x1)
# print(x1,'页出错,已记录')
get_page()
# 获取电影详情页的url
def get_detail(url):
# time.sleep(1)
# url='http://www.86cg.com/v/63633.html'
try:
a=requests.get(url=url,headers=headers,proxies=t)
d=etree.HTML(a.text)
data=d.xpath('.//div[@class="details-con2-body"]/div/ul/li[1]/a/@href')
# 预防出现两个清晰度,后面find不到东西报错,用列表来解决,永远选择第一个
lis=[]
for x in data:
lis.append(x)
dd='http://www.86cg.com'+''.join(lis[0]).strip()
# print('m3L链接',dd)
# print(data)
d=d.xpath('.//div[@class="details-left fl"]')
for xx in d:
a=xx.xpath('.//div[@class="details-con1"]/a/img/@src') # 获取影视图片
a1=xx.xpath('.//div[@class="art-title"]/h1/text()') # 获取电影标题
a2=xx.xpath('.//div[@class="synopsis"]/p[1]/a/text()') # 获取影视类型
a3=xx.xpath('.//div[@class="synopsis"]/p[1]/storng/a/text()') # 获取导演
a4=xx.xpath('.//div[@class="synopsis"]/p[2]/storng/a/text()') # 获取主演
a5=xx.xpath('.//div[@class="synopsis"]/p[3]/a/text()') # 获取地区
a6=xx.xpath('.//div[@class="synopsis"]/p[3]/text()') # 获取上映时间
a7=xx.xpath('.//div[@class="synopsis"]/p[4]/text()') # 电影简介
for x1,x2,x3,x4,x5,x6,x7,x8 in zip(a,a1,a2,a3,a4,a5,a6,a7):
image='http://www.86cg.com'+''.join(x1)
name=''.join(x2)
type1=''.join(x3)
daoyan=''.join(x4)
zhuyan=''.join(x5)
diqu=''.join(x6)
years=''.join(x7)
content=''.join(x8)
m = get_m3u8() # m3u8文件地址
# save(name,content,daoyan,zhuyan,diqu,years,image,m,type1)
print(name,content,daoyan,zhuyan,diqu,years,image,m,type1)
# print(image,title,type1,daoyan,zhuyan,diqu,shijian,content,m)
# 返回 获取到的m3u8文件给第一次主函数进行储存
# return m
except Exception as e:
print(e)
# 从script中提取m3u8文件
def get_m3u8():
# time.sleep(0.5)
try:
m3='http://www.86cg.com/online/79889-1-2.html'
m3u8=requests.get(url=m3,headers=headers,proxies=t)
a=m3u8.text
aa=re.findall('.*.{"url":.*',a)
s=aa[0].find(':')
e=aa[0].find(',')
m=aa[0][s+2:e-1]
# 返回给结果给主函数
return m
except Exception as e:
print(e)
#
if __name__ == '__main__':
s_time=time.time()
pool=multiprocessing.Pool(10)
error=[]
for x in range(1,1041):
url=f'http://www.86cg.com/index.php?s=vod-type-id-1-mcid--lz--area--year--letter--order-addtime-picm-1-p-{x}.html'
pool.apply_async(get_page,(url,x))
pool.close()
pool.join()
print('总共用时',time.time()-s_time,'s')
print('出错页数',len(error))
print(error)