记一次电影网站全站爬取

kirin

发布于 2020-05-09 15:55:14

1.6K0

发布于 2020-05-09 15:55:14

文章被收录于专栏：Kirin博客

代码如下，认真读一下还是很容易看懂的
import requests
import time
import multiprocessing
from lxml import etree
import re
import pymysql
import random
sql=pymysql.connect(host='localhost',password='123456',user='root',db='cg',autocommit=True)

headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
    '(KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
ip=[
    '101.231.104.82:80',
    '218.17.46.17:808',
    '116.114.19.211:443',
    '39.137.69.6:8080',
    '118.25.10.200:8080',
    '110.167.30.50:8060',
    '39.137.107.98:80',
    '39.137.69.8:80',
    '221.180.170.104:8080',
    '39.137.69.10:80',
    '39.137.69.9:80',
]
t={
    'http':random.choice(ip)

}

def save(name,content,daoyan,zhuyan,diqu,years,image,m3,type1):
    cur = sql.cursor()
    cur.execute(f'''insert into movies values(Null,'{name}','{image}','{m3}','{content}','{type1}','{daoyan}','{zhuyan}','{diqu}','{years}',now())''')
    print('保存完成',name)

def get_page():
    url='http://www.86cg.com/index.php?s=vod-type-id-1-mcid--lz--area--year--letter--order-addtime-picm-1-p-10.html'
    # 获取每一页的url，提供给后面请求m3u8文件

    try:
        time.sleep(1)
        d = requests.get(url=url, headers=headers, proxies=t)
        da = etree.HTML(d.text)
        data=da.xpath('//div[@id="contents"]/ul')
        for x in data:
            a=x.xpath('./li/a/@href') # 电影页url
            print(a)
            for x1 in a:
                xa='http://www.86cg.com'+''.join(x1) # 获取到详情页的url

                # print(xa)
                # print('详情',xa)
                get_detail(xa) # 从get_datail中获取返回的值
              #  print(m,type(m))
                # 再次解析m3u8详情url，提供给get_m3u8函数进行获取m3u8文件
        # print('完成第',x1,'页')
    except Exception as e:
        pass
        # error.append(x1)
        # print(x1,'页出错，已记录')
get_page()
# 获取电影详情页的url
def get_detail(url):
    # time.sleep(1)
    # url='http://www.86cg.com/v/63633.html'
    try:
        a=requests.get(url=url,headers=headers,proxies=t)
        d=etree.HTML(a.text)
        data=d.xpath('.//div[@class="details-con2-body"]/div/ul/li[1]/a/@href')
        # 预防出现两个清晰度，后面find不到东西报错，用列表来解决，永远选择第一个
        lis=[]
        for x in data:
            lis.append(x)
        dd='http://www.86cg.com'+''.join(lis[0]).strip()
        # print('m3L链接',dd)
        # print(data)

        d=d.xpath('.//div[@class="details-left fl"]')
        for xx in d:
            a=xx.xpath('.//div[@class="details-con1"]/a/img/@src') # 获取影视图片
            a1=xx.xpath('.//div[@class="art-title"]/h1/text()') # 获取电影标题
            a2=xx.xpath('.//div[@class="synopsis"]/p[1]/a/text()') # 获取影视类型
            a3=xx.xpath('.//div[@class="synopsis"]/p[1]/storng/a/text()') # 获取导演
            a4=xx.xpath('.//div[@class="synopsis"]/p[2]/storng/a/text()') # 获取主演
            a5=xx.xpath('.//div[@class="synopsis"]/p[3]/a/text()') # 获取地区
            a6=xx.xpath('.//div[@class="synopsis"]/p[3]/text()') # 获取上映时间
            a7=xx.xpath('.//div[@class="synopsis"]/p[4]/text()') # 电影简介
            for x1,x2,x3,x4,x5,x6,x7,x8 in zip(a,a1,a2,a3,a4,a5,a6,a7):
                image='http://www.86cg.com'+''.join(x1)
                name=''.join(x2)
                type1=''.join(x3)
                daoyan=''.join(x4)
                zhuyan=''.join(x5)
                diqu=''.join(x6)
                years=''.join(x7)
                content=''.join(x8)
                m = get_m3u8() # m3u8文件地址

                # save(name,content,daoyan,zhuyan,diqu,years,image,m,type1)
                print(name,content,daoyan,zhuyan,diqu,years,image,m,type1)


                # print(image,title,type1,daoyan,zhuyan,diqu,shijian,content,m)



        # 返回 获取到的m3u8文件给第一次主函数进行储存
        # return m
    except Exception as e:
        print(e)

# 从script中提取m3u8文件
def get_m3u8():
    # time.sleep(0.5)
    try:
        m3='http://www.86cg.com/online/79889-1-2.html'
        m3u8=requests.get(url=m3,headers=headers,proxies=t)
        a=m3u8.text
        aa=re.findall('.*.{"url":.*',a)
        s=aa[0].find(':')
        e=aa[0].find(',')
        m=aa[0][s+2:e-1]
        # 返回给结果给主函数
        return m
    except Exception as e:
        print(e)
#
if __name__ == '__main__':
    s_time=time.time()
    pool=multiprocessing.Pool(10)
    error=[]
    for x in range(1,1041):
        url=f'http://www.86cg.com/index.php?s=vod-type-id-1-mcid--lz--area--year--letter--order-addtime-picm-1-p-{x}.html'
        pool.apply_async(get_page,(url,x))
    pool.close()
    pool.join()
    print('总共用时',time.time()-s_time,'s')
    print('出错页数',len(error))
    print(error)

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2020/04/15 ，如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自作者个人站点/博客前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

登录后参与评论

0 条评论

热度

记一次电影网站全站爬取

记一次电影网站全站爬取

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐