首页
学习
活动
专区
工具
TVP
发布
精选内容/技术社群/优惠产品,尽在小程序
立即前往

手把手教你用Python从好看视频开始采集视频

源代码地址

https://github.com/kylin1020/haokan

运行环境

Python 3.6+

windows/Linux

源代码

# coding: utf-8

import re

import os

import logging

from threading import Thread

import http.cookiejar

from json import loads

from urllib import request, parse, error

from bs4 import BeautifulSoup

from time import time

class Spider():

def __init__(self):

self.main_url = 'http://sv.baidu.com'

self.tab_url = 'http://sv.baidu.com/videoui/list/tab'

self.header = {

'Referer': 'http://sv.baidu.com/',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36'

}

self.log_dir = self.mkdir(os.path.join(os.path.dirname(__file__), 'log'))

self.video_dir = self.mkdir(os.path.join(os.path.dirname(__file__), 'video'))

self.opener = self.build_opener()

self.logger = self.__build_logger()

# 创建目录

def mkdir(self, dir):

if not os.path.isdir(dir):

os.mkdir(dir)

return dir

# 构建日志输出函数

def __build_logger(self):

filename = os.path.join(self.log_dir, 'spider.log')

logger = logging.getLogger(__name__)

logger.setLevel(level = logging.INFO)

handler = logging.FileHandler(filename)

handler.setLevel(level = logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

handler.setFormatter(formatter)

logger.addHandler(handler)

console = logging.StreamHandler()

console.setLevel(level = logging.INFO)

logger.addHandler(console)

return logger

# 构建带cookie的url opener

def build_opener(self):

cookie_name = os.path.join(self.log_dir, 'cookie.log')

cookie = http.cookiejar.MozillaCookieJar(cookie_name)

if os.path.isfile(cookie_name):

cookie.load(cookie_name, ignore_discard=True, ignore_expires=True)

handler = request.HTTPCookieProcessor(cookie)

opener = request.build_opener(handler)

else:

handler = request.HTTPCookieProcessor(cookie)

opener = request.build_opener(handler)

req = request.Request(self.main_url, headers=self.header)

try:

respone = opener.open(req)

except Exception as e:

print(e)

cookie.save(filename=cookie_name, ignore_discard=True, ignore_expires=True)

return opener

# 获取指定url内容

def urlopen(self, url, data = None, is_redecode = False, is_format = True):

if data is not None:

data = parse.urlencode(data).encode(encoding='UTF-8')

req = request.Request(url, data=data, headers=self.header)

respone = self.opener.open(req)

if is_format:

respone = BeautifulSoup(respone, 'lxml')

# unicode编码转中文

if is_redecode:

respone = BeautifulSoup(self.redecode(respone), 'lxml')

return respone

# 获取分类信息

def get_index(self):

html = self.urlopen(self.main_url)

data_list = html.find_all(name='li', attrs={'tid': True})

url_list = {}

for data in data_list:

name_url = data.find('a')['href']

name = data.find('a').string

url_list[name] = name_url

return url_list

# unicode编码转中文

def redecode(self, content):

content = content.encode('utf-8').decode('unicode_escape')

return content

# 获取指定页视频内容

def expand(self, item, page = 1):

data = parse.urlencode({

'source': 'wise-channel',

'pd': '',

'subTab': item,

'direction': 'down',

'refreshType': 1,

'ua': 'Mozilla%2F5.0%20(Windows%20NT%2010.0%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F72.0.3626.81%20Safari%2F537.36',

'bt': '1549533482',

'caller': 'bdwise',

'_': int(time()),

'cb': 'jsonp%d' % (page),

})

url = self.tab_url + '?' + data

respone = self.urlopen(url = url, is_redecode=True)

video_list = respone.find_all('div', attrs={'data-authorid': True})

items = []

for video in video_list:

pattern = r'(\\)|(")'

url = re.sub(pattern, '', video['data-vsrc'])

title = re.sub(pattern, '', video['data-title'])

items.append({'title': title, 'url': url})

return items

# 保存视频

def save_video(self, page = 1):

items = self.expand('qiongying', page)

for item in items:

respone = self.urlopen(item['url'], is_format=False)

filename = os.path.join(self.video_dir, item['title'].strip() + '.mp4')

if not os.path.isfile(filename):

with open(filename, 'wb') as f:

f.write(respone.read())

if os.path.isfile(filename):

self.logger.info(filename)

# 执行视频采集

def run(self, total_page = 1):

for i in range(1, total_page+1):

self.save_video(i)

# 视频采集示例

if __name__ == "__main__":

spider = Spider()

spider.run(total_page=10)

Python抓取好看视频简单示例

代码简单易懂,暂未加入多线程多进程采集,也未设置代理。

  • 发表于:
  • 原文链接https://kuaibao.qq.com/s/20190208A0CTQX00?refer=cp_1026
  • 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号(企鹅号)传播渠道之一,根据《腾讯内容开放平台服务协议》转载发布内容。
  • 如有侵权,请联系 cloudcommunity@tencent.com 删除。

扫码

添加站长 进交流群

领取专属 10元无门槛券

私享最新 技术干货

扫码加入开发者社群
领券