前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >爬虫案例

爬虫案例

作者头像
懿曲折扇情
发布2022-08-24 12:30:13
1.1K0
发布2022-08-24 12:30:13
举报
文章被收录于专栏:建帅技术分享

一、壁纸网站

代码语言:javascript
复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/3/25 19:35
"""
import os.path

import requests
import parsel


def get_address():
    """
    获取url地址
    :return:
    """
    dirname = 'photo/'

    if not os.path.exists(dirname):
        os.mkdir(dirname)

    for page in range(2, 11):
        print(f'=====================正在爬取第{page}页内容========================')
        url = f'http://www.netbian.com/1920x1080/index_{page}.htm'

        # url = 'http://www.netbian.com/1920x1080/'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
        }
        res = requests.get(url, headers=headers)
        res.encoding = res.apparent_encoding

        selector = parsel.Selector(res.text)
        href = selector.css('.list li a::attr(href)').getall()

        # <img src="http://img.netbian.com/file/2022/0326/small003835uYAUe1648226315.jpg" alt="绿色草地 美女刘亦菲2022年4月日历桌面壁纸护眼">

        url_lis = selector.css('.list li')
        for lis in url_lis:
            title = lis.css('b::text').get()
            # 取出广告页面
            if title:
                list_url = 'http://www.netbian.com' + lis.css('a::attr(href)').get()
                # print(list_url)
                res1 = requests.get(list_url, headers=headers)
                # print(res1.text)
                selector1 = parsel.Selector(res1.text)
                img_url = selector1.css('.pic img::attr(src)').get()
                # print(img_url)

                # 保存图片
                img_content = requests.get(url=img_url).content
                with open('photo/' + title + '.jpg', 'wb') as f:
                    f.write(img_content)
                    print(title, img_url)


get_address()

二、彼岸壁纸

代码语言:javascript
复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/2 14:59
"""
import os.path
import re
import requests


if not os.path.exists('photo/'):
    os.mkdir('photo/')

url = 'http://www.netbian.com'
# http://www.netbian.com/index_2.htm

# http://www.netbian.com/desk/26344-1920x1080.htm
# http://www.netbian.com/desk/26345-1920x1080.htm
headers = {
    'Host': 'www.netbian.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
    'Upgrade-Insecure-Requests': '1',
    'Cookie': '__yjs_duid=1_4535c561a20964f1ade88776981a0f411648389371877; Hm_lvt_0f461eb489c245a31c209d36e41fcc0f=1648389374,1648986956; Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f=1648986956'
}
rsp = requests.get(url, headers=headers)
rsp.encoding = rsp.apparent_encoding
# print(rsp.text)

# <img src="http://img.netbian.com/file/2022/0402/small004425v1bwe1648831465.jpg" alt="lol英雄联盟九尾妖狐 命运之子 阿狸壁纸"/>
# <a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />
url_list = re.findall('<a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />', rsp.text)
# print(url_list)

for index in url_list:
    url_lis = index[0]
    title = index[1]
    new_url = url + url_lis
    # print(new_url)

    rsp1 = requests.get(new_url)
    rsp1.encoding = rsp1.apparent_encoding
    img_list = re.findall('<a href=".*?" target="_blank"><img src="(.*?)" alt="(.*?)" title=".*?"></a>', rsp1.text)
    # print(img_list)

    for img in img_list:
        img_url = img[0]
        img_title = img[1]
        content_data = requests.get(img_url).content

        with open('photo/' + img_title + '.jpg', 'wb') as f:
            f.write(content_data)
            print(f'***************正在爬取{title}中****************')

三、某手视频

代码语言:javascript
复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/15 20:13
"""
import json
import os.path
import pprint

import requests


def get_page(pcursor):
    path = 'video/'
    if not os.path.exists(path):
        os.mkdir(path)
    # 爬取对象'https://www.kuaishou.com/profile/3xhv7zhkfr3rqag'
    """
    ctrl+r 批量替换
    https://www.kuaishou.com/short-video/3xw5fmcf9jdap29?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
    https://www.kuaishou.com/short-video/3xf98wc5q2cuxtq?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
    """

    url = 'https://www.kuaishou.com/graphql'
    headers = {
        'content-type': 'application/json',
        'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_72314bf978cb158dd7034b2370d2ae70',
        'Host': 'www.kuaishou.com',
        'Origin': 'https://www.kuaishou.com',
        'Referer': 'https://www.kuaishou.com/short-video/3x6v3xmcjsd5cki?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
    }
    data = {
        "operationName": "visionProfilePhotoList",
        "query": "query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n  visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      type\n      author {\n        id\n        name\n        following\n        headerUrl\n        headerUrls {\n          cdn\n          url\n          __typename\n        }\n        __typename\n      }\n      tags {\n        type\n        name\n        __typename\n      }\n      photo {\n        id\n        duration\n        caption\n        likeCount\n        realLikeCount\n        coverUrl\n        coverUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrl\n        liked\n        timestamp\n        expTag\n        animatedCoverUrl\n        stereoType\n        videoRatio\n        profileUserTopPhoto\n        __typename\n      }\n      canAddComment\n      currentPcursor\n      llsid\n      status\n      __typename\n    }\n    hostName\n    pcursor\n    __typename\n  }\n}\n",
        "variables": {"userId": "3xhv7zhkfr3rqag", "pcursor": pcursor, "page": "detail", "webPageArea": "profilexxnull"}
    }
    rsp = requests.post(url=url, json=data, headers=headers)

    # 第一种方式转成json
    # json_data = json.loads(rsp.text)
    # 或者
    json_data = rsp.json()
    # print(json_data, type(json_data))
    url_list = json_data['data']['visionProfilePhotoList']['feeds']
    pcursor = json_data['data']['visionProfilePhotoList']['pcursor']
    # print(url_list)
    # pprint.pprint(url_list)

    for key in url_list:
        # 视屏标题
        title = key['photo']['caption']
        # print(title)
        # 视频url
        new_url = key['photo']['photoUrl']
        # print(title, new_url)
        # 发送请求
        content_data = requests.get(url=new_url).content
        # 保存目录
        with open(f'video/{title}.mp4', mode='wb') as f:
            f.write(content_data)
            print(f'=======================正在下载标题为 {title} 的快手短视频==========================')
    if pcursor != "no_more":
        get_page(pcursor)


get_page("")

四、拉钩数据

代码语言:javascript
复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/3 17:58
"""
import csv
import json
import os.path
import pprint

import requests
import re


if not os.path.exists('info/'):
    os.makedirs('info/')

f = open('info/招聘.csv', encoding='utf-8', mode='a', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
    '职位名字',
    '公司名字',
    '工作城市',
    '学历要求',
    '经验要求',
    '薪资要求',
    '公司地址',
    '详情页'
])
# 写入表头
csv_writer.writeheader()

for page in range(1, 11):
    url = f'https://www.lagou.com/wn/jobs?pn={page}&fromSearch=true&kd=python'
    headers = {
        'Host': 'www.lagou.com',
        'Referer': 'https://www.lagou.com/utrack/trackMid.html?f=https%3A%2F%2Fwww.lagou.com%2Fwn%2Fjobs%3Fpn%3D2%26fromSearch%3Dtrue%26kd%3Dpython&t=1648984113&_ti=1',
        'Cookie': 'user_trace_token=20211122110451-60eec88a-fbaf-47fd-9a53-188f3632144b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637550277; _ga=GA1.2.1219095688.1637550277; LGUID=20211122110452-94ffa347-2c46-4c2d-8429-b83e30e86693; RECOMMEND_TIP=true; __lg_stoken__=9ec31e7a3301bab4f215bd5f80c8af0ab0dc2b8ce81af654fe848cf33ad7c4f33d0748020b30281d56a28a756342ce5d42e6c218bcfd56dbf764c51686741cbaf14de987ef24; JSESSIONID=ABAAABAABEIABCIA45B6C458598FF70789BDFD5A4574786; WEBTJ-ID=20220403173842-17feeca7ea0402-090b1b6ee61841-a3e3164-3686400-17feeca7ea15f1; sensorsdata2015session=%7B%7D; X_HTTP_TOKEN=1ca92d1d8ffe4ecb3114898461b10fa2c7054519c6; X_MIDDLE_TOKEN=3e27b9a5a69f9fa78d5d2fe99174c9c5; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229659966%22%2C%22%24device_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2298.0.4758.102%22%7D%2C%22first_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%7D',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }

    rsp = requests.get(url=url, headers=headers)
    print(rsp.status_code)
    # print(rsp.text)
    # <script id="__NEXT_DATA__" type="application/json">(.*?)</script>
    html_data = re.findall('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', rsp.text)[0]
    # print(html_data)


    json_data = json.loads(html_data)
    # print(json_data)
    # pprint.pprint(json_data)
    result = json_data['props']['pageProps']['initData']['content']['positionResult']['result']
    # print(result)
    # 格式输出
    # pprint.pprint(result)
    for index in result:
        # pprint.pprint(index)
        # 岗位职责
        job_index = index['positionDetail'].replace('<br />', '').replace('<br>', '')
        href = f'https://www.lagou.com/wn/jobs{index["positionId"]}.html'
        dict1 = {
            '职位名字': index['positionName'],
            '公司名字': index['companyFullName'],
            '工作城市': index['city'],
            '学历要求': index['education'],
            '经验要求': index['workYear'],
            '薪资要求': index['workYear'],
            '公司地址': index['positionAddress'],
            '详情页': href
        }
        csv_writer.writerow(dict1)
        title = index['positionName'] + index['companyFullName']
        new_title = re.sub(r'[\/?:"<>|]', '', title)
        with open('info/' + new_title + '.txt', 'w', encoding='utf-8') as f:
            f.write(job_index)
        print(dict1)

五、王者荣耀英雄皮肤高清壁纸

代码语言:javascript
复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/2 13:05
"""

import requests
import os
import re


url = 'https://pvp.qq.com/web201605/js/herolist.json'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55'
}
rsp = requests.get(url, headers=headers)
# print(rsp.text)
print(rsp.status_code)
# print(rsp.json())
for index in rsp.json():
    # 获取英雄名字和id
    hero_name = index['cname']
    hero_id = index['ename']

    # filename = f'{hero_name}\\'
    # if not os.path.exists(filename):
    #     os.mkdir(filename)

    index_url = f'https://pvp.qq.com/web201605/herodetail/{hero_id}.shtml'
    # print(hero_name, hero_id, index_url)
    rsp1 = requests.get(url=index_url, headers=headers)
    # rsp1.encoding = 'gbk'
    rsp1.encoding = rsp1.apparent_encoding#自动识别编码
    # print(rsp1.text)
    temp = '<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">'
    title_list = re.findall('<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">', rsp1.text)[0]
    title_list = re.sub('&\d+', '', title_list).split('|')

    for num in range(1, len(title_list) + 1):

        img_url = f'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{hero_id}/{hero_id}-bigskin-{num}.jpg'
        img_title = title_list[num - 1]

        img_data = requests.get(url=img_url, headers=headers).content
        with open('photo/' + img_title + '.jpg', 'wb') as f:
            print(f'=====================正在爬取{hero_name}的皮肤========================')
            f.write(img_data)
        # print(img_title, img_url)

六、美图网站

代码语言:javascript
复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/3/26 12:17
"""
import os.path
from time import sleep

import requests
import re


dirname = 'photo/'
if not os.path.exists(dirname):
    os.mkdir(dirname)

url = 'https://www.vmgirls.com/17081.html'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
}
res = requests.get(url, headers=headers)
# print(res.text)
print(res.status_code)

# <a href="(.*?)" alt=".*?" title=".*?">
# 只匹配括号内的内容
url_list = re.findall('<a href="(.*?)" alt=".*?" title=".*?">', res.text)
print(url_list)

for urls in url_list:
    name = urls.split('/')[-1]
    new_url = 'https:' + urls
    # print(new_url)
    res_content = requests.get(url=new_url, headers=headers).content
    sleep(2)
    # 保存文件
    with open('photo/' + name + '.jpeg', mode='wb') as f:
        f.write(res_content)

七、表情包

代码语言:javascript
复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/3/25 17:35
"""

import requests
import re


def download_photo(name, url):
    res = requests.get(url)
    print(res.status_code)
    suffix = url.split('.')[-1]
    with open('photo/' + name + '.' + suffix, 'wb') as f:
        f.write(res.content)


"""
http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd1pb7ij20iz0iz41l.jpg
http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd3od4lg208w08wdvb.gif

https://www.fabiaoqing.com/bqb/lists/page/3.html
"""


def download_page(url):
    # url = 'https://www.fabiaoqing.com/biaoqing/lists/page/3.html'
    res1 = requests.get(url)
    temp = '<img class="ui image lazy" data-original="(.*?)" src="/Public/lazyload/img/transparent.gif" title="(.*?)" alt="(.*?)" style="max-height:188;margin: 0 auto"/>'
    result1 = re.findall(temp, res1.text)
    print(result1)
    for img in result1:
        print(img)
        # name = img[0]
        # new_name = re.sub(r'[\/:*?;"<>|\n]', '_', name)
        # download_photo(img[1], new_name)
        download_photo(img[1], img[0])

# download_page('https://www.fabiaoqing.com/biaoqing/lists/page/3.html')


def download_all_page():
    for page in range(1, 50):
        pages = 'https://www.fabiaoqing.com/biaoqing/lists/page/' + str(page) + '.html'
        download_page(pages)


download_all_page()

八、酷狗music

代码语言:javascript
复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/8 12:59
"""
import os.path
import pprint
import re

import requests


if not os.path.exists('music/'):
    os.mkdir('music/')
url = 'https://www.kugou.com/yy/html/rank.html'

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
}

rsp = requests.get(url, headers=headers)
# print(rsp.text)
hash_list = re.findall('"Hash":"(.*?)"', rsp.text)
album_list = re.findall('"album_id":(.*?),', rsp.text)
# print(rsp.text)
zip_list = zip(hash_list, album_list)
for hash1, album_id in zip_list:
    # print(hash1, album_id)

    index_url = 'https://wwwapi.kugou.com/yy/index.php'
    data = {
        'r': 'play/getdata',
        'hash': hash1,
        'dfid': '34dlds4MjPyk0XgC5n0MobxL',
        'appid': '1014',
        'mid': 'fbcb28bbcbd1758696a1eb4363b645d6',
        'platid': '4',
        'album_id': album_id,
        '_': '1649395118742'
    }
    rsp1 = requests.get(url=index_url, params=data, headers=headers)
    # print(rsp1.json())
    # pprint.pprint(rsp1.json())
    audioname = rsp1.json()['data']['audio_name']
    playurl = rsp1.json()['data']['play_url']
    # print(audioname, playurl)
    music_content = requests.get(url=playurl, headers=headers).content
    with open('music\\' + audioname + '.mp3', 'wb') as f:
        print(f'*************************正在爬取歌曲{audioname}中***********************')
        f.write(music_content)
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 一、壁纸网站
  • 二、彼岸壁纸
  • 三、某手视频
  • 四、拉钩数据
  • 五、王者荣耀英雄皮肤高清壁纸
  • 六、美图网站
  • 七、表情包
  • 八、酷狗music
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档