selenium自动化带你一键爬取LOL英雄皮肤壁纸

叶庭云

修改于 2021-01-23 13:14:00

6740

修改于 2021-01-23 13:14:00

文章被收录于专栏：Python进阶之路

文章目录

一、网页分析

进入LOL官网，鼠标悬停在游戏资料上，等出现窗口，选择资料库，点击进入。

进入了所有英雄的页面，随便选择一个英雄进行查看

检查可以发现一个 .js 文件，里面保存了所有英雄的有关信息，可以将里面的内容复制下来保存到本地，转为json。

# -*- coding: UTF-8 -*-
"""
@File    ：read_json.py
@Author  ：叶庭云
@CSDN    ：https://yetingyun.blog.csdn.net/
"""
import json

# 读取txt里数据
with open('hreo_list.txt') as f:
    con = f.read()
# 将str转换为json
rep = json.loads(con)
# 遍历  得到每个英雄的 ID
print(f"有多少个英雄：{len(rep['hero'])}")    # 有多少个英雄：152
# https://lol.qq.com/data/info-defail.shtml?id=876
count = 0
for item in rep['hero']:
    print(f"英雄ID：{item['heroId']}")

依次点击英雄的详情页分析

id参数的值为.js文件中heroId对应的值   通过参数构造英雄详情页的URL
黑暗之女：https://lol.qq.com/data/info-defail.shtml?id=1
狂战士：https://lol.qq.com/data/info-defail.shtml?id=2
正义巨像：https://lol.qq.com/data/info-defail.shtml?id=3
含羞蓓蕾：https://lol.qq.com/data/info-defail.shtml?id=876

一些英雄的皮肤URL是规律的，比如安妮这样：

# big + id + 001.jpg  从001.jpg开始
https://game.gtimg.cn/images/lol/act/img/skin/big1001.jpg
https://game.gtimg.cn/images/lol/act/img/skin/big1002.jpg
https://game.gtimg.cn/images/lol/act/img/skin/big1003.jpg
https://game.gtimg.cn/images/lol/act/img/skin/big1004.jpg
https://game.gtimg.cn/images/lol/act/img/skin/big1005.jpg

但有些又像派克这样，皮肤URL不规律

https://game.gtimg.cn/images/lol/act/img/skin/big555001.jpg   # 第一张
https://game.gtimg.cn/images/lol/act/img/skin/big555009.jpg	  # 第二张
https://game.gtimg.cn/images/lol/act/img/skin/big555016.jpg   # 第三张

这样的情况，构造URL来请求下载图片不方便，我们直接上 selenium 大法

二、selenium爬虫

# -*- coding: UTF-8 -*-
"""
@File    ：selenium_spider.py
@Author  ：叶庭云
@CSDN    ：https://yetingyun.blog.csdn.net/
"""
from selenium import webdriver
import json
import os
from time import sleep
import requests
from fake_useragent import UserAgent
import logging
import random

# 日志输出的基本配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
ua = UserAgent(verify_ssl=False, path='fake_useragent.json')
# 引擎所在的路径
chrome_driver = r'D:\python\pycharm2020\chromedriver.exe'

options = webdriver.ChromeOptions()
# 设置无头模式 不弹出浏览器
# options.add_argument("--headless")
# 关闭左上方 Chrome 正受到自动测试软件的控制的提示
options.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option("excludeSwitches", ['enable-automation'])
browser = webdriver.Chrome(options=options, executable_path=chrome_driver)
# 设置保存路径
path = r'D:\python\pycharm2020\program\爬虫\LOL皮肤\skin\ '


# 返回构造URL的参数item['heroId']   顺带英雄名称item['name']
def create_urls():
    # 读取txt里数据
    with open('hreo_list.txt') as f:
        con = f.read()
    # 将str转换为json
    rep = json.loads(con)
    # 遍历  得到每个英雄的 ID
    print(f"有多少个英雄：{len(rep['hero'])}")
    # https://lol.qq.com/data/info-defail.shtml?id=876
    id_ = []
    for item in rep['hero']:
        # print(f"英雄ID：{item['heroId']} -- 英雄名称：{item['name']}")
        id_.append((item['heroId'],item['name']))
    return id_


def scrape_skin(items):
    name = items[1]    # 英雄名称
    # 产生随机请求头
    headers = {
        'User-Agent': ua.random
    }
    # 创建英雄名称的文件夹
    os.mkdir(path + name)
    url = f'https://lol.qq.com/data/info-defail.shtml?id={items[0]}'
    print(url)
    browser.get(url)
    browser.maximize_window()    # 窗口最大化
    browser.execute_script("scroll(0,1500)")  # 下拉滚动条
    sleep(2)
    # Xpath定位提取图片
    imgs = browser.find_elements_by_xpath('//*[@id="skinNAV"]/li/a/img')
    # 异常处理
    try:
        # 遍历   获取该页面所有皮肤图片链接 名称
        for i in imgs:
            # https://game.gtimg.cn/images/lol/act/img/skin/small1000.jpg 黑暗之女
            # 得到的是略图   替换small 为 big  得到大图
            src = i.get_attribute('src')
            src = src.replace('small', 'big')
            skin_name = i.get_attribute('alt') + '.jpg'
            # 特殊符号 / 去掉 比如痛苦之拥/K/DA 伊芙琳.jpg 保存就会报错
            if '/' in skin_name:
                skin_name = skin_name.replace('/', '')
            # print(src, skin_name)
            img = requests.get(src, headers=headers).content
            with open(path + name + '/' + skin_name, 'wb') as f:
                f.write(img)
                logging.info(f'已为您成功下载：{skin_name}')

    except Exception as e:
        logging.info(e.args[0])
        pass


def main():
    # 接收参数
    nums = create_urls()
    # 遍历  传入参数  访问每个英雄详情页
    for item in nums:
        scrape_skin(item)
        sleep(random.randint(1, 3))


if __name__ == '__main__':
    main()

运行效果如下：