前往小程序,Get更优阅读体验!
立即前往
发布
社区首页 >专栏 >批量爬取淘宝数据

批量爬取淘宝数据

原创
作者头像
码农GT038527
发布2024-11-15 12:51:29
发布2024-11-15 12:51:29
15000
代码可运行
举报
文章被收录于专栏:练手小项目
运行总次数:0
代码可运行

简介

此代码主要批量爬取淘宝IPad搜索页的商品图片网址,商品名称,商品价格,购买人数,商品店铺,店铺地址等数据

获取相关参数

token值:

需要按F12打开开发者工具,打开源代码,在eT = eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)处打上断点,然后点击搜索网页下一页

cookie:

cookie点击网络,然后刷新页面,搜索相关页面数据找到相应的数据包点击进去就能找到相关请求头里面的cookie

代码

代码语言:python
代码运行次数:0
复制
"""
TODO @Time: 2024.11.13 10:05
TODO @Description:
        TODO 批量爬取淘宝IPad搜索页的商品图片网址,商品名称,商品价格,购买人数,商品店铺,店铺地址等数据
TODO 若报KeyError: 'itemsArray'错误 则修改 token 和 cookie
"""

"""导入模块"""
import requests
import hashlib
import re
import json
import csv
import time


"""保存到csv文件"""
f = open('TaobaoIpadData.csv', mode='w', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
    '商品图片网址',
    '商品名称',
    '商品价格',
    '购买人数',
    '商品店铺',
    '店铺地址',
])


"""获取sign参数"""
def GetSign(eC, page, totalResults, sourceS, bc_offset, nt_offset):
    em_token = "1853a595b6caa656f2a0a0861c87f09e"
    eS = "12574478"
    params = {
        "device": "HMA-AL00",
        "isBeta": "false",
        "grayHair": "false",
        "from": "nt_history",
        "brand": "HUAWEI",
        "info": "wifi",
        "index": "4",
        "rainbow": "",
        "schemaType": "auction",
        "elderHome": "false",
        "isEnterSrpSearch": "true",
        "newSearch": "false",
        "network": "wifi",
        "subtype": "",
        "hasPreposeFilter": "false",
        "prepositionVersion": "v2",
        "client_os": "Android",
        "gpsEnabled": "false",
        "searchDoorFrom": "srp",
        "debug_rerankNewOpenCard": "false",
        "homePageVersion": "v7",
        "searchElderHomeOpen": "false",
        "search_action": "initiative",
        "sugg": "_4_1",
        "sversion": "13.6",
        "style": "list",
        "ttid": "600000@taobao_pc_10.7.0",
        "needTabs": "true",
        "areaCode": "CN",
        "vm": "nw",
        "countryNum": "156",
        "m": "pc",
        "page": page,
        "n": 48,
        "q": "ipad",
        "qSource": "url",
        "pageSource": "",
        "tab": "all",
        "pageSize": 48,
        "totalPage": 100,
        "totalResults": totalResults,
        "sourceS": sourceS,
        "sort": "_coefp",
        "bcoffset": bc_offset,
        "ntoffset": nt_offset,
        "filterTag": "",
        "service": "",
        "prop": "",
        "loc": "",
        "start_price": None,
        "end_price": None,
        "startPrice": None,
        "endPrice": None,
        "itemIds": None,
        "p4pIds": None,
        "p4pS": None,
        "categoryp": "",
        "myCNA": "X/9CH0MbWU8BASQOA3k5PVBd"
    }

    info_data = {
        "appId": "34385",
        "params": json.dumps(params)
    }

    ep_data = json.dumps(info_data).replace(' ', '')
    string = em_token + "&" + str(eC) + "&" + eS + "&" + ep_data

    MD5 = hashlib.md5()
    MD5.update(string.encode('utf-8'))

    sign = MD5.hexdigest()
    return sign, ep_data


"""发送请求"""
def GetContent(page, totalResults, sourceS, bc_offset, nt_offset):
    # TODO 定义请求头
    headers = {
       'cookie': 'cna=oLVpH4QRRj0CATs4031lk0hs; tracknick=tb735478456; _hvn_lgc_=0; wk_cookie2=16cb34916fc6829f464d8b2c13ef73b7; wk_unb=UUphzWfZGRFIPmmKYA%3D%3D; thw=cn; t=bd7d9d70ef17437c32cb67d3e2e4542f; xlly_s=1; 3PcFlag=1731546377689; havana_lgc2_0=eyJoaWQiOjIyMDcwNTk2MzUxNjUsInNnIjoiMzM2MWNjNmU1NDhlMzRlNmE2YzRhMDI4MTU0N2I4OWMiLCJzaXRlIjowLCJ0b2tlbiI6IjFLVHFhMHVfWTAtLUo5d25tRnpaaTRnIn0; cookie3_bak=1ac0e58735da89325e109cea5d7336a7; cookie3_bak_exp=1731805584957; sn=; lgc=tb735478456; cancelledSubSites=empty; env_bak=FM%2BgmqK9Zvm2A1DNl%2FIQWnjH5AQ8%2BtieK%2FYhollQr2lz; dnk=tb735478456; mtop_partitioned_detect=1; _m_h5_tk=1853a595b6caa656f2a0a0861c87f09e_1731563704579; _m_h5_tk_enc=52c4ef72f7db30af8725119adc640888; cookie2=2adfc396990927a586cf2aa38b81f9ab; _tb_token_=e18b3571e4b1b; _samesite_flag_=true; sgcookie=E100Nc0OwXeI4bcFUPttDzBRkO7HHki9yuWVM3X6ZceMoY7%2BmRq38jisRPIRnj5Ra9eo1Ewhchd226qhMxQ%2BOZsRiTOwS%2FfzZqOsp5J6KjTiqzI%3D; havana_lgc_exp=1762660477612; unb=2207059635165; uc1=cookie16=UIHiLt3xCS3yM2h4eKHS9lpEOw%3D%3D&cookie15=WqG3DMC9VAQiUQ%3D%3D&pas=0&cookie14=UoYdXDzyt5YEAQ%3D%3D&cookie21=UtASsssme%2BBq&existShop=false; uc3=nk2=F5RCZs7fvotxxXg%3D&vt3=F8dD37yNyZOLGOP%2Bq%2FE%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D&id2=UUphzWfZGRFIPmmKYA%3D%3D; csg=9b34b3cd; cookie17=UUphzWfZGRFIPmmKYA%3D%3D; skt=f3c8978d4094b3c9; existShop=MTczMTU1NjQ3Nw%3D%3D; uc4=id4=0%40U2grFng8SqPcqRurzgIC3GmPUrGFUk%2Bi&nk4=0%40FY4JiMEjIcdAPx1iApdrpYuKjJryyg%3D%3D; _cc_=VT5L2FSpdA%3D%3D; _l_g_=Ug%3D%3D; sg=659; _nk_=tb735478456; cookie1=B0SuKW4BZ5%2Bczyn34ImbdgL6qCUsVzFzCxIssgtgRno%3D; sdkSilent=1731642877637; havana_sdkSilent=1731642877637; x5sec=7b22733b32223a2233356339313435343265313336663133222c22617365727665723b33223a22307c43507a6f31626b47454b447878703845476738794d6a41334d4455354e6a4d314d5459314f7a4d7767726274742f7a2f2f2f2f2f41513d3d227d; tfstk=fojjTVjHsjcXmL44RAeyNGBefOx_L1ZEGA9OKOnqBnKv6a6h1Iygon7W1_CWMmy0oFd1inLGuCR21h6GO8PUYkWcnhxYTWreRcn30hATbhhfHnDvXWPU44kJoS-OgqdZtugJKQpvBFpTF3pwhCK9kFB8FdpZMVnA686JZdkxWcK9e3pwdhdO6he5ebU4NL4XMtwrMKdRcWLGHQiCEg9vNqXvNXhdVaTX9lOSXcIWHTmi7snQ7BQHgeRhwuoXfTp1Ne7bOm11B9b9PGGY36BOigtDAPyDRaQP8E_7McOP0dTpXUMtXTtOkFI6vkhWEaBVWifjCcpc0M86tUwtjF-RYeQdGA2henpOsejatmAAB9jh8HZjtK_Rd3sPqDRBeMgsFem6FBy7FV0GgoztjgSDIqLvELSUF8GjSEpkFBy7FV0MkLvPY8wSGVf..; isg=BDAwZXPkIwaSkP7QcEyid2j9Af6CeRTDROjFIiqB_Ate5dCP0onkU4ZXPe2F9cyb',
       'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'
    }

    # TODO 请求数据包的网址
    url = 'https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/'

    # TODO 获取时间戳和加密数据sign , ep_data
    eC = int(time.time() * 1000)
    sign, ep_data = GetSign(eC, page, totalResults, sourceS, bc_offset, nt_offset)

    """查询参数"""
    data = {
        'jsv': '2.7.4',
        'appKey': '12574478',
        't': eC,
        'sign': sign,
        'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
        'v': 2.0,
        'timeout': 10000,
        'type': 'jsonp',
        'dataType': 'jsonp',
        'callback': 'mtopjsonp8',
        'data': ep_data
    }

    """发送请求"""
    response = requests.get(url=url, params=data, headers=headers)
    info = response.text

    """解析数据"""
    json_str = re.findall('mtopjsonp\d+\((.*)', info)[0][:-1]
    json_data = json.loads(json_str)

    itemsArray = json_data['data']['itemsArray']
    for index in itemsArray:
        city_info = index['procity'].split(' ')
        if len(city_info) == 2:
            pro = city_info[0]
            city = city_info[1]
        else:
            pro = city_info[0]
            city = '未知'

        dit = {
            '商品图片网址': index['pic_path'],
            '商品名称': index['title'].replace('<span class=H>', '').replace('</span>', ''),
            '商品价格': "¥" + index['price'],
            '购买人数': index['realSales'],
            '商品店铺': index['nick'],
            '店铺地址': city,
        }
        csv_writer.writerow(dit)
        print(dit)

    """获取下一页的内容"""
    totalResults = json_data['data']['mainInfo']['totalResults']
    sourceS = json_data['data']['mainInfo']['sourceS']
    bc_offset = json_data['data']['mainInfo']['bcoffset']
    nt_offset = json_data['data']['mainInfo']['ntoffset']

    return totalResults, sourceS, bc_offset, nt_offset


"""第一页参数"""
totalResults = 4800
sourceS = '0'
bc_offset = '""'
nt_offset = '""'


"""翻页爬取"""
for page in range(1, 21):
    print(f'-----------------------------------------------正在采集第{page}页的数据内容-----------------------------------------------')
    totalResults, sourceS, bc_offset, nt_offset = GetContent(page, totalResults, sourceS, bc_offset, nt_offset)

爬取的数据

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 简介
  • 获取相关参数
  • 代码
  • 爬取的数据
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档