此代码主要批量爬取淘宝IPad搜索页的商品图片网址,商品名称,商品价格,购买人数,商品店铺,店铺地址等数据
token值:
需要按F12打开开发者工具,打开源代码,在eT = eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
处打上断点,然后点击搜索网页下一页
cookie:
cookie点击网络,然后刷新页面,搜索相关页面数据找到相应的数据包点击进去就能找到相关请求头里面的cookie
"""
TODO @Time: 2024.11.13 10:05
TODO @Description:
TODO 批量爬取淘宝IPad搜索页的商品图片网址,商品名称,商品价格,购买人数,商品店铺,店铺地址等数据
TODO 若报KeyError: 'itemsArray'错误 则修改 token 和 cookie
"""
"""导入模块"""
import requests
import hashlib
import re
import json
import csv
import time
"""保存到csv文件"""
f = open('TaobaoIpadData.csv', mode='w', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
'商品图片网址',
'商品名称',
'商品价格',
'购买人数',
'商品店铺',
'店铺地址',
])
"""获取sign参数"""
def GetSign(eC, page, totalResults, sourceS, bc_offset, nt_offset):
em_token = "1853a595b6caa656f2a0a0861c87f09e"
eS = "12574478"
params = {
"device": "HMA-AL00",
"isBeta": "false",
"grayHair": "false",
"from": "nt_history",
"brand": "HUAWEI",
"info": "wifi",
"index": "4",
"rainbow": "",
"schemaType": "auction",
"elderHome": "false",
"isEnterSrpSearch": "true",
"newSearch": "false",
"network": "wifi",
"subtype": "",
"hasPreposeFilter": "false",
"prepositionVersion": "v2",
"client_os": "Android",
"gpsEnabled": "false",
"searchDoorFrom": "srp",
"debug_rerankNewOpenCard": "false",
"homePageVersion": "v7",
"searchElderHomeOpen": "false",
"search_action": "initiative",
"sugg": "_4_1",
"sversion": "13.6",
"style": "list",
"ttid": "600000@taobao_pc_10.7.0",
"needTabs": "true",
"areaCode": "CN",
"vm": "nw",
"countryNum": "156",
"m": "pc",
"page": page,
"n": 48,
"q": "ipad",
"qSource": "url",
"pageSource": "",
"tab": "all",
"pageSize": 48,
"totalPage": 100,
"totalResults": totalResults,
"sourceS": sourceS,
"sort": "_coefp",
"bcoffset": bc_offset,
"ntoffset": nt_offset,
"filterTag": "",
"service": "",
"prop": "",
"loc": "",
"start_price": None,
"end_price": None,
"startPrice": None,
"endPrice": None,
"itemIds": None,
"p4pIds": None,
"p4pS": None,
"categoryp": "",
"myCNA": "X/9CH0MbWU8BASQOA3k5PVBd"
}
info_data = {
"appId": "34385",
"params": json.dumps(params)
}
ep_data = json.dumps(info_data).replace(' ', '')
string = em_token + "&" + str(eC) + "&" + eS + "&" + ep_data
MD5 = hashlib.md5()
MD5.update(string.encode('utf-8'))
sign = MD5.hexdigest()
return sign, ep_data
"""发送请求"""
def GetContent(page, totalResults, sourceS, bc_offset, nt_offset):
# TODO 定义请求头
headers = {
'cookie': 'cna=oLVpH4QRRj0CATs4031lk0hs; tracknick=tb735478456; _hvn_lgc_=0; wk_cookie2=16cb34916fc6829f464d8b2c13ef73b7; wk_unb=UUphzWfZGRFIPmmKYA%3D%3D; thw=cn; t=bd7d9d70ef17437c32cb67d3e2e4542f; xlly_s=1; 3PcFlag=1731546377689; havana_lgc2_0=eyJoaWQiOjIyMDcwNTk2MzUxNjUsInNnIjoiMzM2MWNjNmU1NDhlMzRlNmE2YzRhMDI4MTU0N2I4OWMiLCJzaXRlIjowLCJ0b2tlbiI6IjFLVHFhMHVfWTAtLUo5d25tRnpaaTRnIn0; cookie3_bak=1ac0e58735da89325e109cea5d7336a7; cookie3_bak_exp=1731805584957; sn=; lgc=tb735478456; cancelledSubSites=empty; env_bak=FM%2BgmqK9Zvm2A1DNl%2FIQWnjH5AQ8%2BtieK%2FYhollQr2lz; dnk=tb735478456; mtop_partitioned_detect=1; _m_h5_tk=1853a595b6caa656f2a0a0861c87f09e_1731563704579; _m_h5_tk_enc=52c4ef72f7db30af8725119adc640888; cookie2=2adfc396990927a586cf2aa38b81f9ab; _tb_token_=e18b3571e4b1b; _samesite_flag_=true; sgcookie=E100Nc0OwXeI4bcFUPttDzBRkO7HHki9yuWVM3X6ZceMoY7%2BmRq38jisRPIRnj5Ra9eo1Ewhchd226qhMxQ%2BOZsRiTOwS%2FfzZqOsp5J6KjTiqzI%3D; havana_lgc_exp=1762660477612; unb=2207059635165; uc1=cookie16=UIHiLt3xCS3yM2h4eKHS9lpEOw%3D%3D&cookie15=WqG3DMC9VAQiUQ%3D%3D&pas=0&cookie14=UoYdXDzyt5YEAQ%3D%3D&cookie21=UtASsssme%2BBq&existShop=false; uc3=nk2=F5RCZs7fvotxxXg%3D&vt3=F8dD37yNyZOLGOP%2Bq%2FE%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D&id2=UUphzWfZGRFIPmmKYA%3D%3D; csg=9b34b3cd; cookie17=UUphzWfZGRFIPmmKYA%3D%3D; skt=f3c8978d4094b3c9; existShop=MTczMTU1NjQ3Nw%3D%3D; uc4=id4=0%40U2grFng8SqPcqRurzgIC3GmPUrGFUk%2Bi&nk4=0%40FY4JiMEjIcdAPx1iApdrpYuKjJryyg%3D%3D; _cc_=VT5L2FSpdA%3D%3D; _l_g_=Ug%3D%3D; sg=659; _nk_=tb735478456; cookie1=B0SuKW4BZ5%2Bczyn34ImbdgL6qCUsVzFzCxIssgtgRno%3D; sdkSilent=1731642877637; havana_sdkSilent=1731642877637; x5sec=7b22733b32223a2233356339313435343265313336663133222c22617365727665723b33223a22307c43507a6f31626b47454b447878703845476738794d6a41334d4455354e6a4d314d5459314f7a4d7767726274742f7a2f2f2f2f2f41513d3d227d; tfstk=fojjTVjHsjcXmL44RAeyNGBefOx_L1ZEGA9OKOnqBnKv6a6h1Iygon7W1_CWMmy0oFd1inLGuCR21h6GO8PUYkWcnhxYTWreRcn30hATbhhfHnDvXWPU44kJoS-OgqdZtugJKQpvBFpTF3pwhCK9kFB8FdpZMVnA686JZdkxWcK9e3pwdhdO6he5ebU4NL4XMtwrMKdRcWLGHQiCEg9vNqXvNXhdVaTX9lOSXcIWHTmi7snQ7BQHgeRhwuoXfTp1Ne7bOm11B9b9PGGY36BOigtDAPyDRaQP8E_7McOP0dTpXUMtXTtOkFI6vkhWEaBVWifjCcpc0M86tUwtjF-RYeQdGA2henpOsejatmAAB9jh8HZjtK_Rd3sPqDRBeMgsFem6FBy7FV0GgoztjgSDIqLvELSUF8GjSEpkFBy7FV0MkLvPY8wSGVf..; isg=BDAwZXPkIwaSkP7QcEyid2j9Af6CeRTDROjFIiqB_Ate5dCP0onkU4ZXPe2F9cyb',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'
}
# TODO 请求数据包的网址
url = 'https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/'
# TODO 获取时间戳和加密数据sign , ep_data
eC = int(time.time() * 1000)
sign, ep_data = GetSign(eC, page, totalResults, sourceS, bc_offset, nt_offset)
"""查询参数"""
data = {
'jsv': '2.7.4',
'appKey': '12574478',
't': eC,
'sign': sign,
'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
'v': 2.0,
'timeout': 10000,
'type': 'jsonp',
'dataType': 'jsonp',
'callback': 'mtopjsonp8',
'data': ep_data
}
"""发送请求"""
response = requests.get(url=url, params=data, headers=headers)
info = response.text
"""解析数据"""
json_str = re.findall('mtopjsonp\d+\((.*)', info)[0][:-1]
json_data = json.loads(json_str)
itemsArray = json_data['data']['itemsArray']
for index in itemsArray:
city_info = index['procity'].split(' ')
if len(city_info) == 2:
pro = city_info[0]
city = city_info[1]
else:
pro = city_info[0]
city = '未知'
dit = {
'商品图片网址': index['pic_path'],
'商品名称': index['title'].replace('<span class=H>', '').replace('</span>', ''),
'商品价格': "¥" + index['price'],
'购买人数': index['realSales'],
'商品店铺': index['nick'],
'店铺地址': city,
}
csv_writer.writerow(dit)
print(dit)
"""获取下一页的内容"""
totalResults = json_data['data']['mainInfo']['totalResults']
sourceS = json_data['data']['mainInfo']['sourceS']
bc_offset = json_data['data']['mainInfo']['bcoffset']
nt_offset = json_data['data']['mainInfo']['ntoffset']
return totalResults, sourceS, bc_offset, nt_offset
"""第一页参数"""
totalResults = 4800
sourceS = '0'
bc_offset = '""'
nt_offset = '""'
"""翻页爬取"""
for page in range(1, 21):
print(f'-----------------------------------------------正在采集第{page}页的数据内容-----------------------------------------------')
totalResults, sourceS, bc_offset, nt_offset = GetContent(page, totalResults, sourceS, bc_offset, nt_offset)
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。