爬虫案例学习6

获取淘宝商品数据2024-12-18

参考学习:
大佬博客
视频教程

通过搜索发现,数据是通过发送请求过来的,不是静态存在源代码的

所以我们需要请求这个接口获取数据:比如标题,价格,图片等信息
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/

但是我们直接发请求,携带上参数,无法获取到数据,会返回非法请求的字样。

因为有个参数sign是加密的,我们需要逆向

逆向参数获取sign

sign参数:貌似是一些参数经过哈希加密算法之后生成的32位小写加密参数。

具体的需要查看对应的js

点击main.js

搜素sign:相关的,分析

eT = eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)这一行就是生成sign

点击断点,可以查看变量的值

不过这里推荐打断点的时机,先鼠标滚动到下面的页码处,再接着打断点,点击下一页,此时进入js源码的参数才是正确的。

如果不这样做,鼠标滚轮下滑也进入了断点,ep.data的值不是我们需要的,需要放行很多次。

切换到控制台,输出这些值,等一下在python代码中需要使用,这里先记录一下

bash 复制代码
eE(em.token + "&" + eC + "&" + eS + "&" + ep.data) # 返回值是sign的值
em.token
eC 
eS 
ep.data

获得sign 8a3593958c55ff4115e359745dc9a665是0-9,a-f MD5加密的字符串

我们需要带代码中生成sign

构建字符串MD5加密

python 复制代码
#构建字符串str = em.token + "&" + eC + "&" + eS + "&" + ep.data
#Ec是时间
def getSign(eC):
    em = 'cbee62bc9b064d508514dd6eb1c6cebd' # em变量存储token
    eS = '12574478'
    # signParam 是ep.data中的params字段
    signParam = {
	"device": "HMA-AL00",
	"isBeta": "false",
	"grayHair": "false",
	"from": "nt_history",
	"brand": "HUAWEI",
	"info": "wifi",
	"index": "4",
	"rainbow": "",
	"schemaType": "auction",
	"elderHome": "false",
	"isEnterSrpSearch": "true",
	"newSearch": "false",
	"network": "wifi",
	"subtype": "",
	"hasPreposeFilter": "false",
	"prepositionVersion": "v2",
	"client_os": "Android",
	"gpsEnabled": "false",
	"searchDoorFrom": "srp",
	"debug_rerankNewOpenCard": "false",
	"homePageVersion": "v7",
	"searchElderHomeOpen": "false",
	"search_action": "initiative",
	"sugg": "_4_1",
	"sversion": "13.6",
	"style": "list",
	"ttid": "600000@taobao_pc_10.7.0",
	"needTabs": "true",
	"areaCode": "CN",
	"vm": "nw",
	"countryNum": "156",
	"m": "pc",
	"page": 2,
	"n": 48,
	"q": "%E8%A3%A4%E5%AD%90",
	"qSource": "url",
	"pageSource": "",
	"tab": "all",
	"pageSize": "48",
	"totalPage": "100",
	"totalResults": "137306",
	"sourceS": "0",
	"sort": "_coefp",
	"bcoffset": "-13",
	"ntoffset": "13",
	"filterTag": "",
	"service": "",
	"prop": "",
	"loc": "",
	"start_price": None,
	"end_price": None,
	"startPrice": None,
	"endPrice": None,
	"categoryp": "",
	"ha3Kvpairs": None,
	"couponFilter": 0,
	"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}


json在线格式化

复制到python的函数的signParam字典中,将null值修改为None

接着继续完善getSign函数的MD5加密工作

import hashlib

python 复制代码
 n = json.dumps(signParam)
    # print(json.dumps(json.dumps(signParam)))
    data = {
        "appId": "34385",
        "params": n
    }
    # print(data)
    n_data = json.dumps(data).replace(" ", "")
    eC= "1734492057250" # 时间戳
    str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
    # print(str)
    MD5 = hashlib.md5()
    MD5.update(str.encode("utf-8"))
    sign = MD5.hexdigest()
    return sign,n_data

调用函数,获取签名sign,上面的时间戳我是写死静态可,可以删除,改为动态的,

等一下在完整源码中会修改为动态当前时间戳

python 复制代码
date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
# print(sign)
# f94586b665e0d865a20aa6d3acf708f3

有了sign,就可以发起请求,获取数据了,直接上完整源码

请求数据所在的api接口
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/

完整源码

python 复制代码
# 可以运行版
# 获取淘宝数据:https://s.taobao.com/
# 搜索键盘相关数据,会自动拦截登录页面(所以需要cookie)
import csv
import time

import requests
from pprint import pprint
import hashlib
import json
import re
url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Referer": "https://s.taobao.com/",
    "cookie":"自己的cookie"
}
"""
 mtopjsonp6({"api":"mtop.relationrecommend.wirelessrecommend.recommend","data":{},"ret":["FAIL_SYS_ILLEGAL_ACCESS::非法请求"]
 sign参数每次请求都会变化,导致请求不到数据(参数sign逆向)
"""
# eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
def getSign(eC):
    em = 'db1e1adce046132af55f1e37728ca39b'
    eS = '12574478'
    signParam = {
	"device": "HMA-AL00",
	"isBeta": "false",
	"grayHair": "false",
	"from": "nt_history",
	"brand": "HUAWEI",
	"info": "wifi",
	"index": "4",
	"rainbow": "",
	"schemaType": "auction",
	"elderHome": "false",
	"isEnterSrpSearch": "true",
	"newSearch": "false",
	"network": "wifi",
	"subtype": "",
	"hasPreposeFilter": "false",
	"prepositionVersion": "v2",
	"client_os": "Android",
	"gpsEnabled": "false",
	"searchDoorFrom": "srp",
	"debug_rerankNewOpenCard": "false",
	"homePageVersion": "v7",
	"searchElderHomeOpen": "false",
	"search_action": "initiative",
	"sugg": "_4_1",
	"sversion": "13.6",
	"style": "list",
	"ttid": "600000@taobao_pc_10.7.0",
	"needTabs": "true",
	"areaCode": "CN",
	"vm": "nw",
	"countryNum": "156",
	"m": "pc",
	"page": 1,
	"n": 48,
	"q": "%E8%A3%A4%E5%AD%90",
	"qSource": "url",
	"pageSource": "",
	"tab": "all",
	"pageSize": "48",
	"totalPage": "100",
	"totalResults": "5000",
	"sourceS": "48",
	"sort": "_coefp",
	"bcoffset": "-26",
	"ntoffset": "0",
	"filterTag": "",
	"service": "",
	"prop": "",
	"loc": "",
	"start_price": None,
	"end_price": None,
	"startPrice": None,
	"endPrice": None,
	"categoryp": "",
	"ha3Kvpairs": None,
	"couponFilter": 0,
	"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}
    n = json.dumps(signParam)
    # print(json.dumps(json.dumps(signParam)))
    data = {
        "appId": "34385",
        "params": n
    }
    # print(data)
    n_data = json.dumps(data).replace(" ", "")
    str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
    # print(str)
    MD5 = hashlib.md5()
    MD5.update(str.encode("utf-8"))
    sign = MD5.hexdigest()
    return sign,n_data

date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
print(sign)
params = {
'jsv': '2.7.4',
'appKey': '12574478',
't': date_time,
'sign': sign,
'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
'v': '2.0',
'timeout': '10000',
'type': 'jsonp',
'dataType': 'jsonp',
'callback': 'mtopjsonp6',
'data': n
}
resp = requests.get(url,params=params, headers=headers)
# print(resp.text)
html = resp.text
# 采集数据
info = re.findall(r'mtopjsonp\d+\((.*)', html)[0].replace(')','')
# pprint(info)
jsonData = json.loads(info)
# 循环获取数据
with open('taobao.csv',mode="w",newline='',encoding="utf-8") as f:
    writer = csv.writer(f)
    # 写入表头
    head = ['标题','图片链接','价格','地区','销量','店铺']
    writer.writerow(head)
    for item in jsonData['data']['itemsArray']:
        dit = {
            'title': item['title'].replace('<span class=H>', '').replace('</span>',''),
            'img': item['pic_path'],
            'price': item['price'],
            'procity': item['procity'],
            'realSales': item['realSales'],
            'shopName': item['nick'],
        }
        writer.writerow(dit.values())
        print(dit)


注:需要获取其他数据

修改源码几个参数

url所在浏览器位置

改Referer和cookie

重写getSign函数的em值,eS值,signParam值

data中的appid也改

修改真正数据接口的参数:params

最后运行代码,即可获取数据

相关推荐
西岸行者4 天前
学习笔记:SKILLS 能帮助更好的vibe coding
笔记·学习
悠哉悠哉愿意4 天前
【单片机学习笔记】串口、超声波、NE555的同时使用
笔记·单片机·学习
别催小唐敲代码4 天前
嵌入式学习路线
学习
毛小茛4 天前
计算机系统概论——校验码
学习
babe小鑫4 天前
大专经济信息管理专业学习数据分析的必要性
学习·数据挖掘·数据分析
winfreedoms4 天前
ROS2知识大白话
笔记·学习·ros2
在这habit之下4 天前
Linux Virtual Server(LVS)学习总结
linux·学习·lvs
我想我不够好。4 天前
2026.2.25监控学习
学习
im_AMBER4 天前
Leetcode 127 删除有序数组中的重复项 | 删除有序数组中的重复项 II
数据结构·学习·算法·leetcode
CodeJourney_J4 天前
从“Hello World“ 开始 C++
c语言·c++·学习