爬虫案例学习6

获取淘宝商品数据2024-12-18

参考学习:
大佬博客
视频教程

通过搜索发现,数据是通过发送请求过来的,不是静态存在源代码的

所以我们需要请求这个接口获取数据:比如标题,价格,图片等信息
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/

但是我们直接发请求,携带上参数,无法获取到数据,会返回非法请求的字样。

因为有个参数sign是加密的,我们需要逆向

逆向参数获取sign

sign参数:貌似是一些参数经过哈希加密算法之后生成的32位小写加密参数。

具体的需要查看对应的js

点击main.js

搜素sign:相关的,分析

eT = eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)这一行就是生成sign

点击断点,可以查看变量的值

不过这里推荐打断点的时机,先鼠标滚动到下面的页码处,再接着打断点,点击下一页,此时进入js源码的参数才是正确的。

如果不这样做,鼠标滚轮下滑也进入了断点,ep.data的值不是我们需要的,需要放行很多次。

切换到控制台,输出这些值,等一下在python代码中需要使用,这里先记录一下

bash 复制代码
eE(em.token + "&" + eC + "&" + eS + "&" + ep.data) # 返回值是sign的值
em.token
eC 
eS 
ep.data

获得sign 8a3593958c55ff4115e359745dc9a665是0-9,a-f MD5加密的字符串

我们需要带代码中生成sign

构建字符串MD5加密

python 复制代码
#构建字符串str = em.token + "&" + eC + "&" + eS + "&" + ep.data
#Ec是时间
def getSign(eC):
    em = 'cbee62bc9b064d508514dd6eb1c6cebd' # em变量存储token
    eS = '12574478'
    # signParam 是ep.data中的params字段
    signParam = {
	"device": "HMA-AL00",
	"isBeta": "false",
	"grayHair": "false",
	"from": "nt_history",
	"brand": "HUAWEI",
	"info": "wifi",
	"index": "4",
	"rainbow": "",
	"schemaType": "auction",
	"elderHome": "false",
	"isEnterSrpSearch": "true",
	"newSearch": "false",
	"network": "wifi",
	"subtype": "",
	"hasPreposeFilter": "false",
	"prepositionVersion": "v2",
	"client_os": "Android",
	"gpsEnabled": "false",
	"searchDoorFrom": "srp",
	"debug_rerankNewOpenCard": "false",
	"homePageVersion": "v7",
	"searchElderHomeOpen": "false",
	"search_action": "initiative",
	"sugg": "_4_1",
	"sversion": "13.6",
	"style": "list",
	"ttid": "600000@taobao_pc_10.7.0",
	"needTabs": "true",
	"areaCode": "CN",
	"vm": "nw",
	"countryNum": "156",
	"m": "pc",
	"page": 2,
	"n": 48,
	"q": "%E8%A3%A4%E5%AD%90",
	"qSource": "url",
	"pageSource": "",
	"tab": "all",
	"pageSize": "48",
	"totalPage": "100",
	"totalResults": "137306",
	"sourceS": "0",
	"sort": "_coefp",
	"bcoffset": "-13",
	"ntoffset": "13",
	"filterTag": "",
	"service": "",
	"prop": "",
	"loc": "",
	"start_price": None,
	"end_price": None,
	"startPrice": None,
	"endPrice": None,
	"categoryp": "",
	"ha3Kvpairs": None,
	"couponFilter": 0,
	"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}


json在线格式化

复制到python的函数的signParam字典中,将null值修改为None

接着继续完善getSign函数的MD5加密工作

import hashlib

python 复制代码
 n = json.dumps(signParam)
    # print(json.dumps(json.dumps(signParam)))
    data = {
        "appId": "34385",
        "params": n
    }
    # print(data)
    n_data = json.dumps(data).replace(" ", "")
    eC= "1734492057250" # 时间戳
    str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
    # print(str)
    MD5 = hashlib.md5()
    MD5.update(str.encode("utf-8"))
    sign = MD5.hexdigest()
    return sign,n_data

调用函数,获取签名sign,上面的时间戳我是写死静态可,可以删除,改为动态的,

等一下在完整源码中会修改为动态当前时间戳

python 复制代码
date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
# print(sign)
# f94586b665e0d865a20aa6d3acf708f3

有了sign,就可以发起请求,获取数据了,直接上完整源码

请求数据所在的api接口
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/

完整源码

python 复制代码
# 可以运行版
# 获取淘宝数据:https://s.taobao.com/
# 搜索键盘相关数据,会自动拦截登录页面(所以需要cookie)
import csv
import time

import requests
from pprint import pprint
import hashlib
import json
import re
url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Referer": "https://s.taobao.com/",
    "cookie":"自己的cookie"
}
"""
 mtopjsonp6({"api":"mtop.relationrecommend.wirelessrecommend.recommend","data":{},"ret":["FAIL_SYS_ILLEGAL_ACCESS::非法请求"]
 sign参数每次请求都会变化,导致请求不到数据(参数sign逆向)
"""
# eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
def getSign(eC):
    em = 'db1e1adce046132af55f1e37728ca39b'
    eS = '12574478'
    signParam = {
	"device": "HMA-AL00",
	"isBeta": "false",
	"grayHair": "false",
	"from": "nt_history",
	"brand": "HUAWEI",
	"info": "wifi",
	"index": "4",
	"rainbow": "",
	"schemaType": "auction",
	"elderHome": "false",
	"isEnterSrpSearch": "true",
	"newSearch": "false",
	"network": "wifi",
	"subtype": "",
	"hasPreposeFilter": "false",
	"prepositionVersion": "v2",
	"client_os": "Android",
	"gpsEnabled": "false",
	"searchDoorFrom": "srp",
	"debug_rerankNewOpenCard": "false",
	"homePageVersion": "v7",
	"searchElderHomeOpen": "false",
	"search_action": "initiative",
	"sugg": "_4_1",
	"sversion": "13.6",
	"style": "list",
	"ttid": "600000@taobao_pc_10.7.0",
	"needTabs": "true",
	"areaCode": "CN",
	"vm": "nw",
	"countryNum": "156",
	"m": "pc",
	"page": 1,
	"n": 48,
	"q": "%E8%A3%A4%E5%AD%90",
	"qSource": "url",
	"pageSource": "",
	"tab": "all",
	"pageSize": "48",
	"totalPage": "100",
	"totalResults": "5000",
	"sourceS": "48",
	"sort": "_coefp",
	"bcoffset": "-26",
	"ntoffset": "0",
	"filterTag": "",
	"service": "",
	"prop": "",
	"loc": "",
	"start_price": None,
	"end_price": None,
	"startPrice": None,
	"endPrice": None,
	"categoryp": "",
	"ha3Kvpairs": None,
	"couponFilter": 0,
	"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}
    n = json.dumps(signParam)
    # print(json.dumps(json.dumps(signParam)))
    data = {
        "appId": "34385",
        "params": n
    }
    # print(data)
    n_data = json.dumps(data).replace(" ", "")
    str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
    # print(str)
    MD5 = hashlib.md5()
    MD5.update(str.encode("utf-8"))
    sign = MD5.hexdigest()
    return sign,n_data

date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
print(sign)
params = {
'jsv': '2.7.4',
'appKey': '12574478',
't': date_time,
'sign': sign,
'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
'v': '2.0',
'timeout': '10000',
'type': 'jsonp',
'dataType': 'jsonp',
'callback': 'mtopjsonp6',
'data': n
}
resp = requests.get(url,params=params, headers=headers)
# print(resp.text)
html = resp.text
# 采集数据
info = re.findall(r'mtopjsonp\d+\((.*)', html)[0].replace(')','')
# pprint(info)
jsonData = json.loads(info)
# 循环获取数据
with open('taobao.csv',mode="w",newline='',encoding="utf-8") as f:
    writer = csv.writer(f)
    # 写入表头
    head = ['标题','图片链接','价格','地区','销量','店铺']
    writer.writerow(head)
    for item in jsonData['data']['itemsArray']:
        dit = {
            'title': item['title'].replace('<span class=H>', '').replace('</span>',''),
            'img': item['pic_path'],
            'price': item['price'],
            'procity': item['procity'],
            'realSales': item['realSales'],
            'shopName': item['nick'],
        }
        writer.writerow(dit.values())
        print(dit)


注:需要获取其他数据

修改源码几个参数

url所在浏览器位置

改Referer和cookie

重写getSign函数的em值,eS值,signParam值

data中的appid也改

修改真正数据接口的参数:params

最后运行代码,即可获取数据

相关推荐
虾球xz5 分钟前
游戏引擎学习第84天
学习·游戏引擎
m0_748240541 小时前
AutoSar架构学习笔记
笔记·学习·架构
数据小小爬虫2 小时前
如何使用Python爬虫获取微店商品详情:代码示例与实践指南
开发语言·爬虫·python
siy23333 小时前
[c语言日寄]结构体的使用及其拓展
c语言·开发语言·笔记·学习·算法
mit6.8244 小时前
What is Json?
c++·学习·json
weixin_SAG4 小时前
14天学习微服务-->第1天:微服务架构入门
学习·微服务·架构
ThisIsClark4 小时前
【gopher的java学习笔记】Java中Mapper与Entity的关系详解
java·笔记·学习
m0_548049704 小时前
SpringCloud学习笔记【尚硅谷2024版】
笔记·学习·spring cloud
羊小猪~~4 小时前
深度学习基础--LSTM学习笔记(李沐《动手学习深度学习》)
人工智能·rnn·深度学习·学习·机器学习·gru·lstm