一、引言
在电商数据分析领域,实时获取亚马逊商品详情页数据对于竞品分析、价格监控和市场调研至关重要。本文将详细介绍如何使用 Python 开发一个高效、稳定的亚马逊商品详情数据采集接口,帮助开发者快速获取商品标题、价格、库存、评论等核心信息。
二、准备工作
1. 申请亚马逊 SP-API 权限
首先需要在亚马逊平台注册并****申请 API**** 权限:
- 注册亚马逊开发者账号
- 创建并获取 Client ID 和 Client Secret
- 申请必要的 API 权限(Products API)
- 配置并获取 Refresh Token
2. 安装必要的 Python 库
pip install requests python-jose cryptography python-dotenv
- 项目结构设计
arduino
amazon_api_client/
├── config/
│ └── config.py
├── utils/
│ ├── auth.py
│ └── request_helper.py
├── services/
│ └── product_service.py
├── main.py
└── .env
三、认证机制实现
1. OAuth 2.0 认证流程
亚马逊 SP-API 使用 OAuth 2.0 进行认证,需要通过 Refresh Token 定期获取 Access Token:
python
# utils/auth.py
import os
import time
import requests
from jose import jwk, jwt
from jose.utils import base64url_decode
from dotenv import load_dotenv
load_dotenv()
class AmazonAuth:
def __init__(self):
self.client_id = os.getenv("CLIENT_ID")
self.client_secret = os.getenv("CLIENT_SECRET")
self.refresh_token = os.getenv("REFRESH_TOKEN")
self.access_token = None
self.token_expiry = 0
def get_access_token(self):
"""获取或刷新访问令牌"""
if self.access_token and self.token_expiry > time.time() + 60:
return self.access_token
auth_url = "https://api.amazon.com/auth/o2/token"
payload = {
"grant_type": "refresh_token",
"client_id": self.client_id,
"client_secret": self.client_secret,
"refresh_token": self.refresh_token
}
response = requests.post(auth_url, data=payload)
response.raise_for_status()
token_data = response.json()
self.access_token = token_data["access_token"]
self.token_expiry = time.time() + token_data["expires_in"] - 60 # 提前60秒过期
return self.access_token
- 请求签名生成
python
# utils/request_helper.py
import hmac
import hashlib
import time
import os
from datetime import datetime
from auth import AmazonAuth
class RequestHelper:
def __init__(self):
self.auth = AmazonAuth()
self.seller_id = os.getenv("SELLER_ID")
self.region = os.getenv("REGION", "us-east-1")
self.host = "sellingpartnerapi-na.amazon.com"
def generate_signature(self, string_to_sign):
"""生成请求签名"""
secret_key = os.getenv("AWS_SECRET_KEY").encode()
date_key = hmac.new(b"AWS4" + secret_key, self.get_amz_date().encode(), hashlib.sha256).digest()
date_region_key = hmac.new(date_key, self.region.encode(), hashlib.sha256).digest()
date_region_service_key = hmac.new(date_region_key, "execute-api".encode(), hashlib.sha256).digest()
signing_key = hmac.new(date_region_service_key, "aws4_request".encode(), hashlib.sha256).digest()
return hmac.new(signing_key, string_to_sign.encode(), hashlib.sha256).hexdigest()
def get_amz_date(self):
"""获取AWS格式的日期字符串"""
return datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
def generate_headers(self, method, path, query_params=None, payload=""):
"""生成请求头"""
amz_date = self.get_amz_date()
access_token = self.auth.get_access_token()
canonical_headers = f'host:{self.host}\nx-amz-access-token:{access_token}\nx-amz-date:{amz_date}\n'
signed_headers = 'host;x-amz-access-token;x-amz-date'
canonical_querystring = '&'.join([f"{k}={v}" for k, v in sorted(query_params.items())]) if query_params else ''
payload_hash = hashlib.sha256(payload.encode()).hexdigest()
canonical_request = f"{method}\n{path}\n{canonical_querystring}\n{canonical_headers}\n{signed_headers}\n{payload_hash}"
algorithm = 'AWS4-HMAC-SHA256'
credential_scope = f"{self.get_amz_date()[:8]}/{self.region}/execute-api/aws4_request"
string_to_sign = f"{algorithm}\n{amz_date}\n{credential_scope}\n{hashlib.sha256(canonical_request.encode()).hexdigest()}"
signature = self.generate_signature(string_to_sign)
authorization_header = (
f"{algorithm} Credential={os.getenv('AWS_ACCESS_KEY')}/{credential_scope}, "
f"SignedHeaders={signed_headers}, Signature={signature}"
)
headers = {
'Content-Type': 'application/json',
'X-Amz-Date': amz_date,
'Authorization': authorization_header,
'X-Amz-Access-Token': access_token
}
return headers
四、商品详情 API 实现
1. 获取单个商品详情
python
# services/product_service.py
import requests
from utils.request_helper import RequestHelper
class ProductService:
def __init__(self):
self.request_helper = RequestHelper()
self.endpoint = "https://sellingpartnerapi-na.amazon.com"
def get_product_details(self, asin, marketplace_id="ATVPDKIKX0DER"):
"""获取商品详情"""
path = f"/products/2020-08-01/items/{asin}"
query_params = {
"marketplaceIds": marketplace_id,
"includedData": "attributes,images,productTypes,identifiers,offers,prices"
}
headers = self.request_helper.generate_headers("GET", path, query_params)
url = f"{self.endpoint}{path}"
try:
response = requests.get(url, headers=headers, params=query_params)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as e:
print(f"HTTP Error: {e.response.status_code}")
print(f"Response: {e.response.text}")
raise
except Exception as e:
print(f"Error: {e}")
raise
- 批量获取商品详情
python
# services/product_service.py (续)
def batch_get_product_details(self, asins, marketplace_id="ATVPDKIKX0DER"):
"""批量获取商品详情"""
path = "/products/2020-08-01/items"
results = []
batch_size = 10 # 亚马逊限制每次最多10个ASIN
for i in range(0, len(asins), batch_size):
batch = asins[i:i+batch_size]
query_params = {
"marketplaceIds": marketplace_id,
"includedData": "attributes,images,productTypes,identifiers,offers,prices",
"asin": ",".join(batch)
}
headers = self.request_helper.generate_headers("GET", path, query_params)
url = f"{self.endpoint}{path}"
try:
response = requests.get(url, headers=headers, params=query_params)
response.raise_for_status()
results.extend(response.json().get("payload", []))
# 遵守速率限制
if len(asins) > batch_size:
import time
time.sleep(1) # 每秒请求一次
except requests.exceptions.HTTPError as e:
if e.response.status_code == 429:
print("Rate limit exceeded, sleeping for 60 seconds...")
time.sleep(60)
return self.batch_get_product_details(asins[i:], marketplace_id)
else:
print(f"HTTP Error: {e.response.status_code}")
print(f"Response: {e.response.text}")
raise
except Exception as e:
print(f"Error: {e}")
raise
return results
五、数据解析与处理
1. 解析商品数据
python
# services/product_service.py (续)
def parse_product_data(self, product_data):
"""解析商品数据"""
if not product_data:
return None
try:
# 基本信息
item = product_data.get("attributes", {})
parsed_data = {
"asin": product_data.get("asin"),
"title": item.get("title"),
"brand": item.get("brand"),
"color": item.get("color"),
"size": item.get("size"),
"model": item.get("model"),
"manufacturer": item.get("manufacturer"),
"item_dimensions": item.get("itemDimensions"),
"package_dimensions": item.get("packageDimensions"),
"package_quantity": item.get("packageQuantity"),
"release_date": item.get("releaseDate"),
"is_adult_product": item.get("isAdultProduct"),
"product_type": product_data.get("productType"),
"identifiers": product_data.get("identifiers"),
"images": self._parse_images(product_data.get("images", [])),
"prices": self._parse_prices(product_data.get("prices", {})),
"offers": self._parse_offers(product_data.get("offers", {})),
"update_time": datetime.now().isoformat()
}
return parsed_data
except Exception as e:
print(f"Error parsing product data: {e}")
return None
def _parse_images(self, images):
"""解析商品图片"""
parsed_images = []
for image_group in images:
for image in image_group.get("images", []):
parsed_images.append({
"url": image.get("url"),
"height": image.get("height"),
"width": image.get("width"),
"variant": image.get("variant")
})
return parsed_images
def _parse_prices(self, prices):
"""解析价格信息"""
return {
"list_price": self._parse_price_component(prices.get("listPrice")),
"price_range": {
"min": self._parse_price_component(prices.get("priceRange", {}).get("min")),
"max": self._parse_price_component(prices.get("priceRange", {}).get("max"))
},
"buying_price": self._parse_price_component(prices.get("buyingPrice")),
"regular_price": self._parse_price_component(prices.get("regularPrice")),
"points": prices.get("points")
}
def _parse_price_component(self, price_component):
"""解析价格组件"""
if not price_component:
return None
return {
"amount": price_component.get("amount"),
"currency_code": price_component.get("currencyCode")
}
def _parse_offers(self, offers):
"""解析商品销售信息"""
return {
"listing_id": offers.get("listingId"),
"quantity": offers.get("quantity"),
"condition": offers.get("condition"),
"fulfillment_channel": offers.get("fulfillmentChannel"),
"item_offers": [self._parse_item_offer(offer) for offer in offers.get("itemOffers", [])]
}
def _parse_item_offer(self, offer):
"""解析单个销售信息"""
return {
"seller_id": offer.get("sellerId"),
"price": self._parse_price_component(offer.get("price")),
"shipping": self._parse_price_component(offer.get("shipping")),
"is_buy_box_winner": offer.get("isBuyBoxWinner"),
"condition": offer.get("condition"),
"fulfillment_channel": offer.get("fulfillmentChannel")
}
六、主程序调用示例
python
# main.py
from services.product_service import ProductService
import json
import os
from dotenv import load_dotenv
load_dotenv()
def main():
# 初始化服务
product_service = ProductService()
# 获取单个商品信息
asin = "B08L5TNJHG" # 示例ASIN
print(f"获取ASIN为 {asin} 的商品详情...")
product_data = product_service.get_product_details(asin)
parsed_product = product_service.parse_product_data(product_data.get("payload", {}))
if parsed_product:
print(f"商品标题: {parsed_product['title']}")
print(f"当前价格: {parsed_product['prices']['buying_price']['amount']} {parsed_product['prices']['buying_price']['currency_code']}")
# 保存到文件
with open(f"{asin}_details.json", "w", encoding="utf-8") as f:
json.dump(parsed_product, f, ensure_ascii=False, indent=2)
print(f"商品详情已保存到 {asin}_details.json")
# 批量获取商品信息
asins = ["B08L5TNJHG", "B07Y27597C", "B083F98L8M"] # 示例ASIN列表
print("\n批量获取商品详情...")
batch_products = product_service.batch_get_product_details(asins)
parsed_products = []
for product in batch_products:
parsed = product_service.parse_product_data(product)
if parsed:
parsed_products.append(parsed)
print(f"已获取: {parsed['title']}")
# 保存批量结果
with open("batch_products_details.json", "w", encoding="utf-8") as f:
json.dump(parsed_products, f, ensure_ascii=False, indent=2)
print(f"批量商品详情已保存到 batch_products_details.json")
if __name__ == "__main__":
main()
七、配置文件示例
ini
# config/config.py
import os
from dotenv import load_dotenv
load_dotenv()
class Config:
# 亚马逊API配置
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
REFRESH_TOKEN = os.getenv("REFRESH_TOKEN")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
SELLER_ID = os.getenv("SELLER_ID")
REGION = os.getenv("REGION", "us-east-1")
# 应用配置
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
CACHE_DIR = os.getenv("CACHE_DIR", "./cache")
RETRY_COUNT = int(os.getenv("RETRY_COUNT", 3))
RETRY_DELAY = int(os.getenv("RETRY_DELAY", 5))
ini
# .env 文件示例
CLIENT_ID=amzn1.application-oa2-client.your_client_id_here
CLIENT_SECRET=your_client_secret_here
REFRESH_TOKEN=your_refresh_token_here
AWS_ACCESS_KEY=your_aws_access_key_here
AWS_SECRET_KEY=your_aws_secret_key_here
SELLER_ID=your_seller_id_here
REGION=us-east-1
LOG_LEVEL=INFO
CACHE_DIR=./cache
RETRY_COUNT=3
RETRY_DELAY=5
八、API 调用注意事项
1. 速率限制
亚马逊 SP-API 有严格的速率限制,不同 API 端点的限制不同:
- 商品详情 API:通常为每秒 10 次请求
- 超出限制会返回 429 状态码
- 建议实现指数退避重试机制
2. 错误处理
常见错误及处理方法:
- 401 Unauthorized:刷新 AccessToken 并重试
- 429 Too Many Requests:等待指定时间后重试
- 500 Internal Server Error:记录错误并稍后重试
3. 数据使用合规性
- 仅用于个人或商业分析目的
- 禁止爬取非公开数据
- 遵守亚马逊 API 使用条款
- 不要过度频繁请求
九、扩展与优化建议
- 添加缓存机制:使用 Redis 或文件系统缓存已获取的数据,减少 API 调用次数
- 实现异步请求:使用 asyncio 和 aiohttp 库实现异步请求,提高并发性能
- 数据存储:将数据存入数据库(如 MySQL、MongoDB)便于长期分析
- 监控与告警:添加请求成功率监控和异常告警机制
- 分布式部署:使用分布式架构扩展系统规模,应对大量请求
通过以上步骤,你可以构建一个完整、高效的亚马逊商品详情数据采集系统,满足实时监控和分析需求。在实际应用中,根据具体业务场景进行适当调整和优化,确保系统稳定可靠运行。