Python + 淘宝 API 开发：自动化采集商品数据的完整流程

在电商数据分析、竞品监控和市场调研等场景中，高效采集淘宝商品数据是关键环节。本文将详细介绍如何利用 Python 结合 API，构建一套自动化的商品数据采集系统，涵盖从 API 申请到数据存储的完整流程，并提供可直接运行的代码实现。

淘宝API 基础

淘宝提供了标准化的 API 接口，相比网页爬虫具有稳定性高、合规性强、数据结构统一等优势。常用的商品数据相关 API 包括：

商品详情 API：获取商品基本信息、价格、库存等

商品搜索 API：按关键词、分类等条件搜索商品

店铺商品 API：获取指定店铺的所有商品

商品评价 API：获取商品的用户评价数据

使用淘宝 API 需遵守平台规范，注意调用频率限制和数据使用范围，避免违规操作导致账号封禁。

开发前的准备工作

账号注册
注册开发者账号
完成认证（个人或企业认证）
获取 Api Key 和 Api Secret
申请所需的 API 权限（如商品详情、商品搜索等）
开发环境搭建

推荐使用 Python 3.7 + 版本，需安装以下依赖库：

bash 复制代码

pip install top-api-sdk  # 淘宝API官方Python SDK
pip install pandas       # 数据处理
pip install pymysql      # 数据库连接
pip install python-dotenv # 环境变量管理

核心参数配置

创建.env文件存储敏感信息：

ini 复制代码

APP_KEY=你的AppKey
APP_SECRET=你的AppSecret
REDIRECT_URI=你的回调地址
MYSQL_HOST=localhost
MYSQL_PORT=3306
MYSQL_USER=root
MYSQL_PASSWORD=你的密码
MYSQL_DB=taobao_data

自动化采集系统设计

系统架构

本系统采用模块化设计，主要包含以下组件：

认证模块：处理 API 授权与令牌管理

采集模块：调用 API 获取商品数据

存储模块：将数据保存到数据库

调度模块：实现定时自动采集

日志模块：记录系统运行状态

数据采集流程

获取 API 访问令牌（AccessToken）
构造 API 请求参数（关键词、页码、排序方式等）
调用 API 并处理返回结果
数据清洗与格式转换
存储到数据库
实现增量采集与去重机制

完整代码实现

加载环境变量

load_dotenv()

配置日志

logging.basicConfig(

level=logging.INFO,

format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',

filename='taobao_collector.log'

)

logger = logging.getLogger('taobao_collector')

class TaobaoAPICollector:

def init(self):

"""初始化淘宝 API 采集器"""

self.app_key = os.getenv ('APP_KEY')

self.app_secret = os.getenv ('APP_SECRET')

self.redirect_uri = os.getenv ('REDIRECT_URI')

数据库连接配置

self.db_config = {

'host': os.getenv('MYSQL_HOST'),

'port': int(os.getenv('MYSQL_PORT')),

'user': os.getenv('MYSQL_USER'),

'password': os.getenv('MYSQL_PASSWORD'),

'db': os.getenv('MYSQL_DB'),

'charset': 'utf8mb4'

}

初始化数据库连接

self.db_conn = self._get_db_connection()

self._create_tables()

API 调用配置

self.page_size = 20 # 每页商品数量

self.max_pages = 5 # 最大采集页数

self.request_interval = 2 # API 请求间隔 (秒)

def _get_db_connection (self):

"""获取数据库连接"""

try:

conn = pymysql.connect (

host=self.db_config $'host'$ ,

port=self.db_config $'port'$ ,

user=self.db_config $'user'$ ,

password=self.db_config $'password'$ ,

db=self.db_config $'db'$ ,

charset=self.db_config $'charset'$ ,

cursorclass=DictCursor

)

logger.info("数据库连接成功")

return conn

except Exception as e:

logger.error (f"数据库连接失败: {str (e)}")

raise

def _create_tables (self):

"""创建数据库表"""

try:

with self.db_conn.cursor () as cursor:

商品表

cursor.execute ('''

CREATE TABLE IF NOT EXISTS products (

id BIGINT PRIMARY KEY COMMENT ' 商品 ID',

title VARCHAR (255) COMMENT ' 商品标题 ',

cat_name VARCHAR (100) COMMENT ' 商品分类 ',

price DECIMAL (10,2) COMMENT ' 商品价格 ',

sales INT COMMENT ' 销量 ',

stock INT COMMENT ' 库存 ',

nick VARCHAR (100) COMMENT ' 卖家昵称 ',

shop_title VARCHAR (100) COMMENT ' 店铺名称 ',

pic_url VARCHAR (255) COMMENT ' 商品主图 URL',

detail_url VARCHAR (255) COMMENT ' 商品详情页 URL',

created_time DATETIME COMMENT ' 商品创建时间 ',

collected_time DATETIME COMMENT ' 采集时间 ',

updated_time DATETIME COMMENT ' 更新时间 '

) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT=' 淘宝商品表 ';

''')

商品详情表

cursor.execute ('''

CREATE TABLE IF NOT EXISTS product_details (

product_id BIGINT PRIMARY KEY COMMENT ' 商品 ID',

desc TEXT COMMENT ' 商品描述 ',

props TEXT COMMENT ' 商品属性 ',

sku TEXT COMMENT 'SKU 信息 ',

collected_time DATETIME COMMENT ' 采集时间 ',

updated_time DATETIME COMMENT ' 更新时间 '

) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT=' 淘宝商品详情表 ';

''')

采集任务表

cursor.execute ('''

CREATE TABLE IF NOT EXISTS collection_tasks (

id INT AUTO_INCREMENT PRIMARY KEY,

keyword VARCHAR (100) COMMENT ' 搜索关键词 ',

page INT COMMENT ' 采集页数 ',

status ENUM ('pending', 'running', 'completed', 'failed') DEFAULT 'pending' COMMENT ' 任务状态 ',

start_time DATETIME COMMENT ' 开始时间 ',

end_time DATETIME COMMENT ' 结束时间 ',

total_products INT COMMENT ' 采集商品总数 ',

comment VARCHAR (255) COMMENT ' 备注 '

) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT=' 采集任务表 ';

''')

self.db_conn.commit()

logger.info("数据库表创建 / 检查完成")

except Exception as e:

logger.error (f"创建数据库表失败: {str (e)}")

self.db_conn.rollback ()

def _search_products (self, keyword, page=1):

"""

搜索商品

:param keyword: 搜索关键词

:param page: 页码

:return: 商品列表

"""

try:

request = TbkItemGetRequest ()

request.set_app_info (appinfo (self.app_key, self.app_secret))

设置请求参数

request.keyword = keyword

request.page_no = page

request.page_size = self.page_size

request.platform = 2 # 1:PC，2: 无线

request.sort = "sale_desc" # 按销量降序

调用 API

response = request.getResponse()

处理返回结果

if "tbk_item_get_response" in response and "results" in response $"tbk_item_get_response"$ :

return response $"tbk_item_get_response"$ $"results"$ $"n_tbk_item"$

return \[\]

except Exception as e:

logger.error (f"搜索商品失败 (keyword: {keyword}, page: {page}): {str (e)}")

return \[\]

def _get_product_detail (self, product_id):

"""

获取商品详情

:param product_id: 商品 ID

:return: 商品详情

"""

try:

request = TbkItemDetailGetRequest ()

request.set_app_info (appinfo (self.app_key, self.app_secret))

request.num_iid = product_id

调用 API

response = request.getResponse()

if "tbk_item_detail_get_response" in response and "data" in response $"tbk_item_detail_get_response"$ :

return response $"tbk_item_detail_get_response"$ $"data"$

return None

except Exception as e:

logger.error (f"获取商品详情失败 (id: {product_id}): {str (e)}")

return None

def _save_products (self, products):

"""保存商品数据到数据库"""

if not products:

return 0

try:

with self.db_conn.cursor() as cursor:

count = 0

now = datetime.now()

for item in products:

检查商品是否已存在

cursor.execute("SELECT id FROM products WHERE id = %s", (item $'num_iid'$ ,))

exists = cursor.fetchone() is not None

if exists:

更新现有商品

sql = '''

UPDATE products SET

title = %s, price = %s, sales = %s, stock = %s,

pic_url = %s, updated_time = %s

WHERE id = %s

'''

cursor.execute(sql, (

item $'title'$ , item $'zk_final_price'$ , item.get('sales', 0),

item.get('stock', 0), item $'pict_url'$ , now, item $'num_iid'$

))

else:

插入新商品

sql = '''

INSERT INTO products (

id, title, cat_name, price, sales, stock, nick, shop_title,

pic_url, detail_url, created_time, collected_time, updated_time

) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)

'''

cursor.execute(sql, (

item $'num_iid'$ , item $'title'$ , item.get('cat_name', ''),

item $'zk_final_price'$ , item.get('sales', 0), item.get('stock', 0),

item $'nick'$ , item.get('shop_title', ''), item $'pict_url'$ ,

item $'item_url'$ , datetime.now(), now, now

))

count += 1

self.db_conn.commit()

logger.info(f"成功保存 / 更新 {count} 个商品")

return count

except Exception as e:

logger.error (f"保存商品失败: {str (e)}")

self.db_conn.rollback ()

return 0

def _save_product_details (self, product_id, detail):

"""保存商品详情到数据库"""

if not detail:

return False

try:

with self.db_conn.cursor() as cursor:

now = datetime.now()

提取 SKU 信息

sku_info = \[\]

if "sku" in detail and "sku_map" in detail $"sku"$ :

for key, value in detail $"sku"$ $"sku_map"$ .items():

sku_info.append({

"key": key,

"price": value.get("price", ""),

"stock": value.get("stock", 0),

"props": value.get("props", "")

})

检查详情是否已存在

cursor.execute("SELECT product_id FROM product_details WHERE product_id = %s", (product_id,))

exists = cursor.fetchone() is not None

if exists:

更新现有详情

sql = '''

UPDATE product_details SET

desc = %s, props = %s, sku = %s, updated_time = %s

WHERE product_id = %s

'''

cursor.execute(sql, (

str(detail.get("desc", "")), str(detail.get("props", "")),

str(sku_info), now, product_id

))

else:

插入新详情

sql = '''

INSERT INTO product_details (

product_id, desc, props, sku, collected_time, updated_time

) VALUES (%s, %s, %s, %s, %s, %s)

'''

cursor.execute(sql, (

product_id, str(detail.get("desc", "")), str(detail.get("props", "")),

str(sku_info), now, now

))

self.db_conn.commit ()

return True

except Exception as e:

logger.error (f"保存商品详情失败 (id: {product_id}): {str (e)}")

self.db_conn.rollback ()

return False

def collect_by_keyword (self, keyword, collect_details=True):

"""

按关键词采集商品数据

:param keyword: 搜索关键词

:param collect_details: 是否采集商品详情

:return: 采集总数

"""

logger.info(f"开始按关键词采集: {keyword}")

创建采集任务记录

task_id = self._create_task (keyword)

if not task_id:

logger.error ("创建采集任务失败")

return 0

total_count = 0

try:

分页采集

for page in range(1, self.max_pages + 1):

logger.info(f"采集第 {page} 页，关键词: {keyword}")

搜索商品

products = self._search_products(keyword, page)

if not products:

logger.info(f"第 {page} 页没有获取到商品数据，停止采集")

break

保存商品数据

saved_count = self._save_products(products)

total_count += saved_count

采集商品详情

if collect_details:

for product in products:

product_id = product $'num_iid'$

logger.info(f"采集商品详情: {product_id}")

detail = self._get_product_detail(product_id)

if detail:

self._save_product_details(product_id, detail)

控制请求频率

time.sleep(self.request_interval)

控制请求频率

time.sleep(self.request_interval)

更新任务状态

self._update_task(task_id, "completed", total_count)

logger.info(f"关键词 {keyword} 采集完成，共采集 {total_count} 个商品")

return total_count

except Exception as e:

logger.error (f"采集过程出错: {str (e)}")

self._update_task (task_id, "failed", total_count, str (e))

return total_count

def _create_task (self, keyword):

"""创建采集任务记录"""

try:

with self.db_conn.cursor () as cursor:

sql = '''

INSERT INTO collection_tasks (

keyword, page, status, start_time

) VALUES (% s, % s, % s, % s)

'''

cursor.execute (sql, (keyword, self.max_pages, 'running', datetime.now ()))

self.db_conn.commit ()

return cursor.lastrowid

except Exception as e:

logger.error (f"创建任务记录失败: {str (e)}")

self.db_conn.rollback ()

return None

def _update_task (self, task_id, status, total_products=0, comment=""):

"""更新任务状态"""

try:

with self.db_conn.cursor () as cursor:

sql = '''

UPDATE collection_tasks SET

status = % s, end_time = % s,

total_products = % s, comment = % s

WHERE id = % s

'''

cursor.execute (sql, (status, datetime.now (), total_products, comment, task_id))

self.db_conn.commit ()

except Exception as e:

logger.error (f"更新任务状态失败: {str (e)}")

self.db_conn.rollback ()

def export_to_excel (self, keyword, filename=None):

"""将采集的商品数据导出为 Excel"""

try:

with self.db_conn.cursor () as cursor:

搜索包含关键词的商品

sql = "SELECT * FROM products WHERE title LIKE %s LIMIT 1000"

cursor.execute(sql, (f'%{keyword}%',))

products = cursor.fetchall()

if not products:

logger.info(f"没有找到包含关键词 {keyword} 的商品数据")

return False

转换为 DataFrame

df = pd.DataFrame(products)

生成文件名

if not filename:

filename = f"taobao_products_{keyword}_{datetime.now().strftime