Python采集京东商品评论,京东API接口系列

Python采集京东商品评论指南

由于京东没有公开的官方API供直接采集商品评论,我们需要通过以下几种合法途径实现数据获取:

方法一:使用京东开放平台API(推荐)

京东开放平台提供了部分API接口,需要申请开发者权限:

|-------------------------------------------------------------|
| import requests |
| import json |
| |
| # 京东开放平台API示例(需替换为你的实际参数) |
| def get_jd_comments(api_key, api_secret, sku_id, page=1): |
| url = "https://api.jd.com/routerjson" |
| params = { |
| "method": "jingdong.ware.productcomment.get", |
| "app_key": api_key, |
| "access_token": "your_access_token", # 需要通过OAuth获取 |
| "timestamp": "20230101120000", |
| "format": "json", |
| "v": "2.0", |
| "sign_method": "md5", |
| "param_json": json.dumps({ |
| "skuId": sku_id, |
| "page": page, |
| "pageSize": 10 |
| }) |
| } |
| |
| # 生成签名(实际使用时需要按照京东签名规则实现) |
| params["sign"] = generate_jd_sign(params, api_secret) |
| |
| response = requests.get(url, params=params) |
| return response.json() |
| |
| # 注意:签名生成算法需要参考京东开放平台文档实现 |

方法二:网页爬取(需遵守robots.txt)

|----------------------------------------------------------------------------------------------------------------------------------------|
| import requests |
| from bs4 import BeautifulSoup |
| import time |
| import random |
| |
| def get_jd_comments_web(product_id, page=0): |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
| 'Referer': f'https://item.jd.com/{product_id}.html' |
| } |
| |
| url = f"https://club.jd.com/comment/productPageComments.action" |
| params = { |
| 'callback': 'fetchJSON_comment98', |
| 'productId': product_id, |
| 'score': 0, # 0表示全部,1-5表示对应星级 |
| 'sortType': 5, # 排序方式 |
| 'page': page, |
| 'pageSize': 10, |
| 'isShadowSku': 0, |
| 'fold': 1 |
| } |
| |
| try: |
| response = requests.get(url, headers=headers, params=params) |
| # 处理京东的JSONP格式 |
| json_str = response.text[len('fetchJSON_comment98('):-2] |
| data = json.loads(json_str) |
| return data['comments'] |
| except Exception as e: |
| print(f"Error fetching comments: {e}") |
| return [] |
| |
| # 使用示例 |
| product_id = "100012014970" # 示例商品ID |
| comments = get_jd_comments_web(product_id, page=0) |
| for comment in comments[:5]: # 打印前5条评论 |
| print(f"用户: {comment['nickname']}") |
| print(f"评分: {comment['score']}") |
| print(f"内容: {comment['content']}\n") |
| time.sleep(random.uniform(1, 3)) # 礼貌性延迟 |

方法三:使用第三方数据服务

有些第三方数据服务提供商(如八爪鱼、集搜客等)提供京东评论采集服务,可以通过他们的API获取数据。

注意事项

  1. 合规性

    • 遵守京东robots.txt规定
    • 不要高频请求,建议添加随机延迟
    • 仅用于个人学习研究,不可商用
  2. 反爬机制

    • 京东有反爬措施,可能需要处理验证码
    • 建议使用会话(session)保持连接
    • 可考虑使用selenium模拟浏览器行为
  3. 数据存储

    |----------------------------------------------------------|
    | import pandas as pd |
    | |
    | def save_comments_to_csv(comments, filename): |
    | df = pd.DataFrame(comments) |
    | df.to_csv(filename, index=False, encoding='utf_8_sig') |

  4. API限制

    • 网页版API通常只能获取前100页评论
    • 重要评论可能被折叠,需要额外参数获取

完整示例(带异常处理)

|----------------------------------------------------------------------------------------------------------------------------------------|
| import requests |
| import json |
| import time |
| import random |
| from tqdm import tqdm # 进度条 |
| |
| class JDCommentScraper: |
| def __init__(self): |
| self.session = requests.Session() |
| self.headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
| 'Referer': 'https://item.jd.com/' |
| } |
| |
| def get_comments(self, product_id, max_pages=5): |
| all_comments = [] |
| try: |
| for page in tqdm(range(max_pages), desc="采集进度"): |
| url = "https://club.jd.com/comment/productPageComments.action" |
| params = { |
| 'callback': 'fetchJSON_comment98', |
| 'productId': product_id, |
| 'score': 0, |
| 'sortType': 5, |
| 'page': page, |
| 'pageSize': 10, |
| 'isShadowSku': 0, |
| 'fold': 1 |
| } |
| |
| response = self.session.get(url, headers=self.headers, params=params) |
| if response.status_code == 200: |
| try: |
| json_str = response.text[len('fetchJSON_comment98('):-2] |
| data = json.loads(json_str) |
| comments = data.get('comments', []) |
| if not comments: |
| break |
| all_comments.extend(comments) |
| except json.JSONDecodeError: |
| print(f"解析失败: {response.text[:100]}...") |
| break |
| else: |
| print(f"请求失败,状态码: {response.status_code}") |
| break |
| |
| time.sleep(random.uniform(1, 3)) # 礼貌性延迟 |
| |
| return all_comments |
| except Exception as e: |
| print(f"采集过程中出错: {e}") |
| return all_comments |
| |
| # 使用示例 |
| if __name__ == "__main__": |
| scraper = JDCommentScraper() |
| product_id = "100012014970" # 替换为实际商品ID |
| comments = scraper.get_comments(product_id, max_pages=3) |
| |
| print(f"共采集到 {len(comments)} 条评论") |
| if comments: |
| import pandas as pd |
| df = pd.DataFrame(comments) |
| df.to_csv(f"jd_comments_{product_id}.csv", index=False, encoding='utf_8_sig') |
| print("评论已保存到CSV文件") |

请务必遵守京东的使用条款,合理控制请求频率,避免对京东服务器造成过大压力。对于大规模数据采集需求,建议联系京东商务合作部门获取官方数据接口。