Python采集京东商品评论指南
由于京东没有公开的官方API供直接采集商品评论,我们需要通过以下几种合法途径实现数据获取:
方法一:使用京东开放平台API(推荐)
京东开放平台提供了部分API接口,需要申请开发者权限:
|-------------------------------------------------------------|
| import requests
|
| import json
|
| |
| # 京东开放平台API示例(需替换为你的实际参数)
|
| def get_jd_comments(api_key, api_secret, sku_id, page=1):
|
| url = "https://api.jd.com/routerjson"
|
| params = {
|
| "method": "jingdong.ware.productcomment.get",
|
| "app_key": api_key,
|
| "access_token": "your_access_token", # 需要通过OAuth获取
|
| "timestamp": "20230101120000",
|
| "format": "json",
|
| "v": "2.0",
|
| "sign_method": "md5",
|
| "param_json": json.dumps({
|
| "skuId": sku_id,
|
| "page": page,
|
| "pageSize": 10
|
| })
|
| }
|
| |
| # 生成签名(实际使用时需要按照京东签名规则实现)
|
| params["sign"] = generate_jd_sign(params, api_secret)
|
| |
| response = requests.get(url, params=params)
|
| return response.json()
|
| |
| # 注意:签名生成算法需要参考京东开放平台文档实现
|
方法二:网页爬取(需遵守robots.txt)
|----------------------------------------------------------------------------------------------------------------------------------------|
| import requests
|
| from bs4 import BeautifulSoup
|
| import time
|
| import random
|
| |
| def get_jd_comments_web(product_id, page=0):
|
| headers = {
|
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 'Referer': f'https://item.jd.com/{product_id}.html'
|
| }
|
| |
| url = f"https://club.jd.com/comment/productPageComments.action"
|
| params = {
|
| 'callback': 'fetchJSON_comment98',
|
| 'productId': product_id,
|
| 'score': 0, # 0表示全部,1-5表示对应星级
|
| 'sortType': 5, # 排序方式
|
| 'page': page,
|
| 'pageSize': 10,
|
| 'isShadowSku': 0,
|
| 'fold': 1
|
| }
|
| |
| try:
|
| response = requests.get(url, headers=headers, params=params)
|
| # 处理京东的JSONP格式
|
| json_str = response.text[len('fetchJSON_comment98('):-2]
|
| data = json.loads(json_str)
|
| return data['comments']
|
| except Exception as e:
|
| print(f"Error fetching comments: {e}")
|
| return []
|
| |
| # 使用示例
|
| product_id = "100012014970" # 示例商品ID
|
| comments = get_jd_comments_web(product_id, page=0)
|
| for comment in comments[:5]: # 打印前5条评论
|
| print(f"用户: {comment['nickname']}")
|
| print(f"评分: {comment['score']}")
|
| print(f"内容: {comment['content']}\n")
|
| time.sleep(random.uniform(1, 3)) # 礼貌性延迟
|
方法三:使用第三方数据服务
有些第三方数据服务提供商(如八爪鱼、集搜客等)提供京东评论采集服务,可以通过他们的API获取数据。
注意事项
-
合规性:
- 遵守京东robots.txt规定
- 不要高频请求,建议添加随机延迟
- 仅用于个人学习研究,不可商用
-
反爬机制:
- 京东有反爬措施,可能需要处理验证码
- 建议使用会话(session)保持连接
- 可考虑使用selenium模拟浏览器行为
-
数据存储:
|----------------------------------------------------------|
|import pandas as pd
|
| |
|def save_comments_to_csv(comments, filename):
|
|df = pd.DataFrame(comments)
|
|df.to_csv(filename, index=False, encoding='utf_8_sig')
| -
API限制:
- 网页版API通常只能获取前100页评论
- 重要评论可能被折叠,需要额外参数获取
完整示例(带异常处理)
|----------------------------------------------------------------------------------------------------------------------------------------|
| import requests
|
| import json
|
| import time
|
| import random
|
| from tqdm import tqdm # 进度条
|
| |
| class JDCommentScraper:
|
| def __init__(self):
|
| self.session = requests.Session()
|
| self.headers = {
|
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 'Referer': 'https://item.jd.com/'
|
| }
|
| |
| def get_comments(self, product_id, max_pages=5):
|
| all_comments = []
|
| try:
|
| for page in tqdm(range(max_pages), desc="采集进度"):
|
| url = "https://club.jd.com/comment/productPageComments.action"
|
| params = {
|
| 'callback': 'fetchJSON_comment98',
|
| 'productId': product_id,
|
| 'score': 0,
|
| 'sortType': 5,
|
| 'page': page,
|
| 'pageSize': 10,
|
| 'isShadowSku': 0,
|
| 'fold': 1
|
| }
|
| |
| response = self.session.get(url, headers=self.headers, params=params)
|
| if response.status_code == 200:
|
| try:
|
| json_str = response.text[len('fetchJSON_comment98('):-2]
|
| data = json.loads(json_str)
|
| comments = data.get('comments', [])
|
| if not comments:
|
| break
|
| all_comments.extend(comments)
|
| except json.JSONDecodeError:
|
| print(f"解析失败: {response.text[:100]}...")
|
| break
|
| else:
|
| print(f"请求失败,状态码: {response.status_code}")
|
| break
|
| |
| time.sleep(random.uniform(1, 3)) # 礼貌性延迟
|
| |
| return all_comments
|
| except Exception as e:
|
| print(f"采集过程中出错: {e}")
|
| return all_comments
|
| |
| # 使用示例
|
| if __name__ == "__main__":
|
| scraper = JDCommentScraper()
|
| product_id = "100012014970" # 替换为实际商品ID
|
| comments = scraper.get_comments(product_id, max_pages=3)
|
| |
| print(f"共采集到 {len(comments)} 条评论")
|
| if comments:
|
| import pandas as pd
|
| df = pd.DataFrame(comments)
|
| df.to_csv(f"jd_comments_{product_id}.csv", index=False, encoding='utf_8_sig')
|
| print("评论已保存到CSV文件")
|
请务必遵守京东的使用条款,合理控制请求频率,避免对京东服务器造成过大压力。对于大规模数据采集需求,建议联系京东商务合作部门获取官方数据接口。