一、技术可行性分析
-
API存在性验证
- 小红书官方未公开评论API,需通过逆向工程获取接口
- 典型评论接口特征:参考小红书开放平台API接口
-
请求参数解析
csharppython # 典型请求参数示例 params = { "note_id": "64a1b2c3d4e5f6g7h8i9j0k1", # 笔记ID "sort": "newest", # 排序方式 "page": 1, # 页码 "page_size": 20 # 每页数量 }
二、完整技术实现方案
python
python
import requests
import json
import time
from urllib.parse import urlencode
class XiaohongshuCommentScraper:
def __init__(self):
self.session = requests.Session()
self.headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
"X-Request-ID": "api.comment_list",
"Content-Type": "application/json"
}
self.base_url = "https://www.xiaohongshu.com/api/sns/note/v1/comment_list"
def get_comments(self, note_id, max_pages=5):
comments = []
for page in range(1, max_pages+1):
params = {
"note_id": note_id,
"sort": "newest",
"page": page,
"page_size": 20
}
try:
response = self.session.get(
self.base_url,
headers=self.headers,
params=urlencode(params)
)
data = response.json()
if data.get('errcode') != 0:
raise Exception(f"API Error: {data.get('errmsg')}")
page_comments = data.get('data', {}).get('comments', [])
if not page_comments:
break
comments.extend([{
"id": c["id"],
"user": c["user"]["nickname"],
"content": c["content"],
"like_count": c["like_count"],
"create_time": c["create_time"]
} for c in page_comments])
time.sleep(1.5) # 反爬延迟
except Exception as e:
print(f"Error on page {page}: {str(e)}")
break
return comments
if __name__ == "__main__":
scraper = XiaohongshuCommentScraper()
note_id = "64a1b2c3d4e5f6g7h8i9j0k1" # 替换为实际笔记ID
comments = scraper.get_comments(note_id, max_pages=3)
print(f"成功抓取 {len(comments)} 条评论")
# 保存为CSV文件
with open("comments.csv", "w", encoding="utf-8") as f:
f.write("ID,用户,内容,点赞数,创建时间\n")
for c in comments:
f.write(f"{c['id']},{c['user']},{c['content']},{c['like_count']},{c['create_time']}\n")
三、进阶优化方案
-
分布式爬取架构
pythonpython from concurrent.futures import ThreadPoolExecutor def distributed_scraping(note_ids): with ThreadPoolExecutor(max_workers=5) as executor: futures = [executor.submit(scraper.get_comments, nid, 2) for nid in note_ids] results = [f.result() for f in futures] return results
-
反爬对抗策略
rubypython class AntiCrawler: @staticmethod def get_proxy(): # 返回可用代理IP return {"http": "123.45.67.89:8080"} @staticmethod def random_delay(): time.sleep(random.uniform(1, 3))
四、数据存储扩展
python
python
import sqlite3
class DataStorage:
def __init__(self):
self.conn = sqlite3.connect("comments.db")
self._create_table()
def _create_table(self):
cursor = self.conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS comments (
id TEXT PRIMARY KEY,
note_id TEXT,
user TEXT,
content TEXT,
like_count INTEGER,
create_time INTEGER
)
''')
self.conn.commit()
def save_comments(self, comments, note_id):
cursor = self.conn.cursor()
for c in comments:
cursor.execute('''
INSERT OR REPLACE INTO comments
VALUES (?, ?, ?, ?, ?, ?)
''', (c["id"], note_id, c["user"], c["content"], c["like_count"], c["create_time"]))
self.conn.commit()
五、法律与道德规范
- 遵守《网络安全法》和《数据安全法》
- 禁止抓取用户隐私信息(手机号、地址等)
- 控制请求频率(建议QPS≤2)
- 明确标注数据来源,禁止商业用途
该方案通过模拟移动端请求实现评论抓取,采用分页机制和动态延迟策略规避反爬限制。实际部署时建议配合代理IP池和异常重试机制,确保数据采集的稳定性和合规性。