前言
公司运营需要小红书和抖音的一些关键词数据,做运营分析,爬虫就来了。 之前我研究过直接用接口去获取数据。但是需要解密接口的一些参数,就放弃了。偶然间看到了可以使用指纹浏览器去打开页面监听接口数据,说干就干,没想到还真可以。不啰嗦,直接开始。
第一步,导入需要的库
python
import time
import requests
import re
import os
import json
from DrissionPage import ChromiumPage
from DrissionPage.common import Actions
from urllib.parse import quote
from openpyxl import Workbook
import threading
from typing import List, Dict, Optional
from datetime import datetime
第二步,初始化浏览器
不明白的可以去看看 DrissionPage 库的使用方法 直接上代码:
python
class DouyinCrawler:
def __init__(self):
self.browser = None
self.stop_event = threading.Event()
def init_browser(self):
"""初始化浏览器"""
try:
print("正在初始化浏览器...")
self.browser = ChromiumPage()
if self.browser: # 做测试,是否能够初始化
self.browser.get("https://www.baidu.com")
time.sleep(2)
print("浏览器初始化成功")
return True
else:
print("浏览器对象创建失败")
return False
except Exception as e:
print(f"浏览器初始化失败: {e}")
try:
print("尝试使用自定义配置初始化浏览器...")
from DrissionPage import ChromiumOptions
co = ChromiumOptions()
co.headless(False)
co.set_argument('--no-sandbox')
co.set_argument('--disable-dev-shm-usage')
co.set_argument('--disable-gpu')
co.set_argument('--remote-debugging-port=9222')
self.browser = ChromiumPage(addr_or_opts=co)
if self.browser:
self.browser.get("https://www.baidu.com")
time.sleep(2)
print("使用自定义配置初始化成功")
return True
except Exception as e2:
print(f"自定义配置也失败: {e2}")
try:
print("尝试连接已存在的浏览器...")
self.browser = ChromiumPage(addr_or_opts='127.0.0.1:9222')
if self.browser:
print("连接已存在浏览器成功")
return True
except Exception as e3:
print(f"连接已存在浏览器失败: {e3}")
return False
第三步,监控浏览器数据
这一比较重要,需要使用抓包软件,比如fiddler去监控页面接口(fiddler不会配置的可以去看看其他博客),找到你想要的那个接口,下面是我用fiddler抓到的接口:
先打开抖音网页版,然后输入关键词
比如在搜索框输入: Python

点击搜索按钮之后查看fiddler抓到的数据包,之后找到下面这个接口就是满足需求的:
找到的接口地址:
bash
/aweme/v1/web/search/item/
数据都在监控到的接口的 data 下面。一个标准的JSON格式。 所以下面需要调用浏览器自动去搜索,获取监听接口 关键词搜索,要打开的浏览器地址是:
bash
https://www.douyin.com/search/{encoded_keyword}?type=video
encoded_keyword 是搜索的关键词
打开浏览器搜索的代码:
python
def search_keyword(self, keyword: str):
"""搜索关键词"""
try:
encoded_keyword = quote(keyword)
search_url = f"https://www.douyin.com/search/{encoded_keyword}?type=video"
print(f"访问搜索页面: {search_url}")
self.browser.get(search_url)
time.sleep(5)
return True
except Exception as e:
print(f"访问搜索页面失败: {e}")
return False
第四步,找到接口之后,就需要监听接口的响应获取数据
python
def setup_search_listener(self):
"""设置搜索API监听"""
if not self.browser:
print("浏览器未初始化,无法设置监听")
return False
try:
try:
self.browser.listen.stop()
print("已清除之前的监听")
except:
print("没有之前的监听需要清除")
listeners = [
'/aweme/v1/web/search/item/',
]
for listener in listeners:
try:
self.browser.listen.start(listener)
print(f"成功设置监听: {listener}")
return True
except Exception as e:
print(f"监听 {listener} 失败: {e}")
continue
print("所有监听接口都设置失败")
return False
except Exception as e:
print(f"设置监听失败: {e}")
return False
第五步,因为要获取的数据很多,但是页面又是动态加载的,所以就需要滚动滑轮去加载数据,所以需要用到浏览器的自动滚动(抖音滚动太多的话会出现验证码),测试30次左右就会出现。
下面是滚动和获取数据的代码:
python
def scroll_and_collect_data(self, target_count: int) -> List[Dict]:
"""滚动页面收集数据,增强错误处理"""
ac = Actions(self.browser)
all_videos = []
scroll_count = 0
max_scrolls = 30
consecutive_failures = 0
max_consecutive_failures = 5
print(f"开始收集数据,目标数量: {target_count}")
while len(all_videos) < target_count and scroll_count < max_scrolls:
if self.stop_event.is_set():
break
try:
ret = self.browser.listen.wait(timeout=10)
if ret and ret.response and ret.response.body:
response_data = ret.response.body
consecutive_failures = 0
video_lists = []
if isinstance(response_data, dict):
# 针对结构:{"data": [{"aweme_info": {...}}, {...}]}
if 'data' in response_data and isinstance(response_data['data'], list):
added = 0
for item in response_data['data']:
if isinstance(item, dict) and 'aweme_info' in item:
all_videos.append(item['aweme_info'])
added += 1
print(f"获取到 {added} 个视频,总计: {len(all_videos)}")
else:
print("找不到 'data' 字段或格式错误")
else:
print(f"响应数据格式异常,类型: {type(response_data)}")
except Exception as e:
consecutive_failures += 1
print(f"等待数据包超时或解析失败: {e},连续失败次数: {consecutive_failures}")
if consecutive_failures >= max_consecutive_failures:
print("连续多次解析失败,停止数据收集")
break
try:
ac.scroll(delta_y=1500)
time.sleep(2)
scroll_count += 1
print(f"滚动次数: {scroll_count}")
except Exception as e:
print(f"滚动失败: {e}")
break
return all_videos[:target_count]
第六步,已经获取到了页面接口数据。就不用多说了吧,想要什么数据,直接从接口里获取
python
def extract_video_info(self, video_data: Dict) -> Optional[Dict]:
print(f"=== 开始解析视频数据 ===")
# print(json.dumps(video_data, ensure_ascii=False, indent=2))
if not video_data:
return None
try:
video_info = {
'aweme_id': video_data.get('aweme_id', ''),
'desc': video_data.get('desc', ''),
'create_time': video_data.get('create_time', 0),
'author_name': '',
'author_id': '',
'digg_count': 0,
'comment_count': 0,
'share_count': 0,
'play_count': 0,
'video_url': '',
'cover_url': '',
'music_title': '',
'hashtags': []
}
author = video_data.get('author', {})
video_info['author_name'] = author.get('nickname', '')
video_info['author_id'] = author.get('uid', '') or author.get('short_id', '')
stats = video_data.get('statistics', {})
video_info['digg_count'] = stats.get('digg_count', 0)
video_info['comment_count'] = stats.get('comment_count', 0)
video_info['share_count'] = stats.get('share_count', 0)
video_info['play_count'] = stats.get('play_count', 0)
video = video_data.get('video', {})
play_addr = video.get('play_addr', {})
url_list = play_addr.get('url_list', [])
if url_list:
video_info['video_url'] = url_list[0]
cover = video.get('cover', {})
url_list = cover.get('url_list', [])
if url_list:
video_info['cover_url'] = url_list[0]
music = video_data.get('music', {})
video_info['music_title'] = music.get('title', '')
text_extra = video_data.get('text_extra', [])
hashtags = []
for extra in text_extra:
if isinstance(extra, dict) and extra.get('type') == 1:
hashtags.append(extra.get('hashtag_name', ''))
video_info['hashtags'] = hashtags
# 转换时间戳为日期时间格式
video_info['create_time'] = self.timestamp_to_datetime(video_info['create_time'])
return video_info
except Exception as e:
print(f"提取视频信息失败: {e}")
return None
第七步,保存到Excel
scss
def save_to_excel(self, videos: List[Dict], keyword: str, download_covers: bool = False) -> str:
if not videos:
print("没有数据可保存")
return ""
try:
wb = Workbook()
ws = wb.active
ws.title = "抖音搜索结果"
headers = [
"视频ID", "作者昵称", "作者ID", "视频描述",
"点赞数", "评论数", "分享数", "播放数",
"创建时间", "视频链接", "封面链接", "背景音乐", "话题标签"
]
ws.append(headers)
for i, video in enumerate(videos, 1):
row_data = [
video.get('aweme_id', ''),
video.get('author_name', ''),
video.get('uid', ''),
video.get('desc', ''),
video.get('digg_count', 0),
video.get('comment_count', 0),
video.get('share_count', 0),
video.get('play_count', 0),
video.get('create_time', ''), # 现在是格式化后的时间字符串
video.get('video_url', ''),
video.get('cover_url', ''),
video.get('music_title', ''),
', '.join(video.get('hashtags', []))
]
ws.append(row_data)
filename = f"抖音_{keyword}_{len(videos)}条.xlsx"
wb.save(filename)
print(f"数据已保存到: {filename}")
return filename
except Exception as e:
print(f"保存Excel失败: {e}")
return ""
第八步,最后是执行整个代码
python
def crawl_douyin_search(self, keyword: str, limit: int = 50, download_covers: bool = False) -> bool:
try:
print("=== 抖音关键词爬虫开始 ===")
max_retries = 3
for attempt in range(max_retries):
print(f"尝试初始化浏览器 (第{attempt + 1}次)")
if self.init_browser():
break
elif attempt < max_retries - 1:
print("等待5秒后重试...")
time.sleep(5)
else:
print("浏览器初始化失败,请检查ChromiumPage安装")
return False
if not self.setup_search_listener():
print("API监听设置失败,爬取任务终止")
return False
if not self.search_keyword(keyword):
return False
print("开始收集视频数据...")
videos_data = self.scroll_and_collect_data(limit)
if not videos_data:
print("未能通过API获取到数据,爬取任务失败")
return False
print("处理视频数据...")
processed_videos = []
for video_raw in videos_data:
video_info = self.extract_video_info(video_raw)
if video_info:
processed_videos.append(video_info)
print(f"成功处理 {len(processed_videos)} 个视频")
filename = self.save_to_excel(processed_videos, keyword, download_covers=False)
if filename:
print(f"=== 爬取完成!共获取 {len(processed_videos)} 条数据 ===")
return True
return False
except Exception as e:
print(f"爬取过程出错: {e}")
return False
finally:
if self.browser:
try:
self.browser.quit()
except:
pass
def main():
crawler = DouyinCrawler()
keyword = input('请输入你想搜索的关键词 :') # 搜索关键词
limit = input('请输入你想获取的数量 :') # 爬取数量限制
download_covers = False # 明确设置为不下载封面
success = crawler.crawl_douyin_search(
keyword=keyword,
limit=limit,
download_covers=download_covers
)
if success:
print("爬取任务完成!")
else:
print("爬取任务失败!")
if __name__ == "__main__":
main()
结尾,给大家看看获取到数据格式
