抖音视频下载工具
功能介绍
这是一个基于Python开发的抖音视频下载工具,可以方便地下载抖音平台上的视频内容。
主要特点
- 支持无水印视频下载
- 自动提取视频标题作为文件名
- 显示下载进度条
- 支持自动重试机制
- 支持调试模式
使用要求
- Python 3.10+
- Chrome浏览器
- 必要的Python包:
- selenium
- requests
- tqdm
- webdriver_manager(可选)
安装依赖
txt
# 核心依赖
requests>=2.31.0
beautifulsoup4==4.12.2
lxml==4.9.3
urllib3>=2.1.0 # URL处理和安全连接
selenium>=4.18.1
bash
pip install selenium requests tqdm webdriver_manager
使用方法
- 直接运行脚本:
bash
python 抖音视频下载工具.py
- 作为模块导入:
python
from 抖音视频下载工具 import DouyinDownloader
downloader = DouyinDownloader()
url = "你的抖音视频链接"
main(url)
参数说明
download_dir
: 下载目录,默认为"downloads"max_retries
: 最大重试次数,默认为3debug
: 是否开启调试模式,默认为False
注意事项
- 确保系统已安装Chrome浏览器
- 需要稳定的网络连接
- 部分视频可能因为权限设置无法下载
- 建议不要频繁下载,以免被限制
常见问题
- 如果出现ChromeDriver相关错误,请确保Chrome浏览器版本与ChromeDriver版本匹配
- 如果下载失败,可以尝试增加重试次数或开启调试模式查看详细错误信息
代码实现
python
import traceback
import requests
import re
import json
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import unquote
import logging
import argparse
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
class DouyinDownloader:
def __init__(self, download_dir="downloads", max_retries=3, debug=False):
"""初始化抖音下载器
Args:
download_dir (str): 下载目录
max_retries (int): 最大重试次数
debug (bool): 是否开启调试模式
"""
self.download_dir = download_dir
self.max_retries = max_retries
self.debug = debug
self.setup_logging()
self.setup_chrome()
if not os.path.exists(download_dir):
os.makedirs(download_dir)
def setup_logging(self):
"""设置日志"""
if self.debug:
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('douyin_downloader.log'),
logging.StreamHandler()
]
)
else:
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)
self.logger = logging.getLogger(__name__)
def setup_chrome(self):
"""设置Chrome浏览器"""
chrome_options = Options()
chrome_options.add_argument('--headless') # 开启无头模式
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
def __del__(self):
"""析构函数,确保关闭浏览器"""
if hasattr(self, 'driver'):
self.driver.quit()
def _save_debug_file(self, content, filename, message=""):
"""保存调试文件
Args:
content: 要保存的内容
filename: 文件名
message: 提示信息
"""
if self.debug:
try:
with open(filename, 'w', encoding='utf-8') as f:
f.write(content)
if message:
print(message)
except Exception as e:
print(f"保存调试文件 {filename} 失败: {str(e)}")
def download_webpage(self, url):
"""使用Selenium下载抖音视频页面"""
try:
print("正在打开网页...")
self.driver.get(url)
# 等待页面加载
print("等待页面加载...")
time.sleep(1)
# 获取当前URL
current_url = self.driver.current_url
print(f"最终URL: {current_url}")
# 提取视频ID
video_id = self._extract_video_id(current_url)
if not video_id:
print("无法提取视频ID")
return None
# 等待页面完全加载
print("等待页面完全加载...")
time.sleep(1)
# 尝试提取页面数据
print("尝试提取页面数据...")
try:
methods = [
self._try_get_video_data_from_render_data,
self._try_get_video_data_from_hydration,
self._try_get_video_data_from_player,
self._try_get_video_data_from_element
]
for method in methods:
try:
data = method()
if data:
return data
except Exception as e:
if self.debug:
print(f"方法 {method.__name__} 失败: {str(e)}")
continue
print("所有提取方法都失败了")
if self.debug:
self._save_debug_file(
self.driver.page_source,
'page_source.html',
"页面源代码已保存到page_source.html(用于调试)"
)
return None
except Exception as e:
print(f"提取页面数据失败: {str(e)}")
if self.debug:
self._save_debug_file(
self.driver.page_source,
'page_source.html',
"页面源代码已保存到page_source.html(用于调试)"
)
print("详细错误信息:")
print(traceback.format_exc())
return None
except Exception as e:
print(f"下载网页时出错: {str(e)}")
if self.debug:
try:
self._save_debug_file(
self.driver.page_source,
'page_source.html',
"页面源代码已保存到page_source.html(用于调试)"
)
except:
print("保存页面源代码失败")
print("详细错误信息:")
print(traceback.format_exc())
return None
def _try_get_video_data_from_render_data(self):
"""尝试从RENDER_DATA获取视频数据"""
script = """
var renderData = null;
try {
// 方法1:直接从SSR_HYDRATED_DATA获取
if (window.SSR_HYDRATED_DATA) {
return JSON.stringify(window.SSR_HYDRATED_DATA);
}
// 方法2:从__NEXT_DATA__获取
var nextDataElement = document.getElementById('__NEXT_DATA__');
if (nextDataElement) {
return nextDataElement.textContent;
}
// 方法3:从script标签中查找
var scripts = document.getElementsByTagName('script');
for (var i = 0; i < scripts.length; i++) {
var content = scripts[i].textContent || '';
if (content.includes('"video"') && content.includes('"play_addr"')) {
return content;
}
}
} catch (e) {
console.log('获取数据时出错:', e);
}
return null;
"""
data = self.driver.execute_script(script)
if data:
print("找到页面数据")
try:
if self.debug:
self._save_debug_file(data, 'raw_page_data.txt', "已保存原始数据到raw_page_data.txt")
try:
json_data = json.loads(data)
if self.debug:
self._save_debug_file(
json.dumps(json_data, ensure_ascii=False, indent=2),
'parsed_data.json',
"已保存解析后的数据到parsed_data.json"
)
return json.dumps(json_data)
except:
json_pattern = r'({[^{]*?"video"[^}]*?})'
matches = re.finditer(json_pattern, data)
for match in matches:
try:
json_str = match.group(1)
json.loads(json_str) # 验证是否为有效JSON
return json_str
except:
continue
except Exception as e:
if self.debug:
print(f"处理页面数据时出错: {str(e)}")
print("详细错误信息:")
print(traceback.format_exc())
return None
def _try_get_video_data_from_hydration(self):
"""尝试从__HYDRA_DATA__获取视频数据"""
script = """
if (window.__HYDRA_DATA__) {
return JSON.stringify(window.__HYDRA_DATA__);
}
return null;
"""
data = self.driver.execute_script(script)
if data:
print("从HYDRA_DATA中找到数据")
return data
return None
def _try_get_video_data_from_player(self):
"""尝试从播放器获取视频数据"""
script = """
try {
// 查找视频元素
var videoElement = document.querySelector('video');
if (videoElement && videoElement.src) {
var videoData = {
video_data: {
nwm_video_url: videoElement.src
}
};
// 尝试获取视频标题
var title = document.title || '';
if (!title) {
var titleElement = document.querySelector('title, .video-title, .title, .desc, [data-e2e="video-desc"]');
if (titleElement) {
title = titleElement.textContent.trim();
}
}
if (title) {
videoData.desc = title;
}
return JSON.stringify(videoData);
}
// 如果没有找到视频元素,尝试从source标签获取
var sourceElement = document.querySelector('source[src*="http"]');
if (sourceElement && sourceElement.src) {
var sourceData = {
video_data: {
nwm_video_url: sourceElement.src
},
desc: document.title || ''
};
return JSON.stringify(sourceData);
}
} catch (e) {
console.log('获取视频数据时出错:', e);
}
return null;
"""
data = self.driver.execute_script(script)
if data:
print("从播放器中找到数据")
return data
return None
def _try_get_video_data_from_element(self):
"""尝试从视频元素直接获取数据"""
try:
video_element = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "video"))
)
video_url = video_element.get_attribute('src')
if video_url:
print("从视频元素中找到数据")
return json.dumps({
'video_data': {
'nwm_video_url': video_url
},
'desc': self._get_video_title()
})
except:
pass
return None
def _get_video_title(self):
"""获取视频标题"""
try:
# 首先尝试从title标签获取
title = self.driver.title
if title:
return title
# 如果title为空,尝试其他选择器
selectors = [
'title', # title标签
'.video-title', # 视频标题类
'.desc', # 描述类
'[data-e2e="video-desc"]', # 抖音特定属性
'.title', # 通用标题类
]
for selector in selectors:
try:
element = self.driver.find_element(By.CSS_SELECTOR, selector)
if element and element.text.strip():
return element.text.strip()
except:
continue
# 如果还是没找到,尝试从页面源码中直接提取title
match = re.search(r'<title[^>]*>(.*?)</title>', self.driver.page_source)
if match:
return match.group(1)
except Exception as e:
if self.debug:
print(f"获取视频标题时出错: {str(e)}")
return '未命名视频'
def extract_video_info(self, json_str):
"""从JSON响应中提取视频信息"""
try:
print("开始解析视频信息...")
if not json_str:
print("输入的JSON字符串为空")
return None
print(f"JSON字符串长度: {len(json_str)}")
print("JSON字符串前100个字符:", json_str[:100])
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON解析错误: {str(e)}")
print("尝试修复JSON数据...")
# 尝试提取JSON部分
json_pattern = r'({[^{]*?"video"[^}]*?})'
match = re.search(json_pattern, json_str)
if match:
json_str = match.group(1)
data = json.loads(json_str)
if self.debug:
# 保存完整的JSON数据用于调试
with open('debug_response.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("已保存完整响应到debug_response.json")
video_data = {
'desc': '未命名视频',
'create_time': str(int(time.time())),
'video_urls': []
}
# 处理不同的数据格式
if isinstance(data, dict):
# 直接获取的视频URL格式
if 'video_data' in data and 'nwm_video_url' in data['video_data']:
video_data['desc'] = data.get('desc', '未命名视频')
video_data['video_urls'].append(data['video_data']['nwm_video_url'])
return video_data
# 遍历所有可能包含视频信息的字段
video_url = self._find_video_url(data)
if video_url:
video_data['video_urls'].append(video_url)
desc = self._find_video_desc(data)
if desc:
video_data['desc'] = desc
if video_data['video_urls']:
return video_data
print("无法从响应中提取视频信息")
print("数据结构:", json.dumps(data, indent=2, ensure_ascii=False)[:500])
return None
except Exception as e:
print(f"解析视频信息时出错: {str(e)}")
import traceback
print("详细错误信息:")
print(traceback.format_exc())
return None
def _find_video_url(self, data):
"""递归查找视频URL"""
if isinstance(data, dict):
# 检查常见的视频URL字段
url_fields = ['playApi', 'playAddr', 'downloadAddr', 'video_url', 'nwm_video_url']
for field in url_fields:
if field in data:
url = data[field]
if isinstance(url, str) and url.startswith('http'):
return url
elif isinstance(url, dict) and 'url_list' in url:
urls = url['url_list']
if urls and isinstance(urls, list):
return urls[0]
# 递归搜索
for value in data.values():
result = self._find_video_url(value)
if result:
return result
elif isinstance(data, list):
for item in data:
result = self._find_video_url(item)
if result:
return result
return None
def _find_video_desc(self, data):
"""递归查找视频描述"""
if isinstance(data, dict):
if 'desc' in data and isinstance(data['desc'], str):
return data['desc']
# 递归搜索
for value in data.values():
result = self._find_video_desc(value)
if result:
return result
elif isinstance(data, list):
for item in data:
result = self._find_video_desc(item)
if result:
return result
return None
def _clean_title(self, title):
"""清理视频标题
Args:
title (str): 原始标题
Returns:
str: 清理后的标题
"""
# 移除HTML标签
title = re.sub(r'<[^>]+>', '', title)
# 移除抖音常见的标签(#xxx)
title = re.sub(r'#[^ ]+', '', title)
# 移除" - 抖音"后缀
title = re.sub(r' *- *抖音.*$', '', title)
# 移除特殊字符和空格
title = re.sub(r'[\\/*?:"<>|]', '', title)
title = re.sub(r'\s+', '', title)
# 如果标题为空,使用默认标题
if not title:
title = '未命名视频'
return title
def download_video(self, video_url, title, retry_count=0):
"""下载视频文件
Args:
video_url (str): 视频URL
title (str): 视频标题
retry_count (int): 当前重试次数
Returns:
bool: 下载是否成功
"""
try:
# 清理标题
title = self._clean_title(title)
print(f"处理后的文件名: {title}")
filepath = os.path.join(self.download_dir, f"{title}.mp4")
# 如果文件已存在,添加数字后缀
base_filepath = filepath
counter = 1
while os.path.exists(filepath):
filename, ext = os.path.splitext(base_filepath)
filepath = f"{filename}_{counter}{ext}"
counter += 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'https://www.douyin.com/'
}
response = requests.get(video_url, headers=headers, stream=True)
response.raise_for_status()
# 获取文件大小
total_size = int(response.headers.get('content-length', 0))
with open(filepath, 'wb') as f, tqdm(
desc=os.path.basename(filepath),
total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as pbar:
for data in response.iter_content(chunk_size=1024):
size = f.write(data)
pbar.update(size)
self.logger.info(f"视频已保存到: {filepath}")
return True
except Exception as e:
self.logger.error(f"下载视频时出错: {str(e)}")
if retry_count < self.max_retries:
self.logger.info(f"正在进行第{retry_count + 1}次重试...")
time.sleep(1) # 等待2秒后重试
return self.download_video(video_url, title, retry_count + 1)
return False
def _extract_video_id(self, url):
"""从URL中提取视频ID"""
patterns = [
r'/video/(\d+)',
r'item_ids=(\d+)',
r'aweme_id=(\d+)'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
video_id = match.group(1)
print(f"提取到视频ID: {video_id}")
return video_id
return None
def main(url):
try:
# 创建下载器实例
downloader = DouyinDownloader(
download_dir="downloads",
max_retries=3,
debug=False # 默认关闭调试模式
)
print(f"正在处理链接: {url}")
json_str = downloader.download_webpage(url)
if json_str:
video_info = downloader.extract_video_info(json_str)
if video_info and video_info['video_urls']:
print(f"视频标题: {video_info['desc']}")
video_url = video_info['video_urls'][0]
success = downloader.download_video(video_url, video_info['desc'])
if success:
print("下载成功!")
else:
print("下载失败!")
else:
print("无法获取视频下载地址")
else:
print("下载网页失败")
except Exception as e:
print(f"程序执行出错: {str(e)}")
finally:
if 'downloader' in locals():
del downloader
if __name__ == "__main__":
# 使用固定的测试链接
url = 'https://v.douyin.com/Fw35vv97K4s/'
main(url)