faster_whisper,视频转文字,并生成字幕文件(附带exe)
使用说明:
--model:选 tiny/base/small/medium/large(越大越准、越耗资源)。
模型路径(medium):C:\Users\XXX.cache\huggingface\hub\models--Systran--faster-whisper-medium\snapshots\08e178d48790749d25932bbc082711ddcfdfbc4f
- 直接运行run.exe
- 选择视频文件
- 等待结果,结果保存在视频所在文件夹,文件名与视频文件名相同,格式为txt和srt
- 使用PotPlayer播放视频,自动读取同名的字幕文件
- 暂停的时候,可以复制当前字幕内容到剪贴板
python
# ===================== 16核CPU 强制优化 禁用GPU =====================
import os
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["MKL_NUM_THREADS"] = "16"
os.environ["NUMBA_NUM_THREADS"] = "16"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # 彻底禁用GPU
# ====================================================================
from faster_whisper import WhisperModel
import opencc
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import subprocess
import json
import datetime
# SRT字幕时间格式转换
def format_srt_time(sec):
h = int(sec // 3600)
m = int((sec % 3600) // 60)
s = int(sec % 60)
ms = int((sec - int(sec)) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
# ===================== 模型选择弹窗(下拉菜单) =====================
def select_model():
# win = tk.Tk()
# win.title("选择模型")
# win.geometry("300x150")
# win.resizable(False, False)
# win.attributes('-topmost', True) # 窗口置顶
# # 模型选项(可自行增删)
# model_options = ["small", "medium", "large"]
# tk.Label(win, text="请选择识别模型:", font=("微软雅黑", 12)).pack(pady=20)
# selected = tk.StringVar(value=model_options[0])
# combo = ttk.Combobox(win, textvariable=selected, values=model_options, state="readonly", font=("微软雅黑", 11))
# combo.pack(pady=5)
# result = None
# def confirm():
# nonlocal result
# result = selected.get()
# win.destroy()
# tk.Button(win, text="确认", command=confirm, width=10, font=("微软雅黑", 10)).pack(pady=10)
# win.mainloop()
# return result
return "medium"
# 选择模型
model_name = select_model()
if not model_name:
exit()
# ===================== 初始化 =====================
root = tk.Tk()
root.withdraw()
cc = opencc.OpenCC('t2s')
# 选择视频
video_path = filedialog.askopenfilename(
title="选择视频文件",
filetypes=[("视频格式", "*.mp4 *.mkv *.mov *.avi *.flv *.wmv"), ("所有文件", "*.*")]
)
if not video_path:
exit()
# 输出文件
video_dir = os.path.dirname(video_path)
video_name = os.path.splitext(os.path.basename(video_path))[0]
txt_file = os.path.join(video_dir, f"{video_name}.txt")
srt_file = os.path.join(video_dir, f"{video_name}.srt")
# 获取视频时长
def get_video_duration(video):
try:
res = subprocess.run(["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "json", video], stdout=subprocess.PIPE)
return float(json.loads(res.stdout)["format"]["duration"])
except:
return 100
total_time = get_video_duration(video_path)
# ===================== 加载模型(纯CPU 16核) =====================
print(f"\n加载模型:{model_name}")
model = WhisperModel(
model_size_or_path=model_name,
device="cpu",
compute_type="int8",
cpu_threads=16,
num_workers=16
)
# ===================== 开始识别 =====================
print(f"\n视频总时长:{total_time:.1f}秒,开始识别...\n")
segments, info = model.transcribe(video_path, language="zh", vad_filter=False)
# 写入标准SRT + 纯文本
with open(txt_file, "w", encoding="utf-8") as f_txt, \
open(srt_file, "w", encoding="utf-8") as f_srt:
idx = 1
for seg in segments:
text = cc.convert(seg.text.strip())
start_str = format_srt_time(seg.start)
end_str = format_srt_time(seg.end)
print(f"[{start_str} → {end_str}] {text}")
f_txt.write(text + "\n")
# 标准SRT格式
f_srt.write(f"{idx}\n")
f_srt.write(f"{start_str} --> {end_str}\n")
f_srt.write(f"{text}\n\n")
idx += 1
# 完成提示
messagebox.showinfo("完成", f"模型:{model_name}\n识别成功!\n已生成:\n1. 纯文本.txt\n2. SRT字幕(PotPlayer直接用)")