faster_whisper，视频转文字，并生成字幕文件

faster_whisper，视频转文字，并生成字幕文件（附带exe）

使用说明：

--model：选 tiny/base/small/medium/large（越大越准、越耗资源）。

模型路径(medium)：C:\Users\XXX.cache\huggingface\hub\models--Systran--faster-whisper-medium\snapshots\08e178d48790749d25932bbc082711ddcfdfbc4f

直接运行run.exe
选择视频文件
等待结果，结果保存在视频所在文件夹，文件名与视频文件名相同，格式为txt和srt
使用PotPlayer播放视频，自动读取同名的字幕文件
暂停的时候，可以复制当前字幕内容到剪贴板

python 复制代码

# ===================== 16核CPU 强制优化 禁用GPU =====================
import os
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["MKL_NUM_THREADS"] = "16"
os.environ["NUMBA_NUM_THREADS"] = "16"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # 彻底禁用GPU
# ====================================================================

from faster_whisper import WhisperModel
import opencc
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import subprocess
import json
import datetime

# SRT字幕时间格式转换
def format_srt_time(sec):
    h = int(sec // 3600)
    m = int((sec % 3600) // 60)
    s = int(sec % 60)
    ms = int((sec - int(sec)) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

# ===================== 模型选择弹窗（下拉菜单） =====================
def select_model():
    # win = tk.Tk()
    # win.title("选择模型")
    # win.geometry("300x150")
    # win.resizable(False, False)
    # win.attributes('-topmost', True)  # 窗口置顶

    # # 模型选项（可自行增删）
    # model_options = ["small", "medium", "large"]

    # tk.Label(win, text="请选择识别模型：", font=("微软雅黑", 12)).pack(pady=20)
    # selected = tk.StringVar(value=model_options[0])
    # combo = ttk.Combobox(win, textvariable=selected, values=model_options, state="readonly", font=("微软雅黑", 11))
    # combo.pack(pady=5)

    # result = None
    # def confirm():
    #     nonlocal result
    #     result = selected.get()
    #     win.destroy()

    # tk.Button(win, text="确认", command=confirm, width=10, font=("微软雅黑", 10)).pack(pady=10)
    # win.mainloop()
    # return result
    return "medium"

# 选择模型
model_name = select_model()
if not model_name:
    exit()

# ===================== 初始化 =====================
root = tk.Tk()
root.withdraw()
cc = opencc.OpenCC('t2s')

# 选择视频
video_path = filedialog.askopenfilename(
    title="选择视频文件",
    filetypes=[("视频格式", "*.mp4 *.mkv *.mov *.avi *.flv *.wmv"), ("所有文件", "*.*")]
)
if not video_path:
    exit()

# 输出文件
video_dir = os.path.dirname(video_path)
video_name = os.path.splitext(os.path.basename(video_path))[0]
txt_file = os.path.join(video_dir, f"{video_name}.txt")
srt_file = os.path.join(video_dir, f"{video_name}.srt")

# 获取视频时长
def get_video_duration(video):
    try:
        res = subprocess.run(["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "json", video], stdout=subprocess.PIPE)
        return float(json.loads(res.stdout)["format"]["duration"])
    except:
        return 100
total_time = get_video_duration(video_path)

# ===================== 加载模型（纯CPU 16核） =====================
print(f"\n加载模型：{model_name}")
model = WhisperModel(
    model_size_or_path=model_name,
    device="cpu",
    compute_type="int8",
    cpu_threads=16,
    num_workers=16
)

# ===================== 开始识别 =====================
print(f"\n视频总时长：{total_time:.1f}秒，开始识别...\n")
segments, info = model.transcribe(video_path, language="zh", vad_filter=False)

# 写入标准SRT + 纯文本
with open(txt_file, "w", encoding="utf-8") as f_txt, \
     open(srt_file, "w", encoding="utf-8") as f_srt:
    idx = 1
    for seg in segments:
        text = cc.convert(seg.text.strip())
        start_str = format_srt_time(seg.start)
        end_str = format_srt_time(seg.end)

        print(f"[{start_str} → {end_str}] {text}")

        f_txt.write(text + "\n")
        # 标准SRT格式
        f_srt.write(f"{idx}\n")
        f_srt.write(f"{start_str} --> {end_str}\n")
        f_srt.write(f"{text}\n\n")
        idx += 1

# 完成提示
messagebox.showinfo("完成", f"模型：{model_name}\n识别成功！\n已生成：\n1. 纯文本.txt\n2. SRT字幕（PotPlayer直接用）")