whisper语音转文字配置

Whisper CUDA (RTX 5060) 环境配置笔记

1. 环境安装指令

第一步：卸载旧版 Torch (确保无冲突)

Bash

复制代码

pip uninstall torch torchvision torchaudio -y

第二步：安装支持 RTX 5060 (Blackwell 架构) 的 CUDA 12.8 版本

Bash

复制代码

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

第三步：安装核心组件

Bash

复制代码

pip install faster-whisper whisper-ctranslate2

2. 实际使用指令

推荐转录命令 (生成带标点、无时间戳的连贯文本)：

Bash

复制代码

whisper-ctranslate2 "输入文件.m4a" --model large-v3 --language zh --output_format txt --initial_prompt "以下是转录内容，请确保语句连贯并正确使用中文标点符号。"

whisper-ctranslate2 "lesson 0415 32h.m4a" --model large-v3 --language zh --output_format txt --initial_prompt "以下是转录内容，请确保语句连贯并正确使用中文标点符号。"

whisper "lesson 0415 32h.m4a" --model large-v3 --language zh --output_format txt --initial_prompt "以下是转录内容，请确保语句连贯并正确使用中文标点符号。"

参数说明：

--model large-v3: 使用精度最高的模型。
--language zh: 强制识别为中文。
--output_format txt: 仅输出纯文本文件（不含时间戳）。
--initial_prompt: 通过引导语强制模型生成标点符号。

--beam_size 1：将束搜索宽度设为1，与原版 whisper 默认值一致，减少重复幻觉。

--vad_filter True：启用语音活动检测，自动跳过静音片段，避免无语音段产生幻觉。

--condition_on_previous_text False：禁止用前一段输出作为下一段的上下文，防止错误内容向后传播。

--word_timestamps True：启用词级时间戳，改善分句断点的准确性。

3. 脚本

transcribe.py

python 复制代码

"""
快速语音转文字工具 (whisper-ctranslate2)
用法: python transcribe.py
"""

import os
import subprocess

# ── 配置 ──
MODEL = "large-v3"
OUTPUT_FORMAT = "txt"

LANG_OPTIONS = {
    "1": {
        "code": "zh",
        "label": "中文",
        "extra_args": [
            "--beam_size", "1",
            "--vad_filter", "True",
            "--initial_prompt", "大家好，这是一段录音。我现在开始讲话了，请注意听。今天我们来讨论一下这个问题。如果内容中有一些English，比如app，或者数字10等，保持原词不需要翻译。",
        ],
    },
    "2": {
        "code": "en",
        "label": "English",
        "extra_args": [
            "--beam_size", "1",
            "--vad_filter", "True",
        ],
    },
}

AUDIO_EXTS = {".mp3", ".m4a", ".wav", ".flac", ".ogg", ".wma", ".aac", ".mp4", ".mkv", ".webm"}


def list_files():
    return sorted(
        f for f in os.listdir(".")
        if os.path.isfile(f) and os.path.splitext(f)[1].lower() in AUDIO_EXTS
    )


def choose_language():
    print(f"\n{'─' * 50}")
    print("  选择语言 / Select language")
    print(f"{'─' * 50}\n")
    for key, opt in LANG_OPTIONS.items():
        print(f"  [{key}]  {opt['label']}")
    print(f"\n  [0]  退出\n")

    try:
        choice = input("输入编号: ").strip()
    except (KeyboardInterrupt, EOFError):
        print()
        return None

    if choice == "0" or choice == "":
        return None
    if choice not in LANG_OPTIONS:
        print("编号无效。")
        return None

    return LANG_OPTIONS[choice]


def choose_file(files, lang):
    print(f"\n{'─' * 50}")
    print(f"  模型: {MODEL}  |  语言: {lang['label']}  |  格式: {OUTPUT_FORMAT}")
    print(f"{'─' * 50}\n")

    for i, f in enumerate(files, 1):
        size_mb = os.path.getsize(f) / (1024 * 1024)
        print(f"  [{i}]  {f}  ({size_mb:.1f} MB)")

    print(f"\n  [0]  返回\n")

    try:
        choice = input("输入编号开始转录: ").strip()
    except (KeyboardInterrupt, EOFError):
        print()
        return None

    if choice == "0" or choice == "":
        return None

    try:
        idx = int(choice) - 1
        if idx < 0 or idx >= len(files):
            print("编号无效。")
            return None
    except ValueError:
        print("请输入数字。")
        return None

    return files[idx]


def main():
    lang = choose_language()
    if not lang:
        return

    files = list_files()
    if not files:
        print("当前目录没有找到音视频文件。")
        return

    selected = choose_file(files, lang)
    if not selected:
        return

    print(f"\n开始转录: {selected}\n")

    cmd = [
        "whisper-ctranslate2", selected,
        "--model", MODEL,
        "--language", lang["code"],
        "--task", "transcribe",
        "--output_format", OUTPUT_FORMAT,
        *lang["extra_args"],
    ]

    subprocess.run(cmd)


if __name__ == "__main__":
    main()