python实现分离不同人声、wespeaker

文章目录

剪映等工具都支持背景音和人声分离。
那么如果想分离不同人声呢?例如想把郭靖和黄蓉的声音分开。

方案选型

1、本地运行、无需上传 # 上传的不但慢，而且大多有限制

2、最好是国内的，如果需要注册hugging face，国内经常连不上

所以方案为：

ffmpeg

demucs(Meta/Facebook开发的-人声分离专家)

wespeaker(阿里达摩院-声纹识别专家)

代码流程：

python 复制代码

你可以把我们的程序想象成一个音频加工厂，流程如下：
1、原料入库 (FFmpeg)
输入：input.mp4 (杂乱的视频文件)
操作：提取音频流 -> 转为标准 WAV
产出：input_temp.wav (纯净的音频原料)
2、粗加工 - 分离人声 (Demucs)
输入：input_temp.wav
操作：AI 模型分析频谱 -> 剥离背景音乐
产出：input_vocals.wav (只有人声的干音)
3、精加工 - 声纹识别 (Wespeaker)
输入：input_vocals.wav
操作：
切片：检测哪里有人在说话 (VAD)。
特征提取：给每段话打上"声纹指纹"。
聚类：把相同的指纹归为一类（比如"说话人 A"、"说话人 B"）。
产出：speaker_0.wav, speaker_1.wav (按人分好的文件)

步骤

需要提前装好ffmpeg。

1、安装依赖

python 复制代码

pip install torch torchaudio demucs scipy numpy pydub
pip install s3prl
pip install openai-whisper 
pip install pydub

注：wespeaker国内镜像没有，先从git下载master包，然后再安装到本地即可。

2、代码

python 复制代码

import os
import torch
import torchaudio
import numpy as np
from scipy.cluster.hierarchy import fcluster, linkage
from scipy.spatial.distance import pdist
import warnings
import subprocess

# 过滤警告
warnings.filterwarnings("ignore", message=".*set_audio_backend.*")

# ================= 配置区域 =================
INPUT_FILE = "input.mp4"  # 你的原始 MP4 文件
WESPEAKER_MODEL_PATH = "wespeaker_vox1_en_resnet34.pt"
OUTPUT_DIR = "output_speakers"
# ============================================

print("🚀 正在启动音频分离与声纹识别程序...")


# --- 核心修复：使用 Pydub/FFmpeg 预处理音频 ---
def prepare_audio(input_file):
    """
    将任意格式音频转换为 WAV，绕过 torchaudio 读取 MP4 的 Bug
    """
    if not os.path.exists(input_file):
        print(f"❌ 找不到输入文件: {input_file}")
        return None

    # 如果已经是 wav，直接返回
    if input_file.lower().endswith('.wav'):
        return input_file

    output_wav = input_file.rsplit('.', 1)[0] + "_temp.wav"

    print(f"🔄 正在使用 FFmpeg 将 {input_file} 转换为 WAV...")

    # 调用系统 FFmpeg
    # 注意：这里直接调用 'ffmpeg'，依赖系统环境变量
    cmd = [
        'ffmpeg', '-y', '-i', input_file,
        '-vn',  # 不处理视频
        '-acodec', 'pcm_s16le',  # 编码为 PCM (WAV)
        '-ar', '44100',  # 采样率 44100 (Demucs 标准)
        '-ac', '2',  # 双声道
        output_wav
    ]

    try:
        # 尝试直接运行 ffmpeg
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(f"✅ 转换成功: {output_wav}")
        return output_wav
    except subprocess.CalledProcessError as e:
        print(f"❌ FFmpeg 转换失败: {e.stderr.decode()}")
        print("💡 请确保在终端输入 'ffmpeg' 能直接运行")
        return None


# --- 步骤 1: 使用 Demucs 去除背景音乐 ---
def step1_separate_vocals(input_file):
    print("\n" + "=" * 30)
    print("🎬 [步骤 1] 正在使用 Demucs 去除背景音乐...")
    print("=" * 30)

    try:
        from demucs.pretrained import get_model
        from demucs.apply import apply_model
        from demucs.audio import save_audio

        # 1. 加载模型
        model = get_model('htdemucs')

        # 2. 加载音频
        waveform, sample_rate = torchaudio.load(input_file)
        print(f"📥 音频加载完成: {waveform.shape}, 采样率: {sample_rate}")

        # 【关键修复】
        # demucs 需要 3 维输入: [Batch, Channels, Length]
        # torchaudio 输出 2 维: [Channels, Length]
        # 我们需要在第 0 维增加一个 Batch 维度
        if waveform.dim() == 2:
            waveform = waveform.unsqueeze(0)

        print(f"🔄 调整后的音频形状: {waveform.shape}")

        # 3. 执行分离
        print("🔄 正在分离音轨 (这可能需要几分钟)...")

        # 现在传入 3 维数据，就不会报错了
        out = apply_model(model, waveform, device='cpu', split=True, overlap=0.25)

        # out 是 [Batch, Sources, Length]，取第 0 个 batch
        sources = out[0]

        print(f"🎵 分离完成，共得到 {sources.shape[0]} 条音轨")

        # 4. 提取人声 (通常索引 0 是人声)
        vocals = sources[0]

        # 5. 保存人声
        vocals_file = input_file.rsplit('.', 1)[0] + "_vocals.wav"
        save_audio(vocals, vocals_file, samplerate=44100)

        print(f"✅ 人声提取成功: {vocals_file}")
        return vocals_file

    except Exception as e:
        print(f"❌ 步骤 1 失败: {e}")
        import traceback
        traceback.print_exc()
        return None


# --- 步骤 2: 简单的语音活动检测 ---
def simple_energy_vad(waveform, sample_rate, threshold_ratio=0.02, min_duration=0.5):
    if waveform.shape[0] > 1:
        mono = waveform.mean(dim=0)
    else:
        mono = waveform.squeeze()

    energy = torch.abs(mono)
    max_energy = energy.max()
    if max_energy == 0: return []

    threshold = max_energy * threshold_ratio
    mask = energy > threshold

    segments = []
    start_sample = 0
    in_speech = False

    for i, is_speech in enumerate(mask):
        if is_speech and not in_speech:
            start_sample = i
            in_speech = True
        elif not is_speech and in_speech:
            end_sample = i
            duration = (end_sample - start_sample) / sample_rate
            if duration > min_duration:
                segments.append((start_sample, end_sample))
            in_speech = False
    return segments


# --- 步骤 3: 声纹聚类 ---
def step2_diarize_speakers(vocals_file, output_dir):
    print("\n" + "=" * 30)
    print("🧠 [步骤 2] 正在识别说话人...")
    print("=" * 30)

    if not os.path.exists(WESPEAKER_MODEL_PATH):
        print(f"❌ 找不到模型文件: {WESPEAKER_MODEL_PATH}")
        return

    waveform, sample_rate = torchaudio.load(vocals_file)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    print(f"⏳ 正在加载 Wespeaker 模型...")
    try:
        import wespeaker
        speaker_model = wespeaker.load_model(WESPEAKER_MODEL_PATH)
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        speaker_model.set_device(device)
        print("✅ 模型加载成功")
    except Exception as e:
        print(f"❌ 模型加载失败: {e}")
        return

    print("🔪 正在检测语音片段...")
    segments = simple_energy_vad(waveform, sample_rate)
    print(f"🔍 检测到 {len(segments)} 个有效语音片段")

    if len(segments) < 2:
        print("⚠️ 语音片段太少，无法聚类")
        return

    embeddings = []
    valid_segments = []

    print("📝 正在提取声纹特征...")
    for start, end in segments:
        segment_wave = waveform[:, start:end]
        embedding = speaker_model.extract_embedding(segment_wave.numpy())
        if embedding is not None:
            embeddings.append(embedding)
            valid_segments.append((start, end))

    if len(embeddings) < 2:
        print("❌ 无法提取足够的声纹特征")
        return

    embeddings = np.array(embeddings)

    print("📊 正在进行聚类分析...")
    distances = pdist(embeddings, metric='cosine')
    Z = linkage(distances, 'average')
    labels = fcluster(Z, t=0.5, criterion='distance')

    unique_labels = set(labels)
    print(f"✅ 识别出 {len(unique_labels)} 位说话人: {list(unique_labels)}")

    os.makedirs(output_dir, exist_ok=True)

    for label in unique_labels:
        indices = np.where(labels == label)[0]
        longest_segment = None
        max_len = 0

        for idx in indices:
            start, end = valid_segments[idx]
            duration = end - start
            if duration > max_len:
                max_len = duration
                longest_segment = waveform[:, start:end]

        output_path = os.path.join(output_dir, f"speaker_{label}.wav")
        torchaudio.save(output_path, longest_segment, sample_rate)
        print(f"   - 保存: {output_path} (时长: {max_len / sample_rate:.2f}s)")


# ================= 主程序 =================
if __name__ == "__main__":
    # 1. 预处理：MP4 -> WAV
    wav_file = prepare_audio(INPUT_FILE)

    if wav_file:
        # 2. 分离
        vocals_path = step1_separate_vocals(wav_file)

        # 3. 聚类
        if vocals_path:
            step2_diarize_speakers(vocals_path, OUTPUT_DIR)
            print("\n🎉 全部处理完成！")

            # 可选：清理临时文件
            # os.remove(wav_file)

输出结果：

python 复制代码

🚀 正在启动音频分离与声纹识别程序...
🔄 正在使用 FFmpeg 将 input.mp4 转换为 WAV...
✅ 转换成功: input_temp.wav

==============================
🎬 [步骤 1] 正在使用 Demucs 去除背景音乐...
==============================
Downloading: "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th" to C:\Users\user/.cache\torch\hub\checkpoints\955717e8-8726e21a.th
100%|██████████| 80.2M/80.2M [00:15<00:00, 5.29MB/s]
📥 音频加载完成: torch.Size([2, 13218240]), 采样率: 44100
🔄 调整后的音频形状: torch.Size([1, 2, 13218240])
🔄 正在分离音轨 (这可能需要几分钟)...
🎵 分离完成，共得到 4 条音轨
✅ 人声提取成功: input_temp_vocals.wav

==============================
🧠 [步骤 2] 正在识别说话人...
==============================
❌ 找不到模型文件: wespeaker_vox1_en_resnet34.pt

🎉 全部处理完成！

果然成功了，效果还不错，就是优点慢。

可用的几个模型

mdx_extra：效果最好，速度最慢（龟速）。
htdemucs_ft：效果不错，速度中等（推荐）。
htdemucs：效果一般，速度很快。

其他

如果遇到c++问题

安装visual studio装下

wespeaker下载安装步骤

1、下文git地址下载master分支zip包。

2、pycharm所在虚拟环境中，执行安装命令

python 复制代码

D:/PycharmProjects/transformer_demo/.venv/Scripts/python.exe -m install "安装包路径"

其他

文档

wespeaker官网git地址(这里下载的master分支，可以pip install成功)：
https://github.com/wenet-e2e/wespeaker