企业语音处理场景中,对话识别是非常关键的能力,尤其是在会议场景、客服通话、机器人交互中:
-
话中断不断 → VAD 句子切分
-
不知道有几个人说话? → 自动估计说话人数
-
谁说的? → 声纹聚类
-
说了什么? → ASR 转写
-
输出带人物身份的转写文本 ✔
今天带来一条完整的流水线技术演示👇
支持自动推断说话人数,完全不用手动指定!

输出结果:
bash
[00:00:03.110--00:00:04.620] 陌生人1:你今天中午吃的啥?
[00:00:04.990--00:00:07.510] 陌生人2:今天中午吃了汉堡包,味道挺不错的。
[00:00:08.380--00:00:09.930] 陌生人1:你今天晚上打算吃啥?
[00:00:10.270--00:00:11.260] 陌生人1:有什么想法吗?
[00:00:11.650--00:00:13.880] 陌生人2:我还没有想好,让小智帮我决定吧。
【如果你对人工智能的学习有兴趣可以看看我的其他博客,对新手很友好!!!】
【本猿定期无偿分享学习成果,欢迎关注一起学习!!!】
一.🧠 功能流程概览
任意音频文件(两人或多人说话)
│
1 ffmpeg 转 16k 单声道
│
2 可选:ClearVoice 降噪
│
3 RMS 能量 VAD → 句子级分段
│
4 每段音频切片 .wav
│
5 CAM++ 声纹向量(L2 归一化)
│
6 层次聚类 + silhouette 自动估计 K
│
7 FunASR 转写(含 VAD + 标点)
│
8 输出 transcript_cluster.txt
时间段 陌生人1:转写内容
二.🧱 环境依赖(推荐版本)
| 组件 | 推荐版本 |
|---|---|
| Python | 3.8 ~ 3.11 |
| numpy | >=1.23 |
| librosa | 0.10+ |
| scikit-learn | >=1.3.0 |
| soundfile | 最新 |
| ffmpeg | 必须在 PATH 中 |
| FunASR | ≥1.1.6 |
| ModelScope | ≥1.14 |
| ClearVoice(可选) | 从 GitHub 安装 |
推荐安装命令:
bash
pip install numpy librosa soundfile scikit-learn
pip install "funasr>=1.1.6" modelscope
pip install clearvoice
⚠️ GPU 推荐 CUDA 环境,大幅提升 ASR 速度
CPU 也可运行,但延迟显著高一些
三.📌 项目目录结构建议
bash
project/
├─ main_simple_cluster.py ← 本文脚本
├─ data/
│ └─ input.wav ← 你的音频文件
├─ output_simple/ ← 自动生成结果
└─ model/ ← 缓存模型
四.🔥 完整代码(已验证可运行)
python
# -*- coding: utf-8 -*-
"""
main_simple_cluster.py ------ 自动估计说话人人数版
逻辑:
1. ffmpeg 转 16k 单声道
2. (可选)ClearVoice 降噪
3. 句子级 VAD
4. 按句子裁剪 wav:segments/0000.wav, 0001.wav, ...
5. 对每个句子 wav 用 CAM++ 声纹模型提 embedding(L2 归一化)
6. 在 [MIN_SPEAKERS, MAX_SPEAKERS] 范围内,用 AgglomerativeClustering + cosine 距离,
结合 silhouette score 自动选出最合适的簇数 K
7. 对每个句子 wav 跑 ASR
8. 输出 transcript_cluster.txt:
[00:00:03.110--00:00:04.620] 陌生人1:xxxx
"""
import os
import subprocess
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import numpy as np
import soundfile as sf
from funasr import AutoModel
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
# =============== 全局参数 ===============
BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / "data"
INPUT_PATH = DATA_DIR / "input.wav"
OUTPUT_DIR = BASE_DIR / "output_simple"
SEGMENTS_DIR = OUTPUT_DIR / "segments"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
INPUT_16K_MONO = OUTPUT_DIR / "input_16k_mono.wav"
ENHANCED_WAV = OUTPUT_DIR / "enhanced.wav"
TRANSCRIPT_CLUSTER = OUTPUT_DIR / "transcript_cluster.txt"
MS_CACHE_DIR = BASE_DIR / "model"
os.environ["MODELSCOPE_CACHE"] = str(MS_CACHE_DIR)
os.environ["MODELSCOPE_DATASETS_CACHE"] = str(MS_CACHE_DIR)
_SV_PIPELINE = None
# 说话人数搜索范围(让代码自动估计 K)
MIN_SPEAKERS = 1
MAX_SPEAKERS = 4 # demo 场景一般不超过 4 人,真实工程可适当放大
# =============== 工具函数 ===============
def run_cmd(cmd: List[str]):
print(f"[CMD] {' '.join(cmd)}")
subprocess.run(cmd, check=True)
def ffmpeg_to_16k_mono(input_path: Path, output_path: Path):
output_path.parent.mkdir(parents=True, exist_ok=True)
cmd = [
"ffmpeg", "-y",
"-i", str(input_path),
"-ac", "1",
"-ar", "16000",
str(output_path),
]
run_cmd(cmd)
print(f"[OK] 转为 16k 单声道: {output_path}")
def optional_denoise_with_clearvoice(input_path: Path, output_path: Path) -> Path:
try:
from clearvoice import ClearVoice
except ImportError:
print("[WARN] 无 clearvoice,跳过降噪")
return input_path
try:
cv_se = ClearVoice(task="speech_enhancement", model_names=["FRCRN_SE_16K"])
out = cv_se(input_path=str(input_path), online_write=False)
cv_se.write(out, output_path=str(output_path))
print(f"[OK] ClearVoice 降噪输出: {output_path}")
return output_path
except Exception as e:
print(f"[WARN] ClearVoice 失败: {e},使用未降噪音频")
return input_path
def format_time_s(seconds: float) -> str:
ms_total = int(round(seconds * 1000))
s_total, ms = divmod(ms_total, 1000)
h, rem = divmod(s_total, 3600)
m, s = divmod(rem, 60)
return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
# =============== 简单 VAD ===============
def simple_vad_segments(
wav_path: Path,
frame_ms: float = 20.0,
hop_ms: float = 10.0,
min_speech: float = 0.8,
min_silence: float = 0.25,
energy_ratio: float = 0.6,
) -> List[Dict]:
import librosa
print("[VAD] 简单能量 VAD 分段...")
audio, sr = sf.read(str(wav_path))
if audio.ndim > 1:
audio = audio.mean(axis=1)
frame_len = int(sr * frame_ms / 1000.0)
hop_len = int(sr * hop_ms / 1000.0)
if frame_len <= 0:
frame_len = int(sr * 0.02)
if hop_len <= 0:
hop_len = int(sr * 0.01)
rms = librosa.feature.rms(y=audio, frame_length=frame_len, hop_length=hop_len)[0]
times = librosa.frames_to_time(np.arange(len(rms)), sr=sr, hop_length=hop_len)
nonzero = rms[rms > 0]
if len(nonzero) == 0:
print("[VAD] 全零,返回空")
return []
median = float(np.median(nonzero))
threshold = max(median * energy_ratio, median * 0.3)
print(f"[VAD] RMS median={median:.6f}, threshold={threshold:.6f}")
speech_mask = rms > threshold
segments: List[Tuple[float, float]] = []
in_seg = False
seg_start = 0.0
for i, flag in enumerate(speech_mask):
t = times[i]
if flag and not in_seg:
in_seg = True
seg_start = t
elif not flag and in_seg:
in_seg = False
segments.append((seg_start, t))
if in_seg:
segments.append((seg_start, float(len(audio) / sr)))
merged: List[Tuple[float, float]] = []
for s, e in segments:
if not merged:
merged.append((s, e))
else:
last_s, last_e = merged[-1]
# 中间静音小于 min_silence 就合并
if s - last_e < min_silence:
merged[-1] = (last_s, e)
else:
merged.append((s, e))
finals: List[Dict] = []
for s, e in merged:
if e - s >= min_speech:
finals.append({"start": float(s), "end": float(e)})
print(f"[VAD] 共 {len(finals)} 段")
return finals
def cut_segments(wav_path: Path, segs: List[Dict]) -> List[Dict]:
out_list: List[Dict] = []
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
for idx, seg in enumerate(segs):
s = seg["start"]
e = seg["end"]
if e <= s:
continue
out_path = SEGMENTS_DIR / f"{idx:04d}.wav"
cmd = [
"ffmpeg", "-y",
"-i", str(wav_path),
"-ss", f"{s:.3f}",
"-to", f"{e:.3f}",
"-acodec", "copy",
str(out_path),
]
run_cmd(cmd)
out_list.append(
{"idx": idx, "start": s, "end": e, "wav_path": out_path}
)
print(f"[CUT] [{idx:04d}] {format_time_s(s)}~{format_time_s(e)} -> {out_path}")
return out_list
# =============== 声纹提取 ===============
def get_sv_pipeline():
global _SV_PIPELINE
if _SV_PIPELINE is None:
from modelscope.pipelines import pipeline
print("[SV] 初始化声纹模型 speech_campplus_sv_zh-cn_16k-common ...")
_SV_PIPELINE = pipeline(
task="speaker-verification",
model="damo/speech_campplus_sv_zh-cn_16k-common",
)
return _SV_PIPELINE
def extract_embeddings(
timeline: List[Dict],
min_dur: float = 0.8
) -> List[Optional[np.ndarray]]:
sv = get_sv_pipeline()
embs: List[Optional[np.ndarray]] = [None] * len(timeline)
for i, seg in enumerate(timeline):
dur = seg["end"] - seg["start"]
if dur < min_dur:
# 太短的句子不拿来算 embedding,后面用前向/后向补
continue
wav_path = seg["wav_path"]
try:
res = sv([str(wav_path)], output_emb=True)
emb_obj = res.get("embs", None)
if emb_obj is None:
continue
if isinstance(emb_obj, (list, tuple)):
if not emb_obj:
continue
emb = np.asarray(emb_obj[0], dtype=np.float32)
else:
emb = np.asarray(emb_obj, dtype=np.float32)
if emb.ndim >= 2:
emb = emb[0]
# ★ L2 归一化(余弦空间里聚类)
norm = np.linalg.norm(emb) + 1e-8
emb = emb / norm
embs[i] = emb
except Exception as e:
print(f"[SV][WARN] 提取失败 idx={i}, file={wav_path}, err={e}")
return embs
# =============== 聚类 & 自动估计说话人数 ===============
def choose_k_by_silhouette(X: np.ndarray,
k_min: int,
k_max: int) -> Tuple[int, np.ndarray]:
"""
在 [k_min, k_max] 中搜索,让 silhouette score 最高的 K 生效。
返回:(best_k, labels_for_best_k)
"""
n = X.shape[0]
# 极端情况:样本太少
if n <= 1:
return 1, np.zeros(n, dtype=int)
k_min = max(1, k_min)
k_max = min(k_max, n)
best_k = k_min
best_score = -1.0
best_labels = np.zeros(n, dtype=int)
for k in range(k_min, k_max + 1):
if k == 1:
# 只有 1 个簇就全 0,score 设为 -1,当兜底用
labels = np.zeros(n, dtype=int)
score = -1.0
print(f"[K-SEARCH] k=1, skip silhouette, score={score}")
else:
print(f"[K-SEARCH] 尝试 k={k} ...")
try:
hac = AgglomerativeClustering(
n_clusters=k,
metric="cosine",
linkage="average",
)
labels = hac.fit_predict(X)
# 少于 2 个簇没法算 silhouette
if len(np.unique(labels)) < 2:
print(f"[K-SEARCH] k={k}, 只有一个簇,跳过")
continue
score = silhouette_score(X, labels, metric="cosine")
print(f"[K-SEARCH] k={k}, silhouette={score:.4f}")
except Exception as e:
print(f"[K-SEARCH][WARN] k={k} 失败: {e}")
continue
if score > best_score:
best_score = score
best_k = k
best_labels = labels
print(f"[K-SEARCH] 最优 K = {best_k}, best_silhouette = {best_score:.4f}")
return best_k, best_labels
def cluster_speakers_global(
embs: List[Optional[np.ndarray]],
min_speakers: int = 1,
max_speakers: int = 4,
) -> List[int]:
"""
使用层次聚类(AgglomerativeClustering)做全局聚类,并自动估计说话人数:
- 距离:cosine
- linkage:average
- 在 [min_speakers, max_speakers] 范围内,用 silhouette score 选最优 K
"""
# 只拿有 embedding 的句子参与聚类
indices = [i for i, e in enumerate(embs) if e is not None]
if not indices:
# 一个 embedding 都没提出来,全给一个簇
return [0] * len(embs)
X = np.stack([embs[i] for i in indices], axis=0) # 已经是单位向量
n_valid = X.shape[0]
print(f"[CLUSTER] 有有效 embedding 的句子数: n_valid={n_valid}")
# 自动选择 K
k_min = max(1, min_speakers)
k_max = max(k_min, min(max_speakers, n_valid))
best_k, base_labels = choose_k_by_silhouette(X, k_min, k_max)
# 把结果映射回到所有句子上(那些没有 embedding 的,先填 None,再用前后补)
labels_all: List[Optional[int]] = [None] * len(embs)
for idx, lab in zip(indices, base_labels):
labels_all[idx] = int(lab)
# 前向填补
last = None
for i in range(len(labels_all)):
if labels_all[i] is not None:
last = labels_all[i]
else:
if last is not None:
labels_all[i] = last
# 反向填补
next_c = None
for i in reversed(range(len(labels_all))):
if labels_all[i] is not None:
next_c = labels_all[i]
else:
if next_c is not None:
labels_all[i] = next_c
# 兜底:仍然可能有 None,统一给 0
labels_all = [int(l) if l is not None else 0 for l in labels_all]
return labels_all
def merge_tiny_clusters_by_duration_and_center(
cluster_ids: List[int],
embs: List[Optional[np.ndarray]],
timeline: List[Dict],
min_cluster_dur: float = 1.0,
) -> List[int]:
"""
把"总时长很短的簇"并到最近的大簇里,避免出现只包含 0.x 秒的小簇。
逻辑:
1. 统计每个簇的总时长
2. 计算每个簇的中心 embedding(该簇所有句子 embedding 的平均 / 归一化)
3. 对总时长 < min_cluster_dur 的簇,找到"余弦相似度最高、且时长 >= min_cluster_dur"的簇并入
4. 反复执行直到没有需要合并的短簇
"""
cluster_ids = list(cluster_ids)
unique_ids = sorted(set(cluster_ids))
# 先预计算每个簇的中心
def compute_centers(ids: List[int]) -> Dict[int, np.ndarray]:
center_map: Dict[int, np.ndarray] = {}
for cid in ids:
vecs = []
for lab, emb in zip(cluster_ids, embs):
if lab == cid and emb is not None:
vecs.append(emb)
if not vecs:
continue
mat = np.stack(vecs, axis=0)
center = mat.mean(axis=0)
norm = np.linalg.norm(center) + 1e-8
center_map[cid] = center / norm
return center_map
changed = True
while changed:
changed = False
unique_ids = sorted(set(cluster_ids))
# 1. 重新统计每个簇的总时长
dur_map: Dict[int, float] = {cid: 0.0 for cid in unique_ids}
for cid, seg in zip(cluster_ids, timeline):
d = max(0.0, seg["end"] - seg["start"])
dur_map[cid] += d
center_map = compute_centers(unique_ids)
for cid in list(unique_ids):
if dur_map[cid] >= min_cluster_dur:
continue
if cid not in center_map:
continue
src_center = center_map[cid]
best_target = None
best_sim = -1.0
# 2. 找一个"时长 >= min_cluster_dur 且 与它最相似"的大簇
for other in unique_ids:
if other == cid:
continue
if dur_map.get(other, 0.0) < min_cluster_dur:
continue
if other not in center_map:
continue
sim = float(np.dot(src_center, center_map[other]))
if sim > best_sim:
best_sim = sim
best_target = other
if best_target is None:
# 找不到合适的大簇,就暂时放过
continue
print(f"[POST-MERGE] merge tiny cluster {cid} -> {best_target}, "
f"dur={dur_map[cid]:.2f}s, sim={best_sim:.3f}")
# 3. 把所有 cid 改成 best_target
cluster_ids = [best_target if x == cid else x for x in cluster_ids]
changed = True
break # 重新算一轮
return cluster_ids
def build_speaker_name_map_in_order(cluster_ids: List[int]) -> Dict[int, str]:
"""
按"首次出现顺序"为簇编号:
第一个出现的簇 -> 陌生人1
第二个出现的簇 -> 陌生人2
...
"""
mapping: Dict[int, str] = {}
next_id = 1
for cid in cluster_ids:
if cid not in mapping:
mapping[cid] = f"陌生人{next_id}"
next_id += 1
return mapping
def calc_cluster_duration(cluster_ids: List[int], timeline: List[Dict]) -> Dict[int, float]:
"""
仅用来打印每个簇的总时长(单位:秒),不再改变簇 ID 顺序。
"""
dur_map: Dict[int, float] = {}
for cid, seg in zip(cluster_ids, timeline):
d = max(0.0, seg["end"] - seg["start"])
dur_map[cid] = dur_map.get(cid, 0.0) + d
return dur_map
# =============== ASR ===============
def init_asr_model():
print("[ASR] 初始化 FunASR paraformer-zh ...")
model = AutoModel(
model="paraformer-zh",
vad_model="fsmn-vad",
punc_model="ct-punc",
disable_update=True, # 不每次检查更新,更稳定
device="cuda" if os.getenv("USE_CUDA", "0") == "1" else "cpu",
)
return model
def asr_segments(asr_model, timeline: List[Dict]) -> List[str]:
texts: List[str] = []
for seg in timeline:
wav_path = seg["wav_path"]
print(f"[ASR] {wav_path}")
try:
res = asr_model.generate(input=str(wav_path), batch_size_s=60)
if isinstance(res, list) and res:
texts.append(res[0].get("text", "").strip())
else:
texts.append("")
except Exception as e:
print(f"[ASR][WARN] 失败: {wav_path}, err={e}")
texts.append("")
return texts
# =============== 主流程 ===============
def main():
print("=== 简化版:句子级 + 声纹聚类 + ASR(自动估计说话人数) ===")
print(f"[INFO] 输入: {INPUT_PATH}")
if not INPUT_PATH.exists():
raise FileNotFoundError(INPUT_PATH)
# 1. 统一采样率
ffmpeg_to_16k_mono(INPUT_PATH, INPUT_16K_MONO)
# 2. 可选降噪
processed = optional_denoise_with_clearvoice(INPUT_16K_MONO, ENHANCED_WAV)
# 3. VAD 分段
vad_segs = simple_vad_segments(processed)
if not vad_segs:
print("[ERROR] VAD 结果为空")
return
# 4. 按句子裁剪 WAV
timeline = cut_segments(processed, vad_segs)
if not timeline:
print("[ERROR] 无可用片段")
return
# 5. 声纹 embedding
embs = extract_embeddings(timeline, min_dur=0.8)
# 6. 声纹聚类 + 自动估计说话人数
cluster_ids = cluster_speakers_global(
embs,
min_speakers=MIN_SPEAKERS,
max_speakers=MAX_SPEAKERS,
)
# 6.1 合并总时长过短的小簇,避免 0.x 秒噪声簇
cluster_ids = merge_tiny_clusters_by_duration_and_center(
cluster_ids,
embs,
timeline,
min_cluster_dur=1.0, # 可以根据业务调 0.8 / 1.5 等
)
# 6.2 仅统计每个簇的总时长(不重排 ID)
dur_map = calc_cluster_duration(cluster_ids, timeline)
print(f"[CLUSTER] 原始簇 ID -> 总时长(秒): {dur_map}")
print(f"[CLUSTER] 估计说话人数: {len(dur_map)}")
# 映射成"陌生人1/2/3...",按时间顺序第一次出现来编号
spk_name_map = build_speaker_name_map_in_order(cluster_ids)
# 7. ASR
asr_model = init_asr_model()
texts = asr_segments(asr_model, timeline)
# 8. 输出结果
with open(TRANSCRIPT_CLUSTER, "w", encoding="utf-8") as f:
for seg, cid, text in zip(timeline, cluster_ids, texts):
name = spk_name_map.get(cid, f"陌生人{cid+1}")
line = f"[{format_time_s(seg['start'])}--{format_time_s(seg['end'])}] {name}:{text}\n"
f.write(line)
print("[DONE] 输出:", TRANSCRIPT_CLUSTER)
if __name__ == "__main__":
main()
五.🏁 总结
本文实现了一个完整的、可落地的企业级语音处理流水线:
✔ VAD 分句
✔ 声纹 embedding
✔ 自动估计说话人数
✔ 聚类 & 小簇合并
✔ ASR 转写 + 标点
✔ 输出带时间戳的角色对话文本
🤝 如果这篇内容对你有帮助
记得 点赞 + 收藏 + 评论,让更多人看到!