通用唤醒词识别模型 - Wav2Vec2

1 测试结果:

python 复制代码
# 前 6 个为合成的负样本,后 4 个为人声录制的正样本
# 测试结果表明:基本满足唤醒词识别需求,但特征样本对模型效果影响较大

pool_sim: 0.9486, dtw_sim: 0.1332, similarity: 0.4594
False
pool_sim: 0.9341, dtw_sim: 0.1379, similarity: 0.4564
False
pool_sim: 0.9414, dtw_sim: 0.1686, similarity: 0.4777
False
pool_sim: 0.9615, dtw_sim: 0.2154, similarity: 0.5138
False
pool_sim: 0.9378, dtw_sim: 0.1925, similarity: 0.4906
False
pool_sim: 0.8796, dtw_sim: 0.1584, similarity: 0.4469
False
pool_sim: 0.9676, dtw_sim: 0.2625, similarity: 0.5446
False
pool_sim: 0.9780, dtw_sim: 1.0000, similarity: 0.9912
True
pool_sim: 0.9615, dtw_sim: 0.2212, similarity: 0.5173
False
pool_sim: 0.9862, dtw_sim: 0.5523, similarity: 0.7259
True

2 模型实现

python 复制代码
import json
import os

import librosa
import numpy as np
import torch
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from torch import nn, Tensor
from transformers import Wav2Vec2Processor, Wav2Vec2Model


# 音频编码器
class AudioEncoder(nn.Module):
    def __init__(self,
                 path: str = None,
                 sample_rate: int = 16000,
                 max_length: int = 10):
        super(AudioEncoder, self).__init__()
        self.sample_rate = sample_rate
        self.max_length = max_length
        if path is None:  # 在线加载
            path = r"facebook/wav2vec2-large-960h-lv60-self"
        # 加载预训练模型
        self.processor = Wav2Vec2Processor.from_pretrained(path)
        self.model = Wav2Vec2Model.from_pretrained(path)

    def forward(self,
                audios: list[str | np.ndarray | Tensor],
                return_type: str = "np") -> dict[str, np.ndarray | Tensor]:
        processed_audios = []
        for audio in audios:
            if isinstance(audio, str):  # 加载音频文件
                waveform, _ = librosa.load(audio, sr=self.sample_rate)
            else:
                waveform = audio
            processed_audios.append(waveform)
        # 提取特征
        with torch.no_grad():
            inputs = self.processor(
                audios,  # ndarray, Tensor
                sampling_rate=self.sample_rate,
                return_tensors="pt",
                padding=True,
                max_length=self.sample_rate * self.max_length,
                truncation=True,
            )
            outputs = self.model(**inputs)
            last_hidden_state = outputs["last_hidden_state"]  # (batch, seq_len, 1024)
            # 取最后一层的平均池化作为全局特征向量
            pooler_output = last_hidden_state.mean(dim=1)  # (batch, 1024)
            if return_type == "np":  # ndarray
                last_hidden_state = last_hidden_state.cpu().numpy()
                pooler_output = pooler_output.cpu().numpy()

        return {
            "last_hidden_state": last_hidden_state,
            "pooler_output": pooler_output,
        }


# 唤醒词模型
class WakeWordModel:
    def __init__(self, config: dict = None):
        self.root = os.path.dirname(os.path.abspath(__file__))  # 工作路径
        self.config_path = os.path.join(self.root, "config.json")
        self.config = config or self.load_config()  # 配置项
        self.device = torch.device(self.config["DEVICE"])
        # 音频编码器
        self.audio_encoder = AudioEncoder(
            self.config["AUDIO_ENCODER"],
            sample_rate=self.config["SAMPLE_RATE"],
            max_length=self.config["MAX_LENGTH"],
        ).to(self.device)
        self.audio_encoder.eval()  # 测试模式

    # 检测唤醒词
    def __call__(self,
                 name: str,
                 audio: str | np.ndarray) -> bool:
        info = self.config["WAKE_WORD_INFO"]
        if name not in info:
            raise ValueError(f"未注册的唤醒词:{name}")
        feature = np.load(info[name]["FEATURE"])
        pool_prototype, dtw_prototypes = feature["pool"], feature["dtw"]
        # 提取被检测样本的特征向量
        ret = self.extract_features([audio])
        pool_sim = self.cosine_similarity(ret["pooler_output"][0], pool_prototype)
        # 计算 DTW 距离
        dtw_feat = ret["last_hidden_state"][0]  # (seq_len, 1024)
        min_dtw_dist, prot_len = float("inf"), 0
        for dtw_prototype in dtw_prototypes:  # 逐样本计算对比
            # 使用欧氏距离作为点距离
            distance, _ = fastdtw(dtw_feat, dtw_prototype, dist=euclidean)
            if distance < min_dtw_dist:
                min_dtw_dist = distance
                prot_len = len(dtw_prototype)  # (seq_len,)
        # 计算平均帧距离
        avg_dist = min_dtw_dist / max(len(dtw_feat), prot_len)
        dtw_sim = 1.0 / (1.0 + avg_dist)
        # 加权相似度
        similarity = 0.4 * pool_sim + 0.6 * dtw_sim
        print("pool_sim: {:.4f}, dtw_sim: {:.4f}, similarity: {:.4f}".format(
            pool_sim, dtw_sim, similarity
        ))

        return similarity >= self.config["THRESHOLD"]

    # 更新唤醒词
    def update_wake_word(self,
                         name: str,
                         samples: list[str]):
        min_samples, max_samples = self.config["MIN_SAMPLES"], self.config["MAX_SAMPLES"]
        if len(samples) < min_samples:  # 限制样本数量
            raise ValueError(f"注册唤醒词至少需要 {min_samples} 个音频样本")
        samples = samples[:max_samples]
        # 提取样本的特征向量
        ret = self.extract_features(samples)
        # 计算特征向量的均值
        pool_prototype = np.mean(ret["pooler_output"], axis=0)  # (1024,)
        dtw_prototypes = ret["last_hidden_state"]  # (batch, seq_len, 1024)
        # 更新配置项
        feature_path = os.path.join(self.root, "feature", f"{name}.npz")
        self.config["WAKE_WORD_INFO"][name] = {
            "SAMPLE": samples,
            "FEATURE": feature_path,
        }
        with open(self.config_path, "w+", encoding="utf-8") as file:
            json.dump(self.config, file, ensure_ascii=False)
        # 保存特征文件
        np.savez(feature_path, pool=pool_prototype, dtw=dtw_prototypes)

    # 提取音频的特征向量
    def extract_features(self,
                         audios: list[str | np.ndarray],
                         top_db: float = 40.0) -> dict[str, np.ndarray]:
        processed_audios = []
        for audio in audios:
            if isinstance(audio, str):  # 加载音频文件
                waveform, _ = librosa.load(audio, sr=self.config["SAMPLE_RATE"])
            else:
                waveform = audio
            intervals = librosa.effects.split(waveform, top_db=top_db)
            waveform_trimmed = []  # 消除静音后的音频
            for start, end in intervals:
                waveform_trimmed.extend(waveform[start:end])
            processed_audios.append(np.array(waveform_trimmed))

        return self.audio_encoder(processed_audios)

    # 加载配置项
    def load_config(self) -> dict:
        if not os.path.exists(self.config_path):
            raise FileNotFoundError(f"配置文件不存在:{self.config_path}")
        with open(self.config_path, "r", encoding="utf-8") as f:
            config = json.load(f)
        # 校验
        required_keys = ["DEVICE", "AUDIO_ENCODER", "SAMPLE_RATE",
                         "MAX_LENGTH", "THRESHOLD", "MIN_SAMPLES",
                         "MAX_SAMPLES", "FEATURE_DIM", "WAKE_WORD_INFO"]
        for key in required_keys:
            if key not in config:
                raise ValueError(f"配置文件中缺少必要的配置项:{key}")

        return config

    # 计算两个向量的余弦相似度
    @staticmethod
    def cosine_similarity(vec1, vec2):
        vec1 = vec1 / np.linalg.norm(vec1)
        vec2 = vec2 / np.linalg.norm(vec2)

        return np.dot(vec1, vec2)

3 训练及验证:

python 复制代码
if __name__ == '__main__':
    wake_word_model = WakeWordModel()
    wake_word_model.update_wake_word(
        "你好坤坤",
        [
            r"D:\Project\Transformer\wake_word\sample\你好坤坤" +
            "\\" + str(i) + ".mp3" for i in range(1, 8)
        ],
    )
    for i in range(1, 11):
        print(
            wake_word_model(
                "你好坤坤",
                r"D:\Project\Transformer\wake_word\audio\test" + str(i) + ".mp3",
            )
        )

4 训练结果:

同 1 测试结果

相关推荐
AI袋鼠帝41 分钟前
Codex终于进手机了!
人工智能
Lee川1 小时前
从零解剖一个 AI Agent Tool是如何实现的
前端·人工智能·后端
一个王同学2 小时前
从零到一 | CV转多模态大模型 | week09 | Minillava Refactor结合手搓和llava源码深入理解多模态大模型原理
人工智能·深度学习·机器学习·计算机视觉·改行学it
2601_957787582 小时前
全场景矩阵系统多端统一体验与跨端实时同步技术实践
大数据·人工智能·矩阵·多端统一·跨端同步
liudanzhengxi2 小时前
AI提示词极限赛:突破边界的艺术
人工智能
ZhengEnCi2 小时前
09-斯坦福CS336作业 📝
人工智能
MY_TEUCK2 小时前
【2026最新Python+AI学习基础】Python 入门笔记篇
笔记·python·学习
闭关修炼啊哈2 小时前
[IdeaLoop · 灵感回路] AI时代独立开发者·创业/副业灵感日报 · 2026-05-17
人工智能·远程工作·创业·副业
赢乐3 小时前
大模型学习笔记:检索增强生成(RAG)架构
人工智能·python·深度学习·机器学习·智能体·幻觉·检索增强生成(rag)
飞哥数智坊3 小时前
OPC 需要的不是一个个AI工具,而是一支数字团队
人工智能