Silero-VAD模型自定义微调

代码文件:

finetune_and_save.py

python 复制代码
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import soundfile as sf
from torch.utils.data import DataLoader, Dataset

DEVICE = torch.device("cpu")
class SileroFinetuneDataset(Dataset):
    def __init__(self, csv_file, labels):
        if not os.path.exists(csv_file):
            print(f"创建演示索引文件: {csv_file}")
            pd.DataFrame([["data/test1.wav", "welcome to the ai speech recognition demo"]]).to_csv(csv_file, index=False, header=False)

        self.df = pd.read_csv(csv_file, header=None, names=['path', 'text']).dropna()
        self.char_to_idx = {char: i for i, char in enumerate(labels)}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['path'].strip()
        text = str(self.df.iloc[idx]['text']).lower()

        try:
            speech, sample_rate = sf.read(path)
            if len(speech.shape) > 1:
                speech = np.mean(speech, axis=1)
            waveform = torch.from_numpy(speech).float()
        except Exception as e:
            print(f"读取 {path} 出错: {e}")
            waveform = torch.zeros(16000)

        target = torch.tensor([self.char_to_idx[c] for c in text if c in self.char_to_idx], dtype=torch.long)
        return waveform, target

def collate_fn(batch):
    waveforms, targets = zip(*batch)
    input_lengths = torch.tensor([w.shape[0] for w in waveforms], dtype=torch.long)
    target_lengths = torch.tensor([t.shape[0] for t in targets], dtype=torch.long)
    waveforms_padded = torch.nn.utils.rnn.pad_sequence(waveforms, batch_first=True)
    targets_padded = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)
    return waveforms_padded, targets_padded, input_lengths, target_lengths

def run_finetuning():
    LANG = 'en'
    LR = 1e-6
    EPOCHS = 10
    SAVE_PATH = "silero_stt_finetuned.pt"
    print(f"📦 正在加载 Silero {LANG} 预训练模型 (CPU 模式)...")
    model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                           model='silero_stt',
                                           language=LANG,
                                           device=DEVICE)

    model.train()
    for param in model.parameters():
        param.requires_grad = True
    dataset = SileroFinetuneDataset('metadata.csv', model.labels)
    dataloader = DataLoader(dataset, batch_size=1, collate_fn=collate_fn, shuffle=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    criterion = nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)

    print(f"🏗️ 开始微调任务...")
    for epoch in range(EPOCHS):
        epoch_loss = 0
        for batch in dataloader:
            waveforms, targets, input_lengths, target_lengths = batch
            model.train()
            torch.set_grad_enabled(True)
            optimizer.zero_grad()
            log_probs = model(waveforms)
            if not log_probs.requires_grad:
                log_probs.requires_grad_(True)
            log_probs_trans = log_probs.transpose(0, 1)
            output_lengths = torch.full(size=(waveforms.size(0),),
                                        fill_value=log_probs_trans.size(0),
                                        dtype=torch.long)
            loss = criterion(log_probs_trans, targets, output_lengths, target_lengths)
            if loss.requires_grad:
                loss.backward()      # 计算梯度
                optimizer.step()     # 更新权重
                epoch_loss += loss.item()
            else:
                print("⚠️ 警告:当前 Batch 无法计算梯度,请检查模型是否被锁定。")

        print(f"🔹 轮次 {epoch+1}/{EPOCHS} | 平均误差: {epoch_loss/len(dataloader):.6f}")
        # if (epoch_loss / len(dataloader)) < 0.05:
        #     print(f"🎯 达到理想误差值,提前停止训练以防止过拟合。")
        #     break

    torch.save(model.state_dict(), SAVE_PATH)
    print(f"\n✅ 权重已生成并保存至: {SAVE_PATH}")

if __name__ == "__main__":
    os.makedirs("data", exist_ok=True)
    run_finetuning()

目录结构:

复制代码
-- finetune_and_save.py
-- metadata.csv
-- silero_stt_finetuned.pt
-- data
---- data/test1.wav
---- data/test2.wav
---- data/test3.wav
相关推荐
潜创微科技--高清音视频芯片方案开发1 天前
2026年HUB方案商选型指南:主流厂商核心优势与不同应用场景适配方案深度分析
音视频·硬件工程
月诸清酒1 天前
26-260410 AI 科技日报 (阿里开源视频模型HappyHorse登顶,马斯克疑似泄露Claude参数)
人工智能·开源·音视频
jedi-knight1 天前
AGI时代下的青年教师与学术民主化
人工智能·python·agi
迷藏4941 天前
**eBPF实战进阶:从零构建网络流量监控与过滤系统**在现代云原生架构中,**网络可观测性**和**安全隔离**已成为
java·网络·python·云原生·架构
迷藏4941 天前
**发散创新:基于Solid协议的Web3.0去中心化身份认证系统实战解析**在Web3.
java·python·web3·去中心化·区块链
weixin_156241575761 天前
基于YOLOv8深度学习花卉识别系统摄像头实时图片文件夹多图片等另有其他的识别系统可二开
大数据·人工智能·python·深度学习·yolo
AI_Claude_code1 天前
ZLibrary访问困境方案三:Web代理与轻量级转发服务的搭建与优化
爬虫·python·web安全·搜索引擎·网络安全·web3·httpx
小陈工1 天前
2026年4月7日技术资讯洞察:下一代数据库融合、AI基础设施竞赛与异步编程实战
开发语言·前端·数据库·人工智能·python
时空无限1 天前
ansible 由于不同主机 python 版本不同执行报错
python·ansible
ZhengEnCi1 天前
P2E-Python字典操作完全指南-从增删改查到遍历嵌套的Python编程利器
python