【Moonshine Onnx版本 语音识别】

## 安装环境

bash 复制代码
pip install onnxruntime numpy tokenizers librosa modelscope huggingface-hub

## 下载模型

huggingface

!huggingface-cli download UsefulSensors/moonshine --allow_patterns 'onnx/base/*.onnx' --local-dir ./models/

下载tokenizer.json

!wget https://github.com/usefulsensors/moonshine/blob/main/moonshine/assets/tokenizer.json -P './models/onnx/base/'

modelscope

!modelscope download --model manyeyes/moonshine-base-en-onnx --local_dir ./models/

## 运行

python 复制代码
import os
import wave
import numpy as np
import tokenizers
import onnxruntime

class MoonshineOnnxModel:
    def __init__(self, models_dir):

        preprocess, encode, uncached_decode, cached_decode = [
            f"{models_dir}/{x}.onnx"
            for x in ["preprocess", "encode", "uncached_decode", "cached_decode"]
        ]
        self.preprocess = onnxruntime.InferenceSession(preprocess)
        self.encode = onnxruntime.InferenceSession(encode)
        self.uncached_decode = onnxruntime.InferenceSession(uncached_decode)
        self.cached_decode = onnxruntime.InferenceSession(cached_decode)
        self.tokenizer = tokenizers.Tokenizer.from_file(
            os.path.join(models_dir, "tokenizer.json")
        )
        print('Successfully Load Model.')

    def _generate(self, audio, max_len=None):
        "audio has to be a numpy array of shape [1, num_audio_samples]"
        if max_len is None:
            # max 6 tokens per second of audio
            max_len = int((audio.shape[-1] / 16_000) * 6)
        preprocessed = self.preprocess.run([], dict(args_0=audio))[0]
        seq_len = [preprocessed.shape[-2]]

        context = self.encode.run([], dict(args_0=preprocessed, args_1=seq_len))[0]
        inputs = [[1]]
        seq_len = [1]

        tokens = [1]
        logits, *cache = self.uncached_decode.run(
            [], dict(args_0=inputs, args_1=context, args_2=seq_len)
        )
        for i in range(max_len):
            next_token = logits.squeeze().argmax()
            tokens.extend([next_token])
            if next_token == 2:
                break

            seq_len[0] += 1
            inputs = [[next_token]]
            logits, *cache = self.cached_decode.run(
                [],
                dict(
                    args_0=inputs,
                    args_1=context,
                    args_2=seq_len,
                    **{f"args_{i+3}": x for i, x in enumerate(cache)},
                ),
            )
        return [tokens]

    def generate(self, audio_paths: list[str] | str, max_len=None):
        if isinstance(audio_paths, str):
            audio_paths = [audio_paths]

        audios = []
        for audio_path in audio_paths:
          with wave.open(audio_path) as f:
              params = f.getparams()
              assert (
                  params.nchannels == 1
                  and params.framerate == 16_000
                  and params.sampwidth == 2
              ), f"wave file should have 1 channel, 16KHz, and int16"
              audio = f.readframes(params.nframes)
          audio = np.frombuffer(audio, np.int16) / 32768.0
          audio = audio.astype(np.float32)[None, ...]
          audios.append(audio)

        audios = np.concatenate(audios, axis=0)
        tokens = self._generate(audios, max_len)
        texts = self.tokenizer.decode_batch(tokens)

        return texts


if __name__ == "__main__":
    model_dir = f"models/onnx/base/"
    client = MoonshineOnnxModel(model_dir)
    audio_path = "beckett.wav"
    text = client.generate(audio_path)
    print(text)
相关推荐
Blossom.11816 小时前
量子通信:从科幻走向现实的未来通信技术
人工智能·深度学习·目标检测·机器学习·计算机视觉·语音识别·量子计算
CV-杨帆2 天前
Paraformer分角色语音识别-中文-通用 FunASR
人工智能·语音识别
Blossom.1183 天前
人工智能在智能教育中的创新应用与未来趋势
java·人工智能·深度学习·目标检测·机器学习·计算机视觉·语音识别
漫游者Nova4 天前
麦克风和电脑内播放声音实时识别转文字软件FunASR整合包V5下载
语音识别·语音转文字·音频转录·实时语音识别·录音转文字
Blossom.1186 天前
人工智能在智能健康监测中的创新应用与未来趋势
java·人工智能·深度学习·机器学习·语音识别
kooboo china.8 天前
Tailwind CSS 实战,基于 Kooboo 构建 AI 对话框页面(四):语音识别输入功能
前端·css·人工智能·ui·html·交互·语音识别
木亦汐丫10 天前
【ASR】基于分块非自回归模型的流式端到端语音识别
语音识别·asr·端到端·流式·nar非自回归·分块注意力·mask-ctc
放羊郎10 天前
从零实现本地语音识别(FunASR)
人工智能·语音识别·asr·funasr·语音转文字
雾迟sec11 天前
机器学习中的 K-均值聚类算法及其优缺点
人工智能·深度学习·机器学习·语言模型·语音识别
limingade13 天前
手机打电话时由对方DTMF响应切换多级IVR语音菜单(话术脚本与实战)
android·智能手机·语音识别·蓝牙电话·多级ivr导航·手机个人400电话·手机电话实现ivr语音导航