## 安装环境
bash
pip install onnxruntime numpy tokenizers librosa modelscope huggingface-hub
## 下载模型
huggingface
!huggingface-cli download UsefulSensors/moonshine --allow_patterns 'onnx/base/*.onnx' --local-dir ./models/
下载tokenizer.json
!wget https://github.com/usefulsensors/moonshine/blob/main/moonshine/assets/tokenizer.json -P './models/onnx/base/'
modelscope
!modelscope download --model manyeyes/moonshine-base-en-onnx --local_dir ./models/
## 运行
python
import os
import wave
import numpy as np
import tokenizers
import onnxruntime
class MoonshineOnnxModel:
def __init__(self, models_dir):
preprocess, encode, uncached_decode, cached_decode = [
f"{models_dir}/{x}.onnx"
for x in ["preprocess", "encode", "uncached_decode", "cached_decode"]
]
self.preprocess = onnxruntime.InferenceSession(preprocess)
self.encode = onnxruntime.InferenceSession(encode)
self.uncached_decode = onnxruntime.InferenceSession(uncached_decode)
self.cached_decode = onnxruntime.InferenceSession(cached_decode)
self.tokenizer = tokenizers.Tokenizer.from_file(
os.path.join(models_dir, "tokenizer.json")
)
print('Successfully Load Model.')
def _generate(self, audio, max_len=None):
"audio has to be a numpy array of shape [1, num_audio_samples]"
if max_len is None:
# max 6 tokens per second of audio
max_len = int((audio.shape[-1] / 16_000) * 6)
preprocessed = self.preprocess.run([], dict(args_0=audio))[0]
seq_len = [preprocessed.shape[-2]]
context = self.encode.run([], dict(args_0=preprocessed, args_1=seq_len))[0]
inputs = [[1]]
seq_len = [1]
tokens = [1]
logits, *cache = self.uncached_decode.run(
[], dict(args_0=inputs, args_1=context, args_2=seq_len)
)
for i in range(max_len):
next_token = logits.squeeze().argmax()
tokens.extend([next_token])
if next_token == 2:
break
seq_len[0] += 1
inputs = [[next_token]]
logits, *cache = self.cached_decode.run(
[],
dict(
args_0=inputs,
args_1=context,
args_2=seq_len,
**{f"args_{i+3}": x for i, x in enumerate(cache)},
),
)
return [tokens]
def generate(self, audio_paths: list[str] | str, max_len=None):
if isinstance(audio_paths, str):
audio_paths = [audio_paths]
audios = []
for audio_path in audio_paths:
with wave.open(audio_path) as f:
params = f.getparams()
assert (
params.nchannels == 1
and params.framerate == 16_000
and params.sampwidth == 2
), f"wave file should have 1 channel, 16KHz, and int16"
audio = f.readframes(params.nframes)
audio = np.frombuffer(audio, np.int16) / 32768.0
audio = audio.astype(np.float32)[None, ...]
audios.append(audio)
audios = np.concatenate(audios, axis=0)
tokens = self._generate(audios, max_len)
texts = self.tokenizer.decode_batch(tokens)
return texts
if __name__ == "__main__":
model_dir = f"models/onnx/base/"
client = MoonshineOnnxModel(model_dir)
audio_path = "beckett.wav"
text = client.generate(audio_path)
print(text)