1 代码解析
先看完整:
python
from funasr import AutoModel
import numpy as np
import soundfile as sf
import torch
import librosa
def load_audio(
path: str,
sr: int = 16000,
start: float = 0.0,
duration: float | None = None,
):
info = sf.info(path)
start_frame = int(start * info.samplerate)
frames = None if duration is None else int(duration * info.samplerate)
audio, orig_sr = sf.read(
path,
start=start_frame,
frames=frames,
dtype="float32",
always_2d=False,
)
if audio.ndim > 1:
audio = np.mean(audio, axis=1)
if orig_sr != sr:
audio = librosa.resample(
y=audio,
orig_sr=orig_sr,
target_sr=sr
)
return audio, sr
def main():
model_dir = "/data/H2413325/code_dir_V2/model/Fun-ASR-Nano-2512"
model = AutoModel(
model=model_dir,
trust_remote_code=True,
remote_code="./model.py",
device="cuda",
disable_update=True
)
wav_path = "/data/H2413325/code_dir_V2/model/Fun-ASR-Nano-2512/example/zh.mp3"
chunk_size = 0.72
window_size = 6.0
duration = sf.info(wav_path).duration
timestamps = np.arange(chunk_size, duration + chunk_size, chunk_size)
prev_text = ""
for t in timestamps:
start = max(0.0, t - window_size)
end = t
audio, _ = load_audio(
wav_path,
sr=16000,
start=start,
duration=round(end - start, 3)
)
result = model.inference(
[torch.tensor(audio)],
prev_text=prev_text,
)
# print(result)
text = result[0]["text"]
if text:
print(text)
prev_text = text
if __name__ == "__main__":
main()
1.1 读取音频
代码逐句解释:
python
# 音频分块的时间长度(单位:秒)
# 每一块音频长度为0.72秒
# 如果采样率是16KHZ,0.72秒 = 0.72 * 16000 = 11520个采样点(samples)
chunk_size = 0.72
# 获取wav音频的总时长(秒)float(单位:秒)
# 例如 如果音频是1分23秒,duration=83.0
duration = sf.info(wav_path).duration
# np.arange(start, stop, step)
# np.arange(起始值,结束值,步长) 生成一个等差数组
"""
假设duration=3.0 chunk_size=0.72
那么timestamps = [
0.72,
1.44,
2.16,
2.88,
3.60 # 注意:超过duration 因为当duration刚好不整除chunk_size时,最后一小段音频可能不会被处理,这是一种防御性写法
]
"""
timestamps = np.arange(chunk_size, duration + chunk_size, chunk_size)
1.2 时间轴滑窗
python
"""
按照前边的例子,t = [0.72, 1.44, 2.16, 2.88, 3.60]
"""
window_size = 6.0 # 滑动窗口
for t in timestamps:
# 以t为右端点,往前看6秒
# 例子:
# [0-0.72, 0-1.44 ... 0-3.6 ... 0.48-6.48 ... 4.8-10.8]
start = max(0.0, t - window_size)
end = t
# 从WAV文件中,截取一小段音频,load_audio函数先不细讲,等会会在下边逐句细讲。
# 得到的audio,格式为np.ndarray
audio, _ = load_audio(
wav_path,
sr=16000, # 强制重采样到16KHZ
start=start,
duration=round(end - start, 3)
)
1.3 load_audio函数
这个函数在干嘛?
从音频文件中,按照时间戳一段 -> 转成单声道 ->重采样 -> 返回给模型用
python
# 函数签名
def load_audio(
path: str, # 音频文件路径
sr: int = 16000, # 目标采样率(给模型用)
start: float = 0.0, # 起始时间
duration: float | None = None, # 读取时长,None为结尾
):
# 读取音频元信息(不加载音频数据)
info = sf.info(path)
# 把秒转换为"采样点索引"
# start_frame = 起始秒 * 原始采样率
start_frame = int(start * info.samplerate)
# 多少采样点
frames = None if duration is None else int(duration * info.samplerate)
# 真正读音频
audio, orig_sr = sf.read(
path,
start=start_frame, # 从第几个采样点开始读
frames=frames, # 共多少个采样点
dtype="float32", # 转换格式
always_2d=False, #
)
# 转换为单声道
if audio.ndim > 1:
audio = np.mean(audio, axis=1)
# 重采样
if orig_sr != sr:
audio = librosa.resample(
y=audio,
orig_sr=orig_sr,
target_sr=sr
)
# 返回的audio是一个numpy.ndarray
# 单声道,float32,一维数组
return audio, sr