python_检测音频人声片段
对声源有要求,不一定所有音频的识别结果都是我们想要的
python
# 使用此指令前,请确保安装必要的Python库,例如使用以下命令安装:
# pip install librosa numpy
import librosa
import numpy as np
from typing import *
try:
from xbot.app.logging import trace as print
except:
from xbot import print
def detect_voice_segments(audio_path):
"""
title: 检测音频人声片段
description: 检测音频文件中所有人声片段的起止时间,基于短时能量算法识别有效人声区域,过滤噪声干扰,返回以微秒为单位的时间段列表。
inputs:
- audio_path (file): 音频文件路径,eg: "audio.mp3"
outputs:
- voice_segments (list): 人声片段时间列表,每个元素为[起始微秒, 结束微秒],eg: "[[1000000, 3500000], [5000000, 8200000]]"
"""
def _seconds_to_microseconds(seconds):
"""
将秒数转换为微秒(1秒 = 1000000微秒)
"""
seconds = float(seconds)
return int(round(seconds * 1000000))
# 检测参数配置
threshold = 0.05 # 能量阈值,低于则视为静音
min_voice_dur = 0.1 # 最小有效人声时长(秒),过滤噪声
try:
# 1. 加载音频
y, sr = librosa.load(audio_path, sr=None)
total_samples = len(y)
total_seconds = total_samples / sr
# 2. 计算短时能量(RMS)
frame_length = 2048
hop_length = 512
energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length).flatten()
times = librosa.frames_to_time(np.arange(len(energy)), sr=sr, hop_length=hop_length)
# 3. 判断每帧是否为人声
is_voice = energy > threshold
# 4. 合并连续人声段(简单状态机)
segments = []
in_voice = False
start_frame = None
for i, voice in enumerate(is_voice):
if voice and not in_voice:
# 进入人声
in_voice = True
start_frame = i
elif not voice and in_voice:
# 离开人声
end_frame = i - 1
segments.append((start_frame, end_frame))
in_voice = False
# 处理结尾仍为人声的情况
if in_voice:
segments.append((start_frame, len(is_voice) - 1))
# 5. 转换为时间并过滤太短的段
valid_segments = []
for start_f, end_f in segments:
start_t = times[start_f]
end_t = times[end_f] + hop_length / sr # 补偿最后一帧的时间偏移
# 确保不超出总时长
end_t = min(end_t, total_seconds)
if (end_t - start_t) >= min_voice_dur:
start_us = _seconds_to_microseconds(start_t)
end_us = _seconds_to_microseconds(end_t)
valid_segments.append([start_us, end_us])
return valid_segments
except Exception as e:
raise Exception(f"处理音频文件时出错:{str(e)}")