音频数据增强:提升音频信号质量的多种技术

在音频处理和机器学习领域,音频数据增强是一种常用的技术,旨在通过对原始音频信号进行各种变换和处理,生成更多样化的训练数据。

这不仅可以提高模型的鲁棒性,还能改善其在真实世界应用中的表现。本文将介绍几种常用的音频数据增强技术,包括时间拉伸、音高变换、带通滤波、均衡器、冲激响应处理、添加回声与延迟、非线性模拟等。

1. 时间拉伸

时间拉伸是一种改变音频信号播放速度而不改变其音高的技术。通过随机选择一个拉伸因子(例如在 0.8 到 1.2 之间),我们可以使音频信号变得更快或更慢。这种处理可以帮助模型适应不同的说话速度或音乐节奏。

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
def time_stretch(audio_data):
    """时间拉伸音频信号"""
    stretch_factor = 1.5
    stretched_audio = librosa.effects.time_stretch(audio_data, rate=stretch_factor)
    return stretched_audio
def plot_signals_and_spectra(original_audio, enhanced_audio, sr):
    """绘制原始音频和增强音频的信号及频谱"""
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
    plt.title('Original Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Original Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.subplot(2, 2, 3)
    librosa.display.waveshow(enhanced_audio, sr=sr, alpha=0.5)
    plt.title('Stretched Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 4)
    D_enhanced = librosa.amplitude_to_db(np.abs(librosa.stft(enhanced_audio)), ref=np.max)
    librosa.display.specshow(D_enhanced, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Stretched Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()
if __name__ == "__main__":
    audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
    original_audio, sr = librosa.load(audio_path, sr=None)
    stretched_audio = time_stretch(original_audio)
    plot_signals_and_spectra(original_audio, stretched_audio, sr)

2. 音高变换

音高变换是指在不改变音频信号播放速度的情况下,调整其音高。通过随机选择音高变换的步数(例如 -2、-1、1、2),我们可以模拟不同的音调变化。这对于音乐和语音信号的处理尤为重要。

python 复制代码
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
def pitch_shift(audio_data, sr):
    selected_pitch = 6
    shifted_audio = librosa.effects.pitch_shift(audio_data, sr=sr, n_steps=selected_pitch)
    return shifted_audio
def plot_signals_and_spectra(original_audio, enhanced_audio, sr):
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
    plt.title('Original Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Original Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.subplot(2, 2, 3)
    librosa.display.waveshow(enhanced_audio, sr=sr, alpha=0.5)
    plt.title('Pitch Shifted Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 4)
    D_enhanced = librosa.amplitude_to_db(np.abs(librosa.stft(enhanced_audio)), ref=np.max)
    librosa.display.specshow(D_enhanced, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Pitch Shifted Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()
if __name__ == "__main__":
    audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
    original_audio, sr = librosa.load(audio_path, sr=None)
    shifted_audio = pitch_shift(original_audio, sr)
    plot_signals_and_spectra(original_audio, shifted_audio, sr)

3. 带通滤波

带通滤波是一种常用的信号处理技术,用于保留特定频率范围内的信号,同时抑制其他频率。通过随机选择低频和高频截止频率,我们可以模拟不同的环境和设备特性。

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import random
from scipy.signal import butter, lfilter
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    normal_lowcut = lowcut / nyq
    normal_highcut = highcut / nyq
    b, a = butter(order, [normal_lowcut, normal_highcut], btype='band', analog=False)
    return b, a
def bandpass_filter_audio(audio_data, sr, lowcut, highcut):
    b, a = butter_bandpass(lowcut, highcut, sr)
    filtered_audio = lfilter(b, a, audio_data)
    return filtered_audio
def plot_signals_and_spectra(original_audio, filtered_audio, sr):
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
    plt.title('Original Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Original Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.subplot(2, 2, 3)
    librosa.display.waveshow(filtered_audio, sr=sr, alpha=0.5)
    plt.title('Filtered Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')

    plt.subplot(2, 2, 4)
    D_filtered = librosa.amplitude_to_db(np.abs(librosa.stft(filtered_audio)), ref=np.max)
    librosa.display.specshow(D_filtered, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Filtered Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()
if __name__ == "__main__":
    audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
    original_audio, sr = librosa.load(audio_path, sr=None)
    filtered_audio = bandpass_filter_audio(original_audio, sr, lowcut=300, highcut=4000)
    plot_signals_and_spectra(original_audio, filtered_audio, sr)

4. 均衡器

均衡器用于调整音频信号中不同频段的增益。通过定义多个频段及其增益,我们可以增强或削弱特定频率范围的音频信号,从而改善音质。

定义了多个频段,每个频段都有一个低截止频率(lowcut)、高截止频率(highcut)和增益(gain)。

频段的划分应确保下界(lowcut)不太靠近 20 Hz,上界(highcut)不太靠近采样频率的一半(fs/2),以避免滤波器设计中的不稳定性。

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import butter, lfilter
import random
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    if lowcut <= 0 or highcut >= nyq:
        raise ValueError("Lowcut and highcut must be in the range (0, Nyquist frequency).")
    normal_lowcut = lowcut / nyq
    normal_highcut = highcut / nyq
    b, a = butter(order, [normal_lowcut, normal_highcut], btype='band', analog=False)
    return b, a
def equalizer(audio_data, sr):
    bands = [
        (60, 200, 0.8),
        (200, 500, 1.5),
        (500, 1000, 1.2),
        (1000, 2000, 1),
        (2000, 4000, 1.5),
        (4000, 7995, 0.5)
    ]
    output = np.zeros_like(audio_data)
    if np.any(np.isnan(audio_data)) or np.any(np.isinf(audio_data)):
        raise ValueError("Input audio data contains NaN or Inf values.")
    for lowcut, highcut, gain in bands:
        b, a = butter_bandpass(lowcut, highcut, sr)
        filtered = lfilter(b, a, audio_data)
        output += filtered * gain
    output = np.clip(output, -1.0, 1.0)  # 限制输出范围
    return output
def plot_signals_and_spectra(original_audio, equalized_audio, sr):
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
    plt.title('Original Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Original Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.subplot(2, 2, 3)
    librosa.display.waveshow(equalized_audio, sr=sr, alpha=0.5)
    plt.title('Equalized Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 4)
    D_equalized = librosa.amplitude_to_db(np.abs(librosa.stft(equalized_audio)), ref=np.max)
    librosa.display.specshow(D_equalized, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Equalized Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()
if __name__ == "__main__":
    audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
    original_audio, sr = librosa.load(audio_path, sr=None)
    equalized_audio = equalizer(original_audio, sr)
    plot_signals_and_spectra(original_audio, equalized_audio, sr)

5. 添加回声与延迟

通过冲激响应处理,我们可以模拟房间或设备的声学特性。结合归一化、填充和延迟处理,我们可以生成具有特定声学特性的音频信号。以下是相关函数的实现:

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import fftconvolve, correlate
def apply_impulse_response(audio, ir):
    if audio.ndim >= 2:
        audio = audio[0]
    if ir.ndim >= 2:
        ir = ir[0]
    processed_audio = fftconvolve(audio, ir, mode='full')[:len(audio)]
    return processed_audio
def normalize_audio(audio):
    return audio / np.max(np.abs(audio))
def pad_audio(original_audio, processed_audio, pad_samples):
    if pad_samples > 0:
        original_audio_padded = np.pad(original_audio, (0, pad_samples), mode='constant')
        processed_audio_padded = np.pad(processed_audio, (pad_samples, 0), mode='constant')
    else:
        original_audio_padded = np.pad(original_audio, (-pad_samples, 0), mode='constant')
        processed_audio_padded = np.pad(processed_audio, (0, -pad_samples), mode='constant')
    return original_audio_padded, processed_audio_padded
def calculate_delay(original_audio, processed_audio):
    correlation = correlate(processed_audio, original_audio)
    delay_samples = np.argmax(correlation) - (len(processed_audio) - 1)
    return delay_samples
def add_reverb_with_delay(audio_data, rir_data, delay_samples):
    rir_data = normalize_audio(rir_data)
    processed_audio = apply_impulse_response(audio_data, rir_data)
    processed_audio = normalize_audio(processed_audio)

    ori_delay = calculate_delay(audio_data, processed_audio)
    pad_samples = delay_samples - ori_delay
    original_audio_padded, processed_audio_padded = pad_audio(audio_data, processed_audio, pad_samples)
    final_delay = calculate_delay(original_audio_padded, processed_audio_padded)
    print(f"Final delay: {final_delay / 16:.2f} ms")
    return original_audio_padded, processed_audio_padded
def plot_signals_and_spectra(original_audio, processed_audio, sr):
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
    plt.title('Original Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Original Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.subplot(2, 2, 3)
    librosa.display.waveshow(processed_audio, sr=sr, alpha=0.5)
    plt.title('Processed Audio Signal with Reverb')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 4)
    D_processed = librosa.amplitude_to_db(np.abs(librosa.stft(processed_audio)), ref=np.max)
    librosa.display.specshow(D_processed, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Processed Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()
if __name__ == "__main__":
    audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
    rir_path = '/Volumes/T9/DATA/构建数据集/SELE/rir/large/SLR28_large_Room001-00001.wav'
    original_audio, sr = librosa.load(audio_path, sr=None)
    rir_data, _ = librosa.load(rir_path, sr=None)
    delay_samples = 8000  # 设置延迟样本数
    original_audio,processed_audio = add_reverb_with_delay(original_audio, rir_data, delay_samples)
    plot_signals_and_spectra(original_audio, processed_audio, sr)

6. 非线性模拟

非线性模拟用于模拟扬声器或其他设备的非线性特性。通过应用硬剪辑和非线性变换,我们可以生成更真实的音频信号。

python 复制代码
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
def simulate_nonlinearity(audio_data):
    clip_max = 0.3
    clipped_signal = np.clip(audio_data, -clip_max, clip_max)
    b = 1.5 * clipped_signal - 0.3 * clipped_signal ** 2
    gamma = np.random.uniform(0.15, 0.3)
    if np.any(b > 0):
        a = np.random.uniform(0.05, 0.45)
    else:
        a = np.random.uniform(0.1, 0.4)
    nonlinear_signal = gamma * (2 / (1 + np.exp(-a * b)))
    return nonlinear_signal * clipped_signal
def plot_signals_and_spectra(original_audio, processed_audio, sr):
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
    plt.title('Original Audio Signal')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 2)
    D_original = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
    librosa.display.specshow(D_original, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Original Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.subplot(2, 2, 3)
    librosa.display.waveshow(processed_audio, sr=sr, alpha=0.5)
    plt.title('Processed Audio Signal with Nonlinearity')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.subplot(2, 2, 4)
    D_processed = librosa.amplitude_to_db(np.abs(librosa.stft(processed_audio)), ref=np.max)
    librosa.display.specshow(D_processed, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
    plt.title('Processed Audio Spectrum')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()
if __name__ == "__main__":
    audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
    original_audio, sr = librosa.load(audio_path, sr=None)
    processed_audio = simulate_nonlinearity(original_audio)
    plot_signals_and_spectra(original_audio, processed_audio, sr)
相关推荐
酒酿小圆子~12 分钟前
NLP中常见的分词算法(BPE、WordPiece、Unigram、SentencePiece)
人工智能·算法·自然语言处理
新加坡内哥谈技术1 小时前
Virgo:增强慢思考推理能力的多模态大语言模型
人工智能·语言模型·自然语言处理
martian6651 小时前
深入详解人工智能计算机视觉之图像生成与增强:生成对抗网络(GAN)
人工智能·计算机视觉
qq_273900232 小时前
pytorch torch.isclose函数介绍
人工智能·pytorch·python
说私域2 小时前
阿里巴巴新零售模式下的创新实践:结合开源AI智能名片2+1链动模式S2B2C商城小程序的应用探索
人工智能·开源·零售
致Great2 小时前
《你的RAG出错了?快来Get这份改进秘籍》
人工智能·llm·nlp
这我可不懂2 小时前
2025低代码与人工智能AI新篇
人工智能·低代码
XianxinMao2 小时前
企业通过私有安全端点访问大型语言模型的益处
人工智能·安全·语言模型
itwangyang5202 小时前
AIDD-人工智能药物设计-可扩展!更快!更便宜!大规模基因组数据存储新结构
人工智能
生信与遗传解读2 小时前
XGBoost算法在自定义数据集中预测疾病风险
人工智能·python·算法·数据分析