在音频处理和机器学习领域,音频数据增强是一种常用的技术,旨在通过对原始音频信号进行各种变换和处理,生成更多样化的训练数据。
这不仅可以提高模型的鲁棒性,还能改善其在真实世界应用中的表现。本文将介绍几种常用的音频数据增强技术,包括时间拉伸、音高变换、带通滤波、均衡器、冲激响应处理、添加回声与延迟、非线性模拟等。
1. 时间拉伸
时间拉伸是一种改变音频信号播放速度而不改变其音高的技术。通过随机选择一个拉伸因子(例如在 0.8 到 1.2 之间),我们可以使音频信号变得更快或更慢。这种处理可以帮助模型适应不同的说话速度或音乐节奏。
python
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
def time_stretch(audio_data):
"""时间拉伸音频信号"""
stretch_factor = 1.5
stretched_audio = librosa.effects.time_stretch(audio_data, rate=stretch_factor)
return stretched_audio
def plot_signals_and_spectra(original_audio, enhanced_audio, sr):
"""绘制原始音频和增强音频的信号及频谱"""
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
plt.title('Original Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 2)
D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Original Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.subplot(2, 2, 3)
librosa.display.waveshow(enhanced_audio, sr=sr, alpha=0.5)
plt.title('Stretched Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 4)
D_enhanced = librosa.amplitude_to_db(np.abs(librosa.stft(enhanced_audio)), ref=np.max)
librosa.display.specshow(D_enhanced, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Stretched Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()
if __name__ == "__main__":
audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
original_audio, sr = librosa.load(audio_path, sr=None)
stretched_audio = time_stretch(original_audio)
plot_signals_and_spectra(original_audio, stretched_audio, sr)
2. 音高变换
音高变换是指在不改变音频信号播放速度的情况下,调整其音高。通过随机选择音高变换的步数(例如 -2、-1、1、2),我们可以模拟不同的音调变化。这对于音乐和语音信号的处理尤为重要。
python
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
def pitch_shift(audio_data, sr):
selected_pitch = 6
shifted_audio = librosa.effects.pitch_shift(audio_data, sr=sr, n_steps=selected_pitch)
return shifted_audio
def plot_signals_and_spectra(original_audio, enhanced_audio, sr):
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
plt.title('Original Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 2)
D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Original Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.subplot(2, 2, 3)
librosa.display.waveshow(enhanced_audio, sr=sr, alpha=0.5)
plt.title('Pitch Shifted Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 4)
D_enhanced = librosa.amplitude_to_db(np.abs(librosa.stft(enhanced_audio)), ref=np.max)
librosa.display.specshow(D_enhanced, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Pitch Shifted Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()
if __name__ == "__main__":
audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
original_audio, sr = librosa.load(audio_path, sr=None)
shifted_audio = pitch_shift(original_audio, sr)
plot_signals_and_spectra(original_audio, shifted_audio, sr)
3. 带通滤波
带通滤波是一种常用的信号处理技术,用于保留特定频率范围内的信号,同时抑制其他频率。通过随机选择低频和高频截止频率,我们可以模拟不同的环境和设备特性。
python
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import random
from scipy.signal import butter, lfilter
def butter_bandpass(lowcut, highcut, fs, order=5):
nyq = 0.5 * fs
normal_lowcut = lowcut / nyq
normal_highcut = highcut / nyq
b, a = butter(order, [normal_lowcut, normal_highcut], btype='band', analog=False)
return b, a
def bandpass_filter_audio(audio_data, sr, lowcut, highcut):
b, a = butter_bandpass(lowcut, highcut, sr)
filtered_audio = lfilter(b, a, audio_data)
return filtered_audio
def plot_signals_and_spectra(original_audio, filtered_audio, sr):
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
plt.title('Original Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 2)
D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Original Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.subplot(2, 2, 3)
librosa.display.waveshow(filtered_audio, sr=sr, alpha=0.5)
plt.title('Filtered Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 4)
D_filtered = librosa.amplitude_to_db(np.abs(librosa.stft(filtered_audio)), ref=np.max)
librosa.display.specshow(D_filtered, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Filtered Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()
if __name__ == "__main__":
audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
original_audio, sr = librosa.load(audio_path, sr=None)
filtered_audio = bandpass_filter_audio(original_audio, sr, lowcut=300, highcut=4000)
plot_signals_and_spectra(original_audio, filtered_audio, sr)
4. 均衡器
均衡器用于调整音频信号中不同频段的增益。通过定义多个频段及其增益,我们可以增强或削弱特定频率范围的音频信号,从而改善音质。
定义了多个频段,每个频段都有一个低截止频率(lowcut)、高截止频率(highcut)和增益(gain)。
频段的划分应确保下界(lowcut)不太靠近 20 Hz,上界(highcut)不太靠近采样频率的一半(fs/2),以避免滤波器设计中的不稳定性。
python
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import butter, lfilter
import random
def butter_bandpass(lowcut, highcut, fs, order=5):
nyq = 0.5 * fs
if lowcut <= 0 or highcut >= nyq:
raise ValueError("Lowcut and highcut must be in the range (0, Nyquist frequency).")
normal_lowcut = lowcut / nyq
normal_highcut = highcut / nyq
b, a = butter(order, [normal_lowcut, normal_highcut], btype='band', analog=False)
return b, a
def equalizer(audio_data, sr):
bands = [
(60, 200, 0.8),
(200, 500, 1.5),
(500, 1000, 1.2),
(1000, 2000, 1),
(2000, 4000, 1.5),
(4000, 7995, 0.5)
]
output = np.zeros_like(audio_data)
if np.any(np.isnan(audio_data)) or np.any(np.isinf(audio_data)):
raise ValueError("Input audio data contains NaN or Inf values.")
for lowcut, highcut, gain in bands:
b, a = butter_bandpass(lowcut, highcut, sr)
filtered = lfilter(b, a, audio_data)
output += filtered * gain
output = np.clip(output, -1.0, 1.0) # 限制输出范围
return output
def plot_signals_and_spectra(original_audio, equalized_audio, sr):
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
plt.title('Original Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 2)
D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Original Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.subplot(2, 2, 3)
librosa.display.waveshow(equalized_audio, sr=sr, alpha=0.5)
plt.title('Equalized Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 4)
D_equalized = librosa.amplitude_to_db(np.abs(librosa.stft(equalized_audio)), ref=np.max)
librosa.display.specshow(D_equalized, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Equalized Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()
if __name__ == "__main__":
audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
original_audio, sr = librosa.load(audio_path, sr=None)
equalized_audio = equalizer(original_audio, sr)
plot_signals_and_spectra(original_audio, equalized_audio, sr)
5. 添加回声与延迟
通过冲激响应处理,我们可以模拟房间或设备的声学特性。结合归一化、填充和延迟处理,我们可以生成具有特定声学特性的音频信号。以下是相关函数的实现:
python
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import fftconvolve, correlate
def apply_impulse_response(audio, ir):
if audio.ndim >= 2:
audio = audio[0]
if ir.ndim >= 2:
ir = ir[0]
processed_audio = fftconvolve(audio, ir, mode='full')[:len(audio)]
return processed_audio
def normalize_audio(audio):
return audio / np.max(np.abs(audio))
def pad_audio(original_audio, processed_audio, pad_samples):
if pad_samples > 0:
original_audio_padded = np.pad(original_audio, (0, pad_samples), mode='constant')
processed_audio_padded = np.pad(processed_audio, (pad_samples, 0), mode='constant')
else:
original_audio_padded = np.pad(original_audio, (-pad_samples, 0), mode='constant')
processed_audio_padded = np.pad(processed_audio, (0, -pad_samples), mode='constant')
return original_audio_padded, processed_audio_padded
def calculate_delay(original_audio, processed_audio):
correlation = correlate(processed_audio, original_audio)
delay_samples = np.argmax(correlation) - (len(processed_audio) - 1)
return delay_samples
def add_reverb_with_delay(audio_data, rir_data, delay_samples):
rir_data = normalize_audio(rir_data)
processed_audio = apply_impulse_response(audio_data, rir_data)
processed_audio = normalize_audio(processed_audio)
ori_delay = calculate_delay(audio_data, processed_audio)
pad_samples = delay_samples - ori_delay
original_audio_padded, processed_audio_padded = pad_audio(audio_data, processed_audio, pad_samples)
final_delay = calculate_delay(original_audio_padded, processed_audio_padded)
print(f"Final delay: {final_delay / 16:.2f} ms")
return original_audio_padded, processed_audio_padded
def plot_signals_and_spectra(original_audio, processed_audio, sr):
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
plt.title('Original Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 2)
D = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Original Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.subplot(2, 2, 3)
librosa.display.waveshow(processed_audio, sr=sr, alpha=0.5)
plt.title('Processed Audio Signal with Reverb')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 4)
D_processed = librosa.amplitude_to_db(np.abs(librosa.stft(processed_audio)), ref=np.max)
librosa.display.specshow(D_processed, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Processed Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()
if __name__ == "__main__":
audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
rir_path = '/Volumes/T9/DATA/构建数据集/SELE/rir/large/SLR28_large_Room001-00001.wav'
original_audio, sr = librosa.load(audio_path, sr=None)
rir_data, _ = librosa.load(rir_path, sr=None)
delay_samples = 8000 # 设置延迟样本数
original_audio,processed_audio = add_reverb_with_delay(original_audio, rir_data, delay_samples)
plot_signals_and_spectra(original_audio, processed_audio, sr)
6. 非线性模拟
非线性模拟用于模拟扬声器或其他设备的非线性特性。通过应用硬剪辑和非线性变换,我们可以生成更真实的音频信号。
python
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
def simulate_nonlinearity(audio_data):
clip_max = 0.3
clipped_signal = np.clip(audio_data, -clip_max, clip_max)
b = 1.5 * clipped_signal - 0.3 * clipped_signal ** 2
gamma = np.random.uniform(0.15, 0.3)
if np.any(b > 0):
a = np.random.uniform(0.05, 0.45)
else:
a = np.random.uniform(0.1, 0.4)
nonlinear_signal = gamma * (2 / (1 + np.exp(-a * b)))
return nonlinear_signal * clipped_signal
def plot_signals_and_spectra(original_audio, processed_audio, sr):
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
librosa.display.waveshow(original_audio, sr=sr, alpha=0.5)
plt.title('Original Audio Signal')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 2)
D_original = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
librosa.display.specshow(D_original, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Original Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.subplot(2, 2, 3)
librosa.display.waveshow(processed_audio, sr=sr, alpha=0.5)
plt.title('Processed Audio Signal with Nonlinearity')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.subplot(2, 2, 4)
D_processed = librosa.amplitude_to_db(np.abs(librosa.stft(processed_audio)), ref=np.max)
librosa.display.specshow(D_processed, sr=sr, x_axis='time', y_axis='log', cmap='coolwarm')
plt.title('Processed Audio Spectrum')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()
if __name__ == "__main__":
audio_path = '/Volumes/T9/DATA/构建数据集/SELE/near_voice/WenetSpeech_CN_16k_0005458.wav'
original_audio, sr = librosa.load(audio_path, sr=None)
processed_audio = simulate_nonlinearity(original_audio)
plot_signals_and_spectra(original_audio, processed_audio, sr)