一、GMM说话人识别核心原理
1.1 为什么选择GMM?
GMM是说话人识别领域的经典方法,其优势在于:
- 概率建模:对声学特征分布进行柔性的概率描述
- 通用逼近性:足够多高斯分量可逼近任意复杂分布
- 区分性:每个说话人的声音特征对应独特的GMM参数集
1.2 系统架构与流程
原始语音 → 预处理 → 特征提取 → GMM训练 → 模型库 → 识别决策
↑ ↑ ↑
端点检测 MFCC/PLP EM算法
二、完整MATLAB实现代码
matlab
%% 基于GMM的说话人识别系统
clear; close all; clc;
%% 1. 参数设置
fs = 16000; % 采样频率 (Hz)
frame_length = 0.025; % 帧长 25ms
frame_shift = 0.01; % 帧移 10ms
num_mfcc = 13; % MFCC系数个数
num_filters = 26; % 梅尔滤波器数量
num_gaussians = 8; % GMM高斯分量数
max_iterations = 100; % EM算法最大迭代次数
speakers = {'speaker1', 'speaker2', 'speaker3', 'speaker4'};
num_speakers = length(speakers);
%% 2. 数据准备与预处理
% 假设每个说话人有10条语音用于训练,5条用于测试
train_features = cell(num_speakers, 1);
test_features = cell(num_speakers, 1);
for spk = 1:num_speakers
fprintf('处理说话人 %d/%d: %s\n', spk, num_speakers, speakers{spk});
% 模拟数据生成(实际应用应从音频文件读取)
% 这里创建具有不同共振峰特征的模拟语音
[train_data, test_data] = generate_simulated_speech(spk, fs);
% 提取MFCC特征
train_features{spk} = extract_mfcc(train_data, fs, frame_length, ...
frame_shift, num_mfcc, num_filters);
test_features{spk} = extract_mfcc(test_data, fs, frame_length, ...
frame_shift, num_mfcc, num_filters);
end
%% 3. GMM模型训练
gmm_models = cell(num_speakers, 1);
for spk = 1:num_speakers
fprintf('训练说话人 %d 的GMM模型...\n', spk);
features = train_features{spk};
[N, dim] = size(features);
% 3.1 初始化GMM参数(K-means聚类)
[init_means, init_covs, init_weights] = initialize_gmm(features, num_gaussians);
% 3.2 EM算法训练
[means, covs, weights, log_likelihood] = train_gmm_em(features, ...
init_means, init_covs, init_weights, max_iterations);
% 保存模型
gmm_models{spk} = struct('means', means, 'covs', covs, ...
'weights', weights, 'dim', dim);
% 可视化训练过程
figure('Position', [100, 100, 800, 400]);
subplot(1,2,1);
plot(log_likelihood, 'LineWidth', 2);
title(sprintf('说话人%d: 对数似然度收敛曲线', spk));
xlabel('迭代次数'); ylabel('对数似然度'); grid on;
subplot(1,2,2);
visualize_gmm(features, means, covs, weights);
title(sprintf('说话人%d: GMM分布', spk));
end
%% 4. 识别测试
confusion_matrix = zeros(num_speakers, num_speakers);
decision_scores = cell(num_speakers, 1);
for true_spk = 1:num_speakers
fprintf('测试说话人 %d 的识别性能...\n', true_spk);
test_data = test_features{true_spk};
num_test_utts = size(test_data, 1);
scores = zeros(num_test_utts, num_speakers);
for utt = 1:num_test_utts
feature_vector = test_data(utt, :);
% 计算对每个GMM模型的似然度
for model_spk = 1:num_speakers
model = gmm_models{model_spk};
scores(utt, model_spk) = compute_gmm_likelihood(...
feature_vector, model.means, model.covs, model.weights);
end
end
% 决策:选择似然度最大的模型
[max_scores, decisions] = max(scores, [], 2);
% 更新混淆矩阵
for dec = 1:length(decisions)
confusion_matrix(true_spk, decisions(dec)) = ...
confusion_matrix(true_spk, decisions(dec)) + 1;
end
decision_scores{true_spk} = scores;
end
%% 5. 性能评估与可视化
% 5.1 计算性能指标
accuracy = trace(confusion_matrix) / sum(confusion_matrix(:));
fprintf('\n系统总体识别准确率: %.2f%%\n', accuracy * 100);
% 5.2 绘制混淆矩阵
figure('Position', [100, 100, 600, 500]);
imagesc(confusion_matrix);
colorbar; colormap(jet);
title('说话人识别混淆矩阵');
xlabel('识别结果'); ylabel('真实说话人');
set(gca, 'XTick', 1:num_speakers, 'YTick', 1:num_speakers);
for i = 1:num_speakers
for j = 1:num_speakers
text(j, i, num2str(confusion_matrix(i,j)), ...
'HorizontalAlignment', 'center', 'Color', 'white');
end
end
% 5.3 绘制ROC/DET曲线
figure('Position', [100, 100, 900, 400]);
plot_roc_det_curves(decision_scores, num_speakers);
% 5.4 特征空间可视化(前2维)
figure('Position', [100, 100, 1000, 400]);
subplot(1,2,1);
for spk = 1:num_speakers
features = train_features{spk};
scatter(features(:,1), features(:,2), 20, 'filled', 'DisplayName', speakers{spk});
hold on;
end
title('MFCC特征前两维分布');
xlabel('MFCC系数1'); ylabel('MFCC系数2');
legend('Location', 'best'); grid on;
subplot(1,2,2);
% 绘制决策边界
visualize_decision_boundary(gmm_models, train_features, num_speakers);
title('GMM决策边界(前两维)');
%% 核心函数定义
function mfccs = extract_mfcc(audio, fs, frame_len, frame_shift, num_coeff, num_filters)
% MFCC特征提取
frame_size = round(frame_len * fs);
frame_step = round(frame_shift * fs);
% 预加重
pre_emphasis = 0.97;
audio = filter([1, -pre_emphasis], 1, audio);
% 分帧
frames = buffer(audio, frame_size, frame_size-frame_step, 'nodelay');
num_frames = size(frames, 2);
% 加汉明窗
window = hamming(frame_size);
frames = frames .* window;
% 计算功率谱
NFFT = 2^nextpow2(frame_size);
mag_frames = abs(fft(frames, NFFT)).^2 / NFFT;
% 梅尔滤波器组
mel_filters = mel_filterbank(num_filters, NFFT, fs);
% 应用梅尔滤波器组
mel_power = mel_filters * mag_frames(1:NFFT/2+1, :);
% 取对数
log_mel = log(mel_power + eps);
% DCT变换得到MFCC
mfccs = dct(log_mel);
mfccs = mfccs(1:num_coeff, :);
% 一阶和二阶差分(动态特征)
delta = [zeros(num_coeff,1), diff(mfccs, 1, 2)];
delta_delta = [zeros(num_coeff,1), diff(delta, 1, 2)];
% 组合静态和动态特征
mfccs = [mfccs; delta; delta_delta]';
end
function [means, covs, weights] = initialize_gmm(data, num_components)
% 使用K-means初始化GMM参数
[N, dim] = size(data);
% K-means聚类
[idx, centers] = kmeans(data, num_components, 'MaxIter', 100);
means = centers;
weights = zeros(1, num_components);
covs = zeros(dim, dim, num_components);
for k = 1:num_components
cluster_data = data(idx == k, :);
weights(k) = size(cluster_data, 1) / N;
if size(cluster_data, 1) > 1
covs(:,:,k) = cov(cluster_data) + 1e-6 * eye(dim); % 正则化
else
covs(:,:,k) = eye(dim);
end
end
end
function [means, covs, weights, log_likelihood] = train_gmm_em(data, ...
init_means, init_covs, init_weights, max_iter)
[N, dim] = size(data);
K = size(init_means, 1);
means = init_means;
covs = init_covs;
weights = init_weights;
log_likelihood = zeros(max_iter, 1);
tolerance = 1e-6;
for iter = 1:max_iter
% E步:计算后验概率
log_prob = zeros(N, K);
for k = 1:K
log_prob(:, k) = log(weights(k)) + ...
log_mvnpdf(data, means(k,:), covs(:,:,k));
end
% 对数似然度
max_log_prob = max(log_prob, [], 2);
log_prob = log_prob - max_log_prob;
prob = exp(log_prob);
sum_prob = sum(prob, 2);
log_likelihood(iter) = sum(log(sum_prob) + max_log_prob);
% 后验概率
gamma = prob ./ sum_prob;
% M步:更新参数
Nk = sum(gamma, 1);
weights = Nk / N;
for k = 1:K
means(k,:) = sum(gamma(:,k) .* data, 1) / Nk(k);
diff = data - means(k,:);
weighted_diff = diff .* sqrt(gamma(:,k));
covs(:,:,k) = (weighted_diff' * weighted_diff) / Nk(k) + ...
1e-6 * eye(dim);
end
% 检查收敛
if iter > 1 && abs(log_likelihood(iter) - log_likelihood(iter-1)) < tolerance
log_likelihood = log_likelihood(1:iter);
break;
end
end
end
function log_prob = log_mvnpdf(X, mu, Sigma)
% 多元高斯分布对数概率密度
[N, dim] = size(X);
X_centered = X - mu;
[R, p] = chol(Sigma);
if p > 0
Sigma = Sigma + 1e-6 * eye(dim);
R = chol(Sigma);
end
quadform = sum((X_centered / R).^2, 2);
log_prob = -0.5 * quadform - sum(log(diag(R))) - (dim/2) * log(2*pi);
end
function likelihood = compute_gmm_likelihood(x, means, covs, weights)
% 计算单个样本对GMM的似然度
K = size(means, 1);
log_prob = zeros(1, K);
for k = 1:K
log_prob(k) = log(weights(k)) + ...
log_mvnpdf(x, means(k,:), covs(:,:,k));
end
% 对数域求和技巧
max_log = max(log_prob);
likelihood = exp(max_log) * sum(exp(log_prob - max_log));
end
function plot_roc_det_curves(scores, num_speakers)
% 绘制ROC和DET曲线
subplot(1,2,1);
colors = lines(num_speakers);
for spk = 1:num_speakers
spk_scores = scores{spk};
target_scores = spk_scores(:, spk);
nontarget_scores = spk_scores(:, setdiff(1:num_speakers, spk));
% 计算FAR和FRR
thresholds = linspace(min(spk_scores(:)), max(spk_scores(:)), 100);
far = zeros(size(thresholds));
frr = zeros(size(thresholds));
for t = 1:length(thresholds)
far(t) = sum(nontarget_scores(:) > thresholds(t)) / numel(nontarget_scores);
frr(t) = sum(target_scores <= thresholds(t)) / numel(target_scores);
end
plot(far, 1-frr, 'Color', colors(spk,:), 'LineWidth', 2);
hold on;
end
plot([0 1], [0 1], 'k--');
title('ROC曲线'); xlabel('错误接受率(FAR)'); ylabel('正确接受率(1-FRR)');
legend(arrayfun(@(x) sprintf('说话人%d', x), 1:num_speakers, 'UniformOutput', false));
grid on; axis equal; xlim([0 1]); ylim([0 1]);
subplot(1,2,2);
% DET曲线(在正态概率纸上)
for spk = 1:num_speakers
spk_scores = scores{spk};
target_scores = spk_scores(:, spk);
nontarget_scores = spk_scores(:, setdiff(1:num_speakers, spk));
thresholds = linspace(min(spk_scores(:)), max(spk_scores(:)), 100);
far = zeros(size(thresholds));
frr = zeros(size(thresholds));
for t = 1:length(thresholds)
far(t) = sum(nontarget_scores(:) > thresholds(t)) / numel(nontarget_scores);
frr(t) = sum(target_scores <= thresholds(t)) / numel(target_scores);
end
% 转换为正态概率尺度
norm_far = norminv(far + eps);
norm_frr = norminv(frr + eps);
plot(norm_far, norm_frr, 'Color', colors(spk,:), 'LineWidth', 2);
hold on;
end
title('DET曲线'); xlabel('FAR (%)'); ylabel('FRR (%)');
grid on;
end
function [train_data, test_data] = generate_simulated_speech(spk_id, fs)
% 生成模拟语音数据(实际应用应从文件读取)
duration = 3; % 3秒
t = 0:1/fs:duration;
% 不同说话人具有不同的共振峰频率
formants = [500 + spk_id*100, 1500 + spk_id*150, 2500 + spk_id*200];
% 合成元音/a/的声音
train_data = zeros(1, length(t));
for f = formants
train_data = train_data + sin(2*pi*f*t) .* exp(-0.5*t);
end
% 添加随机相位变化模拟不同发音
test_data = zeros(5, length(t));
for i = 1:5
test_data(i,:) = zeros(1, length(t));
for f = formants
phase_shift = rand() * 2*pi;
test_data(i,:) = test_data(i,:) + ...
sin(2*pi*f*t + phase_shift) .* exp(-0.5*t);
end
% 添加噪声
test_data(i,:) = test_data(i,:) + 0.1*randn(1, length(t));
end
train_data = train_data(:);
test_data = test_data(:);
end
三、系统优化与进阶方法
3.1 特征增强技术
matlab
% 1. 特征规整(CMVN)
features_norm = (features - mean(features)) ./ std(features);
% 2. RASTA滤波(抑制信道影响)
features_rasta = rastaplp(features, fs);
% 3. i-vector提取(降维+说话人因子)
[ivectors, projection_matrix] = extract_ivectors(features, total_variability_matrix);
3.2 GMM-UBM框架(更优方案)
matlab
% 1. 训练通用背景模型(UBM)
ubm = gmdistribution.fit(all_features, num_gaussians, ...
'CovType', 'diagonal', 'Regularize', 1e-6);
% 2. 最大后验概率(MAP)自适应
for spk = 1:num_speakers
gmm_models{spk} = map_adapt(ubm, train_features{spk}, ...
'adapt_means', true, 'adapt_weights', false);
end
3.3 与深度学习的结合
matlab
% 1. DNN特征提取
dnn_features = extract_dnn_features(audio, pretrained_dnn);
% 2. GMM作为后端分类器
gmm_scores = compute_gmm_scores(dnn_features, gmm_models);
% 3. 分数融合
final_scores = 0.7 * dnn_scores + 0.3 * gmm_scores;
参考代码 基于高斯混合模型(GMM)的说话人识别系统 www.youwenfan.com/contentcso/96407.html
四、实际部署建议
-
实时性优化
matlab% 使用并行计算加速GMM似然计算 parfor k = 1:K likelihoods(k) = compute_component_likelihood(x, means(k,:), covs(:,:,k)); end -
模型压缩
- 减少高斯分量数量
- 使用对角协方差矩阵
- 量化模型参数(16位定点数)
-
抗噪增强
- 集成维纳滤波或谱减法
- 使用噪声自适应GMM
- 多条件训练
此系统提供了一个完整的GMM说话人识别框架。如果您有特定需求(如实时识别、抗噪声优化、大规模说话人库等),我可以提供更专门的改进方案。