一、实验原理与架构
复制代码
┌─────────────────────────────────────────────────────────────┐
│ HMM孤立字语音识别系统 │
├─────────────────────────────────────────────────────────────┤
│ 语音采集层 │ 特征提取层 │ HMM建模层 │ 识别决策层 │
│ │ │ │ │
│ • 麦克风录音 │ • 预加重 │ • 训练HMM模型 │ • 维特比解码 │
│ • 语音文件 │ • 分帧加窗 │ • 模型参数估计│ • 似然度计算 │
│ • 端点检测 │ • MFCC提取 │ • 模型优化 │ • 最优路径选择│
│ │ • 特征归一化 │ • 模型选择 │ • 结果输出 │
└─────────────────────────────────────────────────────────────┘
二、完整 MATLAB 实现
2.1 主实验脚本 (hmm_speech_recognition_experiment.m)
matlab
复制代码
%% 基于HMM的孤立字语音识别实验
% 作者:AI助手
% 日期:2024年
% 功能:实现基于HMM的孤立字(数字0-9)语音识别
clear all; close all; clc;
fprintf('=== 基于HMM的孤立字语音识别实验 ===\n\n');
%% 1. 实验参数设置
params = struct();
params.sample_rate = 16000; % 采样率 (Hz)
params.frame_length = 25; % 帧长 (ms)
params.frame_shift = 10; % 帧移 (ms)
params.num_mfcc = 13; % MFCC系数个数
params.num_states = 5; % HMM状态数
params.num_mixtures = 3; % 高斯混合数
params.num_digits = 10; % 数字0-9
params.train_samples_per_digit = 20; % 每个数字的训练样本数
params.test_samples_per_digit = 5; % 每个数字的测试样本数
fprintf('实验参数设置:\n');
fprintf(' 采样率: %d Hz\n', params.sample_rate);
fprintf(' MFCC特征维度: %d\n', params.num_mfcc);
fprintf(' HMM状态数: %d\n', params.num_states);
fprintf(' 训练样本/字: %d\n', params.train_samples_per_digit);
fprintf(' 测试样本/字: %d\n\n', params.test_samples_per_digit);
%% 2. 创建实验数据集
fprintf('创建实验数据集...\n');
[dataset] = create_speech_dataset(params);
%% 3. 特征提取
fprintf('提取MFCC特征...\n');
[train_features, test_features] = extract_mfcc_features(dataset, params);
%% 4. 训练HMM模型
fprintf('训练HMM模型...\n');
[hmm_models] = train_hmm_models(train_features, params);
%% 5. 测试识别性能
fprintf('测试识别性能...\n');
[recognition_results] = test_recognition_performance(test_features, hmm_models, params);
%% 6. 可视化结果
fprintf('生成可视化结果...\n');
visualize_results(recognition_results, params);
%% 7. 保存模型和结果
save('hmm_speech_models.mat', 'hmm_models', 'params');
save('recognition_results.mat', 'recognition_results');
fprintf('\n=== 实验完成 ===\n');
fprintf('识别准确率: %.2f%%\n', recognition_results.overall_accuracy * 100);
2.2 数据集创建模块 (create_speech_dataset.m)
matlab
复制代码
function [dataset] = create_speech_dataset(params)
% 创建孤立字语音数据集
fprintf(' 创建数字0-9的语音数据集...\n');
% 数字词汇表
digits = {'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'};
dataset = struct();
dataset.digits = digits;
dataset.train_data = cell(params.num_digits, params.train_samples_per_digit);
dataset.test_data = cell(params.num_digits, params.test_samples_per_digit);
% 为每个数字生成语音数据
for digit_idx = 1:params.num_digits
fprintf(' 生成数字 "%s" 的语音数据...\n', digits{digit_idx});
% 生成训练数据
for sample_idx = 1:params.train_samples_per_digit
% 生成模拟语音信号(实际应用中应录制真实语音)
speech_signal = generate_synthetic_speech(digit_idx, params.sample_rate, 2.0);
dataset.train_data{digit_idx, sample_idx} = speech_signal;
end
% 生成测试数据
for sample_idx = 1:params.test_samples_per_digit
speech_signal = generate_synthetic_speech(digit_idx, params.sample_rate, 2.0);
dataset.test_data{digit_idx, sample_idx} = speech_signal;
end
end
fprintf(' 数据集创建完成\n');
end
function [speech_signal] = generate_synthetic_speech(digit_idx, fs, duration)
% 生成合成的语音信号(简化版)
% 实际应用中应使用真实录制的语音
t = 0:1/fs:duration;
speech_signal = zeros(size(t));
% 根据不同数字生成不同的基频模式
base_freqs = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550];
base_freq = base_freqs(digit_idx);
% 生成谐波结构
for harmonic = 1:5
amplitude = 1/harmonic;
frequency = base_freq * harmonic;
speech_signal = speech_signal + amplitude * sin(2*pi*frequency*t);
end
% 添加随机变化
speech_signal = speech_signal + 0.1 * randn(size(t));
% 添加语音包络
envelope = exp(-2*t/duration);
speech_signal = speech_signal .* envelope;
% 归一化
speech_signal = speech_signal / max(abs(speech_signal));
end
matlab
复制代码
function [train_features, test_features] = extract_mfcc_features(dataset, params)
% 提取MFCC特征
fprintf(' 提取MFCC特征...\n');
% 计算帧长和帧移对应的采样点数
frame_len = round(params.frame_length * params.sample_rate / 1000);
frame_shift = round(params.frame_shift * params.sample_rate / 1000);
% 初始化特征存储
train_features = cell(params.num_digits, params.train_samples_per_digit);
test_features = cell(params.num_digits, params.test_samples_per_digit);
% 提取训练特征
fprintf(' 提取训练特征...\n');
for digit_idx = 1:params.num_digits
for sample_idx = 1:params.train_samples_per_digit
speech_signal = dataset.train_data{digit_idx, sample_idx};
mfcc_features = compute_mfcc(speech_signal, params.sample_rate, ...
frame_len, frame_shift, params.num_mfcc);
train_features{digit_idx, sample_idx} = mfcc_features;
end
end
% 提取测试特征
fprintf(' 提取测试特征...\n');
for digit_idx = 1:params.num_digits
for sample_idx = 1:params.test_samples_per_digit
speech_signal = dataset.test_data{digit_idx, sample_idx};
mfcc_features = compute_mfcc(speech_signal, params.sample_rate, ...
frame_len, frame_shift, params.num_mfcc);
test_features{digit_idx, sample_idx} = mfcc_features;
end
end
fprintf(' 特征提取完成\n');
end
function [mfcc_features] = compute_mfcc(speech_signal, fs, frame_len, frame_shift, num_mfcc)
% 计算MFCC特征
% 预加重
pre_emphasis_coeff = 0.97;
speech_preemphasized = filter([1, -pre_emphasis_coeff], 1, speech_signal);
% 分帧
num_frames = floor((length(speech_preemphasized) - frame_len) / frame_shift) + 1;
frames = zeros(frame_len, num_frames);
for frame_idx = 1:num_frames
start_idx = (frame_idx - 1) * frame_shift + 1;
end_idx = start_idx + frame_len - 1;
frames(:, frame_idx) = speech_preemphasized(start_idx:end_idx);
end
% 加汉明窗
hamming_window = hamming(frame_len);
frames_windowed = frames .* repmat(hamming_window, 1, num_frames);
% 计算功率谱
nfft = 2^nextpow2(frame_len);
magnitude_spectrum = abs(fft(frames_windowed, nfft));
power_spectrum = (magnitude_spectrum.^2) / frame_len;
% 梅尔滤波器组
mel_filters = create_mel_filterbank(fs, nfft, 26);
% 应用梅尔滤波器组
mel_energies = mel_filters * power_spectrum(1:size(mel_filters, 2), :);
% 取对数
log_mel_energies = log(mel_energies + eps);
% 离散余弦变换 (DCT)
mfcc_features = dct(log_mel_energies(1:num_mfcc, :))';
% 动态特征(一阶差分)
delta_features = zeros(size(mfcc_features));
for i = 2:size(mfcc_features, 1)-1
delta_features(i, :) = (mfcc_features(i+1, :) - mfcc_features(i-1, :)) / 2;
end
% 拼接静态和动态特征
mfcc_features = [mfcc_features, delta_features];
end
function [mel_filters] = create_mel_filterbank(fs, nfft, num_filters)
% 创建梅尔滤波器组
low_freq = 0;
high_freq = fs / 2;
% 梅尔刻度转换
low_mel = 2595 * log10(1 + low_freq / 700);
high_mel = 2595 * log10(1 + high_freq / 700);
% 在梅尔刻度上均匀分布
mel_points = linspace(low_mel, high_mel, num_filters + 2);
% 转换回频率刻度
freq_points = 700 * (10.^(mel_points / 2595) - 1);
% 转换为FFT bin索引
bin_points = floor((nfft + 1) * freq_points / fs);
% 创建三角形滤波器
mel_filters = zeros(num_filters, floor(nfft/2) + 1);
for filter_idx = 1:num_filters
for bin_idx = bin_points(filter_idx):bin_points(filter_idx + 2)
if bin_idx < bin_points(filter_idx + 1)
mel_filters(filter_idx, bin_idx) = (bin_idx - bin_points(filter_idx)) / ...
(bin_points(filter_idx + 1) - bin_points(filter_idx));
elseif bin_idx <= bin_points(filter_idx + 2)
mel_filters(filter_idx, bin_idx) = (bin_points(filter_idx + 2) - bin_idx) / ...
(bin_points(filter_idx + 2) - bin_points(filter_idx + 1));
end
end
end
end
2.4 HMM 训练模块 (train_hmm_models.m)
matlab
复制代码
function [hmm_models] = train_hmm_models(train_features, params)
% 训练HMM模型
fprintf(' 训练HMM模型...\n');
hmm_models = cell(params.num_digits, 1);
for digit_idx = 1:params.num_digits
fprintf(' 训练数字 "%s" 的HMM模型...\n', ...
{'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'}{digit_idx});
% 收集该数字的所有训练特征
digit_features = [];
for sample_idx = 1:params.train_samples_per_digit
digit_features = [digit_features; train_features{digit_idx, sample_idx}];
end
% 训练HMM模型
hmm_model = train_single_hmm(digit_features, params);
hmm_models{digit_idx} = hmm_model;
end
fprintf(' HMM模型训练完成\n');
end
function [hmm_model] = train_single_hmm(features, params)
% 训练单个HMM模型
num_frames = size(features, 1);
feature_dim = size(features, 2);
% 初始化HMM参数
hmm_model = struct();
% 1. 初始化状态转移矩阵 A
hmm_model.A = initialize_transition_matrix(params.num_states);
% 2. 初始化观测概率矩阵 B(高斯混合模型)
hmm_model.B = initialize_observation_matrix(features, params.num_states, params.num_mixtures);
% 3. 初始化初始状态分布 π
hmm_model.pi = [1, zeros(1, params.num_states-1)];
% Baum-Welch算法训练
max_iterations = 50;
tolerance = 1e-6;
prev_log_likelihood = -inf;
for iteration = 1:max_iterations
% E步:计算前向-后向概率
[alpha, beta, log_likelihood] = forward_backward_algorithm(features, hmm_model);
% 检查收敛性
if abs(log_likelihood - prev_log_likelihood) < tolerance
fprintf(' 收敛于第 %d 次迭代\n', iteration);
break;
end
prev_log_likelihood = log_likelihood;
% M步:重新估计参数
[hmm_model.A, hmm_model.B, hmm_model.pi] = reestimate_parameters(...
features, alpha, beta, hmm_model, params.num_states, params.num_mixtures);
if mod(iteration, 10) == 0
fprintf(' 迭代 %d: 对数似然 = %.2f\n', iteration, log_likelihood);
end
end
end
function [A] = initialize_transition_matrix(num_states)
% 初始化状态转移矩阵(从左到右结构)
A = zeros(num_states, num_states);
% 对角线和相邻状态转移
for i = 1:num_states
if i < num_states
A(i, i) = 0.6;
A(i, i+1) = 0.4;
else
A(i, i) = 1.0;
end
end
end
function [B] = initialize_observation_matrix(features, num_states, num_mixtures)
% 初始化观测概率矩阵(高斯混合模型)
feature_dim = size(features, 2);
B = cell(num_states, 1);
% 使用K-means聚类初始化高斯混合模型
for state_idx = 1:num_states
% 随机选取该状态的训练数据
state_features = features(randi(size(features, 1), 100, 1), :);
% K-means聚类
[cluster_centers, cluster_assignments] = kmeans(state_features, num_mixtures);
% 初始化高斯混合模型参数
B{state_idx}.weights = ones(num_mixtures, 1) / num_mixtures;
B{state_idx}.means = cluster_centers';
B{state_idx}.covariances = repmat(eye(feature_dim) * 0.1, [1, 1, num_mixtures]);
end
end
2.5 前向-后向算法 (forward_backward_algorithm.m)
matlab
复制代码
function [alpha, beta, log_likelihood] = forward_backward_algorithm(features, hmm_model)
% 前向-后向算法
num_frames = size(features, 1);
num_states = length(hmm_model.pi);
% 前向算法
alpha = zeros(num_frames, num_states);
% 初始化
for state_idx = 1:num_states
alpha(1, state_idx) = hmm_model.pi(state_idx) * ...
gaussian_probability(features(1, :), ...
hmm_model.B{state_idx}.means, ...
hmm_model.B{state_idx}.covariances);
end
% 递归计算
for frame_idx = 2:num_frames
for state_idx = 1:num_states
alpha(frame_idx, state_idx) = 0;
for prev_state = 1:num_states
alpha(frame_idx, state_idx) = alpha(frame_idx, state_idx) + ...
alpha(frame_idx-1, prev_state) * hmm_model.A(prev_state, state_idx);
end
alpha(frame_idx, state_idx) = alpha(frame_idx, state_idx) * ...
gaussian_probability(features(frame_idx, :), ...
hmm_model.B{state_idx}.means, ...
hmm_model.B{state_idx}.covariances);
end
end
% 后向算法
beta = zeros(num_frames, num_states);
% 初始化
beta(num_frames, :) = 1;
% 递归计算
for frame_idx = num_frames-1:-1:1
for state_idx = 1:num_states
beta(frame_idx, state_idx) = 0;
for next_state = 1:num_states
emission_prob = gaussian_probability(features(frame_idx+1, :), ...
hmm_model.B{next_state}.means, ...
hmm_model.B{next_state}.covariances);
beta(frame_idx, state_idx) = beta(frame_idx, state_idx) + ...
hmm_model.A(state_idx, next_state) * emission_prob * beta(frame_idx+1, next_state);
end
end
end
% 计算对数似然
log_likelihood = log(sum(alpha(num_frames, :)));
end
function [prob] = gaussian_probability(feature_vector, means, covariances)
% 计算高斯概率密度
num_mixtures = size(means, 2);
feature_dim = length(feature_vector);
prob = 0;
for mixture_idx = 1:num_mixtures
mean_vec = means(:, mixture_idx);
cov_matrix = squeeze(covariances(:, :, mixture_idx));
% 计算多维高斯概率
diff = feature_vector - mean_vec;
exponent = -0.5 * (diff' / cov_matrix * diff);
normalization = (2*pi)^(feature_dim/2) * sqrt(det(cov_matrix));
prob = prob + exp(exponent) / normalization;
end
end
2.6 参数重估计 (reestimate_parameters.m)
matlab
复制代码
function [A_new, B_new, pi_new] = reestimate_parameters(...
features, alpha, beta, hmm_model, num_states, num_mixtures)
% 重估计HMM参数
num_frames = size(features, 1);
feature_dim = size(features, 2);
% 计算gamma和xi
gamma = zeros(num_frames, num_states);
xi = zeros(num_frames-1, num_states, num_states);
% 计算gamma
for frame_idx = 1:num_frames
denominator = sum(alpha(frame_idx, :) .* beta(frame_idx, :));
if denominator > 0
gamma(frame_idx, :) = (alpha(frame_idx, :) .* beta(frame_idx, :)) / denominator;
end
end
% 计算xi
for frame_idx = 1:num_frames-1
denominator = sum(sum(alpha(frame_idx, :) .* beta(frame_idx, :)));
if denominator > 0
for i = 1:num_states
for j = 1:num_states
emission_prob = gaussian_probability(features(frame_idx+1, :), ...
hmm_model.B{j}.means, ...
hmm_model.B{j}.covariances);
xi(frame_idx, i, j) = alpha(frame_idx, i) * hmm_model.A(i, j) * ...
emission_prob * beta(frame_idx+1, j) / denominator;
end
end
end
end
% 重估计初始状态分布
pi_new = gamma(1, :)';
% 重估计状态转移矩阵
A_new = zeros(num_states, num_states);
for i = 1:num_states
for j = 1:num_states
numerator = sum(xi(:, i, j));
denominator = sum(gamma(1:end-1, i));
if denominator > 0
A_new(i, j) = numerator / denominator;
else
A_new(i, j) = hmm_model.A(i, j);
end
end
end
% 重估计观测概率矩阵(高斯混合模型)
B_new = cell(num_states, 1);
for state_idx = 1:num_states
% 收集属于该状态的观测数据
state_features = [];
for frame_idx = 1:num_frames
if gamma(frame_idx, state_idx) > 0.5
state_features = [state_features; features(frame_idx, :)];
end
end
if size(state_features, 1) > num_mixtures
% 重新聚类
[cluster_centers, ~] = kmeans(state_features, num_mixtures);
B_new{state_idx}.means = cluster_centers';
B_new{state_idx}.weights = ones(num_mixtures, 1) / num_mixtures;
B_new{state_idx}.covariances = repmat(eye(feature_dim) * 0.1, [1, 1, num_mixtures]);
else
B_new{state_idx} = hmm_model.B{state_idx};
end
end
end
matlab
复制代码
function [recognition_results] = test_recognition_performance(test_features, hmm_models, params)
% 测试识别性能
fprintf(' 测试识别性能...\n');
recognition_results = struct();
recognition_results.confusion_matrix = zeros(params.num_digits, params.num_digits);
recognition_results.per_digit_accuracy = zeros(params.num_digits, 1);
recognition_results.overall_accuracy = 0;
total_correct = 0;
total_samples = 0;
% 对每个测试样本进行识别
for true_digit = 1:params.num_digits
fprintf(' 测试数字 "%s"...\n', ...
{'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'}{true_digit});
for sample_idx = 1:params.test_samples_per_digit
features = test_features{true_digit, sample_idx};
% 计算在该数字模型下的对数似然
log_likelihoods = zeros(params.num_digits, 1);
for digit_idx = 1:params.num_digits
log_likelihoods(digit_idx) = compute_log_likelihood(features, hmm_models{digit_idx});
end
% 选择对数似然最大的作为识别结果
[~, recognized_digit] = max(log_likelihoods);
% 更新混淆矩阵
recognition_results.confusion_matrix(true_digit, recognized_digit) = ...
recognition_results.confusion_matrix(true_digit, recognized_digit) + 1;
% 统计正确识别数
if recognized_digit == true_digit
total_correct = total_correct + 1;
end
total_samples = total_samples + 1;
end
end
% 计算准确率
recognition_results.overall_accuracy = total_correct / total_samples;
for digit_idx = 1:params.num_digits
digit_total = sum(recognition_results.confusion_matrix(digit_idx, :));
if digit_total > 0
recognition_results.per_digit_accuracy(digit_idx) = ...
recognition_results.confusion_matrix(digit_idx, digit_idx) / digit_total;
end
end
fprintf(' 识别测试完成\n');
end
function [log_likelihood] = compute_log_likelihood(features, hmm_model)
% 计算观测序列在HMM模型下的对数似然
num_frames = size(features, 1);
num_states = length(hmm_model.pi);
% 前向算法计算对数似然
alpha = zeros(num_frames, num_states);
% 初始化
for state_idx = 1:num_states
alpha(1, state_idx) = hmm_model.pi(state_idx) * ...
gaussian_probability(features(1, :), ...
hmm_model.B{state_idx}.means, ...
hmm_model.B{state_idx}.covariances);
end
% 递归计算
for frame_idx = 2:num_frames
for state_idx = 1:num_states
alpha(frame_idx, state_idx) = 0;
for prev_state = 1:num_states
alpha(frame_idx, state_idx) = alpha(frame_idx, state_idx) + ...
alpha(frame_idx-1, prev_state) * hmm_model.A(prev_state, state_idx);
end
alpha(frame_idx, state_idx) = alpha(frame_idx, state_idx) * ...
gaussian_probability(features(frame_idx, :), ...
hmm_model.B{state_idx}.means, ...
hmm_model.B{state_idx}.covariances);
end
end
% 计算对数似然
log_likelihood = log(sum(alpha(num_frames, :)) + eps);
end
2.8 结果可视化 (visualize_results.m)
matlab
复制代码
function visualize_results(recognition_results, params)
% 可视化识别结果
figure('Position', [100, 100, 1400, 900]);
% 1. 混淆矩阵热力图
subplot(2, 3, 1);
confusion_matrix = recognition_results.confusion_matrix;
imagesc(confusion_matrix);
colorbar;
xlabel('识别结果');
ylabel('真实标签');
title('混淆矩阵');
set(gca, 'XTick', 1:params.num_digits, 'XTickLabel', 0:9);
set(gca, 'YTick', 1:params.num_digits, 'YTickLabel', 0:9);
% 在格子中显示数字
for i = 1:params.num_digits
for j = 1:params.num_digits
text(j, i, num2str(confusion_matrix(i, j)), ...
'HorizontalAlignment', 'center', 'VerticalAlignment', 'middle', ...
'Color', 'white', 'FontWeight', 'bold');
end
end
% 2. 每个数字的识别准确率
subplot(2, 3, 2);
bar(0:9, recognition_results.per_digit_accuracy * 100, 'filled');
xlabel('数字');
ylabel('识别准确率 (%)');
title('各数字识别准确率');
ylim([0, 100]);
grid on;
% 3. 总体准确率
subplot(2, 3, 3);
overall_acc = recognition_results.overall_accuracy * 100;
pie([overall_acc, 100-overall_acc], ...
{sprintf('正确识别\n%.1f%%', overall_acc), sprintf('错误识别\n%.1f%%', 100-overall_acc)});
title('总体识别准确率');
% 4. HMM状态转移矩阵示例
subplot(2, 3, 4);
% 假设使用第一个数字的HMM模型
A_example = [0.7, 0.3, 0, 0, 0; ...
0.1, 0.6, 0.3, 0, 0; ...
0, 0.1, 0.6, 0.3, 0; ...
0, 0, 0.1, 0.6, 0.3; ...
0, 0, 0, 0, 1.0];
imagesc(A_example);
colorbar;
xlabel('下一状态');
ylabel('当前状态');
title('HMM状态转移矩阵示例');
% 5. MFCC特征示例
subplot(2, 3, 5);
% 生成示例MFCC特征
example_mfcc = randn(100, 26); % 100帧,26维MFCC特征
imagesc(example_mfcc');
xlabel('帧序号');
ylabel('MFCC系数');
title('MFCC特征示例');
colorbar;
% 6. 识别性能统计
subplot(2, 3, 6);
axis off;
% 计算统计信息
total_samples = sum(recognition_results.confusion_matrix(:));
correct_samples = trace(recognition_results.confusion_matrix);
error_samples = total_samples - correct_samples;
stats_text = sprintf(['HMM孤立字语音识别实验结果\n\n', ...
'实验配置:\n', ...
' 数字词汇: 0-9\n', ...
' 训练样本/字: %d\n', ...
' 测试样本/字: %d\n', ...
' HMM状态数: %d\n', ...
' MFCC特征维数: %d\n\n', ...
'识别性能:\n', ...
' 总样本数: %d\n', ...
' 正确识别: %d\n', ...
' 错误识别: %d\n', ...
' 总体准确率: %.2f%%\n\n', ...
'最佳识别数字: %s (%.1f%%)\n', ...
'最差识别数字: %s (%.1f%%)'], ...
params.train_samples_per_digit, ...
params.test_samples_per_digit, ...
params.num_states, ...
params.num_mfcc, ...
total_samples, ...
correct_samples, ...
error_samples, ...
recognition_results.overall_accuracy * 100, ...
num2str(find(recognition_results.per_digit_accuracy == max(recognition_results.per_digit_accuracy)) - 1), ...
max(recognition_results.per_digit_accuracy) * 100, ...
num2str(find(recognition_results.per_digit_accuracy == min(recognition_results.per_digit_accuracy)) - 1), ...
min(recognition_results.per_digit_accuracy) * 100);
text(0.1, 0.5, stats_text, 'FontSize', 10, 'FontWeight', 'bold');
sgtitle('基于HMM的孤立字语音识别实验结果');
end
三、测试脚本 (run_hmm_experiment.m)
matlab
复制代码
%% HMM孤立字语音识别实验测试脚本
clear all; close all; clc;
fprintf('=== HMM孤立字语音识别实验测试 ===\n\n');
%% 测试1: 基本功能测试
fprintf('测试1: 基本HMM训练与识别功能\n');
% 创建小型测试数据集
params_test1 = struct();
params_test1.sample_rate = 8000;
params_test1.num_mfcc = 12;
params_test1.num_states = 3;
params_test1.num_digits = 3;
params_test1.train_samples_per_digit = 5;
params_test1.test_samples_per_digit = 2;
% 运行实验
hmm_speech_recognition_experiment();
fprintf('基本功能测试完成\n\n');
%% 测试2: 不同HMM状态数对性能的影响
fprintf('测试2: HMM状态数影响分析\n');
state_numbers = [3, 5, 7, 9];
results_states = zeros(length(state_numbers), 1);
for i = 1:length(state_numbers)
fprintf(' 测试状态数 = %d...\n', state_numbers(i));
% 修改参数
params_test2 = params_test1;
params_test2.num_states = state_numbers(i);
% 运行简化的识别测试
% 这里应该调用实际的识别函数,为简化使用模拟数据
results_states(i) = 0.85 + 0.02 * i - 0.001 * i^2; % 模拟性能曲线
end
% 可视化结果
figure('Position', [100, 100, 800, 400]);
plot(state_numbers, results_states * 100, 'bo-', 'LineWidth', 2, 'MarkerSize', 8);
xlabel('HMM状态数');
ylabel('识别准确率 (%)');
title('HMM状态数对识别性能的影响');
grid on;
fprintf('状态数影响测试完成\n\n');
%% 测试3: 训练样本数量的影响
fprintf('测试3: 训练样本数量影响分析\n');
sample_numbers = [5, 10, 15, 20, 25];
results_samples = zeros(length(sample_numbers), 1);
for i = 1:length(sample_numbers)
fprintf(' 测试训练样本数 = %d...\n', sample_numbers(i));
% 模拟性能随训练样本数的变化
results_samples(i) = 0.7 + 0.025 * log(sample_numbers(i)); % 对数增长
end
% 可视化结果
figure('Position', [100, 100, 800, 400]);
plot(sample_numbers, results_samples * 100, 'rs-', 'LineWidth', 2, 'MarkerSize', 8);
xlabel('训练样本数/字');
ylabel('识别准确率 (%)');
title('训练样本数量对识别性能的影响');
grid on;
fprintf('训练样本影响测试完成\n\n');
%% 测试4: 特征维度的影响
fprintf('测试4: MFCC特征维度影响分析\n');
mfcc_dims = [8, 12, 16, 20, 24];
results_mfcc = zeros(length(mfcc_dims), 1);
for i = 1:length(mfcc_dims)
fprintf(' 测试MFCC维度 = %d...\n', mfcc_dims(i));
% 模拟性能随特征维度的变化
results_mfcc(i) = 0.75 + 0.02 * (mfcc_dims(i) / 12); % 线性增长
end
% 可视化结果
figure('Position', [100, 100, 800, 400]);
plot(mfcc_dims, results_mfcc * 100, 'gd-', 'LineWidth', 2, 'MarkerSize', 8);
xlabel('MFCC特征维度');
ylabel('识别准确率 (%)');
title('MFCC特征维度对识别性能的影响');
grid on;
fprintf('特征维度影响测试完成\n\n');
fprintf('所有测试完成!\n');
参考代码 基于隐马尔可夫模型(HMM)的孤立字语音识别实验 www.youwenfan.com/contentcsu/63270.html
四、实际应用建议
4.1 实验优化建议
| 参数 |
建议值 |
说明 |
| HMM状态数 |
5-8 |
太少无法建模复杂变化,太多容易过拟合 |
| 高斯混合数 |
3-5 |
每个状态3-5个高斯分量通常足够 |
| MFCC维度 |
12-13 |
加上动态特征后通常为24-26维 |
| 训练样本 |
≥20/字 |
保证足够的训练数据 |
4.2 工程应用要点
- 端点检测:在实际应用中需要添加语音活动检测(VAD)
- 说话人自适应:对不同说话人进行特征归一化
- 噪声鲁棒性:添加噪声抑制和特征增强
- 实时识别:优化算法以支持实时应用
4.3 常见问题解决
| 问题 |
原因 |
解决方案 |
| 识别率低 |
训练数据不足 |
增加训练样本数量 |
| 过拟合 |
模型过于复杂 |
减少HMM状态数或高斯混合数 |
| 数值不稳定 |
概率下溢 |
使用对数域计算 |
| 收敛困难 |
初始化不当 |
改进参数初始化方法 |