一、工程概述
这个方案使用 STM32F407 + INMP441 数字麦克风,实现3个命令词的离线语音识别。包含完整的MFCC特征提取和DNN神经网络推理,无需联网,完全离线运行。
二、硬件连接(STM32F407)
引脚连接表
| INMP441 引脚 | STM32F407 引脚 | 功能 |
|---|---|---|
| L/R | GND | 左声道(接地) |
| VDD | 3.3V | 电源 |
| GND | GND | 地 |
| SD | PB15 | 数据(I2S2_SD) |
| SCK | PB13 | 时钟(I2S2_CK) |
| WS | PB12 | 字选择(I2S2_WS) |
三、STM32CubeMX 配置
1、时钟配置
c
HCLK = 168 MHz
APB1 = 42 MHz
APB2 = 84 MHz
2、外设配置
c
I2S2:
Mode = Master Receive
Standard = Philips
Data Format = 16 bits
Audio Frequency = 16kHz
Clock Polarity = LOW
I2S2 DMA:
DMA1 Stream 3
Priority = High
Mode = Circular
Data Width = Half Word
UART1: 115200 baud, 8N1
GPIO:
LED1 = PD12
LED2 = PD13
四、代码实现
1、音频采集(I2S DMA)
c
// main.c
#include "main.h"
#include "audio.h"
#include "voice_recognition.h"
I2S_HandleTypeDef hi2s2;
DMA_HandleTypeDef hdma_spi2_rx;
UART_HandleTypeDef huart1;
// 音频缓冲区
int16_t audio_buffer[2][AUDIO_BUFFER_SIZE] = {0};
uint8_t current_buffer = 0;
uint32_t audio_ready = 0;
void HAL_I2S_RxHalfCpltCallback(I2S_HandleTypeDef *hi2s)
{
current_buffer = 0;
audio_ready = 1;
}
void HAL_I2S_RxCpltCallback(I2S_HandleTypeDef *hi2s)
{
current_buffer = 1;
audio_ready = 1;
}
2、 MFCC 特征提取
c
// mfcc.c
#include "mfcc.h"
#include "arm_math.h"
#define SAMPLE_RATE 16000
#define FRAME_LEN_MS 25
#define FRAME_SHIFT_MS 10
#define NUM_MFCC_COEFFS 13
#define NUM_FILTERS 20
#define FFT_SIZE 512
static float32_t hamming_window[FRAME_LEN];
static float32_t mel_filters[NUM_FILTERS][FFT_SIZE/2 + 1];
static float32_t dct_matrix[NUM_MFCC_COEFFS][NUM_FILTERS];
static arm_rfft_fast_instance_f32 rfft_instance;
void mfcc_init(void)
{
// 初始化汉明窗
for (int i = 0; i < FRAME_LEN; i++) {
hamming_window[i] = 0.54f - 0.46f * arm_cos_f32((2 * PI * i) / (FRAME_LEN - 1));
}
// 初始化Mel滤波器
init_mel_filters();
// 初始化DCT矩阵
init_dct_matrix();
// 初始化FFT
arm_rfft_fast_init_f32(&rfft_instance, FFT_SIZE);
}
void extract_mfcc(const int16_t *audio, float32_t *mfcc_features)
{
float32_t frame[FRAME_LEN];
float32_t fft_out[FFT_SIZE];
float32_t mag_spectrum[FFT_SIZE/2 + 1];
float32_t mel_energies[NUM_FILTERS];
// 1. 预加重
for (int i = 1; i < FRAME_LEN; i++) {
frame[i] = (float32_t)audio[i] - 0.97f * (float32_t)audio[i-1];
}
// 2. 加窗
for (int i = 0; i < FRAME_LEN; i++) {
frame[i] *= hamming_window[i];
}
// 3. 零填充
for (int i = FRAME_LEN; i < FFT_SIZE; i++) {
frame[i] = 0.0f;
}
// 4. FFT
arm_rfft_fast_f32(&rfft_instance, frame, fft_out, 0);
// 5. 幅度谱
for (int i = 0; i <= FFT_SIZE/2; i++) {
float32_t real = fft_out[2*i];
float32_t imag = fft_out[2*i + 1];
mag_spectrum[i] = sqrtf(real*real + imag*imag);
}
// 6. Mel滤波器组
for (int m = 0; m < NUM_FILTERS; m++) {
mel_energies[m] = 0.0f;
for (int k = 0; k <= FFT_SIZE/2; k++) {
mel_energies[m] += mag_spectrum[k] * mel_filters[m][k];
}
mel_energies[m] = log10f(mel_energies[m] + 1e-6f);
}
// 7. DCT (MFCC)
for (int i = 0; i < NUM_MFCC_COEFFS; i++) {
mfcc_features[i] = 0.0f;
for (int m = 0; m < NUM_FILTERS; m++) {
mfcc_features[i] += mel_energies[m] * dct_matrix[i][m];
}
}
}
3、 神经网络推理(CMSIS-NN)
c
// nn_inference.c
#include "nn_inference.h"
#include "arm_nnfunctions.h"
// 预训练模型参数
const q7_t weight_input_hidden[INPUT_SIZE * HIDDEN_SIZE] = {...};
const q7_t bias_hidden[HIDDEN_SIZE] = {...};
const q7_t weight_hidden_output[HIDDEN_SIZE * NUM_CLASSES] = {...};
const q7_t bias_output[NUM_CLASSES] = {...};
void nn_inference(const float32_t *mfcc, uint8_t *prediction)
{
q7_t input_q7[INPUT_SIZE];
q7_t hidden_q7[HIDDEN_SIZE];
q7_t output_q7[NUM_CLASSES];
// 1. MFCC特征量化到int8
for (int i = 0; i < INPUT_SIZE; i++) {
input_q7[i] = (q7_t)(mfcc[i] * 128.0f);
}
// 2. 全连接层1
arm_fully_connected_q7(
input_q7,
(q7_t *)weight_input_hidden,
INPUT_SIZE, HIDDEN_SIZE,
1, 7, // 输入偏移1,右移7位
bias_hidden,
hidden_q7,
(q15_t *)temp_buffer
);
// 3. ReLU激活
arm_relu_q7(hidden_q7, HIDDEN_SIZE);
// 4. 全连接层2
arm_fully_connected_q7(
hidden_q7,
(q7_t *)weight_hidden_output,
HIDDEN_SIZE, NUM_CLASSES,
1, 7,
bias_output,
output_q7,
(q15_t *)temp_buffer
);
// 5. 找最大值
int8_t max_val = output_q7[0];
uint8_t max_idx = 0;
for (uint8_t i = 1; i < NUM_CLASSES; i++) {
if (output_q7[i] > max_val) {
max_val = output_q7[i];
max_idx = i;
}
}
*prediction = max_idx;
}
4、主循环 - 语音识别
c
// main.c 继续
int main(void)
{
HAL_Init();
SystemClock_Config();
MX_GPIO_Init();
MX_DMA_Init();
MX_I2S2_Init();
MX_USART1_UART_Init();
printf("Voice Recognition System Starting...\r\n");
// 初始化音频
mfcc_init();
audio_init();
// 开始音频采集
HAL_I2S_Receive_DMA(&hi2s2,
(uint16_t *)audio_buffer[0],
AUDIO_BUFFER_SIZE);
// 语音识别状态机
uint8_t state = STATE_IDLE;
uint32_t voice_start_time = 0;
float32_t mfcc_buffer[13 * 10]; // 10帧MFCC
uint8_t frame_count = 0;
while (1)
{
if (audio_ready)
{
int16_t *current_audio = audio_buffer[current_buffer];
// VAD 端点检测
float32_t energy = compute_energy(current_audio, AUDIO_BUFFER_SIZE/2);
switch (state)
{
case STATE_IDLE:
if (energy > THRESHOLD_SILENCE)
{
state = STATE_VOICE_START;
voice_start_time = HAL_GetTick();
frame_count = 0;
printf("Voice detected!\r\n");
}
break;
case STATE_VOICE_START:
case STATE_VOICE_ONGOING:
if (frame_count < 10) // 收集10帧
{
extract_mfcc(current_audio,
&mfcc_buffer[frame_count * 13]);
frame_count++;
}
if (energy < THRESHOLD_SILENCE ||
(HAL_GetTick() - voice_start_time) > 2000)
{
// 语音结束,开始识别
if (frame_count == 10)
{
uint8_t cmd = recognize_command(mfcc_buffer);
execute_command(cmd);
}
state = STATE_IDLE;
}
break;
}
audio_ready = 0;
}
}
}
5、命令识别与执行
c
// command_handler.c
#include "command_handler.h"
uint8_t recognize_command(const float32_t *mfcc_features)
{
uint8_t prediction = 0;
// 神经网络推理
nn_inference(mfcc_features, &prediction);
// 置信度检查
if (get_confidence() < CONFIDENCE_THRESHOLD)
{
return CMD_UNKNOWN;
}
return prediction;
}
void execute_command(uint8_t cmd)
{
char msg[50];
switch (cmd)
{
case CMD_LIGHT_ON:
HAL_GPIO_WritePin(GPIOD, GPIO_PIN_12, GPIO_PIN_SET);
snprintf(msg, sizeof(msg), "Command: LIGHT_ON\r\n");
HAL_UART_Transmit(&huart1, (uint8_t *)msg, strlen(msg), HAL_MAX_DELAY);
break;
case CMD_LIGHT_OFF:
HAL_GPIO_WritePin(GPIOD, GPIO_PIN_12, GPIO_PIN_RESET);
snprintf(msg, sizeof(msg), "Command: LIGHT_OFF\r\n");
HAL_UART_Transmit(&huart1, (uint8_t *)msg, strlen(msg), HAL_MAX_DELAY);
break;
case CMD_CALL:
snprintf(msg, sizeof(msg), "Command: CALL\r\n");
HAL_UART_Transmit(&huart1, (uint8_t *)msg, strlen(msg), HAL_MAX_DELAY);
// 通过GSM模块拨打电话
gsm_call("10086");
break;
case CMD_UNKNOWN:
default:
snprintf(msg, sizeof(msg), "Command: UNKNOWN\r\n");
HAL_UART_Transmit(&huart1, (uint8_t *)msg, strlen(msg), HAL_MAX_DELAY);
break;
}
}
void gsm_call(const char *number)
{
char at_cmd[64];
// AT指令拨打电话
snprintf(at_cmd, sizeof(at_cmd), "ATD%s;\r\n", number);
HAL_UART_Transmit(&huart2, (uint8_t *)at_cmd, strlen(at_cmd), 1000);
}
五、模型训练(Python端)
python
# train_model.py
import numpy as np
import tensorflow as tf
from tensorflow import keras
import librosa
import os
def extract_features(audio_path):
"""提取MFCC特征"""
y, sr = librosa.load(audio_path, sr=16000)
# 预加重
y_pre = librosa.effects.preemphasis(y)
# MFCC特征
mfcc = librosa.feature.mfcc(
y=y_pre,
sr=sr,
n_mfcc=13,
n_fft=512,
hop_length=160
)
return mfcc.T # 转置,时间帧在第一个维度
def create_model():
"""创建简单的DNN模型"""
model = keras.Sequential([
keras.layers.Dense(64, activation='relu', input_shape=(130,)), # 10帧*13
keras.layers.Dropout(0.2),
keras.layers.Dense(32, activation='relu'),
keras.layers.Dense(4, activation='softmax') # 4个命令
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
return model
# 量化模型到int8
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.int8]
tflite_model = converter.convert()
# 保存为C数组
with open('model.h', 'w') as f:
f.write('const unsigned char model_data[] = {')
for i, byte in enumerate(tflite_model):
if i % 12 == 0:
f.write('\n ')
f.write(f'0x{byte:02x}, ')
f.write('\n};\n')
六、性能优化
1、内存优化
c
// 使用内存池
#define AUDIO_POOL_SIZE 4096
static uint8_t memory_pool[AUDIO_POOL_SIZE];
static uint32_t pool_index = 0;
void* audio_malloc(size_t size)
{
if (pool_index + size <= AUDIO_POOL_SIZE) {
void *ptr = &memory_pool[pool_index];
pool_index += size;
return ptr;
}
return NULL;
}
2、定点数优化
c
// 使用Q格式定点数
#define Q_mfcc 7 // Q7格式
#define Q_weight 7
int16_t mfcc_q15[13];
for (int i = 0; i < 13; i++) {
mfcc_q15[i] = (int16_t)(mfcc_float[i] * (1 << Q_mfcc));
}
参考代码 stm32语音识别 www.youwenfan.com/contentcsv/101747.html
七、快速开始指南
1、硬件准备
- STM32F407 Discovery 开发板
- INMP441 数字麦克风模块
- GSM模块(SIM800C/SIM900A,用于拨打电话)
- 杜邦线若干
2、软件准备
- STM32CubeIDE
- STM32CubeMX
- CMSIS-DSP库
- CMSIS-NN库
3、部署步骤
bash
1. 用CubeMX生成I2S+DMA+UART工程
2. 添加MFCC、神经网络代码
3. 用Python训练模型,导出参数
4. 将模型参数替换到nn_inference.c
5. 编译下载到STM32
6. 对着麦克风说"开灯"/"关灯"/"打电话"
八、调试技巧
串口打印特征
c
// 调试MFCC特征
void print_mfcc(const float32_t *mfcc)
{
printf("MFCC: ");
for (int i = 0; i < 13; i++) {
printf("%.2f ", mfcc[i]);
}
printf("\r\n");
}
能量可视化
c
// 绘制能量曲线(字符图)
void plot_energy(const int16_t *audio, uint32_t len)
{
char bar[51] = {0};
float32_t max_val = 0;
for (uint32_t i = 0; i < len; i++) {
if (abs(audio[i]) > max_val) max_val = abs(audio[i]);
}
for (uint32_t i = 0; i < 50; i++) {
uint32_t idx = i * len / 50;
int height = (abs(audio[idx]) * 20) / max_val;
bar[i] = height > 0 ? '#' : ' ';
}
printf("Energy: |%s|\r\n", bar);
}
九、性能指标
| 指标 | 值 |
|---|---|
| 采样率 | 16kHz |
| 延迟 | < 200ms |
| 内存使用 | ~60KB |
| Flash使用 | ~200KB |
| 准确率 | > 90%(安静环境) |
| 功耗 | ~80mA |
十、扩展功能
1、多命令支持
c
// 增加更多命令
enum VOICE_COMMANDS {
CMD_LIGHT_ON = 0,
CMD_LIGHT_OFF,
CMD_CALL,
CMD_HANGUP,
CMD_VOLUME_UP,
CMD_VOLUME_DOWN,
CMD_MUSIC_PLAY,
CMD_MUSIC_STOP,
NUM_COMMANDS
};
2、中文支持
python
# 收集中文语音数据
commands_zh = ["开灯", "关灯", "打电话", "挂断", "播放音乐", "停止"]
3、噪声抑制
c
// 简单的谱减法
void spectral_subtraction(float32_t *spectrum, const float32_t *noise_profile)
{
for (int i = 0; i < FFT_SIZE/2; i++) {
spectrum[i] = fmaxf(spectrum[i] - noise_profile[i], 0.1f);
}
}