1、前面ESP32-S3-CAM:豆包语音识别文字后控制小车(二)------跑通demo代码
已经把python 代码跑通了
2、交给kimi转成可以在Ardunio IDE跑的代码
这里要注意,提前告诉kimi背景信息,SD卡是板载卡槽,相关的GPIO口等,我是直接把之前跑通的代码也作为附件上传给了kimi。
代码在这里:ESP32-S3-CAM:SD卡开发
3、给kimi的提示词:

这里再强调下,板载SD卡槽对应的GPIO口是固定的,前面帖子里也说过。代码直接复制,不要走弯路。
最后kimi给的代码中是可以用的,但是刚开始它分片做得比较小,后来我调整到一次读取64K的数据到内存。
kimi改过的代码,它自动补全了,WiFi要单独连接,所以你要给配置WiFi名称和密码,SD卡里要拷贝音频文件到根目录。
注意点:kimi转的代码中由于原来python demo代码里 有文件压缩动作,其实可以不用压缩,可以去掉,然后分片发送即可。
运行代码即可,后来我是让kimi稍微调整增加了一些log输出,完整代码如下:
cpp
/*
* ESP32-S3 豆包ASR语音识别 - 流式读取版
* 从SD卡流式读取audio1.wav,边读边发,无需大内存
*
* 硬件: GOOUUU ESP32-S3-CAM v1.3
* SD卡引脚: CLK=39, CMD=38, D0=40 (1-bit模式)
*/
#include <WiFi.h>
#include <WebSocketsClient.h>
#include <SD_MMC.h>
#include <ArduinoJson.h>
// ============== 配置区域 ==============
const char* ssid = "你的WiFi名称";
const char* password = "你的WiFi密码";
// 豆包ASR配置
const char* appid = "你的APP ID";
const char* token = "你的Access Token";
const char* cluster = "volcengine_input_common";
const char* ws_host = "openspeech.bytedance.com";
const int ws_port = 443;
const char* ws_path = "/api/v2/asr";
// SD卡引脚
#define SD_CLK 39
#define SD_CMD 38
#define SD_D0 40
// 音频配置
const char* audio_filename = "/audio1.wav";//SD卡根目录下文件
const int audio_rate = 16000;
const int audio_bits = 16;
const int audio_channel = 2;
// 协议常量
#define PROTOCOL_VERSION 0x01
#define DEFAULT_HEADER_SIZE 0x01
#define CLIENT_FULL_REQUEST 0x01
#define CLIENT_AUDIO_ONLY_REQUEST 0x02
#define SERVER_FULL_RESPONSE 0x09
#define SERVER_ACK 0x0B
#define SERVER_ERROR_RESPONSE 0x0F
#define NO_SEQUENCE 0x00
#define NEG_SEQUENCE 0x02
#define JSON_SERIALIZATION 0x01
#define NO_COMPRESSION 0x00
// ============== 全局变量 ==============
WebSocketsClient webSocket;
bool ws_connected = false;
bool asr_completed = false;
String recognition_result = "";
File audio_file; // 音频文件句柄(流式读取)
size_t audio_total_size = 0;
size_t audio_sent_size = 0;
const size_t CHUNK_SIZE = 64000; // 每片2秒左右音频数据
// ============== 函数声明 ==============
bool initSD();
void listSDFiles();
bool checkAudioFile();
bool openAudioFile(); // 流式打开,不加载
void initWebSocket();
void webSocketEvent(WStype_t type, uint8_t * payload, size_t length);
void sendFullClientRequest();
void sendAudioChunk(); // 从SD卡读取一片并发送
void parseResponse(uint8_t* data, size_t len);
String extractTextFromResult(JsonDocument& doc);
void generateHeader(uint8_t* header, uint8_t msg_type, uint8_t flags);
// ============== 协议头生成 ==============
void generateHeader(uint8_t* header, uint8_t msg_type, uint8_t flags) {
header[0] = (PROTOCOL_VERSION << 4) | DEFAULT_HEADER_SIZE;
header[1] = (msg_type << 4) | flags;
header[2] = (JSON_SERIALIZATION << 4) | NO_COMPRESSION;
header[3] = 0x00;
}
// ============== 设置 ==============
void setup() {
Serial.begin(115200);
delay(1000);
Serial.println("\n========================================");
Serial.println("ESP32-S3 豆包ASR语音识别(流式版)");
Serial.println("========================================");
// [自检1/4] 硬件信息
Serial.println("\n[自检1/4] 硬件信息:");
Serial.print(" 芯片型号: ");
Serial.println(ESP.getChipModel());
Serial.print(" 堆内存: ");
Serial.print(ESP.getFreeHeap() / 1024);
Serial.println(" KB");
// [自检2/4] SD卡检测
Serial.println("\n[自检2/4] SD卡检测:");
if (!initSD()) {
Serial.println(" ❌ SD卡初始化失败");
while (1) delay(1000);
}
Serial.println(" SD卡文件列表:");
listSDFiles();
Serial.print(" 检查目标文件 ");
Serial.print(audio_filename);
Serial.print(": ");
if (!checkAudioFile()) {
Serial.println("❌ 不存在");
while (1) delay(1000);
}
Serial.println("✓ 存在");
// [自检3/4] WiFi连接
Serial.println("\n[自检3/4] WiFi连接:");
Serial.print(" 连接 ");
Serial.print(ssid);
Serial.print(" ...");
WiFi.begin(ssid, password);
int retry = 0;
while (WiFi.status() != WL_CONNECTED && retry < 30) {
delay(500);
Serial.print(".");
retry++;
}
if (WiFi.status() != WL_CONNECTED) {
Serial.println(" ❌ 失败");
while (1) delay(1000);
}
Serial.println(" ✓ 已连接");
Serial.print(" IP: ");
Serial.println(WiFi.localIP());
// [自检4/4] 打开音频文件(流式)
Serial.println("\n[自检4/4] 打开音频文件:");
if (!openAudioFile()) {
Serial.println(" ❌ 打开失败");
while (1) delay(1000);
}
Serial.println("\n========================================");
Serial.println("所有自检通过,启动ASR识别...");
Serial.println("========================================\n");
initWebSocket();
}
// ============== 主循环 ==============
void loop() {
webSocket.loop();
if (asr_completed) {
Serial.println("\n========================================");
Serial.println("ASR识别完成!");
Serial.print("识别结果: ");
Serial.println(recognition_result);
Serial.println("========================================");
if (audio_file) audio_file.close();
while (1) delay(1000);
}
}
// ============== SD卡初始化 ==============
bool initSD() {
Serial.println(" 初始化SD卡...");
SD_MMC.setPins(SD_CLK, SD_CMD, SD_D0);
if (!SD_MMC.begin("/sdcard", true)) {
Serial.println(" ❌ 失败");
return false;
}
uint8_t type = SD_MMC.cardType();
Serial.print(" 类型: ");
if (type == CARD_SD) Serial.println("SD");
else if (type == CARD_SDHC) Serial.println("SDHC");
else if (type == CARD_MMC) Serial.println("MMC");
else Serial.println("未知");
Serial.printf(" 容量: %llu MB\n", SD_MMC.cardSize() / 1048576);
return true;
}
// ============== 列出SD卡文件 ==============
void listSDFiles() {
File root = SD_MMC.open("/");
if (!root || !root.isDirectory()) return;
int count = 0;
File f = root.openNextFile();
while (f) {
if (!f.isDirectory()) {
Serial.printf(" [FILE] %-20s %6d KB\n", f.name(), f.size() / 1024);
count++;
}
f = root.openNextFile();
}
Serial.printf(" 共 %d 个文件\n", count);
}
// ============== 检查音频文件是否存在 ==============
bool checkAudioFile() {
return SD_MMC.exists(audio_filename);
}
// ============== 流式打开音频文件 ==============
bool openAudioFile() {
Serial.printf(" 打开: %s\n", audio_filename);
audio_file = SD_MMC.open(audio_filename, FILE_READ);
if (!audio_file) {
Serial.println(" ❌ 无法打开");
return false;
}
audio_total_size = audio_file.size();
audio_sent_size = 0;
Serial.printf(" 文件大小: %d bytes (%.1f KB)\n", audio_total_size, audio_total_size / 1024.0);
Serial.println(" ✓ 已打开(流式读取)");
return true;
}
// ============== WebSocket初始化 ==============
void initWebSocket() {
webSocket.beginSSL(ws_host, ws_port, ws_path);
String auth = "Authorization: Bearer; ";
auth += token;
webSocket.setExtraHeaders(auth.c_str());
webSocket.onEvent(webSocketEvent);
webSocket.setReconnectInterval(5000);
}
// ============== WebSocket事件处理 ==============
void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
switch(type) {
case WStype_DISCONNECTED:
Serial.println("[WS] ❌ 断开连接");
ws_connected = false;
break;
case WStype_CONNECTED:
Serial.println("[WS] ✅ 已连接到豆包ASR服务器");
Serial.print("[WS] 服务器: ");
Serial.println(ws_host);
ws_connected = true;
sendFullClientRequest();
break;
case WStype_BIN:
Serial.print("[WS] 📥 收到二进制数据, 长度: ");
Serial.print(length);
Serial.println(" bytes");
parseResponse(payload, length);
break;
case WStype_ERROR:
Serial.println("[WS] ❌ 错误");
break;
case WStype_PING:
Serial.println("[WS] 🏓 Ping");
break;
case WStype_PONG:
Serial.println("[WS] 🏓 Pong");
break;
default:
Serial.print("[WS] 其他事件, 类型: ");
Serial.println(type);
break;
}
}
// ============== 发送全量客户端请求 ==============
void sendFullClientRequest() {
Serial.println("
[ASR] 📤 发送Full Client Request(配置)...");
Serial.println("[ASR] 构建JSON配置...");
JsonDocument doc;
doc["app"]["appid"] = appid;
doc["app"]["token"] = token;
doc["app"]["cluster"] = cluster;
doc["user"]["uid"] = "esp32_asr";
doc["request"]["reqid"] = "esp32-" + String(millis());
doc["request"]["nbest"] = 1;
doc["request"]["workflow"] = "audio_in,resample,partition,vad,fe,decode,itn,nlu_punctuate";
doc["request"]["show_utterances"] = false;
doc["request"]["result_type"] = "full";
doc["request"]["sequence"] = 1;
doc["audio"]["format"] = "wav";
doc["audio"]["rate"] = audio_rate;
doc["audio"]["language"] = "zh-CN";
doc["audio"]["bits"] = audio_bits;
doc["audio"]["channel"] = audio_channel;
doc["audio"]["codec"] = "raw";
String json;
serializeJson(doc, json);
Serial.print("[ASR] JSON配置大小: ");
Serial.print(json.length());
Serial.println(" bytes");
Serial.print("[ASR] JSON内容: ");
Serial.println(json);
size_t msg_len = 4 + 4 + json.length();
Serial.print("[ASR] 总消息大小: ");
Serial.print(msg_len);
Serial.println(" bytes (header:4 + size:4 + payload)");
uint8_t* msg = (uint8_t*)malloc(msg_len);
if (!msg) {
Serial.println("[ASR] ❌ 内存分配失败");
return;
}
generateHeader(msg, CLIENT_FULL_REQUEST, NO_SEQUENCE);
msg[4] = (json.length() >> 24) & 0xFF;
msg[5] = (json.length() >> 16) & 0xFF;
msg[6] = (json.length() >> 8) & 0xFF;
msg[7] = json.length() & 0xFF;
memcpy(msg + 8, json.c_str(), json.length());
Serial.print("[ASR] Header: 0x");
for (int i = 0; i < 4; i++) {
if (msg[i] < 0x10) Serial.print("0");
Serial.print(msg[i], HEX);
}
Serial.println();
webSocket.sendBIN(msg, msg_len);
Serial.println("[ASR] ✅ Full Client Request已发送");
Serial.println("[ASR] 等待服务器响应...
");
free(msg);
}
// ============== 从SD卡读取并发送音频片 ==============
void sendAudioChunk() {
if (!audio_file || audio_sent_size >= audio_total_size) {
return; // 发送完成
}
size_t remaining = audio_total_size - audio_sent_size;
size_t chunk = (remaining > CHUNK_SIZE) ? CHUNK_SIZE : remaining;
bool is_last = (audio_sent_size + chunk >= audio_total_size);
static int chunk_count = 0;
chunk_count++;
Serial.print("
[ASR] 📤 发送音频片 #");
Serial.print(chunk_count);
Serial.print(" | 大小: ");
Serial.print(chunk);
Serial.print(" bytes | 进度: ");
Serial.print(audio_sent_size * 100 / audio_total_size);
Serial.print("% -> ");
Serial.print((audio_sent_size + chunk) * 100 / audio_total_size);
Serial.print("%");
if (is_last) Serial.print(" [最后一片]");
Serial.println();
// 从SD卡读取一片到临时缓冲区
uint8_t* buffer = (uint8_t*)malloc(chunk);
if (!buffer) {
Serial.println("[ERR] ❌ 内存分配失败");
return;
}
size_t read = audio_file.read(buffer, chunk);
if (read != chunk) {
Serial.print("[ERR] ❌ 读取SD卡失败, 预期: ");
Serial.print(chunk);
Serial.print(", 实际: ");
Serial.println(read);
free(buffer);
return;
}
Serial.println("[ASR] ✅ 从SD卡读取成功");
// 构建消息
size_t msg_len = 4 + 4 + chunk;
uint8_t* msg = (uint8_t*)malloc(msg_len);
if (!msg) {
Serial.println("[ERR] ❌ 消息内存分配失败");
free(buffer);
return;
}
uint8_t flags = is_last ? NEG_SEQUENCE : NO_SEQUENCE;
generateHeader(msg, CLIENT_AUDIO_ONLY_REQUEST, flags);
msg[4] = (chunk >> 24) & 0xFF;
msg[5] = (chunk >> 16) & 0xFF;
msg[6] = (chunk >> 8) & 0xFF;
msg[7] = chunk & 0xFF;
memcpy(msg + 8, buffer, chunk);
Serial.print("[ASR] Header: 0x");
for (int i = 0; i < 4; i++) {
if (msg[i] < 0x10) Serial.print("0");
Serial.print(msg[i], HEX);
}
Serial.print(" | Flags: ");
Serial.println(flags == NEG_SEQUENCE ? "NEG_SEQUENCE(最后一片)" : "NO_SEQUENCE");
webSocket.sendBIN(msg, msg_len);
Serial.println("[ASR] ✅ 音频片已发送");
free(buffer);
free(msg);
audio_sent_size += chunk;
if (is_last) {
Serial.println("
[ASR] ✅ 所有音频发送完成");
Serial.print("[ASR] 总共发送 ");
Serial.print(chunk_count);
Serial.println(" 片音频");
audio_file.close();
}
}
// ============== 解析响应 ==============
void parseResponse(uint8_t* data, size_t len) {
if (len < 8) return;
uint8_t msg_type = data[1] >> 4;
uint8_t serialization = data[2] >> 4;
uint8_t header_size = data[0] & 0x0F;
size_t payload_offset = header_size * 4;
if (len < payload_offset + 4) return;
int32_t payload_size = ((int32_t)data[payload_offset] << 24) |
((int32_t)data[payload_offset + 1] << 16) |
((int32_t)data[payload_offset + 2] << 8) |
((int32_t)data[payload_offset + 3]);
if (payload_size <= 0 || len < payload_offset + 4 + payload_size) return;
uint8_t* payload = data + payload_offset + 4;
if (serialization == JSON_SERIALIZATION) {
JsonDocument doc;
DeserializationError err = deserializeJson(doc, payload, payload_size);
if (!err) {
int code = doc["code"] | -1;
if (code == 1000) {
// 提取文本
JsonArray arr = doc["result"].as<JsonArray>();
if (arr.size() > 0) {
const char* txt = arr[0]["text"];
if (txt) {
recognition_result = String(txt);
Serial.printf("[识别] %s\n", txt);
}
}
// 检查是否完成
int seq = doc["sequence"] | 0;
if (seq < 0) {
asr_completed = true;
return;
}
// 继续发送下一片音频
sendAudioChunk();
} else {
const char* msg = doc["message"] | "Unknown";
Serial.printf("[ERR] %d: %s\n", code, msg);
}
}
}
}
bash
ESP-ROM:esp32s3-20210327
Build:Mar 27 2021
rst:0x1 (POWERON),boot:0x8 (SPI_FAST_FLASH_BOOT)
SPIWP:0xee
mode:DIO, clock div:1
load:0x3fce2820,len:0x10cc
load:0x403c8700,len:0xc2c
load:0x403cb700,len:0x30c0
entry 0x403c88b8
========================================
ESP32-S3 豆包ASR语音识别(流式版)
========================================
[自检1/4] 硬件信息:
芯片型号: ESP32-S3
堆内存: 312 KB
[自检2/4] SD卡检测:
初始化SD卡...
类型: SDHC
容量: 15200 MB
SD卡文件列表:
[FILE] audio1.wav 258 KB
[FILE] audio2.wav 303 KB
[FILE] audio3.wav 285 KB
[FILE] audio4.wav 472 KB
[FILE] audio5.wav 476 KB
[FILE] hello.wav 281 KB
共 6 个文件
检查目标文件 /audio1.wav: ✓ 存在
[自检3/4] WiFi连接:
连接 rm1234 ....................... ✓ 已连接
IP: 192.168.137.241
[自检4/4] 打开音频文件:
打开: /audio1.wav
文件大小: 265004 bytes (258.8 KB)
✓ 已打开(流式读取)
========================================
所有自检通过,启动ASR识别...
========================================
[WS] 已连接
[ASR] 发送配置...
[识别]
[识别]
[识别] 头
[ASR] 音频发送完成
[识别] 头抬高!
========================================
ASR识别完成!
识别结果: 头抬高!
========================================