ESP32-S3-CAM:豆包语音识别文字后控制小车(三)——SD卡本地音频识别转文字

1、前面ESP32-S3-CAM:豆包语音识别文字后控制小车(二)------跑通demo代码

已经把python 代码跑通了

2、交给kimi转成可以在Ardunio IDE跑的代码

这里要注意,提前告诉kimi背景信息,SD卡是板载卡槽,相关的GPIO口等,我是直接把之前跑通的代码也作为附件上传给了kimi。

代码在这里:ESP32-S3-CAM:SD卡开发

3、给kimi的提示词:

这里再强调下,板载SD卡槽对应的GPIO口是固定的,前面帖子里也说过。代码直接复制,不要走弯路。

最后kimi给的代码中是可以用的,但是刚开始它分片做得比较小,后来我调整到一次读取64K的数据到内存。

kimi改过的代码,它自动补全了,WiFi要单独连接,所以你要给配置WiFi名称和密码,SD卡里要拷贝音频文件到根目录。

注意点:kimi转的代码中由于原来python demo代码里 有文件压缩动作,其实可以不用压缩,可以去掉,然后分片发送即可。

运行代码即可,后来我是让kimi稍微调整增加了一些log输出,完整代码如下:

cpp 复制代码
/*
 * ESP32-S3 豆包ASR语音识别 - 流式读取版
 * 从SD卡流式读取audio1.wav,边读边发,无需大内存
 * 
 * 硬件: GOOUUU ESP32-S3-CAM v1.3
 * SD卡引脚: CLK=39, CMD=38, D0=40 (1-bit模式)
 */

#include <WiFi.h>
#include <WebSocketsClient.h>
#include <SD_MMC.h>
#include <ArduinoJson.h>

// ============== 配置区域 ==============
const char* ssid = "你的WiFi名称";
const char* password = "你的WiFi密码";

// 豆包ASR配置
const char* appid = "你的APP ID";
const char* token = "你的Access Token";
const char* cluster = "volcengine_input_common";
const char* ws_host = "openspeech.bytedance.com";
const int ws_port = 443;
const char* ws_path = "/api/v2/asr";

// SD卡引脚
#define SD_CLK  39
#define SD_CMD  38
#define SD_D0   40

// 音频配置
const char* audio_filename = "/audio1.wav";//SD卡根目录下文件
const int audio_rate = 16000;
const int audio_bits = 16;
const int audio_channel = 2;

// 协议常量
#define PROTOCOL_VERSION 0x01
#define DEFAULT_HEADER_SIZE 0x01
#define CLIENT_FULL_REQUEST 0x01
#define CLIENT_AUDIO_ONLY_REQUEST 0x02
#define SERVER_FULL_RESPONSE 0x09
#define SERVER_ACK 0x0B
#define SERVER_ERROR_RESPONSE 0x0F
#define NO_SEQUENCE 0x00
#define NEG_SEQUENCE 0x02
#define JSON_SERIALIZATION 0x01
#define NO_COMPRESSION 0x00

// ============== 全局变量 ==============
WebSocketsClient webSocket;
bool ws_connected = false;
bool asr_completed = false;
String recognition_result = "";
File audio_file;  // 音频文件句柄(流式读取)
size_t audio_total_size = 0;
size_t audio_sent_size = 0;
const size_t CHUNK_SIZE = 64000;  // 每片2秒左右音频数据

// ============== 函数声明 ==============
bool initSD();
void listSDFiles();
bool checkAudioFile();
bool openAudioFile();  // 流式打开,不加载
void initWebSocket();
void webSocketEvent(WStype_t type, uint8_t * payload, size_t length);
void sendFullClientRequest();
void sendAudioChunk();  // 从SD卡读取一片并发送
void parseResponse(uint8_t* data, size_t len);
String extractTextFromResult(JsonDocument& doc);
void generateHeader(uint8_t* header, uint8_t msg_type, uint8_t flags);

// ============== 协议头生成 ==============
void generateHeader(uint8_t* header, uint8_t msg_type, uint8_t flags) {
    header[0] = (PROTOCOL_VERSION << 4) | DEFAULT_HEADER_SIZE;
    header[1] = (msg_type << 4) | flags;
    header[2] = (JSON_SERIALIZATION << 4) | NO_COMPRESSION;
    header[3] = 0x00;
}

// ============== 设置 ==============
void setup() {
    Serial.begin(115200);
    delay(1000);

    Serial.println("\n========================================");
    Serial.println("ESP32-S3 豆包ASR语音识别(流式版)");
    Serial.println("========================================");

    // [自检1/4] 硬件信息
    Serial.println("\n[自检1/4] 硬件信息:");
    Serial.print("  芯片型号: ");
    Serial.println(ESP.getChipModel());
    Serial.print("  堆内存: ");
    Serial.print(ESP.getFreeHeap() / 1024);
    Serial.println(" KB");

    // [自检2/4] SD卡检测
    Serial.println("\n[自检2/4] SD卡检测:");
    if (!initSD()) {
        Serial.println("  ❌ SD卡初始化失败");
        while (1) delay(1000);
    }
    Serial.println("  SD卡文件列表:");
    listSDFiles();
    Serial.print("  检查目标文件 ");
    Serial.print(audio_filename);
    Serial.print(": ");
    if (!checkAudioFile()) {
        Serial.println("❌ 不存在");
        while (1) delay(1000);
    }
    Serial.println("✓ 存在");

    // [自检3/4] WiFi连接
    Serial.println("\n[自检3/4] WiFi连接:");
    Serial.print("  连接 ");
    Serial.print(ssid);
    Serial.print(" ...");
    WiFi.begin(ssid, password);
    int retry = 0;
    while (WiFi.status() != WL_CONNECTED && retry < 30) {
        delay(500);
        Serial.print(".");
        retry++;
    }
    if (WiFi.status() != WL_CONNECTED) {
        Serial.println(" ❌ 失败");
        while (1) delay(1000);
    }
    Serial.println(" ✓ 已连接");
    Serial.print("  IP: ");
    Serial.println(WiFi.localIP());

    // [自检4/4] 打开音频文件(流式)
    Serial.println("\n[自检4/4] 打开音频文件:");
    if (!openAudioFile()) {
        Serial.println("  ❌ 打开失败");
        while (1) delay(1000);
    }

    Serial.println("\n========================================");
    Serial.println("所有自检通过,启动ASR识别...");
    Serial.println("========================================\n");

    initWebSocket();
}

// ============== 主循环 ==============
void loop() {
    webSocket.loop();

    if (asr_completed) {
        Serial.println("\n========================================");
        Serial.println("ASR识别完成!");
        Serial.print("识别结果: ");
        Serial.println(recognition_result);
        Serial.println("========================================");

        if (audio_file) audio_file.close();
        while (1) delay(1000);
    }
}

// ============== SD卡初始化 ==============
bool initSD() {
    Serial.println("  初始化SD卡...");
    SD_MMC.setPins(SD_CLK, SD_CMD, SD_D0);
    if (!SD_MMC.begin("/sdcard", true)) {
        Serial.println("  ❌ 失败");
        return false;
    }
    uint8_t type = SD_MMC.cardType();
    Serial.print("  类型: ");
    if (type == CARD_SD) Serial.println("SD");
    else if (type == CARD_SDHC) Serial.println("SDHC");
    else if (type == CARD_MMC) Serial.println("MMC");
    else Serial.println("未知");
    Serial.printf("  容量: %llu MB\n", SD_MMC.cardSize() / 1048576);
    return true;
}

// ============== 列出SD卡文件 ==============
void listSDFiles() {
    File root = SD_MMC.open("/");
    if (!root || !root.isDirectory()) return;
    int count = 0;
    File f = root.openNextFile();
    while (f) {
        if (!f.isDirectory()) {
            Serial.printf("    [FILE] %-20s %6d KB\n", f.name(), f.size() / 1024);
            count++;
        }
        f = root.openNextFile();
    }
    Serial.printf("    共 %d 个文件\n", count);
}

// ============== 检查音频文件是否存在 ==============
bool checkAudioFile() {
    return SD_MMC.exists(audio_filename);
}

// ============== 流式打开音频文件 ==============
bool openAudioFile() {
    Serial.printf("  打开: %s\n", audio_filename);
    audio_file = SD_MMC.open(audio_filename, FILE_READ);
    if (!audio_file) {
        Serial.println("  ❌ 无法打开");
        return false;
    }
    audio_total_size = audio_file.size();
    audio_sent_size = 0;
    Serial.printf("  文件大小: %d bytes (%.1f KB)\n", audio_total_size, audio_total_size / 1024.0);
    Serial.println("  ✓ 已打开(流式读取)");
    return true;
}

// ============== WebSocket初始化 ==============
void initWebSocket() {
    webSocket.beginSSL(ws_host, ws_port, ws_path);
    String auth = "Authorization: Bearer; ";
    auth += token;
    webSocket.setExtraHeaders(auth.c_str());
    webSocket.onEvent(webSocketEvent);
    webSocket.setReconnectInterval(5000);
}

// ============== WebSocket事件处理 ==============
void webSocketEvent(WStype_t type, uint8_t * payload, size_t length) {
    switch(type) {
        case WStype_DISCONNECTED:
            Serial.println("[WS] ❌ 断开连接");
            ws_connected = false;
            break;
        case WStype_CONNECTED:
            Serial.println("[WS] ✅ 已连接到豆包ASR服务器");
            Serial.print("[WS] 服务器: ");
            Serial.println(ws_host);
            ws_connected = true;
            sendFullClientRequest();
            break;
        case WStype_BIN:
            Serial.print("[WS] 📥 收到二进制数据, 长度: ");
            Serial.print(length);
            Serial.println(" bytes");
            parseResponse(payload, length);
            break;
        case WStype_ERROR:
            Serial.println("[WS] ❌ 错误");
            break;
        case WStype_PING:
            Serial.println("[WS] 🏓 Ping");
            break;
        case WStype_PONG:
            Serial.println("[WS] 🏓 Pong");
            break;
        default:
            Serial.print("[WS] 其他事件, 类型: ");
            Serial.println(type);
            break;
    }
}

// ============== 发送全量客户端请求 ==============
void sendFullClientRequest() {
    Serial.println("
[ASR] 📤 发送Full Client Request(配置)...");
    Serial.println("[ASR] 构建JSON配置...");

    JsonDocument doc;
    doc["app"]["appid"] = appid;
    doc["app"]["token"] = token;
    doc["app"]["cluster"] = cluster;
    doc["user"]["uid"] = "esp32_asr";
    doc["request"]["reqid"] = "esp32-" + String(millis());
    doc["request"]["nbest"] = 1;
    doc["request"]["workflow"] = "audio_in,resample,partition,vad,fe,decode,itn,nlu_punctuate";
    doc["request"]["show_utterances"] = false;
    doc["request"]["result_type"] = "full";
    doc["request"]["sequence"] = 1;
    doc["audio"]["format"] = "wav";
    doc["audio"]["rate"] = audio_rate;
    doc["audio"]["language"] = "zh-CN";
    doc["audio"]["bits"] = audio_bits;
    doc["audio"]["channel"] = audio_channel;
    doc["audio"]["codec"] = "raw";

    String json;
    serializeJson(doc, json);

    Serial.print("[ASR] JSON配置大小: ");
    Serial.print(json.length());
    Serial.println(" bytes");
    Serial.print("[ASR] JSON内容: ");
    Serial.println(json);

    size_t msg_len = 4 + 4 + json.length();
    Serial.print("[ASR] 总消息大小: ");
    Serial.print(msg_len);
    Serial.println(" bytes (header:4 + size:4 + payload)");

    uint8_t* msg = (uint8_t*)malloc(msg_len);
    if (!msg) {
        Serial.println("[ASR] ❌ 内存分配失败");
        return;
    }

    generateHeader(msg, CLIENT_FULL_REQUEST, NO_SEQUENCE);
    msg[4] = (json.length() >> 24) & 0xFF;
    msg[5] = (json.length() >> 16) & 0xFF;
    msg[6] = (json.length() >> 8) & 0xFF;
    msg[7] = json.length() & 0xFF;
    memcpy(msg + 8, json.c_str(), json.length());

    Serial.print("[ASR] Header: 0x");
    for (int i = 0; i < 4; i++) {
        if (msg[i] < 0x10) Serial.print("0");
        Serial.print(msg[i], HEX);
    }
    Serial.println();

    webSocket.sendBIN(msg, msg_len);
    Serial.println("[ASR] ✅ Full Client Request已发送");
    Serial.println("[ASR] 等待服务器响应...
");
    free(msg);
}

// ============== 从SD卡读取并发送音频片 ==============
void sendAudioChunk() {
    if (!audio_file || audio_sent_size >= audio_total_size) {
        return;  // 发送完成
    }

    size_t remaining = audio_total_size - audio_sent_size;
    size_t chunk = (remaining > CHUNK_SIZE) ? CHUNK_SIZE : remaining;
    bool is_last = (audio_sent_size + chunk >= audio_total_size);

    static int chunk_count = 0;
    chunk_count++;

    Serial.print("
[ASR] 📤 发送音频片 #");
    Serial.print(chunk_count);
    Serial.print(" | 大小: ");
    Serial.print(chunk);
    Serial.print(" bytes | 进度: ");
    Serial.print(audio_sent_size * 100 / audio_total_size);
    Serial.print("% -> ");
    Serial.print((audio_sent_size + chunk) * 100 / audio_total_size);
    Serial.print("%");
    if (is_last) Serial.print(" [最后一片]");
    Serial.println();

    // 从SD卡读取一片到临时缓冲区
    uint8_t* buffer = (uint8_t*)malloc(chunk);
    if (!buffer) {
        Serial.println("[ERR] ❌ 内存分配失败");
        return;
    }

    size_t read = audio_file.read(buffer, chunk);
    if (read != chunk) {
        Serial.print("[ERR] ❌ 读取SD卡失败, 预期: ");
        Serial.print(chunk);
        Serial.print(", 实际: ");
        Serial.println(read);
        free(buffer);
        return;
    }
    Serial.println("[ASR] ✅ 从SD卡读取成功");

    // 构建消息
    size_t msg_len = 4 + 4 + chunk;
    uint8_t* msg = (uint8_t*)malloc(msg_len);
    if (!msg) {
        Serial.println("[ERR] ❌ 消息内存分配失败");
        free(buffer);
        return;
    }

    uint8_t flags = is_last ? NEG_SEQUENCE : NO_SEQUENCE;
    generateHeader(msg, CLIENT_AUDIO_ONLY_REQUEST, flags);
    msg[4] = (chunk >> 24) & 0xFF;
    msg[5] = (chunk >> 16) & 0xFF;
    msg[6] = (chunk >> 8) & 0xFF;
    msg[7] = chunk & 0xFF;
    memcpy(msg + 8, buffer, chunk);

    Serial.print("[ASR] Header: 0x");
    for (int i = 0; i < 4; i++) {
        if (msg[i] < 0x10) Serial.print("0");
        Serial.print(msg[i], HEX);
    }
    Serial.print(" | Flags: ");
    Serial.println(flags == NEG_SEQUENCE ? "NEG_SEQUENCE(最后一片)" : "NO_SEQUENCE");

    webSocket.sendBIN(msg, msg_len);
    Serial.println("[ASR] ✅ 音频片已发送");

    free(buffer);
    free(msg);

    audio_sent_size += chunk;

    if (is_last) {
        Serial.println("
[ASR] ✅ 所有音频发送完成");
        Serial.print("[ASR] 总共发送 ");
        Serial.print(chunk_count);
        Serial.println(" 片音频");
        audio_file.close();
    }
}

// ============== 解析响应 ==============
void parseResponse(uint8_t* data, size_t len) {
    if (len < 8) return;

    uint8_t msg_type = data[1] >> 4;
    uint8_t serialization = data[2] >> 4;
    uint8_t header_size = data[0] & 0x0F;

    size_t payload_offset = header_size * 4;
    if (len < payload_offset + 4) return;

    int32_t payload_size = ((int32_t)data[payload_offset] << 24) |
                           ((int32_t)data[payload_offset + 1] << 16) |
                           ((int32_t)data[payload_offset + 2] << 8) |
                           ((int32_t)data[payload_offset + 3]);

    if (payload_size <= 0 || len < payload_offset + 4 + payload_size) return;

    uint8_t* payload = data + payload_offset + 4;

    if (serialization == JSON_SERIALIZATION) {
        JsonDocument doc;
        DeserializationError err = deserializeJson(doc, payload, payload_size);

        if (!err) {
            int code = doc["code"] | -1;

            if (code == 1000) {
                // 提取文本
                JsonArray arr = doc["result"].as<JsonArray>();
                if (arr.size() > 0) {
                    const char* txt = arr[0]["text"];
                    if (txt) {
                        recognition_result = String(txt);
                        Serial.printf("[识别] %s\n", txt);
                    }
                }

                // 检查是否完成
                int seq = doc["sequence"] | 0;
                if (seq < 0) {
                    asr_completed = true;
                    return;
                }

                // 继续发送下一片音频
                sendAudioChunk();

            } else {
                const char* msg = doc["message"] | "Unknown";
                Serial.printf("[ERR] %d: %s\n", code, msg);
            }
        }
    }
}
bash 复制代码
ESP-ROM:esp32s3-20210327
Build:Mar 27 2021
rst:0x1 (POWERON),boot:0x8 (SPI_FAST_FLASH_BOOT)
SPIWP:0xee
mode:DIO, clock div:1
load:0x3fce2820,len:0x10cc
load:0x403c8700,len:0xc2c
load:0x403cb700,len:0x30c0
entry 0x403c88b8

========================================
ESP32-S3 豆包ASR语音识别(流式版)
========================================

[自检1/4] 硬件信息:
  芯片型号: ESP32-S3
  堆内存: 312 KB

[自检2/4] SD卡检测:
  初始化SD卡...
  类型: SDHC
  容量: 15200 MB
  SD卡文件列表:
    [FILE] audio1.wav              258 KB
    [FILE] audio2.wav              303 KB
    [FILE] audio3.wav              285 KB
    [FILE] audio4.wav              472 KB
    [FILE] audio5.wav              476 KB
    [FILE] hello.wav               281 KB
    共 6 个文件
  检查目标文件 /audio1.wav: ✓ 存在

[自检3/4] WiFi连接:
  连接 rm1234 ....................... ✓ 已连接
  IP: 192.168.137.241

[自检4/4] 打开音频文件:
  打开: /audio1.wav
  文件大小: 265004 bytes (258.8 KB)
  ✓ 已打开(流式读取)

========================================
所有自检通过,启动ASR识别...
========================================

[WS] 已连接
[ASR] 发送配置...
[识别] 
[识别] 
[识别] 头
[ASR] 音频发送完成
[识别] 头抬高!

========================================
ASR识别完成!
识别结果: 头抬高!
========================================

ESP32-S3-CAM:豆包语音识别文字后控制小车(四)------增加mic拾音后通过豆包语音识别后转文字输出

相关推荐
春末的南方城市2 小时前
CVPR 2026 | 复旦开源首个端到端多模态矢量动画生成框架OmniLottie:UI动效革命,文本/图像一键转Lottie动画!
人工智能·深度学习·机器学习·计算机视觉·aigc
禹笑笑-AI食用指南2 小时前
AI 团队协作下的工作日志系统:痛点、场景与技术解决方案
人工智能
新缸中之脑2 小时前
用Gemma 4构建自托管OCR
人工智能·ocr
ai_xiaogui2 小时前
凌晨3点的重构局:从遗漏“用户中心”看AI客户端前后端分离架构的深水区
人工智能·aistarter·panelai·ai客户端架构设计·桌面端前后端分离·本地大模型api接入·独立开发者踩坑实录
不才小强2 小时前
CUDA编程与API详解
人工智能
探物 AI2 小时前
虾破苍穹(一):RTX 3060 养一只本地“呆呆”龙虾 [特殊字符]
人工智能·ai编程
俊哥V2 小时前
每日 AI 研究简报 · 2026-04-12
人工智能·ai
拥抱AGI2 小时前
Qwen3.5开源矩阵震撼发布!从0.8B到397B,不同规模模型性能、显存、速度深度对比与选型指南来了!
人工智能·学习·程序员·开源·大模型·大模型训练·qwen3.5
哈喽天空2 小时前
win10原生安装openclaw
人工智能