【ESP32-S3】对接豆包端到端实时语音

这里写目录标题

背景

近期尝试给小车加一个智能对话的功能,满大街的小智,想尝试下其他的

当前遇到的问题

可以websocket连接成功,也可以发起StartConnection事件,但是发送StartSession失败

39.407 -> [状态检查 70]

22:24:39.407 -> WiFi: 已连接

22:24:39.407 -> WebSocket: 断开

22:24:39.407 -> 豆包激活: 否

22:24:39.449 -> Connect ID: esp32_3CDC757254DC

22:24:39.449 -> Dialog ID:

22:24:39.449 -> [DouBao] 连接断开,尝试重连...

22:24:39.449 -> [DouBao] 初始化豆包语音...

22:24:39.449 -> [DouBao] Connect ID: esp32_3CDC757254DC

22:24:39.543 -> [DouBao] 设置头部信息

22:24:39.543 -> [DouBao] 连接豆包服务器: openspeech.bytedance.com:443

22:24:39.543 -> 麦克风已初始化,跳过

22:24:39.543 -> 扬声器已初始化,跳过

22:24:40.053 -> 测试麦克风...

22:24:40.053 -> 测试 1: 读取 512 字节,平均音量: 0

22:24:40.145 -> 测试 2: 读取 512 字节,平均音量: 0

22:24:40.227 -> 测试 3: 读取 512 字节,平均音量: 0

22:24:40.360 -> 测试扬声器...

22:24:40.423 -> 扬声器测试成功

22:24:41.063 -> [DouBao] WebSocket连接成功

22:24:41.189 -> [DouBao] 发送StartConnection...

22:24:41.189 -> [DouBao] StartConnection发送: 成功

22:24:41.371 -> [DouBao] 发送StartSession...

22:24:41.371 -> [DouBao] StartSession发送: 失败

22:24:41.405 -> [DouBao] StartSession发送失败,可能是消息太长或连接问题

22:24:41.405 -> [DouBao] WebSocket断开连接

代码

最新代码见连接:https://gitee.com/likexiang/like-code/blob/master/ESP32-S3-CAM/DouBaoVoic.ino

c++ 复制代码
// ===========================
// 豆包端到端实时语音大模型API适配 V2.0 - 修复版
// ===========================

#include <WiFi.h>
#include <WebSocketsClient.h>
#include <WiFiClientSecure.h>
#include <ArduinoJson.h>
#include <driver/i2s.h>

// ========== 豆包接口配置 ==========
#define DOUBAO_WSS_HOST "openspeech.bytedance.com"
#define DOUBAO_WSS_PORT 443
#define DOUBAO_APP_ID "9542649884"
#define DOUBAO_API_KEY "xxx"
#define DOUBAO_ACCESS_TOKEN "xx"
#define DOUBAO_RESOURCE_ID "volc.speech.dialog"

// ========== 音频参数配置 ==========
#define DOUBAO_FRAME_MS 20
#define DOUBAO_SAMPLE_RATE 16000
#define DOUBAO_BITS_PER_SAMPLE 16
#define DOUBAO_CHANNELS 1
#define DOUBAO_FRAME_BYTES (DOUBAO_SAMPLE_RATE * DOUBAO_BITS_PER_SAMPLE / 8 * DOUBAO_CHANNELS * DOUBAO_FRAME_MS / 1000)

// ========== 音频设备配置 ==========
#define I2S_MIC_PORT I2S_NUM_1
#define I2S_MIC_BCLK_PIN 4
#define I2S_MIC_LRCLK_PIN 5
#define I2S_MIC_DATA_PIN 2

#define I2S_SPEAKER_PORT I2S_NUM_0
#define I2S_SPEAKER_BCLK_PIN 12
#define I2S_SPEAKER_LRCLK_PIN 15
#define I2S_SPEAKER_DATA_PIN 16

// ========== WiFi配置 ==========
const char *ssid = "ChinaNet-6x8c";
const char *password = "8zeymm8c";

// ========== 全局状态 ==========
WebSocketsClient doubaoWs;
bool isDoubaoActive = false;
bool isDoubaoEnabled = true;

// 会话管理
String doubaoConnectId = "";
String doubaoDialogId = "";

// 任务句柄
TaskHandle_t doubaoCaptureTask = NULL;

// 音频设备状态
bool isMicrophoneInitialized = false;
bool isSpeakerInitialized = false;

// ========== JSON构建函数 ==========
String buildStartConnectionJson() {
  DynamicJsonDocument doc(512);
  doc["type"] = "start_connection";
  doc["device_id"] = WiFi.macAddress();
  doc["client_type"] = "esp32";
  doc["version"] = "1.0.0";
  
  String json;
  serializeJson(doc, json);
  return json;
}

String buildStartSessionJson() {
  DynamicJsonDocument doc(512);
  doc["type"] = "start_session";
  
  JsonObject asr = doc.createNestedObject("asr");
  JsonObject extra = asr.createNestedObject("extra");
  extra["end_smooth_window_ms"] = 1500;
  extra["enable_custom_vad"] = false;
  
  JsonObject dialog = doc.createNestedObject("dialog");
  dialog["dialog_id"] = "";
  dialog["user_id"] = "";
  
  String json;
  serializeJson(doc, json);
  return json;
}

// ========== WebSocket事件处理 ==========
void doubaoWsEvent(WStype_t type, uint8_t *payload, size_t length) {
  switch (type) {
    case WStype_DISCONNECTED:
      Serial.println("[DouBao] WebSocket断开连接");
      isDoubaoActive = false;
      break;
      
    case WStype_CONNECTED:
      {
        Serial.println("[DouBao] WebSocket连接成功");
        
        // 连接成功后发送初始化消息
        delay(100);
        
        // 发送StartConnection
        String startConn = buildStartConnectionJson();
        Serial.println("[DouBao] 发送StartConnection...");
        bool connSent = doubaoWs.sendTXT(startConn);
        Serial.printf("[DouBao] StartConnection发送: %s\n", connSent ? "成功" : "失败");
        
        delay(200);
        
        // 发送StartSession
        String startSess = buildStartSessionJson();
        Serial.println("[DouBao] 发送StartSession...");
        bool sessSent = doubaoWs.sendTXT(startSess);
        Serial.printf("[DouBao] StartSession发送: %s\n", sessSent ? "成功" : "失败");
        
        if (!sessSent) {
          Serial.println("[DouBao] StartSession发送失败,可能是消息太长或连接问题");
        }
      }
      break;
      
    case WStype_TEXT:
      {
        String message = String((char*)payload).substring(0, min(length, (size_t)500));
        Serial.printf("[DouBao] 收到消息: %s\n", message.c_str());
        
        // 解析JSON响应
        DynamicJsonDocument doc(1024);
        DeserializationError error = deserializeJson(doc, message);
        if (error) {
          Serial.printf("[DouBao] JSON解析失败: %s\n", error.c_str());
          return;
        }
        
        // 处理响应
        if (doc.containsKey("type")) {
          String responseType = doc["type"].as<String>();
          Serial.printf("[DouBao] 响应类型: %s\n", responseType.c_str());
          
          if (responseType == "connection_started") {
            Serial.println("[DouBao] 连接建立成功");
          } 
          else if (responseType == "session_started") {
            if (doc.containsKey("dialog_id")) {
              doubaoDialogId = doc["dialog_id"].as<String>();
              Serial.printf("[DouBao] 会话启动成功,dialog_id: %s\n", doubaoDialogId.c_str());
              isDoubaoActive = true;
              
              // 启动音频采集任务
              if (doubaoCaptureTask == NULL) {
                xTaskCreatePinnedToCore([](void* param) {
                  Serial.println("[DouBao] 音频采集任务启动");
                  
                  int16_t buffer[DOUBAO_FRAME_BYTES / 2];
                  size_t bytesRead;
                  uint32_t frameCount = 0;
                  
                  while (isDoubaoEnabled && isDoubaoActive) {
                    if (doubaoWs.isConnected()) {
                      esp_err_t ret = i2s_read(I2S_MIC_PORT, buffer, DOUBAO_FRAME_BYTES, &bytesRead, portMAX_DELAY);
                      
                      if (ret == ESP_OK && bytesRead == DOUBAO_FRAME_BYTES) {
                        // 简单的音频处理
                        for (int i = 0; i < DOUBAO_FRAME_BYTES / 2; i++) {
                          int32_t amplified = buffer[i] * 4;
                          buffer[i] = (int16_t)constrain(amplified, -32768, 32767);
                        }
                        
                        frameCount++;
                        if (frameCount % 50 == 0) {
                          Serial.printf("[DouBao] 已采集 %d 帧音频\n", frameCount);
                        }
                      }
                    }
                    delay(DOUBAO_FRAME_MS);
                  }
                  
                  Serial.println("[DouBao] 音频采集任务结束");
                  vTaskDelete(NULL);
                }, "DouBaoCapture", 8192, NULL, 1, &doubaoCaptureTask, 1);
              }
            }
          } 
          else if (responseType == "asr_response") {
            if (doc.containsKey("results")) {
              String text = doc["results"][0]["text"].as<String>();
              bool isInterim = doc["results"][0]["is_interim"].as<bool>();
              if (!isInterim && text.length() > 0) {
                Serial.printf("[DouBao] 识别结果: %s\n", text.c_str());
              }
            }
          }
          else if (responseType == "error") {
            if (doc.containsKey("error")) {
              String errorMsg = doc["error"].as<String>();
              Serial.printf("[DouBao] 错误: %s\n", errorMsg.c_str());
            }
          }
        }
      }
      break;
      
    case WStype_ERROR:
      Serial.println("[DouBao] WebSocket错误");
      break;
      
    default:
      break;
  }
}

// ========== 音频设备初始化 ==========
esp_err_t initMicrophone() {
  if (isMicrophoneInitialized) {
    Serial.println("麦克风已初始化,跳过");
    return ESP_OK;
  }

  Serial.println("初始化麦克风...");
  
  // 先卸载可能存在的驱动
  i2s_driver_uninstall(I2S_MIC_PORT);
  delay(100);
  
  i2s_config_t i2s_config = {
    .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
    .sample_rate = DOUBAO_SAMPLE_RATE,
    .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
    .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
    .communication_format = I2S_COMM_FORMAT_STAND_I2S,
    .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
    .dma_buf_count = 4,
    .dma_buf_len = 256,
    .use_apll = false,
    .tx_desc_auto_clear = false,
    .fixed_mclk = 0
  };

  esp_err_t err = i2s_driver_install(I2S_MIC_PORT, &i2s_config, 0, NULL);
  if (err != ESP_OK) {
    Serial.printf("I2S麦克风驱动安装失败: %d\n", err);
    return err;
  }

  i2s_pin_config_t pin_config = {
    .bck_io_num = I2S_MIC_BCLK_PIN,
    .ws_io_num = I2S_MIC_LRCLK_PIN,
    .data_out_num = I2S_PIN_NO_CHANGE,
    .data_in_num = I2S_MIC_DATA_PIN
  };

  err = i2s_set_pin(I2S_MIC_PORT, &pin_config);
  if (err != ESP_OK) {
    Serial.printf("I2S麦克风引脚配置失败: %d\n", err);
    i2s_driver_uninstall(I2S_MIC_PORT);
    return err;
  }

  isMicrophoneInitialized = true;
  Serial.println("麦克风初始化完成");
  return ESP_OK;
}

esp_err_t initSpeaker() {
  if (isSpeakerInitialized) {
    Serial.println("扬声器已初始化,跳过");
    return ESP_OK;
  }

  Serial.println("初始化扬声器...");
  
  // 先卸载可能存在的驱动
  i2s_driver_uninstall(I2S_SPEAKER_PORT);
  delay(100);
  
  i2s_config_t i2sConfig = {
    .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
    .sample_rate = 24000,
    .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
    .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
    .communication_format = I2S_COMM_FORMAT_STAND_I2S,
    .dma_buf_count = 4,
    .dma_buf_len = 256,
    .use_apll = false,
    .tx_desc_auto_clear = true,
    .fixed_mclk = 0
  };

  esp_err_t err = i2s_driver_install(I2S_SPEAKER_PORT, &i2sConfig, 0, NULL);
  if (err != ESP_OK) {
    Serial.printf("I2S扬声器驱动安装失败: %d\n", err);
    return err;
  }

  i2s_pin_config_t pin_config = {
    .bck_io_num = I2S_SPEAKER_BCLK_PIN,
    .ws_io_num = I2S_SPEAKER_LRCLK_PIN,
    .data_out_num = I2S_SPEAKER_DATA_PIN,
    .data_in_num = I2S_PIN_NO_CHANGE
  };

  err = i2s_set_pin(I2S_SPEAKER_PORT, &pin_config);
  if (err != ESP_OK) {
    Serial.printf("I2S扬声器引脚配置失败: %d\n", err);
    i2s_driver_uninstall(I2S_SPEAKER_PORT);
    return err;
  }

  isSpeakerInitialized = true;
  Serial.println("扬声器初始化完成");
  return ESP_OK;
}

// ========== 音频设备测试 ==========
void testMicrophone() {
  Serial.println("测试麦克风...");
  int16_t buffer[256];
  size_t bytesRead;
  
  for (int i = 0; i < 3; i++) {
    esp_err_t ret = i2s_read(I2S_MIC_PORT, buffer, sizeof(buffer), &bytesRead, 100);
    if (ret == ESP_OK && bytesRead > 0) {
      // 计算平均音量
      int32_t sum = 0;
      for (int j = 0; j < bytesRead / 2; j++) {
        sum += abs(buffer[j]);
      }
      int avgVolume = sum / (bytesRead / 2);
      Serial.printf("测试 %d: 读取 %d 字节,平均音量: %d\n", i+1, bytesRead, avgVolume);
    } else {
      Serial.printf("测试 %d 失败,错误码: %d\n", i+1, ret);
    }
    delay(100);
  }
}

void testSpeaker() {
  Serial.println("测试扬声器...");
  
  // 生成1kHz正弦波测试音(100ms)
  const int sampleCount = 2400;  // 24000Hz * 0.1s
  int16_t testTone[sampleCount];
  
  for (int i = 0; i < sampleCount; i++) {
    testTone[i] = sin(2 * 3.14159 * 1000 * i / 24000) * 5000;
  }
  
  size_t bytesWritten;
  esp_err_t err = i2s_write(I2S_SPEAKER_PORT, testTone, sizeof(testTone), &bytesWritten, portMAX_DELAY);
  
  if (err == ESP_OK && bytesWritten == sizeof(testTone)) {
    Serial.println("扬声器测试成功");
  } else {
    Serial.printf("扬声器测试失败,错误码: %d\n", err);
  }
}

// ========== 豆包初始化 ==========
void initDoubao() {
  Serial.println("[DouBao] 初始化豆包语音...");
  
  // 生成Connect ID
  doubaoConnectId = "esp32_" + WiFi.macAddress();
  doubaoConnectId.replace(":", "");
  Serial.printf("[DouBao] Connect ID: %s\n", doubaoConnectId.c_str());
  
  // 断开现有连接
  doubaoWs.disconnect();
  delay(100);
  
  // 重新初始化WebSocket
  doubaoWs.onEvent(doubaoWsEvent);
  doubaoWs.setReconnectInterval(5000);
  doubaoWs.enableHeartbeat(15000, 3000, 2);
  
  // 设置头部信息 - 确保格式正确
  String headers = 
    "X-Api-App-ID: " + String(DOUBAO_APP_ID) + "\r\n" +
    "X-Api-Access-Key: " + String(DOUBAO_ACCESS_TOKEN) + "\r\n" +
    "X-Api-Resource-Id: " + String(DOUBAO_RESOURCE_ID) + "\r\n" +
    "X-Api-App-Key: " + String(DOUBAO_API_KEY) + "\r\n" +
    "X-Api-Connect-Id: " + doubaoConnectId + "\r\n";
  
  Serial.println("[DouBao] 设置头部信息");
  doubaoWs.setExtraHeaders(headers.c_str());
  
  // 开始SSL连接
  Serial.printf("[DouBao] 连接豆包服务器: %s:%d\n", DOUBAO_WSS_HOST, DOUBAO_WSS_PORT);
  doubaoWs.beginSSL(DOUBAO_WSS_HOST, DOUBAO_WSS_PORT, "/api/v3/realtime/dialogue");
  
  // 初始化音频设备
  initMicrophone();
  initSpeaker();
  
  // 测试音频设备
  delay(500);
  testMicrophone();
  testSpeaker();
}

// ========== 清理函数 ==========
void cleanupAudio() {
  Serial.println("清理音频设备...");
  
  if (doubaoCaptureTask != NULL) {
    vTaskDelete(doubaoCaptureTask);
    doubaoCaptureTask = NULL;
  }
  
  if (isMicrophoneInitialized) {
    i2s_driver_uninstall(I2S_MIC_PORT);
    isMicrophoneInitialized = false;
    Serial.println("麦克风已卸载");
  }
  
  if (isSpeakerInitialized) {
    i2s_driver_uninstall(I2S_SPEAKER_PORT);
    isSpeakerInitialized = false;
    Serial.println("扬声器已卸载");
  }
}

// ========== 主程序 ==========
void setup() {
  Serial.begin(115200);
  Serial.println();
  Serial.println("=== 豆包语音精简版 V2.0 ===");

  delay(1000);

  // 初始化WiFi
  Serial.println("正在连接WiFi...");
  WiFi.begin(ssid, password);
  WiFi.setSleep(false);

  int attempts = 0;
  while (WiFi.status() != WL_CONNECTED && attempts < 30) {
    delay(500);
    Serial.print(".");
    attempts++;
  }
  
  if (WiFi.status() == WL_CONNECTED) {
    Serial.println("\nWiFi连接成功");
    Serial.print("IP地址: ");
    Serial.println(WiFi.localIP());
  } else {
    Serial.println("\nWiFi连接失败");
    while(1) delay(1000);
  }

  delay(2000);
  
  // 初始化豆包语音
  initDoubao();
  
  Serial.println("=== 系统启动完成 ===");
  Serial.println("输入 'help' 查看可用命令");
}

void loop() {
  // WebSocket循环处理
  doubaoWs.loop();
  
  // 定期状态检查
  static unsigned long lastStatusCheck = 0;
  if (millis() - lastStatusCheck > 10000) {
    lastStatusCheck = millis();
    
    bool wsConnected = doubaoWs.isConnected();
    Serial.printf("\n[状态检查 %lu]\n", millis() / 1000);
    Serial.printf("WiFi: %s\n", WiFi.status() == WL_CONNECTED ? "已连接" : "断开");
    Serial.printf("WebSocket: %s\n", wsConnected ? "已连接" : "断开");
    Serial.printf("豆包激活: %s\n", isDoubaoActive ? "是" : "否");
    Serial.printf("Connect ID: %s\n", doubaoConnectId.c_str());
    Serial.printf("Dialog ID: %s\n", doubaoDialogId.c_str());
    
    // 检查内存
    static unsigned long lastMemCheck = 0;
    if (millis() - lastMemCheck > 30000) {
      lastMemCheck = millis();
      uint32_t freeHeap = ESP.getFreeHeap();
      Serial.printf("可用堆内存: %d bytes\n", freeHeap);
    }
    
    // 如果未连接,尝试重连
    if (isDoubaoEnabled && !wsConnected) {
      Serial.println("[DouBao] 连接断开,尝试重连...");
      initDoubao();
    }
  }
  
  delay(10);
}

// ========== 串口命令处理 ==========
void serialEvent() {
  while (Serial.available()) {
    String command = Serial.readStringUntil('\n');
    command.trim();
    
    Serial.printf("> %s\n", command.c_str());
    
    if (command == "status") {
      Serial.println("=== 当前状态 ===");
      Serial.printf("WiFi: %s\n", WiFi.status() == WL_CONNECTED ? "已连接" : "断开");
      Serial.printf("WebSocket: %s\n", doubaoWs.isConnected() ? "已连接" : "断开");
      Serial.printf("豆包激活: %s\n", isDoubaoActive ? "是" : "否");
      Serial.printf("Connect ID: %s\n", doubaoConnectId.c_str());
      Serial.printf("Dialog ID: %s\n", doubaoDialogId.c_str());
      Serial.printf("麦克风: %s\n", isMicrophoneInitialized ? "已初始化" : "未初始化");
      Serial.printf("扬声器: %s\n", isSpeakerInitialized ? "已初始化" : "未初始化");
    }
    else if (command == "reconnect") {
      Serial.println("重新连接豆包...");
      cleanupAudio();
      initDoubao();
    }
    else if (command == "test_mic") {
      testMicrophone();
    }
    else if (command == "test_speaker") {
      testSpeaker();
    }
    else if (command == "start_session") {
      if (doubaoWs.isConnected()) {
        String startSess = buildStartSessionJson();
        Serial.println("手动发送StartSession...");
        bool sent = doubaoWs.sendTXT(startSess);
        Serial.printf("发送结果: %s\n", sent ? "成功" : "失败");
      } else {
        Serial.println("WebSocket未连接");
      }
    }
    else if (command == "cleanup") {
      cleanupAudio();
      Serial.println("音频设备已清理");
    }
    else if (command == "restart") {
      Serial.println("重启系统...");
      delay(1000);
      ESP.restart();
    }
    else if (command == "help") {
      Serial.println("=== 可用命令 ===");
      Serial.println("status        - 查看当前状态");
      Serial.println("reconnect     - 重新连接豆包");
      Serial.println("test_mic      - 测试麦克风");
      Serial.println("test_speaker  - 测试扬声器");
      Serial.println("start_session - 手动发送StartSession");
      Serial.println("cleanup       - 清理音频设备");
      Serial.println("restart       - 重启系统");
      Serial.println("help          - 显示帮助");
    }
    else {
      Serial.println("未知命令,输入 'help' 查看可用命令");
    }
  }
}

参考

豆包对接链接:https://www.volcengine.com/docs/6561/1594356?lang=zh#客户端事件

相关推荐
时光の尘5 个月前
ESP32入门开发·VScode空白项目搭建·点亮一颗LED灯
c语言·ide·vscode·freertos·led·esp32-s3·esp32-idf
rosemary5127 个月前
ESP32-S3 IDF V5.4.1 LVGL 9.2.0 fatfs
lvgl·esp32-s3·fatfs
jomoly7 个月前
【LC实战派】小智固件编译
乐鑫·esp32-s3·立创·小智·实战派
小_楠_天_问8 个月前
第二课:ESP32 使用 PWM 渐变控制——实现模拟呼吸灯或音调变化
c语言·嵌入式硬件·mcu·esp32·arduino·pwm·esp32-s3
深圳启明云端科技10 个月前
家电产品智能屏方案,ESP32系列助力智能升级,物联网通信交互应用
物联网·人机交互·芯片·乐鑫·esp32-s3·esp32-c3·智能屏
深圳启明云端科技10 个月前
乐鑫ESP32系列产品方案,智能屏无线交互控制应用,设备触控语音交互联动
物联网·乐鑫·esp32-s3·esp32-c3·智能屏·esp32-p4·无线组网
lsalp1 年前
OpenAI于2024年12月21日在GitHub上正式发布了实时嵌入式SDK。支持ESP32-S3
物联网·github·esp32-s3
蓝天居士1 年前
ESP32-S3模组上跑通ES8388(12)
esp32-s3·es8388·audio codec·esp-adf
启明云端wireless-tag1 年前
ESP32-S3设备智能化升级,物联网无线AI语音交互,让生活更加便捷和有趣
物联网·乐鑫·esp32-s3·无线方案·ai语音交互