paddlesppech流式TTS服务返回的base64编码无法正常解码为wav文件

在使用websocket的方式进行tts转换时，返回结果的base64音频文件无法直接播放

解决办法

把base64转换为二进制文件（此时的文件时PCM内容）
把上一步骤产生的二进制文件转换为wav音频文件

java代码实现示例：

java 复制代码

public static void main(String[] args) throws IOException {
        String base64Data = "BAACAAQAAgABAAIAAQACAAEAAQABAAIAAQABAAAAAQAAAAAAAAAAAAEAAAABAAAAAAAAAAEAAAAAAAAAAAABAAAAAAABAAAAAgAAAAAAAQAAAAEAAQAAAAEAAgABAAEAAAABAAIAAQABAAIAAQAAAAAAAAAAAAAA///+//7//v/9//7/AAAAAP/+//7+//7//v///wAA/wAA//8AAP///v/+AAD//wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAQABAAEAAAAAAAAAAAD//wAAAQACAAIAAgACAAIAAwACAAIAAwADAAIAAgADAAMAAwACAAMABQADAAMAAwAEA....";
        // Base64 解码
        byte[] pcmBytes = Base64.getDecoder().decode(base64Data);

        // WAV 文件参数
        int sampleRate = 24000; // 采样率
        int channels = 1;       // 单声道
        int bitDepth = 16;      // 位深度
        String outputFilePath = "C:\\Users\\Administrator\\Desktop\\output1-3.wav";
        try (FileOutputStream fos = new FileOutputStream(outputFilePath)) {
            // 写入 WAV 文件头
            writeWavHeader(fos, pcmBytes.length, sampleRate, channels, bitDepth);
            // 写入 PCM 数据
            fos.write(pcmBytes);
            System.out.println("WAV 文件已生成: " + outputFilePath);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 写入 WAV 文件头
     *
     * @param fos        文件输出流
     * @param dataLength PCM 数据长度
     * @param sampleRate 采样率
     * @param channels   声道数
     * @param bitDepth   位深度
     * @throws IOException
     */
    private static void writeWavHeader(FileOutputStream fos, int dataLength, int sampleRate, int channels, int bitDepth) throws IOException {
        // WAV 文件头大小（44 字节）
        int headerSize = 44;
        // 文件总大小（文件头 + PCM 数据）
        int fileSize = headerSize + dataLength;
        // 字节率（每秒字节数）
        int byteRate = sampleRate * channels * bitDepth / 8;
        // 块对齐（每个采样帧的字节数）
        int blockAlign = channels * bitDepth / 8;

        // 写入 WAV 文件头
        fos.write("RIFF".getBytes()); // Chunk ID
        writeInt(fos, fileSize - 8); // Chunk Size
        fos.write("WAVE".getBytes()); // Format
        fos.write("fmt ".getBytes()); // Subchunk1 ID
        writeInt(fos, 16);           // Subchunk1 Size (16 for PCM)
        writeShort(fos, 1);           // Audio Format (1 for PCM)
        writeShort(fos, channels);    // Number of Channels
        writeInt(fos, sampleRate);    // Sample Rate
        writeInt(fos, byteRate);      // Byte Rate
        writeShort(fos, blockAlign);  // Block Align
        writeShort(fos, bitDepth);    // Bits Per Sample
        fos.write("data".getBytes()); // Subchunk2 ID
        writeInt(fos, dataLength);    // Subchunk2 Size
    }

    /**
     * 写入一个 32 位整数（小端序）
     */
    private static void writeInt(FileOutputStream fos, int value) throws IOException {
        fos.write(value & 0xFF);
        fos.write((value >> 8) & 0xFF);
        fos.write((value >> 16) & 0xFF);
        fos.write((value >> 24) & 0xFF);
    }

    /**
     * 写入一个 16 位短整数（小端序）
     */
    private static void writeShort(FileOutputStream fos, int value) throws IOException {
        fos.write(value & 0xFF);
        fos.write((value >> 8) & 0xFF);
    }

问题深究

源码

接口中的关键代码(TTSServerExecutor类中)

python 复制代码

@router.websocket('/paddlespeech/tts/streaming')
async def websocket_endpoint(websocket: WebSocket):
    """PaddleSpeech Online TTS Server api

    Args:
        websocket (WebSocket): the websocket instance
    """

    #1. the interface wait to accept the websocket protocal header
    #   and only we receive the header, it establish the connection with specific thread
    await websocket.accept()

    #2. if we accept the websocket headers, we will get the online tts engine instance
    engine_pool = get_engine_pool()
    tts_engine = engine_pool['tts']

    connection_handler = None

    if tts_engine.engine_type == "online":
        from paddlespeech.server.engine.tts.online.python.tts_engine import PaddleTTSConnectionHandler
    elif tts_engine.engine_type == "online-onnx":
        from paddlespeech.server.engine.tts.online.onnx.tts_engine import PaddleTTSConnectionHandler
    else:
        logger.error("Online tts engine only support online or online-onnx.")
        sys.exit(-1)

    try:
        while True:
            # careful here, changed the source code from starlette.websockets
            assert websocket.application_state == WebSocketState.CONNECTED
            message = await websocket.receive()
            websocket._raise_on_disconnect(message)
            message = json.loads(message["text"])

          ....省略无关代码.....

            # speech synthesis request 
            elif 'text' in message:
                text = message["text"]
                spk_id = message["spk_id"]

                # 生成 wav数据
                wav_generator = connection_handler.run(
                    sentence=text, spk_id=spk_id) 

                while True:
                    try:
                        tts_results = next(wav_generator) # 获取下一个计算结果
                        resp = {"status": 1, "audio": tts_results} # 把base64位字符串写入socket
                        await websocket.send_json(resp)
                    except StopIteration as e:
                        resp = {"status": 2, "audio": ''}
                        await websocket.send_json(resp)
                        logger.info(
                            "Complete the synthesis of the audio streams")
                        break
                    except Exception as e:
                        resp = {"status": -1, "audio": ''}
                        await websocket.send_json(resp)
                        break

            else:
                logger.error(
                    "Invalid request, please check if the request is correct.")

    except Exception as e:
        logger.error(e)

python 复制代码

   def run(
            self,
            sentence: str,
            spk_id: int=0, ):
        """ run include inference and postprocess.

        Args:
            sentence (str): text to be synthesized
            spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.

        Returns:
            wav_base64: The base64 format of the synthesized audio.
        """

        wav_list = []

        for wav in self.infer(
                text=sentence,
                lang=self.config.lang,
                am=self.config.am,
                spk_id=spk_id, ):

            # wav type: <class 'numpy.ndarray'>  float32, convert to pcm (base64)
            wav = float2pcm(wav)  # 把wav文件转换为了PCM文件
            wav_bytes = wav.tobytes()  # to bytes
            wav_base64 = base64.b64encode(wav_bytes).decode('utf8')  # 这个时候的base64位的字符串实际上是由pcm格式转换而来。
            wav_list.append(wav)

            yield wav_base64

        wav_all = np.concatenate(wav_list, axis=0)
        duration = len(wav_all) / self.tts_engine.sample_rate

        logger.info(f"sentence: {sentence}")
        logger.info(f"The durations of audio is: {duration} s")
        logger.info(f"first response time: {self.first_response_time} s")
        logger.info(f"final response time: {self.final_response_time} s")
        logger.info(f"RTF: {self.final_response_time / duration}")
        logger.info(
            f"Other info: front time: {self.frontend_time} s, first am infer time: {self.first_am_infer} s, first voc infer time: {self.first_voc_infer} s,"
        )

总结

参考资料：https://github.com/PaddlePaddle/PaddleSpeech/issues/3106

感谢网友分享的知识。