在使用websocket的方式进行tts转换时,返回结果的base64音频文件无法直接播放
解决办法
- 把base64转换为二进制文件(此时的文件时PCM内容)
- 把上一步骤产生的二进制文件转换为wav音频文件
java代码实现示例:
java
public static void main(String[] args) throws IOException {
String base64Data = "BAACAAQAAgABAAIAAQACAAEAAQABAAIAAQABAAAAAQAAAAAAAAAAAAEAAAABAAAAAAAAAAEAAAAAAAAAAAABAAAAAAABAAAAAgAAAAAAAQAAAAEAAQAAAAEAAgABAAEAAAABAAIAAQABAAIAAQAAAAAAAAAAAAAA///+//7//v/9//7/AAAAAP/+//7+//7//v///wAA/wAA//8AAP///v/+AAD//wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAQABAAEAAAAAAAAAAAD//wAAAQACAAIAAgACAAIAAwACAAIAAwADAAIAAgADAAMAAwACAAMABQADAAMAAwAEA....";
// Base64 解码
byte[] pcmBytes = Base64.getDecoder().decode(base64Data);
// WAV 文件参数
int sampleRate = 24000; // 采样率
int channels = 1; // 单声道
int bitDepth = 16; // 位深度
String outputFilePath = "C:\\Users\\Administrator\\Desktop\\output1-3.wav";
try (FileOutputStream fos = new FileOutputStream(outputFilePath)) {
// 写入 WAV 文件头
writeWavHeader(fos, pcmBytes.length, sampleRate, channels, bitDepth);
// 写入 PCM 数据
fos.write(pcmBytes);
System.out.println("WAV 文件已生成: " + outputFilePath);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 写入 WAV 文件头
*
* @param fos 文件输出流
* @param dataLength PCM 数据长度
* @param sampleRate 采样率
* @param channels 声道数
* @param bitDepth 位深度
* @throws IOException
*/
private static void writeWavHeader(FileOutputStream fos, int dataLength, int sampleRate, int channels, int bitDepth) throws IOException {
// WAV 文件头大小(44 字节)
int headerSize = 44;
// 文件总大小(文件头 + PCM 数据)
int fileSize = headerSize + dataLength;
// 字节率(每秒字节数)
int byteRate = sampleRate * channels * bitDepth / 8;
// 块对齐(每个采样帧的字节数)
int blockAlign = channels * bitDepth / 8;
// 写入 WAV 文件头
fos.write("RIFF".getBytes()); // Chunk ID
writeInt(fos, fileSize - 8); // Chunk Size
fos.write("WAVE".getBytes()); // Format
fos.write("fmt ".getBytes()); // Subchunk1 ID
writeInt(fos, 16); // Subchunk1 Size (16 for PCM)
writeShort(fos, 1); // Audio Format (1 for PCM)
writeShort(fos, channels); // Number of Channels
writeInt(fos, sampleRate); // Sample Rate
writeInt(fos, byteRate); // Byte Rate
writeShort(fos, blockAlign); // Block Align
writeShort(fos, bitDepth); // Bits Per Sample
fos.write("data".getBytes()); // Subchunk2 ID
writeInt(fos, dataLength); // Subchunk2 Size
}
/**
* 写入一个 32 位整数(小端序)
*/
private static void writeInt(FileOutputStream fos, int value) throws IOException {
fos.write(value & 0xFF);
fos.write((value >> 8) & 0xFF);
fos.write((value >> 16) & 0xFF);
fos.write((value >> 24) & 0xFF);
}
/**
* 写入一个 16 位短整数(小端序)
*/
private static void writeShort(FileOutputStream fos, int value) throws IOException {
fos.write(value & 0xFF);
fos.write((value >> 8) & 0xFF);
}
问题深究
源码
- 接口中的关键代码(TTSServerExecutor类中)
python
@router.websocket('/paddlespeech/tts/streaming')
async def websocket_endpoint(websocket: WebSocket):
"""PaddleSpeech Online TTS Server api
Args:
websocket (WebSocket): the websocket instance
"""
#1. the interface wait to accept the websocket protocal header
# and only we receive the header, it establish the connection with specific thread
await websocket.accept()
#2. if we accept the websocket headers, we will get the online tts engine instance
engine_pool = get_engine_pool()
tts_engine = engine_pool['tts']
connection_handler = None
if tts_engine.engine_type == "online":
from paddlespeech.server.engine.tts.online.python.tts_engine import PaddleTTSConnectionHandler
elif tts_engine.engine_type == "online-onnx":
from paddlespeech.server.engine.tts.online.onnx.tts_engine import PaddleTTSConnectionHandler
else:
logger.error("Online tts engine only support online or online-onnx.")
sys.exit(-1)
try:
while True:
# careful here, changed the source code from starlette.websockets
assert websocket.application_state == WebSocketState.CONNECTED
message = await websocket.receive()
websocket._raise_on_disconnect(message)
message = json.loads(message["text"])
....省略无关代码.....
# speech synthesis request
elif 'text' in message:
text = message["text"]
spk_id = message["spk_id"]
# 生成 wav数据
wav_generator = connection_handler.run(
sentence=text, spk_id=spk_id)
while True:
try:
tts_results = next(wav_generator) # 获取下一个计算结果
resp = {"status": 1, "audio": tts_results} # 把base64位字符串写入socket
await websocket.send_json(resp)
except StopIteration as e:
resp = {"status": 2, "audio": ''}
await websocket.send_json(resp)
logger.info(
"Complete the synthesis of the audio streams")
break
except Exception as e:
resp = {"status": -1, "audio": ''}
await websocket.send_json(resp)
break
else:
logger.error(
"Invalid request, please check if the request is correct.")
except Exception as e:
logger.error(e)
python
def run(
self,
sentence: str,
spk_id: int=0, ):
""" run include inference and postprocess.
Args:
sentence (str): text to be synthesized
spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
Returns:
wav_base64: The base64 format of the synthesized audio.
"""
wav_list = []
for wav in self.infer(
text=sentence,
lang=self.config.lang,
am=self.config.am,
spk_id=spk_id, ):
# wav type: <class 'numpy.ndarray'> float32, convert to pcm (base64)
wav = float2pcm(wav) # 把wav文件转换为了PCM文件
wav_bytes = wav.tobytes() # to bytes
wav_base64 = base64.b64encode(wav_bytes).decode('utf8') # 这个时候的base64位的字符串实际上是由pcm格式转换而来。
wav_list.append(wav)
yield wav_base64
wav_all = np.concatenate(wav_list, axis=0)
duration = len(wav_all) / self.tts_engine.sample_rate
logger.info(f"sentence: {sentence}")
logger.info(f"The durations of audio is: {duration} s")
logger.info(f"first response time: {self.first_response_time} s")
logger.info(f"final response time: {self.final_response_time} s")
logger.info(f"RTF: {self.final_response_time / duration}")
logger.info(
f"Other info: front time: {self.frontend_time} s, first am infer time: {self.first_am_infer} s, first voc infer time: {self.first_voc_infer} s,"
)
总结
参考资料:https://github.com/PaddlePaddle/PaddleSpeech/issues/3106
感谢网友分享的知识。