Unity之使用火山引擎实现流式语音合成|优化版本

初始版本

Unity之使用火山引擎实现流式语音合成

优化版本

优化内容:

  1. 原始版本使用NativeWebSocket插件,无法获取X-Tt-Logid,使用WebSocketSharp插件解决这个问题
  2. 提取两个插件的共性,可依据需要替换

示例

通信类

websocket通信基类

csharp 复制代码
using UnityEngine;
using System;
/// <summary>
/// 文字转语音
/// </summary>
public abstract class BaseTextToSpeech : MonoBehaviour
{
    public event Action OnConnectWeb;
    public event Action OnConnectTTSSucess;

    public event Action OnCreateSessionSucess;
    public event Action OnSessionFinished;

    public event Action OnSentenceStart;
    public event Action OnSentenceEnd;

    public event Action<byte[]> OnReceiveAudio;

    public abstract void WebSocketConnect();

    public abstract void WebSocketDisconnect();

    public abstract void BeginConnectTTS();

    public abstract void BeginSession();

    public abstract void Session(string info);

    public abstract void EndSession();

    public abstract void EndConnectTTS();

    protected void InvokeConnectWeb()
    {
        OnConnectWeb?.Invoke();
    }
    protected void InvokeConnectTTSSucess()
    {
        OnConnectTTSSucess?.Invoke();
    }
    protected void InvokeCreateSessionSucess()
    {
        OnCreateSessionSucess?.Invoke();
    }
    protected void InvokeSessionFinished()
    {
        OnSessionFinished?.Invoke();
    }
    protected void InvokeSentenceStart()
    {
        OnSentenceStart?.Invoke();
    }
    protected void InvokeSentenceEnd()
    {
        OnSentenceEnd?.Invoke();
    }
    protected void InvokeReceiveAudio(byte[] data)
    {
        OnReceiveAudio?.Invoke(data);
    }
}

使用NativeWebSocket插件

csharp 复制代码
using UnityEngine;
using NativeWebSocket;
using System;
using System.Collections.Generic;
/// <summary>
/// 文字转语音 
/// NativeSocket
/// </summary>
public class TextToSpeech : BaseTextToSpeech
{
    string wss = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
    string XApiAppId = "";
    string XApiAccessKey = "";
    string XApiResourceId = "seed-tts-2.0";
    [Header("发音人")]
    [SerializeField] string speaker = "zh_female_vv_uranus_bigtts";
    [Header("采样率")]
    [SerializeField] int sampleRate = 24000;

    WebSocket webSocket;
    string currentSessionId = null;

    public override async void WebSocketConnect()
    {
        //请求参数
        var id = Guid.NewGuid().ToString();

        //请求头
        var headers = new Dictionary<string, string>();
        headers.Add("X-Api-App-Key", XApiAppId);
        headers.Add("X-Api-Access-Key", XApiAccessKey);
        headers.Add("X-Api-Resource-Id", XApiResourceId);
        headers.Add("X-Api-Connect-Id", id);

        //回调注册
        webSocket = new WebSocket(wss, headers);
        webSocket.OnError += OnWebSocketError;
        webSocket.OnClose += OnWebSocketClose;
        webSocket.OnOpen += OnWebSocketOpen;
        webSocket.OnMessage += OnReceiveWebSocketMessage;

        await webSocket.Connect();
    }

    public override async void WebSocketDisconnect()
    {
        if (webSocket != null)
        {
            webSocket.OnError -= OnWebSocketError;
            webSocket.OnClose -= OnWebSocketClose;
            webSocket.OnOpen -= OnWebSocketOpen;
            webSocket.OnMessage -= OnReceiveWebSocketMessage;
            try
            {
                await webSocket.Close();
            }
            catch (System.Exception e)
            {
                Debug.LogError(e.Message);
            }
        }
        webSocket = null;
    }

    public override void BeginConnectTTS()
    {
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.StartConnection,
            Payload = System.Text.Encoding.UTF8.GetBytes("{}"),
        };
        Send(msg.Marshal());
    }

    public override void BeginSession()
    {
        currentSessionId = Guid.NewGuid().ToString();
        var payload = new SessionPayload
        {
            user = new UserInfo { uid = Guid.NewGuid().ToString() },
            @event = (int)Speech.Protocols.EventType.StartSession,
            req_params = new ReqParams
            {
                speaker = speaker,
                text = "",
                audio_params = new AudioParams
                {
                    format = "pcm",
                    sample_rate = sampleRate
                },
            }
        };
        string json = JsonUtility.ToJson(payload);
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes(json);

        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.StartSession,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public override void Session(string info)
    {
        var taskRequest = new SessionPayload();
        taskRequest.req_params = new ReqParams();
        taskRequest.req_params.audio_params = new AudioParams
        {
            format = "pcm",
            sample_rate = 24000
        };
        taskRequest.req_params.addtions = new Addtions()
        {
            disable_markdown_filter = true,
        };
        taskRequest.req_params.text = info;
        taskRequest.user = new UserInfo() { uid = Guid.NewGuid().ToString() };
        taskRequest.@event = (int)Speech.Protocols.EventType.TaskRequest;

        string json = JsonUtility.ToJson(taskRequest);
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes(json);
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.TaskRequest,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public override void EndSession()
    {
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes("{}");
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.FinishSession,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public override void EndConnectTTS()
    {
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes("{}");
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.FinishConnection,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    void OnWebSocketError(string info)
    {
        Debug.LogError("错误信息:" + info);
    }

    void OnWebSocketClose(WebSocketCloseCode closeCode)
    {
        Debug.Log("关闭:" + closeCode);
    }

    void OnWebSocketOpen()
    {
       InvokeConnectWeb();
    }

    void OnReceiveWebSocketMessage(byte[] data)
    {
        try
        {
            var msg = Speech.Protocols.Message.FromBytes(data);
            Debug.Log($"接收语音合成消息:{msg.MsgType},{msg.EventType},{msg.Payload.Length}");
            //接受音频数据
            if (msg.MsgType == Speech.Protocols.MsgType.AudioOnlyServer)
            {
                Debug.Log("接收到音频消息");
                if (msg.Payload != null && msg.Payload.Length > 0)
                {
                    Debug.Log("音频数据回调");
                    //OnReceiveAuido?.Invoke(msg.Payload);
                    InvokeReceiveAudio(msg.Payload);
                }
            }

            if (msg.MsgType == Speech.Protocols.MsgType.Error)
            {
                string reason = msg.Payload.Length > 0
          ? System.Text.Encoding.UTF8.GetString(msg.Payload)
          : "未知错误";
                Debug.LogError($"{msg.EventType}: {reason}");
            }

            switch (msg.EventType)
            {
                case Speech.Protocols.EventType.ConnectionStarted:
                    //Debug.Log("连接成功,可以发送 StartSession");
                    InvokeConnectTTSSucess();
                    break;
                case Speech.Protocols.EventType.SessionStarted:
                    //Debug.Log("会话已启动,可以发送 TaskRequest");
                    InvokeCreateSessionSucess();
                    break;
                case Speech.Protocols.EventType.SessionFinished:
                    //Debug.Log("会话结束");
                    InvokeSessionFinished();
                    break;
                case Speech.Protocols.EventType.TTSSentenceStart:
                    InvokeSentenceStart();//句子开始
                    break;
                case Speech.Protocols.EventType.TTSSentenceEnd:
                    InvokeSentenceEnd();//句子结束
                    break;
                case Speech.Protocols.EventType.ConnectionFailed:
                case Speech.Protocols.EventType.SessionFailed:
                    string reason = msg.Payload.Length > 0
                        ? System.Text.Encoding.UTF8.GetString(msg.Payload)
                        : "未知错误";
                    Debug.LogError($"{msg.EventType}: {reason}");
                    break;
            }
        }
        catch (Exception e)
        {
            Debug.LogError("接受消息:" + e.Message);
        }
    }

    async void Send(byte[] bytes)
    {
        if (webSocket.State == WebSocketState.Open)
            await webSocket.Send(bytes);
    }

    [System.Serializable]
    public class SessionPayload
    {
        public UserInfo user;
        public int @event;
        public ReqParams req_params;
    }

    [System.Serializable]
    public class UserInfo
    {
        public string uid;
    }

    [System.Serializable]
    public class ReqParams
    {
        public string speaker;
        public string text;                // 可选,可以先不填或空字符串
        public AudioParams audio_params;
        public Addtions addtions;
    }

    [System.Serializable]
    public class AudioParams
    {
        public string format;
        public int sample_rate;
    }

    [System.Serializable]
    public class Addtions
    {
        public bool disable_markdown_filter = true;
    }
}

使用WebSocketSharp插件

csharp 复制代码
using UnityEngine;
using System;
using WebSocketSharp;
/// <summary>
/// 文字转语音 
/// websocket-sharp
/// </summary>
public class TextToSpeechSharp : BaseTextToSpeech
{
    string wss = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
    string XApiAppId = "";
    string XApiAccessKey = "";
    string XApiResourceId = "seed-tts-2.0";
    [Header("发音人")]
    [SerializeField] string speaker = "zh_female_vv_uranus_bigtts";
    [Header("采样率")]
    [SerializeField] int sampleRate = 24000;

    WebSocket webSocket;
    string currentSessionId = null;

    public override void WebSocketConnect()
    {
        //请求参数
        var id = Guid.NewGuid().ToString();

        //回调注册
        webSocket = new WebSocket(wss);
        //请求头
        webSocket.SetUserHeader("X-Api-App-Key", XApiAppId);
        webSocket.SetUserHeader("X-Api-Access-Key", XApiAccessKey);
        webSocket.SetUserHeader("X-Api-Resource-Id", XApiResourceId);
        webSocket.SetUserHeader("X-Api-Connect-Id", id);

        webSocket.OnError += OnWebSocketError;
        webSocket.OnClose += OnWebSocketClose;
        webSocket.OnOpen += OnWebSocketOpen;
        webSocket.OnMessage += OnReceiveWebSocketMessage;

        webSocket.Connect();
    }

    public override void WebSocketDisconnect()
    {
        if (webSocket != null)
        {
            webSocket.OnError -= OnWebSocketError;
            webSocket.OnClose -= OnWebSocketClose;
            webSocket.OnOpen -= OnWebSocketOpen;
            webSocket.OnMessage -= OnReceiveWebSocketMessage;
            try
            {
                webSocket.Close();
            }
            catch (System.Exception e)
            {
                Debug.LogError(e.Message);
            }
        }
        webSocket = null;
    }

    public override void BeginConnectTTS()
    {
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.StartConnection,
            Payload = System.Text.Encoding.UTF8.GetBytes("{}"),
        };
        Send(msg.Marshal());
    }

    public override void BeginSession()
    {
        currentSessionId = Guid.NewGuid().ToString();
        var payload = new SessionPayload
        {
            user = new UserInfo { uid = Guid.NewGuid().ToString() },
            @event = (int)Speech.Protocols.EventType.StartSession,
            req_params = new ReqParams
            {
                speaker = speaker,
                text = "",
                audio_params = new AudioParams
                {
                    format = "pcm",
                    sample_rate = sampleRate
                },
            }
        };
        string json = JsonUtility.ToJson(payload);
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes(json);

        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.StartSession,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public override void Session(string info)
    {
        var taskRequest = new SessionPayload();
        taskRequest.req_params = new ReqParams();
        taskRequest.req_params.audio_params = new AudioParams
        {
            format = "pcm",
            sample_rate = 24000
        };
        taskRequest.req_params.addtions = new Addtions()
        {
            disable_markdown_filter = true,
        };
        taskRequest.req_params.text = info;
        taskRequest.user = new UserInfo() { uid = Guid.NewGuid().ToString() };
        taskRequest.@event = (int)Speech.Protocols.EventType.TaskRequest;

        string json = JsonUtility.ToJson(taskRequest);
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes(json);
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.TaskRequest,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public override void EndSession()
    {
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes("{}");
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.FinishSession,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public override void EndConnectTTS()
    {
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes("{}");
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.FinishConnection,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    void OnWebSocketError(object sender, ErrorEventArgs args)
    {
        Debug.LogError("错误信息:" + args.Message);
    }

    void OnWebSocketClose(object sender, CloseEventArgs args)
    {
        Debug.Log("关闭:" + args.Code);
    }

    void OnWebSocketOpen(object sender, EventArgs args)
    {
        var headers = webSocket.HandshakeResponseHeaders;
        Debug.Log(("X-Tt-Logid",headers["X-Tt-Logid"]));
        InvokeConnectWeb();
    }

    void OnReceiveWebSocketMessage(object sender, MessageEventArgs args)
    {
        try
        {
            var msg = Speech.Protocols.Message.FromBytes(args.RawData);
            Debug.Log($"接收语音合成消息:{msg.MsgType},{msg.EventType},{msg.Payload.Length}");
            //接受音频数据
            if (msg.MsgType == Speech.Protocols.MsgType.AudioOnlyServer)
            {
                Debug.Log("接收到音频消息");
                if (msg.Payload != null && msg.Payload.Length > 0)
                {
                    Debug.Log("音频数据回调");
                    //OnReceiveAuido?.Invoke(msg.Payload);
                    InvokeReceiveAudio(msg.Payload);
                }
            }

            if (msg.MsgType == Speech.Protocols.MsgType.Error)
            {
                string reason = msg.Payload.Length > 0
          ? System.Text.Encoding.UTF8.GetString(msg.Payload)
          : "未知错误";
                Debug.LogError($"{msg.EventType}: {reason}");
            }

            switch (msg.EventType)
            {
                case Speech.Protocols.EventType.ConnectionStarted:
                    //Debug.Log("连接成功,可以发送 StartSession");
                    InvokeConnectTTSSucess();
                    break;
                case Speech.Protocols.EventType.SessionStarted:
                    //Debug.Log("会话已启动,可以发送 TaskRequest");
                    InvokeCreateSessionSucess();
                    break;
                case Speech.Protocols.EventType.SessionFinished:
                    //Debug.Log("会话结束");
                    InvokeSessionFinished();
                    break;
                case Speech.Protocols.EventType.TTSSentenceStart:
                    InvokeSentenceStart();//句子开始
                    break;
                case Speech.Protocols.EventType.TTSSentenceEnd:
                    InvokeSentenceEnd();//句子结束
                    break;
                case Speech.Protocols.EventType.ConnectionFailed:
                case Speech.Protocols.EventType.SessionFailed:
                    string reason = msg.Payload.Length > 0
                        ? System.Text.Encoding.UTF8.GetString(msg.Payload)
                        : "未知错误";
                    Debug.LogError($"{msg.EventType}: {reason}");
                    break;
            }
        }
        catch (Exception e)
        {
            Debug.LogError("接受消息:" + e.Message);
        }
    }

    void Send(byte[] bytes)
    {
        if (webSocket.ReadyState == WebSocketState.Open)
            webSocket.Send(bytes);
    }

    [System.Serializable]
    public class SessionPayload
    {
        public UserInfo user;
        public int @event;
        public ReqParams req_params;
    }

    [System.Serializable]
    public class UserInfo
    {
        public string uid;
    }

    [System.Serializable]
    public class ReqParams
    {
        public string speaker;
        public string text;                // 可选,可以先不填或空字符串
        public AudioParams audio_params;
        public Addtions addtions;
    }

    [System.Serializable]
    public class AudioParams
    {
        public string format;
        public int sample_rate;
    }

    [System.Serializable]
    public class Addtions
    {
        public bool disable_markdown_filter = true;
    }
}

语音合成流程类

连接TTS服务器成功后,不需要立即创建会话。

csharp 复制代码
using System.Text;
using UnityEngine;

public class TextToSpeechOut : MonoBehaviour
{
    [Header("文字合成语音")]
    [SerializeField] BaseTextToSpeech textToSpeech;

    [Header("TTS音频流播放")]
    [SerializeField] TTSAudioStreamPlay tTSAudioStreamPlay;

    bool hasSession = false;//会话存在
    bool isCreateSession;//创建会话
    StringBuilder sentenceBuffer;//缓存文本

    bool receivedAudioData = false;
    bool requestEndSession = false;
    float endSessionTimer = 0;
    [Header("结束会话等待时间")]
    [SerializeField] float endSessionRequestTime = 2f;


    void Awake()
    {
        tTSAudioStreamPlay.CreateAudioStream();
        sentenceBuffer = new StringBuilder();
        textToSpeech.OnConnectWeb += OnConnectWeb;
        textToSpeech.OnConnectTTSSucess += OnConnectTTSSucess;
        textToSpeech.OnCreateSessionSucess += OnCreateSession;
        textToSpeech.OnSessionFinished += OnSessionFinish;
        textToSpeech.OnReceiveAudio += OnReceiveData;
        textToSpeech.WebSocketConnect();
    }

    void OnDestroy()
    {
        tTSAudioStreamPlay.ReleaseAudioStream();
        textToSpeech.OnConnectWeb -= OnConnectWeb;
        textToSpeech.OnConnectTTSSucess -= OnConnectTTSSucess;
        textToSpeech.OnCreateSessionSucess -= OnCreateSession;
        textToSpeech.OnSessionFinished -= OnSessionFinish;
        textToSpeech.OnReceiveAudio -= OnReceiveData;
        textToSpeech.WebSocketDisconnect();
    }

    void OnConnectWeb()
    {
        Debug.Log("连接服务器成功,开始连接TTS");
        textToSpeech.BeginConnectTTS();
    }

    void OnConnectTTSSucess()
    {
        Debug.Log("连接TTS成功");
        //避免重复建立会话 
        //5s内不发送文本,服务器不返回音频数据,超出5s,再次发送也不会返回音频数据
        //会话10s内有效,不提前建立会话,需要时建立
        // if (hasSession == false && isCreateSession == false)
        // {
        //     isCreateSession = true;
        //     //请求建立会话
        //     textToSpeech.BeginSession();
        // }
    }

    void OnCreateSession()
    {
        Debug.Log("建立会话成功");
        hasSession = true;
        isCreateSession = false;

        // 重置所有会话状态
        receivedAudioData = false;
        requestEndSession = false;
        endSessionTimer = 0f;

        //发送缓存内容
        if (sentenceBuffer.Length > 0)
        {
            var content = sentenceBuffer.ToString();
            sentenceBuffer.Clear();
            Debug.Log("Send TaskRequest: " + content);
            textToSpeech.Session(content);
        }
    }

    void OnSessionFinish()
    {
        Debug.Log("会话结束");
        hasSession = false;
        isCreateSession = false;
        receivedAudioData = false;
        requestEndSession = false;
        endSessionTimer = 0;
    }

    void OnReceiveData(byte[] data)
    {
        receivedAudioData = true;
        tTSAudioStreamPlay.WriteAudioData(data);
    }

    public void ReadText(string text)
    {
        if (hasSession)
        {
            Debug.Log("Send TaskRequest: " + text);
            textToSpeech.Session(text);
        }
        else
        {
            //缓存文本
            sentenceBuffer.Append(text);
            //避免重复建立会话
            if (isCreateSession == false)
            {
                isCreateSession = true;
                //请求建立会话
                textToSpeech.BeginSession();
            }
        }
    }

    public void EndReadText()
    {
        if (hasSession)
        {
            endSessionTimer = 0;
            requestEndSession = true;
            //请求结束会话
            //textToSpeech.EndSession();
        }
    }

    void Update()
    {
        if (requestEndSession)
        {
            if (receivedAudioData)
            {
                requestEndSession = false;
                //请求结束会话
                textToSpeech.EndSession();
            }
            else
            {
                endSessionTimer += Time.deltaTime;
                if (endSessionTimer >= endSessionRequestTime)
                {
                    requestEndSession = false;
                    endSessionTimer = 0;
                    //请求结束会话
                    textToSpeech.EndSession();
                }
            }
        }
    }
}