Unity之使用火山引擎实现流式语音合成

准备

安装NativeWebSocket插件

流式语音合成流程

1.连接TTS

2.建立会话

3.发送文本

4.关闭会话

5.TTS结束连接

一步骤完成后,重复234步骤,进行语音合成。

注意:

每次建立会话需要创建一个id

一次会话,可多次发送文本。

请求文字转语音

参数设置XApiAppId,XApiAccessKey,XApiResourceId

csharp 复制代码
using UnityEngine;
using NativeWebSocket;
using System;
using System.Collections.Generic;
/// <summary>
/// 文字转语音
/// </summary>
public class TextToSpeech : MonoBehaviour
{
    string wss = "wss://openspeech.bytedance.com/api/v3/tts/bidirection";
    string XApiAppId = "";
    string XApiAccessKey = "";
    string XApiResourceId = "seed-tts-2.0";
    [Header("发音人")]
    [SerializeField] string speaker = "zh_female_vv_uranus_bigtts";
    [Header("采样率")]
    [SerializeField] int sampleRate = 24000;

    WebSocket webSocket;
    string currentSessionId = null;

    public event Action OnConnectWeb;
    public event Action OnConnectTTSSucess;

    public event Action OnCreateSessionSucess;
    public event Action OnSessionFinished;

    public event Action OnSentenceStart;
    public event Action OnSentenceEnd;

    public event Action<byte[]> OnReceiveAuido;

    public async void WebSocketConnect()
    {
        //请求参数
        var id = Guid.NewGuid().ToString();

        //请求头
        var headers = new Dictionary<string, string>();
        headers.Add("X-Api-App-Key", XApiAppId);
        headers.Add("X-Api-Access-Key", XApiAccessKey);
        headers.Add("X-Api-Resource-Id", XApiResourceId);
        headers.Add("X-Api-Connect-Id", id);

        //回调注册
        webSocket = new WebSocket(wss, headers);
        webSocket.OnError += OnWebSocketError;
        webSocket.OnClose += OnWebSocketClose;
        webSocket.OnOpen += OnWebSocketOpen;
        webSocket.OnMessage += OnReceiveWebSocketMessage;

        await webSocket.Connect();
    }

    public async void WebSocketDisconnect()
    {
        if (webSocket != null)
        {
            webSocket.OnError -= OnWebSocketError;
            webSocket.OnClose -= OnWebSocketClose;
            webSocket.OnOpen -= OnWebSocketOpen;
            webSocket.OnMessage -= OnReceiveWebSocketMessage;
            try
            {
                await webSocket.Close();
            }
            catch (System.Exception e)
            {

                Debug.LogError(e.Message);
            }
        }
        webSocket = null;
    }

    public void BeginConnectTTS()
    {
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.StartConnection,
            Payload = System.Text.Encoding.UTF8.GetBytes("{}"),
        };
        Send(msg.Marshal());
    }

    public void BeginSession()
    {
        currentSessionId = Guid.NewGuid().ToString();
        var payload = new SessionPayload
        {
            user = new UserInfo { uid = Guid.NewGuid().ToString() },
            @event = (int)Speech.Protocols.EventType.StartSession,
            req_params = new ReqParams
            {
                speaker = speaker,
                text = "",
                audio_params = new AudioParams
                {
                    format = "pcm",
                    sample_rate = sampleRate
                },
            }
        };
        string json = JsonUtility.ToJson(payload);
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes(json);

        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.StartSession,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public void Session(string info)
    {
        var taskRequest = new SessionPayload();
        taskRequest.req_params = new ReqParams();
        taskRequest.req_params.audio_params = new AudioParams
        {
            format = "pcm",
            sample_rate = 24000
        };
        taskRequest.req_params.addtions = new Addtions()
        {
            disable_markdown_filter = true,
        };
        taskRequest.req_params.text = info;
        taskRequest.user = new UserInfo() { uid = Guid.NewGuid().ToString() };
        taskRequest.@event = (int)Speech.Protocols.EventType.TaskRequest;

        string json = JsonUtility.ToJson(taskRequest);
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes(json);
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.TaskRequest,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public void EndSession()
    {
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes("{}");
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.FinishSession,
            SessionId = currentSessionId,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    public void EndConnectTTS()
    {
        byte[] payloadBytes = System.Text.Encoding.UTF8.GetBytes("{}");
        var msg = new Speech.Protocols.Message
        {
            MsgType = Speech.Protocols.MsgType.FullClientRequest,
            MsgTypeFlag = Speech.Protocols.MsgTypeFlagBits.WithEvent,
            EventType = Speech.Protocols.EventType.FinishConnection,
            Payload = payloadBytes
        };
        Send(msg.Marshal());
    }

    void OnWebSocketError(string info)
    {
        Debug.LogError("错误信息:" + info);
    }

    void OnWebSocketClose(WebSocketCloseCode closeCode)
    {
        Debug.Log("关闭:" + closeCode);
    }

    void OnWebSocketOpen()
    {
        OnConnectWeb?.Invoke();
    }

    void OnReceiveWebSocketMessage(byte[] data)
    {
        try
        {
            var msg = Speech.Protocols.Message.FromBytes(data);

            //接受音频数据
            if (msg.MsgType == Speech.Protocols.MsgType.AudioOnlyServer)
            {
                if (msg.Payload != null && msg.Payload.Length > 0)
                {
                    OnReceiveAuido?.Invoke(msg.Payload);
                }
            }
            switch (msg.EventType)
            {
                case Speech.Protocols.EventType.ConnectionStarted:
                    //Debug.Log("连接成功,可以发送 StartSession");
                    OnConnectTTSSucess?.Invoke();
                    break;
                case Speech.Protocols.EventType.SessionStarted:
                    //Debug.Log("会话已启动,可以发送 TaskRequest");
                    OnCreateSessionSucess?.Invoke();
                    break;
                case Speech.Protocols.EventType.SessionFinished:
                    //Debug.Log("会话结束");
                    OnSessionFinished?.Invoke();
                    break;
                case Speech.Protocols.EventType.TTSSentenceStart:
                    OnSentenceStart?.Invoke();//句子开始
                    break;
                case Speech.Protocols.EventType.TTSSentenceEnd:
                    OnSentenceEnd?.Invoke();//句子结束
                    break;
                case Speech.Protocols.EventType.ConnectionFailed:
                case Speech.Protocols.EventType.SessionFailed:
                    string reason = msg.Payload.Length > 0
                        ? System.Text.Encoding.UTF8.GetString(msg.Payload)
                        : "未知错误";
                    Debug.LogError($"{msg.EventType}: {reason}");
                    break;
            }
        }
        catch (Exception e)
        {
            Debug.LogError("接受消息:" + e.Message);
        }
    }

    async void Send(byte[] bytes)
    {
        if (webSocket.State == WebSocketState.Open)
            await webSocket.Send(bytes);
    }

    [System.Serializable]
    public class SessionPayload
    {
        public UserInfo user;
        public int @event;
        public ReqParams req_params;
    }

    [System.Serializable]
    public class UserInfo
    {
        public string uid;
    }

    [System.Serializable]
    public class ReqParams
    {
        public string speaker;
        public string text;                
        public AudioParams audio_params;
        public Addtions addtions;
    }

    [System.Serializable]
    public class AudioParams
    {
        public string format;
        public int sample_rate;
    }

    [System.Serializable]
    public class Addtions
    {
        public bool disable_markdown_filter = true;
    }
}

协议类

来源官方C#版本的Demo,移除不使用的部分

csharp 复制代码
using System;
using System.Buffers.Binary;
using System.IO;
using System.Text;

namespace Speech.Protocols
{
    /// <summary>
    /// Defines the event type which determines the event of the message.
    /// </summary>
    public enum EventType : int
    {
        // Default event, applicable for scenarios not using events or not requiring event transmission,
        // or for scenarios using events, non-zero values can be used to validate event legitimacy
        None = 0,

        // 1 ~ 49 for upstream Connection events
        StartConnection = 1,
        StartTask = 1, // Alias of "StartConnection"
        FinishConnection = 2,
        FinishTask = 2, // Alias of "FinishConnection"

        // 50 ~ 99 for downstream Connection events
        // Connection established successfully
        ConnectionStarted = 50,
        TaskStarted = 50, // Alias of "ConnectionStarted"
        // Connection failed (possibly due to authentication failure)
        ConnectionFailed = 51,
        TaskFailed = 51, // Alias of "ConnectionFailed"
        // Connection ended
        ConnectionFinished = 52,
        TaskFinished = 52, // Alias of "ConnectionFinished"

        // 100 ~ 149 for upstream Session events
        StartSession = 100,
        CancelSession = 101,
        FinishSession = 102,

        // 150 ~ 199 for downstream Session events
        SessionStarted = 150,
        SessionCanceled = 151,
        SessionFinished = 152,
        SessionFailed = 153,

        // Usage events
        UsageResponse = 154,
        ChargeData = 154, // Alias of "UsageResponse"

        // 200 ~ 249 for upstream general events
        TaskRequest = 200,
        UpdateConfig = 201,

        // 250 ~ 299 for downstream general events
        AudioMuted = 250,

        // 300 ~ 349 for upstream TTS events
        SayHello = 300,

        // 350 ~ 399 for downstream TTS events
        TTSSentenceStart = 350,
        TTSSentenceEnd = 351,
        TTSResponse = 352,
        TTSEnded = 359,
        PodcastRoundStart = 360,
        PodcastRoundResponse = 361,
        PodcastRoundEnd = 362,

        // 450 ~ 499 for downstream ASR events
        ASRInfo = 450,
        ASRResponse = 451,
        ASREnded = 459,

        // 500 ~ 549 for upstream dialogue events
        // (Ground-Truth-Alignment) text for speech synthesis
        ChatTTSText = 500,

        // 550 ~ 599 for downstream dialogue events
        ChatResponse = 550,
        ChatEnded = 559,

        // 650 ~ 699 for downstream dialogue events
        // Events for source (original) language subtitle
        SourceSubtitleStart = 650,
        SourceSubtitleResponse = 651,
        SourceSubtitleEnd = 652,

        // Events for target (translation) language subtitle
        TranslationSubtitleStart = 653,
        TranslationSubtitleResponse = 654,
        TranslationSubtitleEnd = 655
    }

    /// <summary>
    /// Message type flags which determines how the message will be serialized with the protocol
    /// </summary>
    [Flags]
    public enum MsgTypeFlagBits : byte
    {
        NoSeq = 0,             // Non-terminal packet with no sequence
        PositiveSeq = 0b1,     // Non-terminal packet with sequence > 0
        LastNoSeq = 0b10,      // last packet with no sequence
        NegativeSeq = 0b11,    // last packet with sequence < 0
        WithEvent = 0b100      // Payload contains event number (int32)
    }

    /// <summary>
    /// Version bits defines the 4-bit version type
    /// </summary>
    public enum VersionBits : byte
    {
        Version1 = 1,
        Version2 = 2,
        Version3 = 3,
        Version4 = 4
    }

    /// <summary>
    /// Header size bits defines the 4-bit header-size type
    /// </summary>
    public enum HeaderSizeBits : byte
    {
        HeaderSize4 = 1,
        HeaderSize8 = 2,
        HeaderSize12 = 3,
        HeaderSize16 = 4
    }

    /// <summary>
    /// Serialization bits defines the 4-bit serialization method type
    /// </summary>
    public enum SerializationBits : byte
    {
        Raw = 0,
        JSON = 0b1,
        Thrift = 0b11,
        Custom = 0b1111
    }

    /// <summary>
    /// Compression bits defines the 4-bit compression method type
    /// </summary>
    public enum CompressionBits : byte
    {
        None = 0,
        Gzip = 0b1,
        Custom = 0b1111
    }

    /// <summary>
    /// Message type which determines how the message will be serialized with the protocol
    /// </summary>
    public enum MsgType : byte
    {
        Invalid = 0,
        FullClientRequest = 0b1,
        AudioOnlyClient = 0b10,
        FullServerResponse = 0b1001,
        AudioOnlyServer = 0b1011,
        FrontEndResultServer = 0b1100,
        Error = 0b1111,

        ServerACK = AudioOnlyServer
    }

    /// <summary>
    /// Message structure for protocol communication
    ///   0                 1                 2                 3
    /// | 0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 |
    /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    /// |    Version      |   Header Size   |     Msg Type    |      Flags      |
    /// |   (4 bits)      |    (4 bits)     |     (4 bits)    |     (4 bits)    |
    /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    /// | Serialization   |   Compression   |           Reserved                |
    /// |   (4 bits)      |    (4 bits)     |           (8 bits)                |
    /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    /// |                                                                       |
    /// |                   Optional Header Extensions                          |
    /// |                     (if Header Size > 1)                              |
    /// |                                                                       |
    /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    /// |                                                                       |
    /// |                           Payload                                     |
    /// |                      (variable length)                                |
    /// |                                                                       |
    /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    /// </summary>
    public class Message
    {
        public VersionBits Version { get; set; }
        public HeaderSizeBits HeaderSize { get; set; }
        public MsgType MsgType { get; set; }
        public MsgTypeFlagBits MsgTypeFlag { get; set; }
        public SerializationBits Serialization { get; set; }
        public CompressionBits Compression { get; set; }

        public EventType EventType { get; set; }
        public string SessionId { get; set; }
        public string ConnectId { get; set; }
        public int Sequence { get; set; }
        public uint ErrorCode { get; set; }

        public byte[] Payload { get; set; }

        /// <summary>
        /// Creates a new message with default values
        /// </summary>
        public Message()
        {
            Version = VersionBits.Version1;
            HeaderSize = HeaderSizeBits.HeaderSize4;
            Serialization = SerializationBits.JSON;
            Compression = CompressionBits.None;
            Payload = Array.Empty<byte>();
        }

        /// <summary>
        /// Creates a new message with specified message type and flag
        /// </summary>
        public static Message Create(MsgType msgType, MsgTypeFlagBits flag)
        {
            return new Message
            {
                MsgType = msgType,
                MsgTypeFlag = flag
            };
        }

        /// <summary>
        /// Creates a message from byte array
        /// </summary>
        public static Message FromBytes(byte[] data)
        {
            if (data == null || data.Length < 4)
            {
                throw new ArgumentException("Invalid data length", nameof(data));
            }

            var message = new Message();
            using var stream = new MemoryStream(data);
            message.Unmarshal(stream);
            return message;
        }

        /// <summary>
        /// Converts the message to a byte array
        /// </summary>
        public byte[] Marshal()
        {
            using var stream = new MemoryStream();

            // Write header bytes
            byte header1 = (byte)((byte)Version << 4 | (byte)HeaderSize);
            byte header2 = (byte)((byte)MsgType << 4 | (byte)MsgTypeFlag);
            byte header3 = (byte)((byte)Serialization << 4 | (byte)Compression);

            stream.WriteByte(header1);
            stream.WriteByte(header2);
            stream.WriteByte(header3);

            // Write padding for header size
            int headerSize = 4 * (int)HeaderSize;
            int paddingSize = headerSize - 3;
            for (int i = 0; i < paddingSize; i++)
            {
                stream.WriteByte(0);
            }

            // Write fields in Go writers() order
            if ((MsgTypeFlag & MsgTypeFlagBits.WithEvent) != 0)
            {
                // Write event type
                var eventBytes = new byte[4];
                BinaryPrimitives.WriteInt32BigEndian(eventBytes, (int)EventType);
                stream.Write(eventBytes, 0, 4);

                // Write session ID
                WriteSessionId(stream);
            }

            // Write sequence if needed
            switch (MsgType)
            {
                case MsgType.FullClientRequest:
                case MsgType.FullServerResponse:
                case MsgType.FrontEndResultServer:
                case MsgType.AudioOnlyClient:
                case MsgType.AudioOnlyServer:
                    if (MsgTypeFlag == MsgTypeFlagBits.PositiveSeq || MsgTypeFlag == MsgTypeFlagBits.NegativeSeq)
                    {
                        var seqBytes = new byte[4];
                        BinaryPrimitives.WriteInt32BigEndian(seqBytes, Sequence);
                        stream.Write(seqBytes, 0, 4);
                    }
                    break;

                case MsgType.Error:
                    var errorBytes = new byte[4];
                    BinaryPrimitives.WriteUInt32BigEndian(errorBytes, ErrorCode);
                    stream.Write(errorBytes, 0, 4);
                    break;
            }

            // Write payload with length prefix
            WritePayload(stream);

            return stream.ToArray();
        }

        private void WriteSessionId(MemoryStream stream)
        {
            // Skip session ID for connection events
            switch (EventType)
            {
                case EventType.StartConnection:
                case EventType.FinishConnection:
                case EventType.ConnectionStarted:
                case EventType.ConnectionFailed:
                    return;
            }

            var sessionBytes = string.IsNullOrEmpty(SessionId) ? Array.Empty<byte>() : Encoding.UTF8.GetBytes(SessionId!);
            var lenBytes = new byte[4];
            BinaryPrimitives.WriteUInt32BigEndian(lenBytes, (uint)sessionBytes.Length);
            stream.Write(lenBytes, 0, 4);
            if (sessionBytes.Length > 0)
            {
                stream.Write(sessionBytes, 0, sessionBytes.Length);
            }
        }

        private void WritePayload(MemoryStream stream)
        {
            var payloadBytes = Payload ?? Array.Empty<byte>();
            var lenBytes = new byte[4];
            BinaryPrimitives.WriteUInt32BigEndian(lenBytes, (uint)payloadBytes.Length);
            stream.Write(lenBytes, 0, 4);
            if (payloadBytes.Length > 0)
            {
                stream.Write(payloadBytes, 0, payloadBytes.Length);
            }
        }

        /// <summary>
        /// Unmarshals a byte array into the message
        /// </summary>
        private void Unmarshal(MemoryStream stream)
        {
            // Read header bytes
            int header1 = stream.ReadByte();
            Version = (VersionBits)(header1 >> 4);
            HeaderSize = (HeaderSizeBits)(header1 & 0x0F);

            int header2 = stream.ReadByte();
            MsgType = (MsgType)(header2 >> 4);
            MsgTypeFlag = (MsgTypeFlagBits)(header2 & 0x0F);

            int header3 = stream.ReadByte();
            Serialization = (SerializationBits)(header3 >> 4);
            Compression = (CompressionBits)(header3 & 0x0F);

            // Skip padding bytes
            int headerSize = 4 * (int)HeaderSize;
            int paddingSize = headerSize - 3;
            for (int i = 0; i < paddingSize; i++)
            {
                stream.ReadByte();
            }

            // Read fields in Go readers() order

            // First, read sequence or error code based on message type
            switch (MsgType)
            {
                case MsgType.FullClientRequest:
                case MsgType.FullServerResponse:
                case MsgType.FrontEndResultServer:
                case MsgType.AudioOnlyClient:
                case MsgType.AudioOnlyServer:
                    if (MsgTypeFlag == MsgTypeFlagBits.PositiveSeq || MsgTypeFlag == MsgTypeFlagBits.NegativeSeq)
                    {
                        var seqBytes = new byte[4];
                        stream.Read(seqBytes, 0, 4);
                        Sequence = BinaryPrimitives.ReadInt32BigEndian(seqBytes);
                    }
                    break;

                case MsgType.Error:
                    var errorBytes = new byte[4];
                    stream.Read(errorBytes, 0, 4);
                    ErrorCode = BinaryPrimitives.ReadUInt32BigEndian(errorBytes);
                    break;

                default:
                    throw new InvalidDataException($"Unsupported message type: {MsgType}");
            }

            // Then, if WithEvent flag is set, read event, session ID, and connect ID
            if ((MsgTypeFlag & MsgTypeFlagBits.WithEvent) != 0)
            {
                var eventBytes = new byte[4];
                stream.Read(eventBytes, 0, 4);
                EventType = (EventType)BinaryPrimitives.ReadInt32BigEndian(eventBytes);

                ReadSessionId(stream);
                ReadConnectId(stream);
            }

            // Read payload with length prefix
            ReadPayload(stream);

            // Verify no unexpected data remains
            if (stream.Position < stream.Length)
            {
                throw new InvalidDataException($"Unexpected data after message: {stream.Length - stream.Position} bytes remaining");
            }
        }

        private void ReadSessionId(MemoryStream stream)
        {
            // Skip session ID for connection events
            switch (EventType)
            {
                case EventType.StartConnection:
                case EventType.FinishConnection:
                case EventType.ConnectionStarted:
                case EventType.ConnectionFailed:
                case EventType.ConnectionFinished:
                    return;
            }

            var lenBytes = new byte[4];
            stream.Read(lenBytes, 0, 4);
            uint sessionIdLength = BinaryPrimitives.ReadUInt32BigEndian(lenBytes);

            if (sessionIdLength > 0)
            {
                var sessionBytes = new byte[sessionIdLength];
                stream.Read(sessionBytes, 0, (int)sessionIdLength);
                SessionId = Encoding.UTF8.GetString(sessionBytes);
            }
        }

        private void ReadConnectId(MemoryStream stream)
        {
            // Only read connect ID for specific connection events
            switch (EventType)
            {
                case EventType.ConnectionStarted:
                case EventType.ConnectionFailed:
                case EventType.ConnectionFinished:
                    break;
                default:
                    return;
            }

            var lenBytes = new byte[4];
            stream.Read(lenBytes, 0, 4);
            uint connectIdLength = BinaryPrimitives.ReadUInt32BigEndian(lenBytes);

            if (connectIdLength > 0)
            {
                var connectBytes = new byte[connectIdLength];
                stream.Read(connectBytes, 0, (int)connectIdLength);
                ConnectId = Encoding.UTF8.GetString(connectBytes);
            }
        }

        private void ReadPayload(MemoryStream stream)
        {
            var lenBytes = new byte[4];
            stream.Read(lenBytes, 0, 4);
            uint payloadLength = BinaryPrimitives.ReadUInt32BigEndian(lenBytes);

            if (payloadLength > 0)
            {
                Payload = new byte[payloadLength];
                stream.Read(Payload, 0, (int)payloadLength);
            }
            else
            {
                Payload = Array.Empty<byte>();
            }
        }
    }
}

TTS音频流式播放

csharp 复制代码
using System.Collections.Generic;
using UnityEngine;
/// <summary>
/// TTS音频流式播放
/// </summary>
public class TTSAudioStreamPlay : MonoBehaviour
{
    [Header("播放器")]
    [SerializeField] AudioSource audioSource;

    AudioClip streamingClip;//播放的流式音频
    bool stream = true;//是否使用流式播放模式。                
    int LengthSamples => Mathf.FloorToInt(audioClipDuration * frequency);//单个声道的采样点个数
    int channels = 1;//声道数:1 为单声道,2 为立体声

    [Header("音频剪辑名称")]
    [SerializeField] string audioName = "TTS_Stream";//音频剪辑名称
    [Header("音频剪辑长度(秒)")]
    [SerializeField] float audioClipDuration = 1;
    [Header("采样率(HZ)")]
    [SerializeField] int frequency = 24000;

    List<float> audioClipBuffer = new List<float>();//音频剪辑缓冲区
    readonly object bufferLock = new object();//锁
    byte? leftoverByte;//剩余字节

    public void CreateAudioStream()
    {
        streamingClip = AudioClip.Create(
            audioName,
            LengthSamples,
            channels,
            frequency,
            stream,
            OnPCMReaderCallback);
        audioSource.clip = streamingClip;
        audioSource.loop = true;
        audioSource.Play();
    }

    public void ReleaseAudioStream()
    {
        if (streamingClip)
        {
            audioSource.Stop();
            GameObject.Destroy(streamingClip);
            streamingClip = null;
            audioSource.clip = null;
        }

        lock (bufferLock)      
            audioClipBuffer.Clear();      
        leftoverByte = null;
    }

    public void WriteAudioData(byte[] data)
    {
        // 偶对齐转换
        List<byte> aligned = new List<byte>();
        if (leftoverByte.HasValue)
        {
            aligned.Add(leftoverByte.Value);
            leftoverByte = null;
        }
        aligned.AddRange(data);

        //有效长度
        int validLength = aligned.Count & ~1;
        float[] samples = new float[validLength / 2];
        for (int i = 0; i < validLength; i += 2)
        {
            short val = (short)(aligned[i] | (aligned[i + 1] << 8));
            samples[i / 2] = val / 32768f;
        }

        //奇数长度 缓存最后一位
        if (aligned.Count % 2 != 0)
            leftoverByte = aligned[aligned.Count - 1];

        //缓存音频数据
        lock (bufferLock)
            audioClipBuffer.AddRange(samples);
    }

    //当Unity需要音频数据时调用的回调。
    //参数为一个 float[] 数组,长度由引擎决定(通常等于内部缓冲区大小)。
    //你需要用音频样本填充整个数组,值范围为 [-1.0, 1.0]。
    //对于流式音频(stream = true),此回调会被周期性调用;
    //对于非流式,它可能只在 Create 时被调用一次或多次,直到填满 lengthSamples。
    //不能调用主线程API
    void OnPCMReaderCallback(float[] data)
    {
        lock (bufferLock)
        {
            //获取长度
            int len = Mathf.Min(audioClipBuffer.Count, data.Length);

            //填充音频数据
            for (int i = 0; i < len; i++)
                data[i] = audioClipBuffer[i];

            //移除已填充音频
            audioClipBuffer.RemoveRange(0, len);

            //剩余部分填静音
            for (int i = len; i < data.Length; i++)
                data[i] = 0f;
        }
    }
}

流式语音合成并播放

结合请求文字转语音和TTS音频流式播放的功能

csharp 复制代码
using System.Text;
using UnityEngine;

public class TextToSpeechOut : MonoBehaviour
{
    [Header("文字合成语音")]
    [SerializeField] TextToSpeech textToSpeech;

    [Header("TTS音频流播放")]
    [SerializeField] TTSAudioStreamPlay tTSAudioStreamPlay;

    bool hasSession = false;//会话存在
    bool isCreateSession;//创建会话
    StringBuilder sentenceBuffer;//缓存文本

    void Awake()
    {
        tTSAudioStreamPlay.CreateAudioStream();
        sentenceBuffer = new StringBuilder();
        textToSpeech.OnConnectWeb += OnConnectWeb;
        textToSpeech.OnConnectTTSSucess += OnConnectTTSSucess;
        textToSpeech.OnCreateSessionSucess += OnCreateSession;
        textToSpeech.OnSessionFinished += OnSessionFinish;
        textToSpeech.OnReceiveAuido += OnReceiveData;
        textToSpeech.WebSocketConnect();
    }

    void OnDestroy()
    {
        tTSAudioStreamPlay.ReleaseAudioStream();
        textToSpeech.OnConnectWeb -= OnConnectWeb;
        textToSpeech.OnConnectTTSSucess -= OnConnectTTSSucess;
        textToSpeech.OnCreateSessionSucess -= OnCreateSession;
        textToSpeech.OnSessionFinished -= OnSessionFinish;
        textToSpeech.OnReceiveAuido -= OnReceiveData;
        textToSpeech.WebSocketDisconnect();
    }

    void OnConnectWeb()
    {
        Debug.Log("连接服务器成功,开始连接TTS");
        textToSpeech.BeginConnectTTS();
    }

    void OnConnectTTSSucess()
    {
        Debug.Log("连接TTS成功,建立会话");
        //避免重复建立会话
        if (hasSession == false && isCreateSession == false)
        {
            isCreateSession = true;
            //请求建立会话
            textToSpeech.BeginSession();
        }
    }

    void OnCreateSession()
    {
        Debug.Log("建立会话成功");
        hasSession = true;
        isCreateSession = false;
        //发送缓存内容
        if (sentenceBuffer.Length > 0)
        {
            var content = sentenceBuffer.ToString();
            sentenceBuffer.Clear();
            textToSpeech.Session(content);
        }
    }

    void OnSessionFinish()
    {
        Debug.Log("会话结束");
        hasSession = false;
        isCreateSession = false;
    }

    void OnReceiveData(byte[] data)
    {
        tTSAudioStreamPlay.WriteAudioData(data);
    }

    public void ReadText(string text)
    {
        if (hasSession)
        {
            textToSpeech.Session(text);
        }
        else
        {
            //缓存文本
            sentenceBuffer.Append(text);
            //避免重复建立会话
            if (isCreateSession == false)
            {
                isCreateSession = true;
                //请求建立会话
                textToSpeech.BeginSession();
            }
        }
    }

    public void EndReadText()
    {
        //请求结束会话
        textToSpeech.EndSession();
    }
}
相关推荐
心前阳光3 小时前
Unity之使用火山引擎实现音频剪辑提问,流式语音回复
unity·音视频·火山引擎
心前阳光3 小时前
Unity之音频剪辑提问,流式语音回复使用示例
unity·游戏引擎·音视频
小拉达不是臭老鼠15 小时前
Unity学习_ScriptableObject
学习·unity
Thomas_YXQ16 小时前
Unity无GC读取图片与网格完整方案
大数据·人工智能·unity·微信·产品运营
jiayong2318 小时前
虚幻引擎 Unreal Engine 通俗指南
游戏引擎·虚幻
郝学胜-神的一滴20 小时前
中级OpenGL教程 008:精准控制高光光斑大小与强度
c++·unity·godot·three.js·图形学·opengl·unreal
avi91111 天前
Unity 商业插件之(五)课外2 - Zenject的一些小Tips(学习备忘)
unity·游戏开发·团结引擎
元气少女小圆丶1 天前
SenseGlove Nova 2+Unity开发笔记4
笔记·unity·游戏引擎
basketball6161 天前
Go 语言从入门到进阶:6. 一文彻底吃透结构体(Struct)
开发语言·unity·golang