资料
日期:20260402
火山方舟对接文档
功能
按下空格键,语音提问;
松开空格键,语音回答。
思路
提问
- 录制语音
- 语音转换文本(语音识别)
- 文本提问(语言模型)
答复
- 答复文本转换为语音(语音合成)
- 播放语音
录制语音
使用Microphone录制60秒音频。
Unity录制的音频要指定长度,但是语音输入的长度有可能小于指定长度,这会浪费内存。
使用TrimAudioClip方法裁剪多余内容。
录制的音频采样率设置为16000。
录制的结果是一个AudioClip,将AudioClip转换为wav格式的字节数组,后一个流程可以使用音频数据。
注意:录制时确保麦克风可用。
csharp
using System;
using System.IO;
using UnityEngine;
/// <summary>
/// 录制语音
/// </summary>
public class RecordVoice : MonoBehaviour
{
bool loop = false;//录制长度超过length,录音是否继续录制,继续录制覆盖旧的音频从开头录制
[Header("录制时长(s)")]
[SerializeField] int length = 60;//音频长度,单位秒
[Header("采样率(hz)")]
[SerializeField] int frequency = 16000;//音频采样率
[Header("保存录制音频")]
[SerializeField] bool saveRecordAudio = true;
float minLength = 0.1f;//最短录制时长
bool isRecording = false;//录制中
string recordDeviceName;//录制设备名字
AudioClip recordClip;//录制的音频
public AudioClip RecordClip => recordClip;
public bool IsRecording => isRecording;
byte[] wavData;
public byte[] WavData => wavData;
public bool ExistMicrophoneDevice()
{
var devices = Microphone.devices;
return devices != null && devices.Length > 0;
}
public void Begin()
{
if (isRecording) return;
var devices = Microphone.devices;
recordDeviceName = devices[0];
recordClip = Microphone.Start(recordDeviceName, loop, length, frequency);
isRecording = true;
}
public void End()
{
if (isRecording == false) return;
//实际录制时长
int lastSamplePosition = Microphone.GetPosition(recordDeviceName);
Microphone.End(recordDeviceName);
isRecording = false;
float actualDuration = (float)lastSamplePosition / recordClip.frequency;
actualDuration = Mathf.Clamp(actualDuration, 0, length);
if (actualDuration < minLength)
{
Debug.LogWarning("录制太短");
return;
}
//去除后面没有使用的部分
recordClip = TrimAudioClip(recordClip, actualDuration);
wavData = WavUtility.AudioClipToWav(recordClip);
if (saveRecordAudio)
File.WriteAllBytes(Guid.NewGuid() + ".wav", wavData);
}
private AudioClip TrimAudioClip(AudioClip source, float duration)
{
if (source == null || duration <= 0) return null;
int channels = source.channels;//声道数,单声道=1 双声道=2
int sampleRate = source.frequency;//采样率
// 目标采样长度
int singleChannelSamples = Mathf.FloorToInt(duration * sampleRate);//单个声道的总采样数
int targetSamples = singleChannelSamples * channels;//所有声道的采样数
// 源数据采样长度
float[] sourceSamples = new float[source.samples * channels];
source.GetData(sourceSamples, 0);
// 限制目标采样数不超过源数据长度
targetSamples = Mathf.Min(targetSamples, sourceSamples.Length);
if (targetSamples <= 0)
return null;
// 提取有效数据
float[] trimmedData = new float[targetSamples];
System.Array.Copy(sourceSamples, 0, trimmedData, 0, targetSamples);
//创建音频片段
AudioClip trimmedClip = AudioClip.Create(
"Record",
targetSamples / channels,
channels,
sampleRate,
false
);
trimmedClip.SetData(trimmedData, 0);
return trimmedClip;
}
}
语音转换文本(语音识别)
使用模型:大模型录音文件极速版识别API
将wav格式的字节数组转换为base64的字符串,按照api要求上传,获取识别文本。
xApiKey设置为自己的APIKey
注意:音频格式为wav
csharp
using System;
using System.Collections;
using UnityEngine;
using UnityEngine.Networking;
/// <summary>
/// 语音识别
/// </summary>
public class SpeechRecognition : MonoBehaviour
{
string api = "https://openspeech.bytedance.com/api/v3/auc/bigmodel/recognize/flash";
string XApiKey = "";
string XApiResourceId = "volc.bigasr.auc_turbo";
string XApiRequestId => Guid.NewGuid().ToString();
string XApiSequence = "-1";
string modelName = "bigmodel";
public void Request(byte[] wavbytes, Action<bool, string> OnRespone)
{
RequesAudioUser user = new RequesAudioUser
{
uid = "语音识别"
};
RequestAudioData audio = new RequestAudioData
{
format = "wav",
data = Convert.ToBase64String(wavbytes)
};
RequestAudioModel request = new RequestAudioModel
{
model_name = modelName
};
RequestAudioToText requestAudioToText = new RequestAudioToText
{
user = user,
audio = audio,
request = request
};
var jsonData = JsonUtility.ToJson(requestAudioToText);
StartCoroutine(RequestAudioToText(jsonData, OnRespone));
}
IEnumerator RequestAudioToText(string jsonData, Action<bool, string> OnRespone)
{
using (UnityWebRequest request = new UnityWebRequest(api, UnityWebRequest.kHttpVerbPOST))
{
byte[] bodyRaw = System.Text.Encoding.UTF8.GetBytes(jsonData);
request.uploadHandler = new UploadHandlerRaw(bodyRaw);
request.downloadHandler = new DownloadHandlerBuffer();
request.SetRequestHeader("Content-Type", "application/json");
request.SetRequestHeader("X-Api-Key", XApiKey);
request.SetRequestHeader("X-Api-Resource-Id", XApiResourceId);
request.SetRequestHeader("X-Api-Request-Id", XApiRequestId);
request.SetRequestHeader("X-Api-Sequence", XApiSequence);
yield return request.SendWebRequest();
if (request.result != UnityWebRequest.Result.Success)
{
Debug.LogError("请求失败:" + request.error);
OnRespone?.Invoke(false, request.downloadHandler.text);
}
else
{
var text = request.downloadHandler.text;
var obj = JsonUtility.FromJson<ResponeAudioToText>(text);
if (obj != null && obj.result != null)
{
OnRespone?.Invoke(true, obj.result.text);
}
else
{
OnRespone?.Invoke(false, request.downloadHandler.text);
}
}
}
}
}
[System.Serializable]
public class RequestAudioToText
{
public RequesAudioUser user;
public RequestAudioData audio;
public RequestAudioModel request;
}
[System.Serializable]
public class RequesAudioUser
{
public string uid;//app key
}
[System.Serializable]
public class RequestAudioData
{
public string data;//base64编码内容
public string format;//音频格式
}
[System.Serializable]
public class RequestAudioModel
{
public string model_name;//bigmodel
}
[System.Serializable]
public class ResponeAudioToText
{
public ResponeAudioToTextResult result;
}
[System.Serializable]
public class ResponeAudioToTextResult
{
public string text;
}
文本提问
设置apikey,设置语言model
将转换文本,加入对话列表中,设置role为user,上传对话,接收答复文本。
csharp
using System;
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.Networking;
/// <summary>
/// 文本提问
/// </summary>
public class TextQuestions : MonoBehaviour
{
string baseUrl = "https://ark.cn-beijing.volces.com/api/v3/chat/completions";
string apiKey = "";
string model = "doubao-seed-2-0-code-preview-260215";
public void Request(List<Message> messages, Action<bool, ResponeChatMessage> onRespone)
{
RequestChat requestMessages = new RequestChat
{
model = model,
messages = messages
};
string jsonData = JsonUtility.ToJson(requestMessages);
StartCoroutine(PostChat(jsonData, onRespone));
}
IEnumerator PostChat(string jsonData, Action<bool, ResponeChatMessage> callback)
{
using (UnityWebRequest request = new UnityWebRequest(baseUrl, UnityWebRequest.kHttpVerbPOST))
{
byte[] bodyRaw = System.Text.Encoding.UTF8.GetBytes(jsonData);
request.uploadHandler = new UploadHandlerRaw(bodyRaw);
request.downloadHandler = new DownloadHandlerBuffer();
request.SetRequestHeader("Content-Type", "application/json");
request.SetRequestHeader("Authorization", "Bearer " + apiKey);
yield return request.SendWebRequest();
if (request.result == UnityWebRequest.Result.Success)
{
var text = request.downloadHandler.text;
var obj = JsonUtility.FromJson<ResponeChatMessage>(text);
if (obj != null)
{
callback?.Invoke(true, obj);
}
else
{
callback?.Invoke(false, null);
}
}
else
{
Debug.LogError("请求失败:" + request.error);
callback?.Invoke(false, null);
}
}
}
}
[Serializable]
public class RequestChat
{
public string model;
public List<Message> messages;
}
[Serializable]
public class Message
{
public string role;
public string content;
}
[Serializable]
public class ResponeChatMessage
{
public Choices[] choices;
}
[Serializable]
public class Choices
{
public string finish_reason;
public int index;
public Message message;
}
答复文本转换为语音
设置apiid,accesskey,发音对象
上传答复文本,接收音频数据,将接收数据转换为audioclip
csharp
using System;
using System.Collections;
using System.IO;
using System.Text;
using UnityEngine;
using UnityEngine.Networking;
/// <summary>
/// 语音合成
/// </summary>
public class TextToAudio : MonoBehaviour
{
[Header("接口")]
string baseUrl = "https://openspeech.bytedance.com/api/v3/tts/unidirectional";
string XApiAppId = "";
string XApiAccessKey = "";
string XApiResourceId = "seed-tts-2.0";
[Header("发音对象")]
[SerializeField] string speaker = "zh_female_vv_uranus_bigtts";
[Header("存放音频")]
[SerializeField] bool save = true;
MemoryStream memoryStream;
public event Action<AudioClip> OnGetAudio;
public void Request(string text)
{
RequestAudio requestAudio = new RequestAudio();
AudioUser user = new AudioUser();
user.uid = Guid.NewGuid().ToString();
requestAudio.user = user;
RequestAudioParams requestAudioParams = new RequestAudioParams
{
speaker = speaker,
text = text
};
AudioParams audioParams = new AudioParams
{
format = "wav",
sample_rate = 44100
};
requestAudioParams.audio_params = audioParams;
requestAudio.req_params = requestAudioParams;
var message = JsonUtility.ToJson(requestAudio);
if (memoryStream != null)
{
memoryStream.Dispose();
memoryStream = null;
}
memoryStream = new MemoryStream();
StartCoroutine(PostTextToAudio(message, OnObjectReceive, OnCompleted));
}
void OnObjectReceive(string content)
{
try
{
var result = JsonUtility.FromJson<ResponeAudio>(content);
if (result != null && result.code == 0 && !string.IsNullOrEmpty(result.data))
{
byte[] chunk = Convert.FromBase64String(result.data);
memoryStream.Write(chunk, 0, chunk.Length);
}
}
catch (Exception e)
{
Debug.LogError(e.Message);
}
}
void OnCompleted()
{
byte[] audioData = null;
if (memoryStream != null)
{
audioData = memoryStream.ToArray();
memoryStream.Dispose();
memoryStream = null;
}
if (audioData != null && audioData.Length > 0)
{
OnGetAudio?.Invoke(WavUtility.WavToAudioClip(audioData, "tts"));
if (save)
File.WriteAllBytes(Guid.NewGuid() + ".wav", audioData);
}
}
IEnumerator PostTextToAudio(string jsonData, Action<string> OnObjectReceive, Action OnComplete)
{
using (UnityWebRequest request = new UnityWebRequest(baseUrl, UnityWebRequest.kHttpVerbPOST))
{
byte[] bodyRaw = System.Text.Encoding.UTF8.GetBytes(jsonData);
request.uploadHandler = new UploadHandlerRaw(bodyRaw);
var audioDownload = new TTSDownload();
request.downloadHandler = audioDownload;
audioDownload.OnComplete += OnComplete;
audioDownload.OnObjectReceived += OnObjectReceive;
request.SetRequestHeader("Content-Type", "application/json");
request.SetRequestHeader("X-Api-App-Id", XApiAppId);
request.SetRequestHeader("X-Api-Access-Key", XApiAccessKey);
request.SetRequestHeader("X-Api-Resource-Id", XApiResourceId);
yield return request.SendWebRequest();
if (request.result != UnityWebRequest.Result.Success)
Debug.LogError("请求失败:" + request.error);
audioDownload.OnComplete -= OnComplete;
audioDownload.OnObjectReceived -= OnObjectReceive;
}
}
void OnDestroy()
{
if (memoryStream != null)
{
memoryStream.Dispose();
memoryStream = null;
}
}
}
[Serializable]
public class RequestAudio
{
public AudioUser user;
public RequestAudioParams req_params;
}
[Serializable]
public class AudioUser
{
public string uid;
}
[Serializable]
public class RequestAudioParams
{
public string text;
public string speaker;
public AudioParams audio_params;
}
[Serializable]
public class AudioParams
{
public string format;
public int sample_rate;
}
[Serializable]
public class ResponeAudio
{
public int code;
public string message;
public string data;
}
public class TTSDownload : DownloadHandlerScript
{
private StringBuilder buffer = new StringBuilder();
public event Action<string> OnObjectReceived;
public event Action OnComplete;
public TTSDownload() : base(new byte[4096]) { }
protected override bool ReceiveData(byte[] data, int dataLength)
{
string chunk = Encoding.UTF8.GetString(data, 0, dataLength);
buffer.Append(chunk);
ProcessBuffer();
return true;
}
private void ProcessBuffer()
{
string content = buffer.ToString();
int splitStartIndex = -1;
int braceCount = 0;
int lastProcessed = 0;
for (int i = 0; i < content.Length; i++)
{
if (content[i] == '{')
{
if (braceCount == 0) splitStartIndex = i;
braceCount++;
}
else if (content[i] == '}')
{
braceCount--;
if (braceCount == 0 && splitStartIndex != -1)
{
string obj = content.Substring(splitStartIndex, i - splitStartIndex + 1);
OnObjectReceived?.Invoke(obj);
splitStartIndex = -1;
lastProcessed = i + 1;
}
}
}
if (lastProcessed > 0)
buffer.Remove(0, lastProcessed);
}
protected override void CompleteContent()
{
OnComplete?.Invoke();
}
}
播放音频
csharp
using System;
using System.Collections;
using UnityEngine;
/// <summary>
/// 播放语音
/// </summary>
public class AudioPlay : MonoBehaviour
{
AudioSource audioSource;
void Awake()
{
audioSource = GetComponent<AudioSource>();
}
public void Play(AudioClip clip, Action playend)
{
audioSource.clip = clip;
audioSource.Play();
StartCoroutine(PlayEnd(clip.length, playend));
}
IEnumerator PlayEnd(float duration, Action callback)
{
yield return new WaitForSeconds(duration);
callback?.Invoke();
}
}
语音转换工具
wav字节数组转换为audioclip
audioclip转换为wav字节数组
csharp
using System;
using System.IO;
using UnityEngine;
public static class WavUtility
{
public static AudioClip WavToAudioClip(byte[] wavBytes, string clipName)
{
//解析wav头
int channels = wavBytes[22];//声道数
int sampleRate = BitConverter.ToInt32(wavBytes, 24);//采样率
//data块起始位置
int dataStartIndex = 44;//标准PCM WAV头
int dataLength = wavBytes.Length - dataStartIndex;//数据长度
//将16-bit pcm转换为 float[-1,1]
float[] samples = new float[dataLength / 2];
for (int i = 0; i < samples.Length; i++)
{
short sample = BitConverter.ToInt16(wavBytes, dataStartIndex + i * 2);
samples[i] = sample / 32768f;//归一化 [-1,1]
}
AudioClip clip = AudioClip.Create(clipName,
samples.Length / channels, channels, sampleRate, false);
clip.SetData(samples, 0);
return clip;
}
public static byte[] AudioClipToWav(AudioClip clip)
{
if (clip == null)
{
Debug.LogError("AudioClip is null!");
return null;
}
// 获取音频参数
int channels = clip.channels;//声道数据
int sampleRate = clip.frequency;//采样率
int samples = clip.samples; // 每个声道的采样数
// 读取音频数据(浮点数组,范围 -1.0 到 1.0)
float[] audioData = new float[samples * channels];
clip.GetData(audioData, 0);
// 将浮点数据转换为 16 位 PCM 字节数组
byte[] pcmData = ConvertFloatTo16BitPCM(audioData);
// 构建 WAV 文件头
byte[] wavData = BuildWavHeader(pcmData, channels, sampleRate);
return wavData;
}
private static byte[] ConvertFloatTo16BitPCM(float[] floatSamples)
{
byte[] pcmBytes = new byte[floatSamples.Length * 2]; // 每个采样 2 字节
int offset = 0;
foreach (float sample in floatSamples)
{
// 将 -1..1 映射到 -32768..32767
short intSample = (short)(Mathf.Clamp(sample, -1f, 1f) * 32767f);
pcmBytes[offset++] = (byte)(intSample & 0xFF);
pcmBytes[offset++] = (byte)((intSample >> 8) & 0xFF);
}
return pcmBytes;
}
private static byte[] BuildWavHeader(byte[] pcmData, int channels, int sampleRate)
{
// 参数
int bitsPerSample = 16;
int byteRate = sampleRate * channels * bitsPerSample / 8;
int blockAlign = channels * bitsPerSample / 8;
int dataSize = pcmData.Length;
int fileSize = 36 + dataSize; // 头部总大小(不含前8字节?标准:44字节头部+数据大小)
// 使用 MemoryStream 写入
using (MemoryStream stream = new MemoryStream())
using (BinaryWriter writer = new BinaryWriter(stream))
{
// RIFF 头部
writer.Write(System.Text.Encoding.ASCII.GetBytes("RIFF"));
writer.Write(fileSize); // 文件总大小 - 8
writer.Write(System.Text.Encoding.ASCII.GetBytes("WAVE"));
// fmt 子块
writer.Write(System.Text.Encoding.ASCII.GetBytes("fmt "));
writer.Write(16); // 子块大小(PCM 固定 16)
writer.Write((short)1); // 音频格式(1 = PCM)
writer.Write((short)channels); // 声道数
writer.Write(sampleRate); // 采样率
writer.Write(byteRate); // 字节率
writer.Write((short)blockAlign); // 块对齐
writer.Write((short)bitsPerSample); // 位深度
// data 子块
writer.Write(System.Text.Encoding.ASCII.GetBytes("data"));
writer.Write(dataSize); // 数据大小
writer.Write(pcmData); // 写入 PCM 数据
writer.Flush();
return stream.ToArray();
}
}
}
整合功能
csharp
using System;
using System.Collections.Generic;
using UnityEngine;
/// <summary>
/// 语音提问
/// </summary>
public class VoiceQuestioning : MonoBehaviour
{
[Header("音频录制")]
[SerializeField] RecordVoice recordVoice;
[Header("语音识别")]
[SerializeField] SpeechRecognition speechRecognition;
[Header("文本提问")]
[SerializeField] TextQuestions textQuestions;
List<Message> messages = new List<Message>();
[Header("语音合成")]
[SerializeField] TextToAudio textToAudio;
[Header("播放回答")]
[SerializeField] AudioPlay audioPlay;
VoiceQuestioningState currentState = VoiceQuestioningState.None;
public VoiceQuestioningState State => currentState;
public event Action OnPlayEnd;
public event Action OnError;
void Awake()
{
textToAudio.OnGetAudio += OnGetAnswerAudio;
}
public void RecordVoiceBegin()
{
currentState = VoiceQuestioningState.VoiceInput;
recordVoice.Begin();
}
public void RecordVoiceEnd()
{
if (currentState == VoiceQuestioningState.VoiceInput)
{
recordVoice.End();
RequestAudioToText(recordVoice.WavData);
}
}
public void ClearMessage()
{
messages.Clear();
}
void RequestAudioToText(byte[] wavBytes)
{
currentState = VoiceQuestioningState.VoiceToText;
speechRecognition.Request(wavBytes, ResponeAudioToText);
}
void ResponeAudioToText(bool sucess, string text)
{
if (sucess)
{
RequestTextQuestion(text);
}
else
{
Debug.LogError("语音转换失败");
OnError?.Invoke();
}
}
void RequestTextQuestion(string text)
{
currentState = VoiceQuestioningState.TextQuestion;
messages.Add(new Message()
{
role = "user",
content = text
});
textQuestions.Request(messages, ResponeTextQuestion);
}
void ResponeTextQuestion(bool sucess, ResponeChatMessage result)
{
if (sucess)
{
var choices = result.choices;
if (choices != null)
{
int lastIndex = choices.Length - 1;
if (lastIndex >= 0 && lastIndex < choices.Length)
{
var message = choices[lastIndex].message;
messages.Add(message);//加入问题列表中
RequestTextToAudio(message.content);
}
else
{
Debug.LogError("答复索引无效");
}
}
}
else
{
Debug.LogError("文本提问失败");
OnError?.Invoke();
}
}
void RequestTextToAudio(string text)
{
currentState = VoiceQuestioningState.TextToAudio;
textToAudio.Request(text);
}
void OnGetAnswerAudio(AudioClip audioClip)
{
currentState = VoiceQuestioningState.PlayAnswer;
if (audioClip)
audioPlay.Play(audioClip, OnPlayAnswerEnd);
else
{
Debug.LogError("回复音频转换失败");
OnError?.Invoke();
}
}
void OnPlayAnswerEnd()
{
currentState = VoiceQuestioningState.PlayAnswerEnd;
OnPlayEnd?.Invoke();
}
void OnDestroy()
{
textToAudio.OnGetAudio -= OnGetAnswerAudio;
}
}
public enum VoiceQuestioningState
{
None,
/// <summary>
/// 语音输入
/// </summary>
VoiceInput,
/// <summary>
/// 语音转换为文本
/// </summary>
VoiceToText,
/// <summary>
/// 文本提问
/// </summary>
TextQuestion,
/// <summary>
/// 文本转换为语音
/// </summary>
TextToAudio,
/// <summary>
/// 播放回答
/// </summary>
PlayAnswer,
/// <summary>
/// 播放结束
/// </summary>
PlayAnswerEnd,
}
示例
csharp
using System;
using UnityEngine;
public class UseVoiceQuestion : MonoBehaviour
{
[SerializeField] VoiceQuestioning voiceQuestioning;
bool canRecordVoice = true;
float questionTimer = 0;
void Awake()
{
voiceQuestioning.OnPlayEnd += OnPlayEnd;
voiceQuestioning.OnError += OnError;
}
void OnPlayEnd()
{
canRecordVoice = true;
}
void OnError()
{
canRecordVoice = true;
}
void Update()
{
if (canRecordVoice)
{
if (Input.GetKeyDown(KeyCode.Space))
{
questionTimer = 0;
voiceQuestioning.RecordVoiceBegin();
}
if (Input.GetKeyUp(KeyCode.Space))
{
canRecordVoice = false;
voiceQuestioning.RecordVoiceEnd();
}
}
else
{
questionTimer += Time.deltaTime;
}
}
void OnDestroy()
{
voiceQuestioning.OnError -= OnError;
voiceQuestioning.OnPlayEnd -= OnPlayEnd;
}
void OnGUI()
{
GUI.skin.label.fontSize = 20;
GUILayout.Label("按下空格键录制语音,松开空格键,发送问题");
GUILayout.Label("当前可以使用空格键录制?:" +
((canRecordVoice == true) ?
"<color=green>可以</color>" : "<color=red>不可以</color>"));
string state = String.Empty;
switch (voiceQuestioning.State)
{
case VoiceQuestioningState.None:
state = "无";
break;
case VoiceQuestioningState.VoiceInput:
state = "语音输入";
break;
case VoiceQuestioningState.VoiceToText:
state = "语音转换文本";
break;
case VoiceQuestioningState.TextQuestion:
state = "文本提问";
break;
case VoiceQuestioningState.TextToAudio:
state = "文本转换语音";
break;
case VoiceQuestioningState.PlayAnswer:
state = "播放答复";
break;
case VoiceQuestioningState.PlayAnswerEnd:
state = "播放结束";
break;
}
GUILayout.Label("语音提问状态:" + state);
if (canRecordVoice)
{
GUILayout.Label("提问使用时间:" + questionTimer);
}
else
{
GUILayout.Label("提问计时:" + questionTimer);
}
}
}