语音识别
参数设置XApiKey和XApiResourceId
调用Request方法,传入音频数据
csharp
using System;
using System.Collections;
using UnityEngine;
using UnityEngine.Networking;
/// <summary>
/// 语音识别,语音转换为文字
/// </summary>
public class SpeechRecognition : MonoBehaviour
{
string api = "https://openspeech.bytedance.com/api/v3/auc/bigmodel/recognize/flash";
string XApiKey = "";
string XApiResourceId = "";
string XApiRequestId => Guid.NewGuid().ToString();
string XApiSequence = "-1";
string modelName = "bigmodel";
public void Request(AudioClip audioClip, Action<bool, string> OnRespone)
{
var data = WavUtility.AudioClipToWav(audioClip);
Request(data, OnRespone);
}
public void Request(byte[] wavbytes, Action<bool, string> OnRespone)
{
RequesAudioUser user = new RequesAudioUser
{
uid = "语音识别"
};
RequestAudioData audio = new RequestAudioData
{
format = "wav",
data = Convert.ToBase64String(wavbytes)
};
RequestAudioModel request = new RequestAudioModel
{
model_name = modelName
};
RequestAudioToText requestAudioToText = new RequestAudioToText
{
user = user,
audio = audio,
request = request
};
var jsonData = JsonUtility.ToJson(requestAudioToText);
StartCoroutine(RequestAudioToText(jsonData, OnRespone));
}
IEnumerator RequestAudioToText(string jsonData, Action<bool, string> OnRespone)
{
using (UnityWebRequest request = new UnityWebRequest(api, UnityWebRequest.kHttpVerbPOST))
{
byte[] bodyRaw = System.Text.Encoding.UTF8.GetBytes(jsonData);
request.uploadHandler = new UploadHandlerRaw(bodyRaw);
request.downloadHandler = new DownloadHandlerBuffer();
request.SetRequestHeader("Content-Type", "application/json");
request.SetRequestHeader("X-Api-Key", XApiKey);
request.SetRequestHeader("X-Api-Resource-Id", XApiResourceId);
request.SetRequestHeader("X-Api-Request-Id", XApiRequestId);
request.SetRequestHeader("X-Api-Sequence", XApiSequence);
yield return request.SendWebRequest();
if (request.result != UnityWebRequest.Result.Success)
{
Debug.LogError("请求失败:" + request.error);
OnRespone?.Invoke(false, request.downloadHandler.text);
}
else
{
var text = request.downloadHandler.text;
var obj = JsonUtility.FromJson<ResponeAudioToText>(text);
if (obj != null && obj.result != null)
{
OnRespone?.Invoke(true, obj.result.text);
}
else
{
OnRespone?.Invoke(false, request.downloadHandler.text);
}
}
}
}
}
[System.Serializable]
public class RequestAudioToText
{
public RequesAudioUser user;
public RequestAudioData audio;
public RequestAudioModel request;
}
[System.Serializable]
public class RequesAudioUser
{
public string uid;//app key
}
[System.Serializable]
public class RequestAudioData
{
public string data;//base64编码内容
public string format;//音频格式
}
[System.Serializable]
public class RequestAudioModel
{
public string model_name;//bigmodel
}
[System.Serializable]
public class ResponeAudioToText
{
public ResponeAudioToTextResult result;
}
[System.Serializable]
public class ResponeAudioToTextResult
{
public string text;
}
工具
csharp
using System;
using System.IO;
using UnityEngine;
public static class WavUtility
{
public static AudioClip WavToAudioClip(byte[] wavBytes, string clipName)
{
//解析wav头
int channels = wavBytes[22];//声道数
int sampleRate = BitConverter.ToInt32(wavBytes, 24);//采样率
//data块起始位置
int dataStartIndex = 44;//标准PCM WAV头
int dataLength = wavBytes.Length - dataStartIndex;//数据长度
//将16-bit pcm转换为 float[-1,1]
float[] samples = new float[dataLength / 2];
for (int i = 0; i < samples.Length; i++)
{
short sample = BitConverter.ToInt16(wavBytes, dataStartIndex + i * 2);
samples[i] = sample / 32768f;//归一化 [-1,1]
}
AudioClip clip = AudioClip.Create(clipName,
samples.Length / channels, channels, sampleRate, false);
clip.SetData(samples, 0);
return clip;
}
public static byte[] AudioClipToWav(AudioClip clip)
{
if (clip == null)
{
Debug.LogError("AudioClip is null!");
return null;
}
// 获取音频参数
int channels = clip.channels;//声道数据
int sampleRate = clip.frequency;//采样率
int samples = clip.samples; // 每个声道的采样数
// 读取音频数据(浮点数组,范围 -1.0 到 1.0)
float[] audioData = new float[samples * channels];
clip.GetData(audioData, 0);
// 将浮点数据转换为 16 位 PCM 字节数组
byte[] pcmData = ConvertFloatTo16BitPCM(audioData);
// 构建 WAV 文件头
byte[] wavData = BuildWavHeader(pcmData, channels, sampleRate);
return wavData;
}
private static byte[] ConvertFloatTo16BitPCM(float[] floatSamples)
{
byte[] pcmBytes = new byte[floatSamples.Length * 2]; // 每个采样 2 字节
int offset = 0;
foreach (float sample in floatSamples)
{
// 将 -1..1 映射到 -32768..32767
short intSample = (short)(Mathf.Clamp(sample, -1f, 1f) * 32767f);
pcmBytes[offset++] = (byte)(intSample & 0xFF);
pcmBytes[offset++] = (byte)((intSample >> 8) & 0xFF);
}
return pcmBytes;
}
private static byte[] BuildWavHeader(byte[] pcmData, int channels, int sampleRate)
{
// 参数
int bitsPerSample = 16;
int byteRate = sampleRate * channels * bitsPerSample / 8;
int blockAlign = channels * bitsPerSample / 8;
int dataSize = pcmData.Length;
int fileSize = 36 + dataSize; // 头部总大小(不含前8字节?标准:44字节头部+数据大小)
// 使用 MemoryStream 写入
using (MemoryStream stream = new MemoryStream())
using (BinaryWriter writer = new BinaryWriter(stream))
{
// RIFF 头部
writer.Write(System.Text.Encoding.ASCII.GetBytes("RIFF"));
writer.Write(fileSize); // 文件总大小 - 8
writer.Write(System.Text.Encoding.ASCII.GetBytes("WAVE"));
// fmt 子块
writer.Write(System.Text.Encoding.ASCII.GetBytes("fmt "));
writer.Write(16); // 子块大小(PCM 固定 16)
writer.Write((short)1); // 音频格式(1 = PCM)
writer.Write((short)channels); // 声道数
writer.Write(sampleRate); // 采样率
writer.Write(byteRate); // 字节率
writer.Write((short)blockAlign); // 块对齐
writer.Write((short)bitsPerSample); // 位深度
// data 子块
writer.Write(System.Text.Encoding.ASCII.GetBytes("data"));
writer.Write(dataSize); // 数据大小
writer.Write(pcmData); // 写入 PCM 数据
writer.Flush();
return stream.ToArray();
}
}
}