目录
资源获取
- OpenAI官网
- Azure OpenAI
我这里用的是第二种,从Azure上获取的模型资源,想要从这获取得先注册Azure,并添加OpenAI资源,并且部署OpenAI 4O RealTime模型,部署后可以获得终结点和密钥,类似如下格式:
终结点
bash
https://openaitest.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=gpt-4o-realtime-preview
API KEY
bash
abcdefghijklmnopqrstuvwxyz123456789
后端实现
参考OpenAI官方文档:https://platform.openai.com/docs/guides/realtime-model-capabilities
OpenAI RealTime不能用Http请求,只能用WebSocket或者WebRTC形式,本文只展示WebSocket方式对接。
本文使用的语言是C#。
调用接口代码示例:
csharp
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net.WebSockets;
using Newtonsoft.Json.Linq;
using Newtonsoft.Json;
using Azure;
using Microsoft.AspNetCore.Http;
using Microsoft.IdentityModel.Tokens;
namespace RealTime.Modules.Common.Api
{
public class RealTimeApi
{
public static List<RealTimeConnect> RealTimeConnectList = new List<RealTimeConnect>();
public record RealTimeConnect(string id, ClientWebSocket clinet, string conversationItemID, CancellationTokenSource cts);
/// <summary>
/// 创建连接
/// </summary>
/// <returns>连接ID,用于下次带session的问答</returns>
public async Task<string> CreatedConnect(string prompt = "") {
if (string.IsNullOrEmpty(prompt))
prompt = "Your answer can only be a translation of what I said";
string API_KEY = "YOUR API KEY";
string ENDPOINT = "wss://YOUR ENDPOINT/openai/realtime?api-version=2024-10-01-preview&deployment=gpt-4o-realtime-preview";
var ws = new ClientWebSocket();
ws.Options.SetRequestHeader("api-key", $"{API_KEY}");
var cts = new CancellationTokenSource();
await ws.ConnectAsync(new Uri(ENDPOINT), cts.Token);
string id = Guid.NewGuid().ToString("N");
await SendAsync(ws, cts, new
{
type = "session.update",
session = new
{
instructions = prompt
}
});
Console.WriteLine("提示词发送成功");
await SendAsync(ws, cts, new
{
type = "conversation.item.create",
item = new
{
type = "message",
role = "user",
content = new[]
{
new {
type = "input_text",
text = prompt
}
}
}
});
Console.WriteLine("初始化问题发送成功");
var conversationItemID = string.Empty;
var buffer = new byte[4096];
while (ws.State == WebSocketState.Open)
{
using (var ms = new MemoryStream())
{
WebSocketReceiveResult result;
do
{
result = await ws.ReceiveAsync(new ArraySegment<byte>(buffer), cts.Token);
if (result.MessageType == WebSocketMessageType.Close)
{
await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "", cts.Token);
break;
}
ms.Write(buffer, 0, result.Count);
}
while (!result.EndOfMessage);
if (result.MessageType == WebSocketMessageType.Close) break;
var fullMessageBytes = ms.ToArray();
if (result.MessageType == WebSocketMessageType.Text)
{
var jsonStr = Encoding.UTF8.GetString(fullMessageBytes);
var msg = JObject.Parse(jsonStr);
switch (msg["type"].Value<string>())
{
case "conversation.item.created":
if (msg["type"].Value<string>() == "conversation.item.created")
{
conversationItemID = msg["item"]["id"].Value<string>();
string conversationContent = msg["item"]["content"].Children().First()["text"].Value<string>();
Console.WriteLine($"获取到前置对话ID:{conversationItemID},Content:{conversationContent}");
goto loop_exit;
}
break;
default:
break;
}
}
}
}
loop_exit:;
if (string.IsNullOrEmpty(conversationItemID))
throw new Exception("前置提示词发送失败");
RealTimeConnectList.Add(new RealTimeConnect(id, ws, conversationItemID, cts));
return id;
}
/// <summary>
/// 问答
/// </summary>
/// <param name="id">CreatedConnect 所创建的连接ID</param>
/// <param name="question">问题或者WAV文件的Base64字符串</param>
/// <param name="isAudio">是否输入为音频</param>
/// <returns>回答</returns>
/// <exception cref="NullReferenceException"></exception>
public async Task<string> SendQuestion(string id,string question, bool isAudio = true)
{
var connect = RealTimeConnectList.FirstOrDefault(x => x.id == id);
if (connect == null)
throw new NullReferenceException("连接不存在");
var audioChunks = new List<byte>();
var response = string.Empty;
var ws = connect.clinet;
var cts = connect.cts;
var content = new List<dynamic>() { };
var input = new List<dynamic>
{
new
{
type = "item_reference",
id = string.IsNullOrEmpty(connect.conversationItemID)?"":connect.conversationItemID
},
new
{
type = "message",
role = "user",
content = content
}
};
if (isAudio)
{
content.Add(new
{
type = "input_audio",//当输入问题为音频Base64需要type为input_audio
audio = question
});
}
else
{
content.Add(new
{
type = "input_text",//当输入问题为真实问题时需要type为input_text
text = question
});
}
//发送问题
await SendAsync(ws, cts, new
{
type = "response.create",
response = new
{
conversation = "none",
metadata = new { topic = "translate" },
modalities = new[] { /*"audio", */"text" },
//instructions = question
input = input
}
});
// 接收消息循环
var buffer = new byte[4096]; // 缓冲区大小可调整
var conversationItemID = string.Empty;
while (ws.State == WebSocketState.Open)
{
using (var ms = new MemoryStream())
{
WebSocketReceiveResult result;
do
{
result = await ws.ReceiveAsync(new ArraySegment<byte>(buffer), cts.Token);
if (result.MessageType == WebSocketMessageType.Close)
{
await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, "", cts.Token);
break;
}
ms.Write(buffer, 0, result.Count);
}
while (!result.EndOfMessage); // 循环接收直到消息结束,当返回数据长度大于4096时,会分批发送
if (result.MessageType == WebSocketMessageType.Close) break;
var fullMessageBytes = ms.ToArray();
if (result.MessageType == WebSocketMessageType.Text)
{
var jsonStr = Encoding.UTF8.GetString(fullMessageBytes);
var msg = JObject.Parse(jsonStr);
switch (msg["type"].Value<string>())
{
case "response.done"://回答结束
{
Console.WriteLine($"提问结束:{question}");
goto loop_exit;
}
case "error":
Console.WriteLine($"错误: {msg["error"]}");
goto loop_exit;
case "response.audio_transcript.delta":
Console.WriteLine($"识别文本: {msg["delta"]}");
response += msg["delta"].Value<string>();
break;
case "response.text.delta":
Console.WriteLine($"识别文本: {msg["delta"]}");
response += msg["delta"].Value<string>();
break;
case "response.audio.delta"://当response.create的modalities参数指定audio时,会返回语音回答
var audioData = Convert.FromBase64String(msg["delta"].Value<string>());
Console.WriteLine($"收到音频数据: {audioData.Length}字节");
audioChunks.AddRange(audioData);
break;
}
}
}
}
loop_exit:;
// 保存音频文件(当response.create的modalities参数指定audio时,会返回语音回答)
if (audioChunks.Count > 0)
{
var totalAudio = audioChunks.ToArray();
var header = CreateWavHeader(
sampleRate: 16000,
bitsPerSample: 16,
channels: 1,
dataSize: totalAudio.Length
);
File.WriteAllBytes(@"D:\test\output.wav", CombineBytes(header, totalAudio));
Console.WriteLine("音频文件已保存为 output.wav");
}
else
{
Console.WriteLine("未接收到音频数据");
}
Console.WriteLine("回答:" + response);
return response;
}
public async Task CloseConnect(string id)
{
var connect = RealTimeConnectList.FirstOrDefault(x => x.id == id);
if (connect == null)
throw new NullReferenceException("连接不存在");
await connect.clinet.CloseAsync(WebSocketCloseStatus.NormalClosure, "", connect.cts.Token);
}
//文件转Base64字符串
public static string ConvertFileToBase64(string filePath)
{
try
{
// 读取文件的所有字节
byte[] fileBytes = File.ReadAllBytes(filePath);
// 将字节数组转换为Base64字符串
string base64String = Convert.ToBase64String(fileBytes);
return base64String;
}
catch (Exception ex)
{
Console.WriteLine($"转换失败: {ex.Message}");
return null;
}
}
//文件转Base64字符串
public static async Task<string> ConvertToBase64Async(IFormFile file)
{
if (file == null || file.Length == 0)
throw new ArgumentException("文件不能为空");
using (var memoryStream = new MemoryStream())
{
// 将文件内容复制到内存流
await file.CopyToAsync(memoryStream);
// 获取字节数组并转换为Base64
byte[] fileBytes = memoryStream.ToArray();
return Convert.ToBase64String(fileBytes);
}
}
//发送消息
public async Task SendAsync(ClientWebSocket ws, CancellationTokenSource cts, object obj)
{
var json = JsonConvert.SerializeObject(obj);
await ws.SendAsync(
Encoding.UTF8.GetBytes(json),
WebSocketMessageType.Text,
true,
cts.Token);
}
public static byte[] CombineBytes(params byte[][] arrays)
{
var output = new MemoryStream();
foreach (var arr in arrays)
{
output.Write(arr, 0, arr.Length);
}
return output.ToArray();
}
public static byte[] CreateWavHeader(int sampleRate, int bitsPerSample, int channels, int dataSize)
{
using (var ms = new MemoryStream())
using (var writer = new BinaryWriter(ms))
{
// RIFF 头
writer.Write(Encoding.ASCII.GetBytes("RIFF"));
writer.Write(dataSize + 36); // 总长度
writer.Write(Encoding.ASCII.GetBytes("WAVE"));
// fmt 子块
writer.Write(Encoding.ASCII.GetBytes("fmt "));
writer.Write(16); // fmt块长度
writer.Write((short)1); // PCM格式
writer.Write((short)channels);
writer.Write(sampleRate);
writer.Write(sampleRate * channels * bitsPerSample / 8); // 字节率
writer.Write((short)(channels * bitsPerSample / 8)); // 块对齐
writer.Write((short)bitsPerSample);
// data 子块
writer.Write(Encoding.ASCII.GetBytes("data"));
writer.Write(dataSize);
return ms.ToArray();
}
}
}
}
RealTime Api接收的语音文件可以为wav格式,但当文件的声道数量和比特率不对时,会导致识别有偏差,所以如果传入文件的比特率大于256,需要降低比特率再传入
降低后
ffmpeg处理音频文件
这里用ffmpeg来处理:
这里只提供c#的处理思路,java python应该更方便吧~
首先上ffmpeg官网https://ffmpeg.org/download.html
找到
进入后找最新的下载就行
解压后是这样的
后面要想部署到linux docker的话,就把linux的一并下载(部署在windows iis的可以忽略)
windows我们只用到了bin文件夹的三个exe
linux只用到这两个文件
把这些文件全放入项目里,我这里放在ff/bin 下
转换代码
csharp
using Xabe.FFmpeg;
/// <summary>
/// wav文件比特率转换
/// </summary>
/// <param name="filePath">文件路径</param>
/// <returns></returns>
public static async Task<(string, string)> ConvertVideoAsync(string filePath)
{
//设置ffmpeg执行文件的目录
FFmpeg.SetExecutablesPath(@"ff/bin");
// 自动下载备用方案(没用上,太慢了)
//await FFmpegDownloader.GetLatestVersion(FFmpegVersion.Official);
var mediaInfo = await FFmpeg.GetMediaInfo(filePath);
string saveDirectory = @"UploadFile\Convert";
if (!Directory.Exists(saveDirectory))
Directory.CreateDirectory(saveDirectory);
string outputFileName = $"{Path.GetFileName(filePath).Replace(Path.GetExtension(filePath), "")}_Convert{Path.GetExtension(filePath)}";
string outputFilePath = Path.Combine(saveDirectory, outputFileName);
var conversion = FFmpeg.Conversions.New()
.AddStream(mediaInfo.Streams)
.AddParameter("-ac 1")
.AddParameter("-ar 16000 -acodec pcm_s16le")
.AddParameter("-acodec pcm_s16le")
.SetOutput(outputFilePath);
await conversion.Start();
using (var reader = new BinaryReader(File.OpenRead(outputFilePath)))
{
using (var memoryStream = new MemoryStream())
{
// 将文件内容复制到内存流
await reader.BaseStream.CopyToAsync(memoryStream);
// 获取字节数组并转换为Base64
byte[] fileBytes = memoryStream.ToArray();
return (Convert.ToBase64String(fileBytes), outputFilePath);
}
}
}
调用实例
csharp
RealTimeApi realTimeApi;//依赖注入
/// <summary>
/// 问答
/// </summary>
/// <param name="formFile">文件</param>
/// <param name="question">问题</param>
/// <param name="connectID">连接ID 为空时自动新增</param>
/// <param name="connectID">提示词</param>
/// <returns></returns>
public async Task<List<string>> SendAudioQuestionAsync(IFormFile formFile,string question,string connectID,string prompt)
{
try
{
if(string.IsNullOrEmpty(connectID))
connectID = await realTimeApi.CreatedConnect(prompt);
List<string> responses = new List<string>();
if (string.IsNullOrEmpty(question))
{
string filePath = await SaveToLocalAsync(formFile, @"UploadFile\Org");
(string base64, string convertFilePath) = await AudioProcessor.ConvertVideoAsync(filePath);
File.Delete(convertFilePath);
File.Delete(filePath);
if (base64 == null)
base64 = await RealTimeApi.ConvertToBase64Async(formFile);
string response = await realTimeApi.SendQuestion(connectID, base64, true);
responses.Add(response);
}
else
{
responses.Add(await realTimeApi.SendQuestion(connectID, question, false));
}
return responses;
}
catch (Exception ex)
{
throw;
}
}
//保存文件
public static async Task<string> SaveToLocalAsync(
IFormFile file,
string saveDirectory,
string? customFileName = null)
{
// 参数验证
if (file == null || file.Length == 0)
throw new ArgumentException("文件不能为空");
if (string.IsNullOrEmpty(saveDirectory))
throw new ArgumentException("保存目录不能为空");
// 创建目录(如果不存在)
if (!Directory.Exists(saveDirectory))
Directory.CreateDirectory(saveDirectory);
Random random = new Random();
string fileName = customFileName ??
$"{Path.GetFileName(file.FileName).Replace(Path.GetExtension(file.FileName),"")}-{DateTime.Now.ToString("MMdd_HHmmss")}_{random.Next(100,999)}{Path.GetExtension(file.FileName)}";
string filePath = Path.Combine(saveDirectory, fileName);
using (var fileStream = new FileStream(filePath, FileMode.Create))
{
await file.CopyToAsync(fileStream);
}
return filePath;
}
后续用Controller调用一下就行
前端实现
vite+vue
引入录音包
bash
npm install recorder-js
index.vue
javascript
<template>
<div>
<AudioRecorder />
</div>
</template>
<script>
import AudioRecorder from '../components/AudioRecorder.vue'
export default {
components: {
AudioRecorder
}
}
</script>
AudioRecorder.vue
javascript
<template>
<div class="audio-recorder">
<div>
<p>提示词:</p>
<textarea v-model="state.prompt" style="width: 306px; height: 213px;"></textarea ></div>
<button
@click="toggleRecording"
:class="{ 'recording': state.isRecording }"
>
{{ state.isRecording ? '录音中...' : '开始录音' }}
</button>
<p v-if="state.recordingTime">已录制: {{ state.formattedTime }}</p>
<div v-if="state.error" class="error-message">{{ state.error }}</div>
</div>
<div v-for="(item,index) in state.responseMessage">
<p>{{index+1}}:{{item}}</p>
</div>
</template>
<script>
import { reactive, onBeforeUnmount } from 'vue'
import Recorder from 'recorder-js' //需要npm引入
export default {
name: 'AudioRecorder',
setup() {
const state = reactive({
recorder: null,
audioContext: null,
mediaStream: null,
isRecording: false,
isProcessing: false,
error: null,
startTime: 0,
responseMessage:[],
prompt : 'Your answer can only be a translation of what I said.',
})
// 初始化音频设备
const initRecorder = async () => {
try {
// 清理旧实例
if (state.recorder) {
state.recorder.destroy()
state.audioContext.close()
}
// 创建新实例
state.audioContext = new (window.AudioContext || window.webkitAudioContext)()
state.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
state.recorder = new Recorder(state.audioContext, {
numChannels: 1, // 单声道
})
await state.recorder.init(state.mediaStream)
} catch (err) {
handleError(err)
}
}
// 开始录音
const startRecording = async () => {
try {
if (!state.recorder) {
await initRecorder()
}
await state.recorder.start()
state.isRecording = true
state.startTime = Date.now()
state.error = null
} catch (err) {
handleError(err)
}
}
// 停止录音(关键修复部分)
const stopRecording = async () => {
if (!state.isRecording) return
state.isRecording = false
try {
state.isProcessing = true
// 等待录音停止并获取数据
const { blob, buffer } = await state.recorder.stop()
console.log('获取到音频Blob:', blob)
console.log('音频Buffer:', buffer)
// 自动下载测试
//Recorder.download(blob, 'recording')
// 上传逻辑
const formData = new FormData()
formData.append('files', blob, `recording_${Date.now()}.wav`)
formData.append('IsAudio', true)
formData.append('prompt', state.prompt)
await uploadChunk(formData)
} catch (err) {
handleError(err)
} finally {
// 资源清理
state.mediaStream?.getTracks().forEach(track => track.stop())
state.audioContext?.close()
state.recorder = null
state.isRecording = false
state.isProcessing = false
}
}
// 文件上传方法
const uploadChunk = async (formData) => {
try {
const response = await fetch('http://localhost:12132/api/RealTime/AudioTranslate', {
method: 'POST',
body: formData
})
if (!response.ok) throw new Error(`上传失败: ${response.status}`)
console.log(response);
var jsonRes = await response.json()
state.connectId = jsonRes.data.connectId;
const now = new Date();
var currentTime = now.toLocaleTimeString();
state.responseMessage.push(currentTime + ':' + jsonRes.data.responses[0]);
return jsonRes;
} catch (err) {
handleError(err)
}
}
// 错误处理
const handleError = (error) => {
console.error('录音错误:', error)
state.error = error.message || '录音功能异常'
stopRecording()
}
// 切换录音状态
const toggleRecording = () => {
state.isRecording ? stopRecording() : startRecording()
}
// 组件卸载时清理
onBeforeUnmount(() => {
if (state.isRecording) stopRecording()
})
return {
state,
toggleRecording
}
}
}
</script>
<style scoped>
.audio-recorder {
max-width: 900px;
margin: 20px auto;
padding: 20px;
border: 1px solid #eee;
border-radius: 8px;
}
button {
padding: 10px 20px;
background: #42b983;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
transition: background 0.3s;
}
button:disabled {
background: #ccc;
cursor: not-allowed;
}
button.recording {
background: #ff4757;
animation: pulse 1s infinite;
}
@keyframes pulse {
0% { opacity: 1; }
50% { opacity: 0.5; }
100% { opacity: 1; }
}
.error-message {
color: #ff4757;
margin-top: 10px;
}
</style>
效果
其实这个模型还是不太灵敏,需要一个字一个字的说才能正常识别