微软Azure云目前也推出了OpenAI Realtime系列多模态大模型,由于之前使用过Azure云,因此就没有使用OpenAI而是用它提供的模型进行部署测试。
在多模态大模型出现前,我们都是使用语音转文字(STT)+大模型+文字转语音(TTS)的方式间接实现语音输入输出。它和Realtime模型相比主要存在下面几点问题:
- 延迟问题
语音转文字(STT)和大模型处理需要分步执行,每一步都会引入延迟。
- 错误累积风险
STT的识别错误会直接影响大模型的输入质量,错误可能被放大。
- 上下文连贯性
分步处理可能导致语音转文字后的文本丢失语调、停顿等非语言信息,影响大模型对上下文的理解。
- 实现复杂度
STT+大模型+TTS需要集成多个独立模块,调试和维护成本较高。
- 成本与资源占用
分步方案需部署多个模型,计算资源和API调用成本更高。
- 语音交互自然度
TTS生成的语音可能缺乏情感变化,与真人对话体验存在差距。
- 多语言支持
独立STT/TTS模块可能对不同语言的支持能力不均衡,需单独训练。
因此可以看出,Realtime这种多模态大模型确实可以优化和解决很多问题,使整体体验得到很大的提升。
Azure云目前提供了WebRtc和WebSocket两种API调用方式用于和模型进行语音通信。我以官方给的WebRtc方式demo为例,实现了一个语音助手,其中包括指令设置以及tools的function使用,演示了如何触发tools指定的function,以此实现RAG以及更多功能。
需要配置Azure云上部署的模型参数才能使用,在Azure云上部署完模型就能拿到,参数包括:
- SESSIONS_URL
- API_KEY
- DEPLOYMENT
WebRtc和服务器的事件交互式是通过DataChannel实现的,其中比较关键的是session.update消息。我们的提示词是通过它的instructions参数设置给模型的,另外它的tools参数是用来配置相关function的,需要注意的是tools目前还不支持mcp服务
cpp
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Azure OpenAI Realtime Session</title>
</head>
<body>
<h1>Azure OpenAI Realtime Session</h1>
<p>WARNING: Don't use this code sample in production with the API key hardcoded. Use a protected backend service to call the sessions API and generate the ephemeral key. Then return the ephemeral key to the client.</p>
<button onclick="StartSession()">点击开始聊天</button>
<!-- Log container for API messages -->
<div id="logContainer"></div>
<script>
// Make sure the WebRTC URL region matches the region of your Azure OpenAI resource.
// For example, if your Azure OpenAI resource is in the swedencentral region,
// the WebRTC URL should be https://swedencentral.realtimeapi-preview.ai.azure.com/v1/realtimertc.
// If your Azure OpenAI resource is in the eastus2 region, the WebRTC URL should be https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc.
const WEBRTC_URL= "https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc"
// The SESSIONS_URL includes the Azure OpenAI resource URL,
// deployment name, the /realtime/sessions path, and the API version.
// The Azure OpenAI resource region isn't part of the SESSIONS_URL.
const SESSIONS_URL="https://xxxxxxxxxx.openai.azure.com/openai/realtimeapi/sessions?api-version=2025-04-01-preview"
// The API key of the Azure OpenAI resource.
const API_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
// The deployment name might not be the same as the model name.
const DEPLOYMENT = "xxxxxxxxxxxxxxx"
const VOICE = "sage"
async function StartSession() {
try {
// WARNING: Don't use this code sample in production
// with the API key hardcoded.
// Use a protected backend service to call the
// sessions API and generate the ephemeral key.
// Then return the ephemeral key to the client.
const response = await fetch(SESSIONS_URL, {
method: "POST",
headers: {
//"Authorization": `Bearer ${ACCESS_TOKEN}`,
"api-key": API_KEY,
"Content-Type": "application/json"
},
body: JSON.stringify({
model: DEPLOYMENT,
voice: VOICE
})
});
if (!response.ok) {
throw new Error(`API request failed`);
}
const data = await response.json();
const sessionId = data.id;
const ephemeralKey = data.client_secret?.value;
console.error("Ephemeral key:", ephemeralKey);
// Mask the ephemeral key in the log message.
logMessage("Ephemeral Key Received: " + "***");
logMessage("WebRTC Session Id = " + sessionId );
// Set up the WebRTC connection using the ephemeral key.
init(ephemeralKey);
} catch (error) {
console.error("Error fetching ephemeral key:", error);
logMessage("Error fetching ephemeral key: " + error.message);
}
}
function createMeeting(args) {
alert("触发了我配置的工具函数 【createMeeting】!您预约的会议[" + args.title + "]在: " + args.start_time + " 召开,持续时长:" + args.duration + "分钟");
return true;
}
async function init(ephemeralKey) {
let peerConnection = new RTCPeerConnection();
// Set up to play remote audio from the model.
const audioElement = document.createElement('audio');
audioElement.autoplay = true;
document.body.appendChild(audioElement);
peerConnection.ontrack = (event) => {
audioElement.srcObject = event.streams[0];
};
// Set up data channel for sending and receiving events
const clientMedia = await navigator.mediaDevices.getUserMedia({ audio: true });
const audioTrack = clientMedia.getAudioTracks()[0];
peerConnection.addTrack(audioTrack);
const dataChannel = peerConnection.createDataChannel('realtime-channel');
dataChannel.addEventListener('open', () => {
logMessage('Data channel is open');
updateSession(dataChannel);
responseCreate(dataChannel);
//conversationCreate(dataChannel);
});
dataChannel.addEventListener('message', (event) => {
const realtimeEvent = JSON.parse(event.data);
console.log(realtimeEvent);
logMessage("Received server event: " + JSON.stringify(realtimeEvent, null, 2));
if (realtimeEvent.type === "session.update") {
const instructions = realtimeEvent.session.instructions;
logMessage("Instructions: " + instructions);
} else if (realtimeEvent.type === "session.error") {
logMessage("Error: " + realtimeEvent.error.message);
} else if (realtimeEvent.type === "session.end") {
logMessage("Session ended.");
} else if (realtimeEvent.type === "response.output_item.done") {
const item = realtimeEvent.item;
if (item.type == "function_call") {
try {
const args = JSON.parse(item.arguments);
if (item.name == "createMeeting") {
let result = createMeeting(args);
if (result) {
conversationCreate(dataChannel, "到这边说明会议已经创建成功了!请进行相应回复");
responseCreate(dataChannel);
}
}
} catch(e) {
logMessage("createMeeting error: " + e);
return;
}
}
}
});
dataChannel.addEventListener('close', () => {
logMessage('Data channel is closed');
});
// Start the session using the Session Description Protocol (SDP)
const offer = await peerConnection.createOffer();
await peerConnection.setLocalDescription(offer);
const sdpResponse = await fetch(`${WEBRTC_URL}?model=${DEPLOYMENT}`, {
method: "POST",
body: offer.sdp,
headers: {
Authorization: `Bearer ${ephemeralKey}`,
"Content-Type": "application/sdp",
},
});
const answer = { type: "answer", sdp: await sdpResponse.text() };
await peerConnection.setRemoteDescription(answer);
const button = document.createElement('button');
button.innerText = 'Close Session';
button.onclick = stopSession;
document.body.appendChild(button);
// Send a client event to update the session
function updateSession(dataChannel) {
const event = {
type: "session.update",
session: {
instructions: "你是一名视频会议助理,请用亲切的语气为客户提供服务。你的开场招呼语是:我是您的会议助理小爱!请问有什么能够帮助您的么?\n##注意:\n#你目前只能提供创建会议的服务,并且需要客户提供开始时间、持续时长、会议名称。\n#在正式创建会议前,请再确认一下客户提供的信息是否修改,不需要则开始执行createMeeting函数。",
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 200,
"create_response": true
},
tools: [
/*{
"type": "mcp",
"server_label": "dmcp",
"server_url": "https://dmcp-server.deno.dev/sse",
"require_approval": "never"
},*/
{
"type": "function",
"name": "queryKnowledgeBase",
"description": "查询内部知识库,获取与用户问题相关的信息。",
"parameters": {
"type": "object",
"properties": {
"question": {
"type": "string",
"description": "用户的问题"
}
},
"required": ["question"]
}
},
{
"type": "function",
"name": "createMeeting",
"description": "创建一场指定开始时间、会议时长以及名称的会议,用于后续开视频会议。",
"parameters": {
"type": "object",
"properties": {
"start_time": {
"type": "string",
"description": "会议的开始时间,格式为:(年-月-日 小时:分钟:00) 例:2025-09-09 15:33:00"
},
"duration": {
"type": "integer",
"description": "会议的时长,格式为:数字(分钟) 例:120"
},
"title": {
"type": "string",
"description": "会议名称,默认名称:我的会议"
},
},
"required": ["start_time", "duration", "title"]
}
}
],
tool_choice: "auto"
}
};
dataChannel.send(JSON.stringify(event));
logMessage("Sent client event: " + JSON.stringify(event, null, 2));
}
function conversationCreate(dataChannel, str) {
const event = {
type: "conversation.item.create",
item: {
type: "message",
role: "user",
content: [{ type: "input_text", text: str}],
}
};
dataChannel.send(JSON.stringify(event));
logMessage("Sent client event: " + JSON.stringify(event, null, 2));
}
function responseCreate(dataChannel) {
const event = {
type: "response.create"
};
dataChannel.send(JSON.stringify(event));
logMessage("Sent client event: " + JSON.stringify(event, null, 2));
}
function stopSession() {
if (dataChannel) dataChannel.close();
if (peerConnection) peerConnection.close();
peerConnection = null;
logMessage("Session closed.");
}
}
function logMessage(message) {
const logContainer = document.getElementById("logContainer");
const p = document.createElement("p");
p.textContent = message;
logContainer.appendChild(p);
window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });
}
</script>
</body>
</html>