使用Azure OpenAI Realtime模型实现语音助理

微软Azure云目前也推出了OpenAI Realtime系列多模态大模型，由于之前使用过Azure云，因此就没有使用OpenAI而是用它提供的模型进行部署测试。

在多模态大模型出现前，我们都是使用语音转文字（STT）+大模型+文字转语音（TTS）的方式间接实现语音输入输出。它和Realtime模型相比主要存在下面几点问题：

延迟问题

语音转文字（STT）和大模型处理需要分步执行，每一步都会引入延迟。

错误累积风险

STT的识别错误会直接影响大模型的输入质量，错误可能被放大。

上下文连贯性

分步处理可能导致语音转文字后的文本丢失语调、停顿等非语言信息，影响大模型对上下文的理解。

实现复杂度

STT+大模型+TTS需要集成多个独立模块，调试和维护成本较高。

成本与资源占用

分步方案需部署多个模型，计算资源和API调用成本更高。

语音交互自然度

TTS生成的语音可能缺乏情感变化，与真人对话体验存在差距。

多语言支持

独立STT/TTS模块可能对不同语言的支持能力不均衡，需单独训练。

因此可以看出，Realtime这种多模态大模型确实可以优化和解决很多问题，使整体体验得到很大的提升。

Azure云目前提供了WebRtc和WebSocket两种API调用方式用于和模型进行语音通信。我以官方给的WebRtc方式demo为例，实现了一个语音助手，其中包括指令设置以及tools的function使用，演示了如何触发tools指定的function，以此实现RAG以及更多功能。

需要配置Azure云上部署的模型参数才能使用，在Azure云上部署完模型就能拿到，参数包括：

SESSIONS_URL
API_KEY
DEPLOYMENT

WebRtc和服务器的事件交互式是通过DataChannel实现的，其中比较关键的是session.update消息。我们的提示词是通过它的instructions参数设置给模型的，另外它的tools参数是用来配置相关function的，需要注意的是tools目前还不支持mcp服务

cpp 复制代码

<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Azure OpenAI Realtime Session</title>
</head>
<body>
    <h1>Azure OpenAI Realtime Session</h1>
    <p>WARNING: Don't use this code sample in production with the API key hardcoded. Use a protected backend service to call the sessions API and generate the ephemeral key. Then return the ephemeral key to the client.</p>
    <button onclick="StartSession()">点击开始聊天</button>

    <!-- Log container for API messages -->
    <div id="logContainer"></div> 

    <script>

        // Make sure the WebRTC URL region matches the region of your Azure OpenAI resource.
        // For example, if your Azure OpenAI resource is in the swedencentral region,
        // the WebRTC URL should be https://swedencentral.realtimeapi-preview.ai.azure.com/v1/realtimertc.
        // If your Azure OpenAI resource is in the eastus2 region, the WebRTC URL should be https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc.
        const WEBRTC_URL= "https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc"

        // The SESSIONS_URL includes the Azure OpenAI resource URL,
        // deployment name, the /realtime/sessions path, and the API version.
        // The Azure OpenAI resource region isn't part of the SESSIONS_URL.
        const SESSIONS_URL="https://xxxxxxxxxx.openai.azure.com/openai/realtimeapi/sessions?api-version=2025-04-01-preview"

        // The API key of the Azure OpenAI resource.
        const API_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; 

        // The deployment name might not be the same as the model name.
        const DEPLOYMENT = "xxxxxxxxxxxxxxx"
		const VOICE = "sage"

        async function StartSession() {
            try {

                // WARNING: Don't use this code sample in production
                // with the API key hardcoded. 
                // Use a protected backend service to call the 
                // sessions API and generate the ephemeral key.
                // Then return the ephemeral key to the client.

                const response = await fetch(SESSIONS_URL, {
                    method: "POST",
                    headers: {
                        //"Authorization": `Bearer ${ACCESS_TOKEN}`,
                        "api-key": API_KEY,
                        "Content-Type": "application/json"
                    },
                    body: JSON.stringify({
                        model: DEPLOYMENT,
                        voice: VOICE
                    })
                });

                if (!response.ok) {
                    throw new Error(`API request failed`);
                }

                const data = await response.json();

                const sessionId = data.id;
                const ephemeralKey = data.client_secret?.value; 
                console.error("Ephemeral key:", ephemeralKey);

                // Mask the ephemeral key in the log message.
                logMessage("Ephemeral Key Received: " + "***");
		        logMessage("WebRTC Session Id = " + sessionId );

                // Set up the WebRTC connection using the ephemeral key.
                init(ephemeralKey); 

            } catch (error) {
                console.error("Error fetching ephemeral key:", error);
                logMessage("Error fetching ephemeral key: " + error.message);
            }
        }
		
		function createMeeting(args) {
			alert("触发了我配置的工具函数 【createMeeting】！您预约的会议[" + args.title + "]在： " + args.start_time + " 召开，持续时长：" + args.duration + "分钟");
			return true;
		}

        async function init(ephemeralKey) {

            let peerConnection = new RTCPeerConnection();

            // Set up to play remote audio from the model.
            const audioElement = document.createElement('audio');
            audioElement.autoplay = true;
            document.body.appendChild(audioElement);

            peerConnection.ontrack = (event) => {
                audioElement.srcObject = event.streams[0];
            };

            // Set up data channel for sending and receiving events
            const clientMedia = await navigator.mediaDevices.getUserMedia({ audio: true });
            const audioTrack = clientMedia.getAudioTracks()[0];
            peerConnection.addTrack(audioTrack);

            const dataChannel = peerConnection.createDataChannel('realtime-channel');

            dataChannel.addEventListener('open', () => {
                logMessage('Data channel is open');
                updateSession(dataChannel);
				responseCreate(dataChannel);
				//conversationCreate(dataChannel);
            });

            dataChannel.addEventListener('message', (event) => {
                const realtimeEvent = JSON.parse(event.data); 
                console.log(realtimeEvent); 
                logMessage("Received server event: " + JSON.stringify(realtimeEvent, null, 2));
                if (realtimeEvent.type === "session.update") {
                    const instructions = realtimeEvent.session.instructions;
                    logMessage("Instructions: " + instructions);
                } else if (realtimeEvent.type === "session.error") {
                    logMessage("Error: " + realtimeEvent.error.message);
                } else if (realtimeEvent.type === "session.end") {
                    logMessage("Session ended.");
                } else if (realtimeEvent.type === "response.output_item.done") {
					const item = realtimeEvent.item;
					if (item.type == "function_call") {
						try {
							const args = JSON.parse(item.arguments);
							if (item.name == "createMeeting") {
								let result = createMeeting(args);
								if (result) {
									conversationCreate(dataChannel, "到这边说明会议已经创建成功了！请进行相应回复");
									responseCreate(dataChannel);
								}
							}
						} catch(e) {
							logMessage("createMeeting error: " + e);
							return;
						}
					}
				}
            });

            dataChannel.addEventListener('close', () => {
                logMessage('Data channel is closed');
            });

	        // Start the session using the Session Description Protocol (SDP)
            const offer = await peerConnection.createOffer();
            await peerConnection.setLocalDescription(offer);

            const sdpResponse = await fetch(`${WEBRTC_URL}?model=${DEPLOYMENT}`, {
                method: "POST",
                body: offer.sdp,
                headers: {
                    Authorization: `Bearer ${ephemeralKey}`,
                    "Content-Type": "application/sdp",
                },
            });

            const answer = { type: "answer", sdp: await sdpResponse.text() };
            await peerConnection.setRemoteDescription(answer);

            const button = document.createElement('button');
            button.innerText = 'Close Session';
            button.onclick = stopSession;
            document.body.appendChild(button);

            // Send a client event to update the session
            function updateSession(dataChannel) {
                const event = {
                    type: "session.update",
                    session: {
                        instructions: "你是一名视频会议助理，请用亲切的语气为客户提供服务。你的开场招呼语是：我是您的会议助理小爱！请问有什么能够帮助您的么？\n##注意：\n#你目前只能提供创建会议的服务，并且需要客户提供开始时间、持续时长、会议名称。\n#在正式创建会议前，请再确认一下客户提供的信息是否修改，不需要则开始执行createMeeting函数。",
						"turn_detection": {
						  "type": "server_vad",
						  "threshold": 0.5,
						  "prefix_padding_ms": 300,
						  "silence_duration_ms": 200,
						  "create_response": true
						},
						tools: [
							/*{
								"type": "mcp",
								"server_label": "dmcp",
								"server_url": "https://dmcp-server.deno.dev/sse",
								"require_approval": "never"
							},*/
							{
							  "type": "function",
							  "name": "queryKnowledgeBase",
							  "description": "查询内部知识库，获取与用户问题相关的信息。",
							  "parameters": {
								"type": "object",
								"properties": {
								  "question": {
									"type": "string",
									"description": "用户的问题"
								  }
								},
								"required": ["question"]
							  }
							},
							{
							  "type": "function",
							  "name": "createMeeting",
							  "description": "创建一场指定开始时间、会议时长以及名称的会议，用于后续开视频会议。",
							  "parameters": {
								"type": "object",
								"properties": {
								  "start_time": {
									"type": "string",
									"description": "会议的开始时间，格式为：（年-月-日 小时:分钟:00） 例：2025-09-09 15:33:00"
								  },
								  "duration": {
									"type": "integer",
									"description": "会议的时长，格式为：数字（分钟） 例：120"
								  },
								  "title": {
									"type": "string",
									"description": "会议名称，默认名称：我的会议"
								  },
								},
								"required": ["start_time", "duration", "title"]
							  }
							}
						],
						tool_choice: "auto"
                    }
                };
                dataChannel.send(JSON.stringify(event));
                logMessage("Sent client event: " + JSON.stringify(event, null, 2));
            }
			
            function conversationCreate(dataChannel, str) {
                const event = {
                    type: "conversation.item.create",
					item: {
						type: "message",
						role: "user",
						content: [{ type: "input_text", text: str}],
					}
                };
                dataChannel.send(JSON.stringify(event));
                logMessage("Sent client event: " + JSON.stringify(event, null, 2));
            }
			
            function responseCreate(dataChannel) {
                const event = {
                    type: "response.create"
                };
                dataChannel.send(JSON.stringify(event));
                logMessage("Sent client event: " + JSON.stringify(event, null, 2));
            }

            function stopSession() {
                if (dataChannel) dataChannel.close();
                if (peerConnection) peerConnection.close();
                peerConnection = null;
                logMessage("Session closed.");
            }

        }

        function logMessage(message) {
            const logContainer = document.getElementById("logContainer");
            const p = document.createElement("p");
            p.textContent = message;
            logContainer.appendChild(p);
			
			window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });
        }
    </script>
</body>
</html>