最近公司需要做一个ai对话网站,有语音输入的需求,对接百度实时语音识别,记录下实现过程
直接上干货
因为用的react,一开始我就想到了hooks,自定义一个useBaiduAsr方法,透出text
start
send
end
参数,其中text
是识别后的文字,其余都是方法,可针对指定场景来调用
jsx
import { useCallback, useEffect, useRef, useState } from 'react'
export const useBaiduAsr = (apiKey, appId) => {
const [text, setText] = useState('')
const [resultObjs, setResultObjs] = useState([])
const wsRef = useRef(null)
// 开始语音识别
const startStreamingRecognition = useCallback(async () => {
try {
const uuid = crypto.randomUUID()
const wsUrl = `wss://vop.baidu.com/realtime_asr?sn=${uuid}`
wsRef.current = new WebSocket(wsUrl)
wsRef.current.onmessage = (event) => {
try {
const data = JSON.parse(event.data)
if (data.type === "MID_TEXT" || data.type === "FIN_TEXT") {
const { result, start_time } = data
setResultObjs(pre => [...pre, { result, start_time }])
}
} catch (error) {
console.error('处理消息时发生错误:', error)
}
}
return new Promise((resolve, reject) => {
wsRef.current.onopen = () => {
console.log('WebSocket连接已建立')
const startParams = {
type: 'START',
data: {
appid: appId,
appkey: apiKey,
dev_pid: 1537,
format: "pcm",
sample: 16000,
cuid: 'react-app'
}
}
wsRef.current.send(JSON.stringify(startParams))
resolve(wsRef.current)
}
wsRef.current.onerror = (error) => {
console.error('WebSocket错误:', error)
reject(error)
}
wsRef.current.onclose = () => {
console.log('WebSocket连接已关闭')
}
})
} catch (error) {
console.error('创建WebSocket连接失败:', error)
throw error
}
}, [apiKey])
// 发送音频数据
const sendAudioData = (audioData) => {
if (!wsRef.current) {
console.error('WebSocket未连接')
return
}
if (wsRef.current.readyState === WebSocket.OPEN) {
wsRef.current.send(audioData)
} else {
// console.error('WebSocket未就绪,当前状态:', wsRef.current.readyState)
}
}
// 停止语音识别
const stopStreamingRecognition = useCallback(() => {
if (!wsRef.current) {
return
}
try {
const finishParams = {
type: 'FINISH'
}
wsRef.current.send(JSON.stringify(finishParams))
wsRef.current.close()
setText('')
setResultObjs([])
} catch (error) {
console.error('关闭WebSocket时发生错误:', error)
}
}, [])
useEffect(() => {
const result = []
let tempGroup = []
resultObjs.forEach((item, index) => {
if (index === 0 || item.start_time === resultObjs[index - 1].start_time) {
tempGroup.push(item.result)
} else {
result.push(tempGroup)
tempGroup = [item.result]
}
})
if (tempGroup.length) {
result.push(tempGroup)
}
if (result.length) {
const text = result.map(item => item[item.length - 1]).join('')
setText(text)
}
}, [resultObjs])
return {
text,
start: startStreamingRecognition,
send: sendAudioData,
stop: stopStreamingRecognition,
}
}
解析下代码: 百度语音实时识别使用ws来连接,在连接成功后需要手动触发一次开始任务参数,在识别成功后会返回一个识别内容对象,但是内容可能会重复,百度的策略是文字累加的,那我们该如何消费生成的文字?
我的思路是使用resultObjs数组来接收返回的内容,如果语音被识别为连续的一段话,他们的start_time字段是相同的,通过这个可将相同的start_time的result放到一个二维数组中,下层每个数组代表一个段落,段落数组中每个元素代表识别过程中的内容,我们直接取最后一个即可,这样有个好处就是,实时识别后等最后拿到结果有一个纠正的效果。
js
[
['你', '你好', '你好啊', '你好啊!' ],
['我', '我可', '我可以', '我可以和你', '我可以和你一起吗?']
]
将最后一个取出后拼接在一起即可 下面看我们的组件如何消费这个hook
jsx
import { Button } from 'antd'
import React, { useEffect, useRef, useState } from 'react'
import { useBaiduAsr } from '@/hooks/useBaiduAsr'
import IconFont from '../IconFont'
import css from './index.module.scss'
const VoiceRecognition = ({ onStop, onTextChange }) => {
const [isRecording, setIsRecording] = useState(false)
const mediaStreamRef = useRef(null)
const audioProcessorRef = useRef(null)
const { text, start, send, stop } = useBaiduAsr(YOUR_API_KEY, YOUR_APP_ID)
const startRecording = async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
const audioContext = new AudioContext({ sampleRate: 16000 })
const source = audioContext.createMediaStreamSource(stream)
const processor = audioContext.createScriptProcessor(4096, 1, 1)
source.connect(processor)
processor.connect(audioContext.destination)
// 初始化百度流式识别
await start()
processor.onaudioprocess = (e) => {
try {
const audioData = e.inputBuffer.getChannelData(0)
const pcmData = float32ToPCM(audioData)
send(pcmData)
} catch (e) {
console.log('send error', e)
stopRecording()
}
}
mediaStreamRef.current = stream
audioProcessorRef.current = processor
setIsRecording(true)
} catch (err) {
console.log(err)
}
}
const stopRecording = async () => {
onStop?.()
stop()
if (audioProcessorRef.current) {
audioProcessorRef.current.disconnect()
audioProcessorRef.current = null
}
if (mediaStreamRef.current) {
mediaStreamRef.current.getTracks().forEach((track) => track.stop())
mediaStreamRef.current = null
}
setIsRecording(false)
}
// 转换Float32音频数据为Int16
const float32ToPCM = (float32Array) => {
const pcm16Array = new Int16Array(float32Array.length)
for (let i = 0; i < float32Array.length; i++) {
pcm16Array[i] = Math.max(-32768, Math.min(32767, float32Array[i] * 32768))
}
return pcm16Array.buffer
}
useEffect(() => {
return () => {
stopRecording()
}
}, [])
useEffect(() => {
onTextChange(text)
}, [text, onTextChange])
return (
<>
{isRecording ? (
<Button type="text" size="middle" className={css['voice-btn']} onClick={stopRecording}>
<IconFont className={css['search-voice-listen']} type="icon-mic-off" />
</Button>
) : (
<Button type="text" size="middle" className={css['voice-btn']} onClick={startRecording}>
<IconFont className={css['search-voice']} type="icon-mic-on" />
</Button>
)}
</>
)
}
export default VoiceRecognition
这里的重点是关于语音切片的一些逻辑和api调用以及音频的格式转换,我就不赘述了,你们自己看代码就好,在消费这个组件时,需要注意传入的onStop方法中需要把text赋值到input的value中
jsx
export defualt function Chat() {
const [prompt, setPrompt] = useState('')
const [voiceText, setVoiceText] = useState('')
const handleVoiceStop = () => {
setPrompt(pre => pre + voiceText)
setVoiceText('')
}
const handleText = (text) => {
setVoiceText(text)
}
return (
<Input
value={prompt + voiceText}
suffix={<VoiceRecognition onStop={handleVoiceStop} onTextChange={handleText} />}
/>
)
}
目前暂时就是这样,后面还有ws断线重连和语音续接的需求,后续再更新了。