本文主要实现一个实时语音转写组件,基于原生的 MediaRecorder API,从音频流捕获、音频波形可视化(基于 wavesurfer.js)、实时数据处理、语音识别转写,逐步拆解前端语音边录边转的完整实现流程,完整的代码实现在文章最后面
音频流捕获
音频流捕获主要涉及两个 API :MediaDevices.getUserMedia() 和 MediaRecorder API ,实现代码片段如下:
ts
// 使用麦克风的许可:https://developer.mozilla.org/zh-CN/docs/Web/API/MediaDevices/getUserMedia
const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
// 创建 MediaRecorder 实例:https://developer.mozilla.org/zh-CN/docs/Web/API/MediaRecorder
mediaRecorder = new MediaRecorder(stream)
// 启动录音,并设置采样频率
mediaRecorder.start(RECORDING_TIME_SLICE)
实时数据处理
实时数据处理主要用到了 MediaRecorder.ondataavailable ,该方法可以让你实时捕获语音流,是实现实时语音转写的关键 API
ts
audioChunks: Blob[] = []
// 监听 dataavailable 事件,将录音数据存储到 audioChunks 中
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunks.push(event.data)
}
}
音频波形可视化
音频波形可视化主要用到了 wavesurfer.js,可以直接用 npm i wavesurfer.js 安装,具体实现如下:
ts
import WaveSurfer from "wavesurfer.js"
// 初始化 wavesurfer,更多参数请参考
wavesurfer = WaveSurfer.create({
container: waveform.value, // 指定波形渲染的DOM容器
waveColor: '#006EFF', // 波形颜色
progressColor: 'transparent',
barWidth: 5,
height: 60,
barRadius: 3
})
// 实时更新波形
const audioBlob = new Blob(audioChunks, { type: mediaRecorder!.mimeType })
// 绘制当前音频的波形
wavesurfer?.loadBlob(audioBlob)
语音识别转写
捕获实时语音流之后,可以构建一个 Blob 对象并传递给语音识别 API 实现语音实时转写,这里实现的是轮询调用 API 接口,实际实践中可能用 WebSocket 会更好些
ts
/**
* 处理语音识别
* @param pollVoiceRecognition 是否启动实时语音轮询识别,默认 false
*/
async function handleTranscribe(pollVoiceRecognition: boolean = false) {
let currentRecordingText = ''
try {
if (audioChunks.length === 0) return
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' })
// 调用语音识别 API
const response = await AIServer.transcribeAudio(audioBlob)
const result = response.data?.text || ''
if (result) {
emit('transcribeResult', result)
currentRecordingText = result
}
} catch (error: any) {
console.error('语音识别失败:', error)
} finally {
if (!pollVoiceRecognition) {
recordingText.value = currentRecordingText
} else {
if (currentRecordingText && currentRecordingText === recordingText.value) {
// 识别到相同结果,启动静默计时,计时结束自动结束录音
transcribeTimer = window.setTimeout(() => {
finishRecording()
}, SILENCE_DURATION)
} else {
// 识别到不同结果,更新当前录音文本并继续监听
recordingText.value = currentRecordingText || recordingPlaceholder.value
startTranscribeTimer()
}
}
}
}
语音轮询方法实现
ts
/** 启动实时转录定时器 */
function startTranscribeTimer() {
if (transcribeTimer) {
clearTimeout(transcribeTimer)
}
if (!isRecording.value) return
transcribeTimer = window.setTimeout(() => {
if (isRecording.value) {
handleTranscribe(true)
}
}, TRANSCRIBE_INTERVAL)
}
完整代码
实现代码基于 vue3,仅供参考
html
<template>
<div class="voice-input">
<div class="voice-input__container">
<div class="wave-form" style="width: 100%" ref="waveform"></div>
<img class="voice-input__bg" src="../../../assets/img/ai-assistant/矩形AS.png" alt="" />
<img class="voice-input__mic" :class="{ 'voice-input__mic--recording': isRecording }"
src="../../../assets/img/ai/麦克风.png" alt="麦克风" @click="toggleRecording" />
</div>
</div>
</template>
<script setup lang="ts">
import { AIServer } from "@/API"
import { ref, onBeforeUnmount, onMounted } from 'vue'
import WaveSurfer from "wavesurfer.js"
const emit = defineEmits(['switchToText', 'transcribeResult', 'recordingStatus'])
const isRecording = ref(false)
const recordingText = ref('点击开始录音')
const recordingPlaceholder = ref('我正在听~')
const lastTranscribeTime = ref(0)
const waveform = ref<HTMLDivElement>()
let wavesurfer: WaveSurfer | null = null;
let mediaRecorder: MediaRecorder | null = null
let audioChunks: Blob[] = []
let transcribeTimer: number | null = null
const TRANSCRIBE_INTERVAL = 800 // 语音识别时间间隔
const SILENCE_DURATION = 2000 // 静默时间,在该时间周期内没有新的语音内容,视为转写完成
const RECORDING_TIME_SLICE = 200 // 语音采样频率
/** 切换录音状态 */
function toggleRecording() {
if (isRecording.value) {
stopRecording()
} else {
startRecording()
}
}
/** 开始录音 */
async function startRecording() {
try {
// 使用麦克风的许可:https://developer.mozilla.org/zh-CN/docs/Web/API/MediaDevices/getUserMedia
const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
// 创建 MediaRecorder 实例:https://developer.mozilla.org/zh-CN/docs/Web/API/MediaRecorder
mediaRecorder = new MediaRecorder(stream)
audioChunks = []
// 监听 dataavailable 事件,将录音数据存储到 audioChunks 中
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunks.push(event.data)
// 实时更新波形
const audioBlob = new Blob(audioChunks, { type: mediaRecorder!.mimeType })
// 绘制当前音频对应的波形
wavesurfer?.loadBlob(audioBlob)
}
}
// 启动录音,并设置采样频率
mediaRecorder.start(RECORDING_TIME_SLICE)
isRecording.value = true
recordingText.value = recordingPlaceholder.value
lastTranscribeTime.value = Date.now()
emit('recordingStatus', 'recording')
// 启动实时转录定时器
startTranscribeTimer()
} catch (error) {
console.error('录音启动失败:', error)
recordingText.value = '录音启动失败,请检查麦克风权限'
emit('recordingStatus', 'error')
}
}
/** 停止录音 */
function stopRecording() {
if (!isRecording.value) return
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.stop()
mediaRecorder.stream.getTracks().forEach(track => track.stop())
}
if (transcribeTimer) {
clearTimeout(transcribeTimer)
transcribeTimer = null
}
isRecording.value = false
recordingText.value = '点击开始录音'
wavesurfer?.empty()
}
/** 完成录音 */
function finishRecording() {
stopRecording()
emit('recordingStatus', 'finished')
}
/** 启动语音实时转写定时器 */
function startTranscribeTimer() {
if (transcribeTimer) {
clearTimeout(transcribeTimer)
}
if (!isRecording.value) return
transcribeTimer = window.setTimeout(() => {
if (isRecording.value) {
handleTranscribe(true)
}
}, TRANSCRIBE_INTERVAL)
}
/**
* 处理语音识别
* @param pollVoiceRecognition 是否启动实时语音轮询识别,默认 false
*/
async function handleTranscribe(pollVoiceRecognition: boolean = false) {
let currentRecordingText = ''
try {
if (audioChunks.length === 0) return
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' })
// 调用语音识别 API
const response = await AIServer.transcribeAudio(audioBlob)
const result = response.data?.text || ''
if (result) {
emit('transcribeResult', result)
currentRecordingText = result
}
} catch (error: any) {
console.error('语音识别失败:', error)
} finally {
if (!pollVoiceRecognition) {
recordingText.value = currentRecordingText
} else {
if (currentRecordingText && currentRecordingText === recordingText.value) {
// 识别到相同结果,启动静默计时,计时结束自动完成
transcribeTimer = window.setTimeout(() => {
finishRecording()
}, SILENCE_DURATION)
} else {
// 识别到不同结果,更新当前录音文本并继续监听
recordingText.value = currentRecordingText || recordingPlaceholder.value
startTranscribeTimer()
}
}
}
}
onBeforeUnmount(() => {
wavesurfer?.destroy()
wavesurfer = null
if (isRecording.value) {
stopRecording()
}
})
onMounted(() => {
if (waveform.value) {
wavesurfer = WaveSurfer.create({
container: waveform.value, // 指定波形渲染的DOM容器
waveColor: '#006EFF', // 波形颜色
progressColor: 'transparent',
barWidth: 5,
height: 60,
barRadius: 3,
})
}
})
defineExpose({
startRecording,
stopRecording,
})
</script>
<style lang="scss" scoped>
.voice-input {
display: flex;
flex-direction: column;
align-items: center;
padding: 0;
height: 100%;
position: relative;
}
.voice-input__container {
position: relative;
width: calc(100% - 22px);
margin: 0 11px;
min-height: 68px;
display: flex;
align-items: center;
justify-content: center;
border-radius: 26px;
overflow: hidden;
}
.voice-input__bg {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 68px;
object-fit: contain;
object-position: center center;
z-index: 1;
border-radius: 26px;
}
.voice-input__mic {
width: 80px;
height: 80px;
object-fit: contain;
margin-bottom: 8px;
position: absolute;
top: -6px;
cursor: pointer;
transition: transform 0.3s;
z-index: 1;
&:hover {
transform: scale(1.05);
}
&--recording {
animation: pulse 1.5s ease-in-out infinite;
}
}
.wave-form {
position: absolute;
z-index: 2;
pointer-events: none;
padding: 10px;
box-sizing: border-box;
}
@keyframes pulse {
0% {
transform: scale(1);
}
50% {
transform: scale(1.1);
}
100% {
transform: scale(1);
}
}
</style>