前端实时语音转写：原生 MediaRecorder API 实践

本文主要实现一个实时语音转写组件，基于原生的 MediaRecorder API，从音频流捕获、音频波形可视化（基于 wavesurfer.js）、实时数据处理、语音识别转写，逐步拆解前端语音边录边转的完整实现流程，完整的代码实现在文章最后面

音频流捕获

音频流捕获主要涉及两个 API ：MediaDevices.getUserMedia() 和 MediaRecorder API ，实现代码片段如下：

ts 复制代码

// 使用麦克风的许可：https://developer.mozilla.org/zh-CN/docs/Web/API/MediaDevices/getUserMedia
const stream = await navigator.mediaDevices.getUserMedia({ audio: true })

// 创建 MediaRecorder 实例：https://developer.mozilla.org/zh-CN/docs/Web/API/MediaRecorder
mediaRecorder = new MediaRecorder(stream)

// 启动录音，并设置采样频率
mediaRecorder.start(RECORDING_TIME_SLICE)

实时数据处理

实时数据处理主要用到了 MediaRecorder.ondataavailable ，该方法可以让你实时捕获语音流，是实现实时语音转写的关键 API

ts 复制代码

audioChunks: Blob[] = []

// 监听 dataavailable 事件，将录音数据存储到 audioChunks 中
mediaRecorder.ondataavailable = (event) => {
  if (event.data.size > 0) {
    audioChunks.push(event.data)
  }
}

音频波形可视化

音频波形可视化主要用到了 wavesurfer.js，可以直接用 npm i wavesurfer.js 安装，具体实现如下：

ts 复制代码

import WaveSurfer from "wavesurfer.js"

// 初始化 wavesurfer，更多参数请参考
wavesurfer = WaveSurfer.create({
  container: waveform.value, // 指定波形渲染的DOM容器
  waveColor: '#006EFF', // 波形颜色
  progressColor: 'transparent',
  barWidth: 5,
  height: 60,
  barRadius: 3
})


// 实时更新波形
const audioBlob = new Blob(audioChunks, { type: mediaRecorder!.mimeType })
// 绘制当前音频的波形
wavesurfer?.loadBlob(audioBlob)

语音识别转写

捕获实时语音流之后，可以构建一个 Blob 对象并传递给语音识别 API 实现语音实时转写，这里实现的是轮询调用 API 接口，实际实践中可能用 WebSocket 会更好些

ts 复制代码

/**
 * 处理语音识别
 * @param pollVoiceRecognition 是否启动实时语音轮询识别，默认 false
 */
async function handleTranscribe(pollVoiceRecognition: boolean = false) {
  let currentRecordingText = ''
  try {
    if (audioChunks.length === 0) return

    const audioBlob = new Blob(audioChunks, { type: 'audio/wav' })

    // 调用语音识别 API
    const response = await AIServer.transcribeAudio(audioBlob)
    const result = response.data?.text || ''

    if (result) {
      emit('transcribeResult', result)
      currentRecordingText = result
    }
  } catch (error: any) {
    console.error('语音识别失败:', error)
  } finally {
    if (!pollVoiceRecognition) {
      recordingText.value = currentRecordingText
    } else {
      if (currentRecordingText && currentRecordingText === recordingText.value) {
        // 识别到相同结果，启动静默计时，计时结束自动结束录音
        transcribeTimer = window.setTimeout(() => {
          finishRecording()
        }, SILENCE_DURATION)
      } else {
        // 识别到不同结果，更新当前录音文本并继续监听
        recordingText.value = currentRecordingText || recordingPlaceholder.value
        startTranscribeTimer()
      }
    }
  }
}

语音轮询方法实现

ts 复制代码

/** 启动实时转录定时器 */
function startTranscribeTimer() {
  if (transcribeTimer) {
    clearTimeout(transcribeTimer)
  }
  if (!isRecording.value) return
  transcribeTimer = window.setTimeout(() => {
    if (isRecording.value) {
      handleTranscribe(true)
    }
  }, TRANSCRIBE_INTERVAL)
}

完整代码

实现代码基于 vue3，仅供参考

html 复制代码

<template>
  <div class="voice-input">
    <div class="voice-input__container">
      <div class="wave-form" style="width: 100%" ref="waveform"></div>
      <img class="voice-input__bg" src="../../../assets/img/ai-assistant/矩形AS.png" alt="" />
      <img class="voice-input__mic" :class="{ 'voice-input__mic--recording': isRecording }"
        src="../../../assets/img/ai/麦克风.png" alt="麦克风" @click="toggleRecording" />
    </div>
  </div>
</template>

<script setup lang="ts">
import { AIServer } from "@/API"
import { ref, onBeforeUnmount, onMounted } from 'vue'
import WaveSurfer from "wavesurfer.js"

const emit = defineEmits(['switchToText', 'transcribeResult', 'recordingStatus'])

const isRecording = ref(false)
const recordingText = ref('点击开始录音')
const recordingPlaceholder = ref('我正在听~')
const lastTranscribeTime = ref(0)
const waveform = ref<HTMLDivElement>()

let wavesurfer: WaveSurfer | null = null;
let mediaRecorder: MediaRecorder | null = null
let audioChunks: Blob[] = []
let transcribeTimer: number | null = null


const TRANSCRIBE_INTERVAL = 800 // 语音识别时间间隔
const SILENCE_DURATION = 2000 // 静默时间，在该时间周期内没有新的语音内容，视为转写完成
const RECORDING_TIME_SLICE = 200 // 语音采样频率

/** 切换录音状态 */
function toggleRecording() {
  if (isRecording.value) {
    stopRecording()
  } else {
    startRecording()
  }
}

/** 开始录音 */
async function startRecording() {
  try {
    // 使用麦克风的许可：https://developer.mozilla.org/zh-CN/docs/Web/API/MediaDevices/getUserMedia
    const stream = await navigator.mediaDevices.getUserMedia({ audio: true })

    // 创建 MediaRecorder 实例：https://developer.mozilla.org/zh-CN/docs/Web/API/MediaRecorder
    mediaRecorder = new MediaRecorder(stream)
    audioChunks = []

    // 监听 dataavailable 事件，将录音数据存储到 audioChunks 中
    mediaRecorder.ondataavailable = (event) => {
      if (event.data.size > 0) {
        audioChunks.push(event.data)
        // 实时更新波形
        const audioBlob = new Blob(audioChunks, { type: mediaRecorder!.mimeType })
        // 绘制当前音频对应的波形
        wavesurfer?.loadBlob(audioBlob)
      }
    }

    // 启动录音，并设置采样频率
    mediaRecorder.start(RECORDING_TIME_SLICE)
    isRecording.value = true
    recordingText.value = recordingPlaceholder.value
    lastTranscribeTime.value = Date.now()

    emit('recordingStatus', 'recording')

    // 启动实时转录定时器
    startTranscribeTimer()

  } catch (error) {
    console.error('录音启动失败:', error)
    recordingText.value = '录音启动失败，请检查麦克风权限'
    emit('recordingStatus', 'error')
  }
}

/** 停止录音 */
function stopRecording() {
  if (!isRecording.value) return
  if (mediaRecorder && mediaRecorder.state !== 'inactive') {
    mediaRecorder.stop()
    mediaRecorder.stream.getTracks().forEach(track => track.stop())
  }

  if (transcribeTimer) {
    clearTimeout(transcribeTimer)
    transcribeTimer = null
  }

  isRecording.value = false
  recordingText.value = '点击开始录音'
  wavesurfer?.empty()
}

/** 完成录音 */
function finishRecording() {
  stopRecording()
  emit('recordingStatus', 'finished')
}

/** 启动语音实时转写定时器 */
function startTranscribeTimer() {
  if (transcribeTimer) {
    clearTimeout(transcribeTimer)
  }
  if (!isRecording.value) return
  transcribeTimer = window.setTimeout(() => {
    if (isRecording.value) {
      handleTranscribe(true)
    }
  }, TRANSCRIBE_INTERVAL)
}

/**
 * 处理语音识别
 * @param pollVoiceRecognition 是否启动实时语音轮询识别，默认 false
 */
async function handleTranscribe(pollVoiceRecognition: boolean = false) {
  let currentRecordingText = ''
  try {
    if (audioChunks.length === 0) return

    const audioBlob = new Blob(audioChunks, { type: 'audio/wav' })

    // 调用语音识别 API
    const response = await AIServer.transcribeAudio(audioBlob)
    const result = response.data?.text || ''

    if (result) {
      emit('transcribeResult', result)
      currentRecordingText = result
    }
  } catch (error: any) {
    console.error('语音识别失败:', error)
  } finally {
    if (!pollVoiceRecognition) {
      recordingText.value = currentRecordingText
    } else {
      if (currentRecordingText && currentRecordingText === recordingText.value) {
        // 识别到相同结果，启动静默计时，计时结束自动完成
        transcribeTimer = window.setTimeout(() => {
          finishRecording()
        }, SILENCE_DURATION)
      } else {
        // 识别到不同结果，更新当前录音文本并继续监听
        recordingText.value = currentRecordingText || recordingPlaceholder.value
        startTranscribeTimer()
      }
    }
  }
}

onBeforeUnmount(() => {
  wavesurfer?.destroy()
  wavesurfer = null

  if (isRecording.value) {
    stopRecording()
  }
})

onMounted(() => {
  if (waveform.value) {
    wavesurfer = WaveSurfer.create({
      container: waveform.value, // 指定波形渲染的DOM容器
      waveColor: '#006EFF', // 波形颜色
      progressColor: 'transparent',
      barWidth: 5,
      height: 60,
      barRadius: 3,
    })
  }
})

defineExpose({
  startRecording,
  stopRecording,
})
</script>

<style lang="scss" scoped>
.voice-input {
  display: flex;
  flex-direction: column;
  align-items: center;
  padding: 0;
  height: 100%;
  position: relative;
}

.voice-input__container {
  position: relative;
  width: calc(100% - 22px);
  margin: 0 11px;
  min-height: 68px;
  display: flex;
  align-items: center;
  justify-content: center;
  border-radius: 26px;
  overflow: hidden;
}

.voice-input__bg {
  position: absolute;
  top: 0;
  left: 0;
  width: 100%;
  height: 68px;
  object-fit: contain;
  object-position: center center;
  z-index: 1;
  border-radius: 26px;
}

.voice-input__mic {
  width: 80px;
  height: 80px;
  object-fit: contain;
  margin-bottom: 8px;
  position: absolute;
  top: -6px;
  cursor: pointer;
  transition: transform 0.3s;
  z-index: 1;

  &:hover {
    transform: scale(1.05);
  }

  &--recording {
    animation: pulse 1.5s ease-in-out infinite;
  }
}

.wave-form {
  position: absolute;
  z-index: 2;
  pointer-events: none;
  padding: 10px;
  box-sizing: border-box;
}

@keyframes pulse {
  0% {
    transform: scale(1);
  }

  50% {
    transform: scale(1.1);
  }

  100% {
    transform: scale(1);
  }
}
</style>