Jetson 音频/语音处理:Whisper 语音识别与 TTS
1. Jetson 音频硬件
bash
# 检查音频设备
arecord -l # 录音设备
aplay -l # 播放设备
# USB 麦克风(推荐)
arecord -D plughw:1,0 -f S16_LE -r 16000 -c 1 test.wav -d 5
aplay test.wav
# 安装音频工具
sudo apt install -y alsa-utils pulseaudio portaudio19-dev
pip3 install pyaudio sounddevice
2. Whisper 语音识别
2.1 安装
bash
# 安装 whisper.cpp(C++ 版本,性能更好)
git clone https://github.com/ggerganov/whisper.cpp.git
cd whisper.cpp
make -j$(nproc)
# 下载模型
bash ./models/download-ggml-model.sh base.en
# 安装 Python 绑定
pip3 install openai-whisper
# 或 faster-whisper(推荐)
pip3 install faster-whisper
2.2 实时语音识别
python
#!/usr/bin/env python3
"""whisper_realtime.py - 实时语音识别"""
import numpy as np
import pyaudio
import threading
import queue
from faster_whisper import WhisperModel
class RealtimeWhisper:
"""实时语音识别"""
def __init__(self, model_size="base", device="cuda"):
self.model = WhisperModel(model_size, device=device, compute_type="float16")
self.audio_queue = queue.Queue()
self.sample_rate = 16000
self.chunk_duration = 3 # 每 3 秒识别一次
self.running = False
def start(self):
"""启动识别"""
self.running = True
# 录音线程
self.record_thread = threading.Thread(target=self._record_loop, daemon=True)
self.record_thread.start()
# 识别线程
self.transcribe_thread = threading.Thread(target=self._transcribe_loop, daemon=True)
self.transcribe_thread.start()
def _record_loop(self):
"""录音循环"""
p = pyaudio.PyAudio()
stream = p.open(
format=pyaudio.paFloat32,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=1024
)
chunk_size = int(self.sample_rate * self.chunk_duration)
buffer = []
while self.running:
data = np.frombuffer(stream.read(1024), dtype=np.float32)
buffer.extend(data)
if len(buffer) >= chunk_size:
self.audio_queue.put(np.array(buffer, dtype=np.float32))
buffer = []
stream.stop_stream()
stream.close()
p.terminate()
def _transcribe_loop(self):
"""识别循环"""
while self.running:
try:
audio = self.audio_queue.get(timeout=0.5)
# Whisper 识别
segments, info = self.model.transcribe(
audio,
beam_size=5,
language="zh",
vad_filter=True
)
for segment in segments:
text = segment.text.strip()
if text:
print(f"[{segment.start:.1f}s-{segment.end:.1f}s] {text}")
except queue.Empty:
continue
def stop(self):
self.running = False
if __name__ == "__main__":
whisper = RealtimeWhisper(model_size="base", device="cuda")
whisper.start()
try:
while True:
pass
except KeyboardInterrupt:
whisper.stop()
2.3 Whisper TensorRT 加速
python
#!/usr/bin/env python3
"""whisper_trt.py - Whisper TensorRT 加速"""
import tensorrt as trt
import numpy as np
def convert_whisper_to_trt(whisper_model_path, trt_engine_path):
"""将 Whisper 模型转换为 TensorRT"""
# whisper.cpp 已支持 CUDA 加速
# 使用 faster-whisper 的 CTranslate2 后端
from faster_whisper import WhisperModel
model = WhisperModel(
"base",
device="cuda",
compute_type="float16", # FP16 推理
cpu_threads=4
)
return model
# 性能对比(Orin NX 16GB):
# ┌─────────────┬──────────┬──────────┐
# │ 模型 │ FP32 │ FP16 │
# ├─────────────┼──────────┼──────────┤
# │ tiny │ 15x │ 25x │
# │ base │ 8x │ 15x │
# │ small │ 3x │ 6x │
# │ medium │ 1x │ 2.5x │
# └─────────────┴──────────┴──────────┘
# * 表示实时倍率(>1x 表示快于实时)
3. TTS 语音合成
python
#!/usr/bin/env python3
"""tts_jetson.py - 语音合成"""
from TTS.api import TTS
class JetsonTTS:
"""Jetson 语音合成"""
def __init__(self, model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST"):
self.tts = TTS(model_name).to("cuda")
def synthesize(self, text, output_path="output.wav"):
"""合成语音"""
self.tts.tts_to_file(text=text, file_path=output_path)
print(f"语音已保存: {output_path}")
def speak(self, text):
"""实时播放"""
import subprocess
self.synthesize(text, "/tmp/tts_output.wav")
subprocess.run(["aplay", "/tmp/tts_output.wav"])
if __name__ == "__main__":
tts = JetsonTTS()
tts.speak("你好,我是 Jetson 语音助手")
4. 语音唤醒词检测
python
#!/usr/bin/env python3
"""wake_word.py - 语音唤醒词"""
import pvporcupine
import pyaudio
import struct
class WakeWordDetector:
"""唤醒词检测(Porcupine)"""
def __init__(self, keyword_paths=None, sensitivities=None):
self.porcupine = pvporcupine.create(
access_key="YOUR_ACCESS_KEY",
keyword_paths=keyword_paths,
sensitivities=sensitivities or [0.5]
)
self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(
rate=self.porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=self.porcupine.frame_length
)
def listen(self):
"""监听唤醒词"""
print("等待唤醒词...")
while True:
pcm = self.stream.read(self.porcupine.frame_length)
pcm = struct.unpack_from("h" * self.porcupine.frame_length, pcm)
keyword_index = self.porcupine.process(pcm)
if keyword_index >= 0:
print(f"唤醒词检测到!索引: {keyword_index}")
return keyword_index
def cleanup(self):
self.stream.close()
self.pa.terminate()
self.porcupine.delete()
5. 完整语音助手
python
#!/usr/bin/env python3
"""voice_assistant.py - Jetson 语音助手"""
import threading
import queue
class VoiceAssistant:
"""语音助手"""
def __init__(self):
self.wake_detector = WakeWordDetector()
self.whisper = RealtimeWhisper(model_size="base")
self.tts = JetsonTTS()
self.command_queue = queue.Queue()
def run(self):
"""运行助手"""
print("语音助手已启动,等待唤醒词...")
while True:
# 等待唤醒词
self.wake_detector.listen()
print("已唤醒,请说话...")
# 语音识别
text = self.whisper.recognize_once()
print(f"识别结果: {text}")
# 处理命令
response = self.process_command(text)
print(f"回复: {response}")
# 语音播报
self.tts.speak(response)
def process_command(self, text):
"""处理语音命令"""
text = text.lower()
if "天气" in text:
return "今天天气晴朗,温度 25 度"
elif "时间" in text:
from datetime import datetime
return f"现在时间是 {datetime.now().strftime('%H:%M')}"
elif "拍照" in text:
return "已拍照保存"
else:
return "抱歉,我没有听懂"
if __name__ == "__main__":
assistant = VoiceAssistant()
assistant.run()
总结
| 功能 | 方案 | 延迟 |
|---|---|---|
| 语音识别 | faster-whisper (base) | <1s |
| 语音合成 | TTS (tacotron2) | <2s |
| 唤醒词 | Porcupine | <100ms |
| 实时转写 | whisper.cpp + VAD | <3s |
核心要点:
- faster-whisper:比 OpenAI Whisper 快 4x,支持 FP16
- GPU 加速:Whisper 和 TTS 都可以用 GPU 推理
- VAD 过滤:语音活动检测减少无效推理
- Porcupine:低功耗唤醒词检测,适合常开场景