vosk+树莓派 启用GPU加速语音识别asr简单测试

复制代码
import pyaudio
import wave
import os
import time
import threading
import json
from vosk import Model, KaldiRecognizer
import edge_tts
import asyncio
import queue
import numpy as np
import pygame
from collections import OrderedDict
import tempfile
from webrtcvad import Vad


class FastASRDialogueSystem:
    def __init__(self, model_path="/home/work/vosk-model-cn-0.15",
                 sample_rate=16000, chunk_size=8000,
                 language="zh-CN", voice="zh-CN-XiaoyiNeural",
                 cache_size=100, vad_mode=3,
                 speech_threshold=3, silence_threshold=8):
        # 基础参数
        self.sample_rate = sample_rate
        self.chunk_size = chunk_size
        self.language = language
        self.voice = voice

        # VAD核心参数
        self.vad_mode = vad_mode
        self.vad = Vad(vad_mode)
        self.frame_duration_ms = 30
        self.vad_frame_size = int(sample_rate * self.frame_duration_ms / 1000) * 2
        self.speech_frames_required = speech_threshold
        self.silence_frames_allowed = silence_threshold

        # 音频缓冲参数
        self.audio_buffer = b''
        self.min_speech_length = 200  # 最小语音长度(ms)

        # 初始化识别模型
        print("正在加载Vosk中文模型...")
        self.model = Model(model_path)
        self.recognizer = KaldiRecognizer(self.model, sample_rate)
        self.recognizer.SetWords(True)

        # 音频流初始化
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=sample_rate,
            input=True,
            frames_per_buffer=chunk_size,
            input_device_index=self._get_default_microphone()
        )

        # 消息队列与状态管理
        self.text_queue = queue.Queue()
        self.is_running = False
        self.speaking = False
        self.tts_task = None

        # 语音合成循环(修复:确保方法存在)
        self.tts_loop = asyncio.new_event_loop()
        self.tts_thread = threading.Thread(target=self._run_tts_loop)
        self.tts_thread.daemon = True
        self.tts_thread.start()

        # 音频播放初始化
        pygame.mixer.init(frequency=sample_rate, size=-16, channels=1)

        # 性能统计
        self.asr_time = 0
        self.tts_time = 0
        self.asr_count = 0
        self.tts_count = 0
        self.vad_stats = {"detect": 0, "reject": 0, "miss": 0}

        # 语音合成缓存
        self.tts_cache = OrderedDict()
        self.cache_size = cache_size
        self.temp_dir = tempfile.TemporaryDirectory()
        self._preload_tts_engine()

    def _get_default_microphone(self):
        info = self.p.get_host_api_info_by_index(0)
        default_device = info.get('defaultInputDevice')
        if default_device is None:
            print("警告:未找到默认麦克风,使用设备0")
            return 0
        return default_device

    def _preload_tts_engine(self):
        async def preload():
            try:
                communicate = edge_tts.Communicate("预加载测试", self.voice)
                output = os.path.join(self.temp_dir.name, "preload.mp3")
                await communicate.save(output)
                if os.path.exists(output):
                    os.remove(output)
            except Exception as e:
                print(f"预加载失败: {e}")

        asyncio.run_coroutine_threadsafe(preload(), self.tts_loop).result()

    # 修复:添加缺失的方法
    def _run_tts_loop(self):
        """运行语音合成的事件循环"""
        asyncio.set_event_loop(self.tts_loop)
        self.tts_loop.run_forever()

    def start_listening(self):
        if self.is_running:
            return
        self.is_running = True
        self.listening_thread = threading.Thread(target=self._listening_loop)
        self.listening_thread.daemon = True
        self.listening_thread.start()
        print("语音监听已启动,可开始对话...")

    def stop_listening(self):
        self.is_running = False
        if hasattr(self, 'listening_thread') and self.listening_thread.is_alive():
            self.listening_thread.join(timeout=1.0)

        self.tts_loop.call_soon_threadsafe(self.tts_loop.stop)
        if hasattr(self, 'tts_thread') and self.tts_thread.is_alive():
            self.tts_thread.join(timeout=1.0)

        pygame.mixer.quit()
        self.temp_dir.cleanup()
        self._print_timing_statistics()
        print("语音系统已停止")

    def _print_timing_statistics(self):
        if self.asr_count > 0:
            avg = self.asr_time / self.asr_count
            print(f"ASR: {self.asr_count}次, 总耗时{self.asr_time:.2f}s, 平均{avg:.2f}s")
        if self.tts_count > 0:
            avg = self.tts_time / self.tts_count
            print(f"TTS: {self.tts_count}次, 总耗时{self.tts_time:.2f}s, 平均{avg:.2f}s")
        if self.vad_stats["detect"] > 0:
            reject_rate = self.vad_stats["reject"] / self.vad_stats["detect"] * 100
            miss_rate = self.vad_stats["miss"] / self.asr_count * 100 if self.asr_count > 0 else 0
            print(
                f"VAD: 检测{self.vad_stats['detect']}次, 拒绝{self.vad_stats['reject']}次({reject_rate:.2f}%), 漏话率{miss_rate:.2f}%")

    def _listening_loop(self):
        frames = []
        speech_frames = 0
        silence_frames = 0
        is_speaking = False
        last_speech_time = 0

        print(f"VAD模式{self.vad_mode}启动,语音阈值{self.speech_frames_required}帧,静音阈值{self.silence_frames_allowed}帧")

        while self.is_running:
            try:
                data = self.stream.read(self.chunk_size, exception_on_overflow=False)
                self.vad_stats["detect"] += 1
                self.audio_buffer += data

                while len(self.audio_buffer) >= self.vad_frame_size:
                    vad_frame = self.audio_buffer[:self.vad_frame_size]
                    self.audio_buffer = self.audio_buffer[self.vad_frame_size:]

                    is_speech = self.vad.is_speech(vad_frame, self.sample_rate)
                    current_time = time.time()

                    if is_speech:
                        if not is_speaking:
                            speech_frames += 1
                            if speech_frames >= self.speech_frames_required:
                                is_speaking = True
                                frames = [vad_frame]
                                speech_frames = 0
                                silence_frames = 0
                                last_speech_time = current_time
                                print("【语音开始】")
                        else:
                            frames.append(vad_frame)
                            silence_frames = 0
                            last_speech_time = current_time
                    else:
                        if is_speaking:
                            frames.append(vad_frame)
                            silence_frames += 1

                            if silence_frames >= self.silence_frames_allowed:
                                speech_length_ms = len(frames) * self.frame_duration_ms
                                if speech_length_ms >= self.min_speech_length:
                                    self._process_audio_frames(b''.join(frames))
                                    frames = []
                                    is_speaking = False
                                    print(f"【语音结束】长度{speech_length_ms}ms")
                                else:
                                    frames = []
                                    self.vad_stats["reject"] += 1
                                    print(f"【丢弃短语音】长度{speech_length_ms}ms")
                        else:
                            if frames:
                                frames = []
                                self.vad_stats["reject"] += 1

                        if is_speaking and (current_time - last_speech_time > 3.0):
                            speech_length_ms = len(frames) * self.frame_duration_ms
                            if speech_length_ms >= self.min_speech_length:
                                self._process_audio_frames(b''.join(frames))
                                frames = []
                                is_speaking = False
                                print(f"【超时提交】长度{speech_length_ms}ms")
                            else:
                                frames = []
                                self.vad_stats["reject"] += 1
            except Exception as e:
                print(f"录音错误: {e}")
                self.audio_buffer = b''
                frames = []
                is_speaking = False
                time.sleep(0.1)

    def _process_audio_frames(self, audio_data):
        try:
            if len(audio_data) < self.chunk_size // 4:
                self.vad_stats["miss"] += 1
                return

            start_time = time.time()
            self.recognizer.AcceptWaveform(audio_data)
            result = self.recognizer.FinalResult()
            end_time = time.time()
            self.asr_time += end_time - start_time
            self.asr_count += 1

            self._process_recognition(result)
        except Exception as e:
            print(f"音频处理错误: {e}")
            self.vad_stats["miss"] += 1

    def _process_recognition(self, result):
        try:
            if '"text":""' not in result:
                parsed = json.loads(result)
                text = parsed.get("text", "").strip()
                if text:
                    print(f"识别: {text}")
                    self.text_queue.put(text)
        except Exception as e:
            print(f"识别解析错误: {e}")

    # 语音合成相关方法
    async def _generate_audio(self, text: str, voice: str) -> bytes:
        if text in self.tts_cache:
            print("使用缓存的语音合成结果")
            return self.tts_cache[text]

        communicate = edge_tts.Communicate(text, voice)
        output_file = os.path.join(self.temp_dir.name, f"{hash(text)}.mp3")
        await communicate.save(output_file)

        with open(output_file, 'rb') as f:
            audio_data = f.read()

        self._update_cache(text, audio_data)
        return audio_data

    def _update_cache(self, text, audio_data):
        if len(self.tts_cache) >= self.cache_size:
            self.tts_cache.popitem(last=False)
        self.tts_cache[text] = audio_data

    def _play_audio(self, audio_data: bytes) -> None:
        with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
            temp_file = f.name
            f.write(audio_data)

        try:
            pygame.mixer.music.load(temp_file)
            pygame.mixer.music.play()
            while pygame.mixer.music.get_busy():
                time.sleep(0.01)
        finally:
            if os.path.exists(temp_file):
                os.remove(temp_file)

    async def _tts_speech(self, text):
        if not text:
            return

        self.speaking = True
        print(f"正在语音回复: {text}")

        start_time = time.time()
        audio_data = await self._generate_audio(text, self.voice)
        end_time = time.time()
        tts_gen_time = end_time - start_time

        start_time = time.time()
        self._play_audio(audio_data)
        end_time = time.time()
        tts_play_time = end_time - start_time

        total_tts_time = tts_gen_time + tts_play_time
        self.tts_time += total_tts_time
        self.tts_count += 1

        print(f"语音合成耗时: {total_tts_time:.2f}秒 (生成: {tts_gen_time:.2f}秒, 播放: {tts_play_time:.2f}秒)")
        self.speaking = False

    def respond(self, text):
        if self.speaking:
            print("当前正在语音回复,稍后处理新文本...")
            return

        if self.tts_task and not self.tts_task.done():
            self.tts_task.cancel()

        self.tts_task = asyncio.run_coroutine_threadsafe(
            self._tts_speech(text), self.tts_loop)

    def process_commands(self, command_handler):
        while True:
            if not self.text_queue.empty():
                text = self.text_queue.get()
                response = command_handler(text)
                if response:
                    self.respond(response)
            time.sleep(0.01)

    def start(self, command_handler):
        self.start_listening()
        self.command_thread = threading.Thread(target=self.process_commands, args=(command_handler,))
        self.command_thread.daemon = True
        self.command_thread.start()
        print("语音对话系统已启动,输入'退出'可停止系统")

        try:
            while self.is_running:
                time.sleep(1)
        except KeyboardInterrupt:
            self.stop_listening()
            print("系统已停止")


def chinese_command_handler(text):
    text = text.strip()
    print("识别内容:", text)
    return text.replace(" ", "")


if __name__ == "__main__":
    system = FastASRDialogueSystem(
        vad_mode=2,
        speech_threshold=3,
        silence_threshold=8
    )
    system.start(chinese_command_handler)

加速后的效果截图: