import pyaudio
import wave
import os
import time
import threading
import json
from vosk import Model, KaldiRecognizer
import edge_tts
import asyncio
import queue
import numpy as np
import pygame
from collections import OrderedDict
import tempfile
from webrtcvad import Vad
class FastASRDialogueSystem:
def __init__(self, model_path="/home/work/vosk-model-cn-0.15",
sample_rate=16000, chunk_size=8000,
language="zh-CN", voice="zh-CN-XiaoyiNeural",
cache_size=100, vad_mode=3,
speech_threshold=3, silence_threshold=8):
# 基础参数
self.sample_rate = sample_rate
self.chunk_size = chunk_size
self.language = language
self.voice = voice
# VAD核心参数
self.vad_mode = vad_mode
self.vad = Vad(vad_mode)
self.frame_duration_ms = 30
self.vad_frame_size = int(sample_rate * self.frame_duration_ms / 1000) * 2
self.speech_frames_required = speech_threshold
self.silence_frames_allowed = silence_threshold
# 音频缓冲参数
self.audio_buffer = b''
self.min_speech_length = 200 # 最小语音长度(ms)
# 初始化识别模型
print("正在加载Vosk中文模型...")
self.model = Model(model_path)
self.recognizer = KaldiRecognizer(self.model, sample_rate)
self.recognizer.SetWords(True)
# 音频流初始化
self.p = pyaudio.PyAudio()
self.stream = self.p.open(
format=pyaudio.paInt16,
channels=1,
rate=sample_rate,
input=True,
frames_per_buffer=chunk_size,
input_device_index=self._get_default_microphone()
)
# 消息队列与状态管理
self.text_queue = queue.Queue()
self.is_running = False
self.speaking = False
self.tts_task = None
# 语音合成循环(修复:确保方法存在)
self.tts_loop = asyncio.new_event_loop()
self.tts_thread = threading.Thread(target=self._run_tts_loop)
self.tts_thread.daemon = True
self.tts_thread.start()
# 音频播放初始化
pygame.mixer.init(frequency=sample_rate, size=-16, channels=1)
# 性能统计
self.asr_time = 0
self.tts_time = 0
self.asr_count = 0
self.tts_count = 0
self.vad_stats = {"detect": 0, "reject": 0, "miss": 0}
# 语音合成缓存
self.tts_cache = OrderedDict()
self.cache_size = cache_size
self.temp_dir = tempfile.TemporaryDirectory()
self._preload_tts_engine()
def _get_default_microphone(self):
info = self.p.get_host_api_info_by_index(0)
default_device = info.get('defaultInputDevice')
if default_device is None:
print("警告:未找到默认麦克风,使用设备0")
return 0
return default_device
def _preload_tts_engine(self):
async def preload():
try:
communicate = edge_tts.Communicate("预加载测试", self.voice)
output = os.path.join(self.temp_dir.name, "preload.mp3")
await communicate.save(output)
if os.path.exists(output):
os.remove(output)
except Exception as e:
print(f"预加载失败: {e}")
asyncio.run_coroutine_threadsafe(preload(), self.tts_loop).result()
# 修复:添加缺失的方法
def _run_tts_loop(self):
"""运行语音合成的事件循环"""
asyncio.set_event_loop(self.tts_loop)
self.tts_loop.run_forever()
def start_listening(self):
if self.is_running:
return
self.is_running = True
self.listening_thread = threading.Thread(target=self._listening_loop)
self.listening_thread.daemon = True
self.listening_thread.start()
print("语音监听已启动,可开始对话...")
def stop_listening(self):
self.is_running = False
if hasattr(self, 'listening_thread') and self.listening_thread.is_alive():
self.listening_thread.join(timeout=1.0)
self.tts_loop.call_soon_threadsafe(self.tts_loop.stop)
if hasattr(self, 'tts_thread') and self.tts_thread.is_alive():
self.tts_thread.join(timeout=1.0)
pygame.mixer.quit()
self.temp_dir.cleanup()
self._print_timing_statistics()
print("语音系统已停止")
def _print_timing_statistics(self):
if self.asr_count > 0:
avg = self.asr_time / self.asr_count
print(f"ASR: {self.asr_count}次, 总耗时{self.asr_time:.2f}s, 平均{avg:.2f}s")
if self.tts_count > 0:
avg = self.tts_time / self.tts_count
print(f"TTS: {self.tts_count}次, 总耗时{self.tts_time:.2f}s, 平均{avg:.2f}s")
if self.vad_stats["detect"] > 0:
reject_rate = self.vad_stats["reject"] / self.vad_stats["detect"] * 100
miss_rate = self.vad_stats["miss"] / self.asr_count * 100 if self.asr_count > 0 else 0
print(
f"VAD: 检测{self.vad_stats['detect']}次, 拒绝{self.vad_stats['reject']}次({reject_rate:.2f}%), 漏话率{miss_rate:.2f}%")
def _listening_loop(self):
frames = []
speech_frames = 0
silence_frames = 0
is_speaking = False
last_speech_time = 0
print(f"VAD模式{self.vad_mode}启动,语音阈值{self.speech_frames_required}帧,静音阈值{self.silence_frames_allowed}帧")
while self.is_running:
try:
data = self.stream.read(self.chunk_size, exception_on_overflow=False)
self.vad_stats["detect"] += 1
self.audio_buffer += data
while len(self.audio_buffer) >= self.vad_frame_size:
vad_frame = self.audio_buffer[:self.vad_frame_size]
self.audio_buffer = self.audio_buffer[self.vad_frame_size:]
is_speech = self.vad.is_speech(vad_frame, self.sample_rate)
current_time = time.time()
if is_speech:
if not is_speaking:
speech_frames += 1
if speech_frames >= self.speech_frames_required:
is_speaking = True
frames = [vad_frame]
speech_frames = 0
silence_frames = 0
last_speech_time = current_time
print("【语音开始】")
else:
frames.append(vad_frame)
silence_frames = 0
last_speech_time = current_time
else:
if is_speaking:
frames.append(vad_frame)
silence_frames += 1
if silence_frames >= self.silence_frames_allowed:
speech_length_ms = len(frames) * self.frame_duration_ms
if speech_length_ms >= self.min_speech_length:
self._process_audio_frames(b''.join(frames))
frames = []
is_speaking = False
print(f"【语音结束】长度{speech_length_ms}ms")
else:
frames = []
self.vad_stats["reject"] += 1
print(f"【丢弃短语音】长度{speech_length_ms}ms")
else:
if frames:
frames = []
self.vad_stats["reject"] += 1
if is_speaking and (current_time - last_speech_time > 3.0):
speech_length_ms = len(frames) * self.frame_duration_ms
if speech_length_ms >= self.min_speech_length:
self._process_audio_frames(b''.join(frames))
frames = []
is_speaking = False
print(f"【超时提交】长度{speech_length_ms}ms")
else:
frames = []
self.vad_stats["reject"] += 1
except Exception as e:
print(f"录音错误: {e}")
self.audio_buffer = b''
frames = []
is_speaking = False
time.sleep(0.1)
def _process_audio_frames(self, audio_data):
try:
if len(audio_data) < self.chunk_size // 4:
self.vad_stats["miss"] += 1
return
start_time = time.time()
self.recognizer.AcceptWaveform(audio_data)
result = self.recognizer.FinalResult()
end_time = time.time()
self.asr_time += end_time - start_time
self.asr_count += 1
self._process_recognition(result)
except Exception as e:
print(f"音频处理错误: {e}")
self.vad_stats["miss"] += 1
def _process_recognition(self, result):
try:
if '"text":""' not in result:
parsed = json.loads(result)
text = parsed.get("text", "").strip()
if text:
print(f"识别: {text}")
self.text_queue.put(text)
except Exception as e:
print(f"识别解析错误: {e}")
# 语音合成相关方法
async def _generate_audio(self, text: str, voice: str) -> bytes:
if text in self.tts_cache:
print("使用缓存的语音合成结果")
return self.tts_cache[text]
communicate = edge_tts.Communicate(text, voice)
output_file = os.path.join(self.temp_dir.name, f"{hash(text)}.mp3")
await communicate.save(output_file)
with open(output_file, 'rb') as f:
audio_data = f.read()
self._update_cache(text, audio_data)
return audio_data
def _update_cache(self, text, audio_data):
if len(self.tts_cache) >= self.cache_size:
self.tts_cache.popitem(last=False)
self.tts_cache[text] = audio_data
def _play_audio(self, audio_data: bytes) -> None:
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
temp_file = f.name
f.write(audio_data)
try:
pygame.mixer.music.load(temp_file)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
time.sleep(0.01)
finally:
if os.path.exists(temp_file):
os.remove(temp_file)
async def _tts_speech(self, text):
if not text:
return
self.speaking = True
print(f"正在语音回复: {text}")
start_time = time.time()
audio_data = await self._generate_audio(text, self.voice)
end_time = time.time()
tts_gen_time = end_time - start_time
start_time = time.time()
self._play_audio(audio_data)
end_time = time.time()
tts_play_time = end_time - start_time
total_tts_time = tts_gen_time + tts_play_time
self.tts_time += total_tts_time
self.tts_count += 1
print(f"语音合成耗时: {total_tts_time:.2f}秒 (生成: {tts_gen_time:.2f}秒, 播放: {tts_play_time:.2f}秒)")
self.speaking = False
def respond(self, text):
if self.speaking:
print("当前正在语音回复,稍后处理新文本...")
return
if self.tts_task and not self.tts_task.done():
self.tts_task.cancel()
self.tts_task = asyncio.run_coroutine_threadsafe(
self._tts_speech(text), self.tts_loop)
def process_commands(self, command_handler):
while True:
if not self.text_queue.empty():
text = self.text_queue.get()
response = command_handler(text)
if response:
self.respond(response)
time.sleep(0.01)
def start(self, command_handler):
self.start_listening()
self.command_thread = threading.Thread(target=self.process_commands, args=(command_handler,))
self.command_thread.daemon = True
self.command_thread.start()
print("语音对话系统已启动,输入'退出'可停止系统")
try:
while self.is_running:
time.sleep(1)
except KeyboardInterrupt:
self.stop_listening()
print("系统已停止")
def chinese_command_handler(text):
text = text.strip()
print("识别内容:", text)
return text.replace(" ", "")
if __name__ == "__main__":
system = FastASRDialogueSystem(
vad_mode=2,
speech_threshold=3,
silence_threshold=8
)
system.start(chinese_command_handler)
加速后的效果截图:
