本地可视化 AI 语音助手搭建教程
Ollama + Qwen2.5 + faster-whisper + Python GUI
官网:ollama
可以直接下载Windows版本
提示: 本教程按实际搭建过程整理,包含环境准备、模型下载、麦克风调试、Whisper 问题修复、GUI 开发和后续优化方向。
项目概览
| 项目 | 内容 |
|---|---|
| 系统 | Windows |
| 主要模型 | qwen2.5:7b, faster-whisper small |
| 输入方式 | 文字输入 / 语音输入 |
| 界面 | customtkinter 白色桌面界面 |
| 运行方式 | 本地运行,Ollama 下载完成后可离线推理 |
1. 项目目标
本项目目标是搭建一个运行在 Windows 本地电脑上的可视化 AI 语音助手。
- 交互流程: 文字/语音输入 →
faster-whisper识别 →Ollama本地模型推理 → 可视化界面显示回复。 - 当前重点: 完成"本地 AI 对话 + 语音输入 + 可视化界面",暂不涉及设备控制。
2. 硬件与软件环境
- 测试配置: CPU i5-12400F / 16GB 内存 / RTX 30XX及其以上
- Python 版本: 建议使用
3.10或3.11(使用虚拟环境)。
核心依赖库
Ollama(本地大模型运行框架)faster-whisper&CTranslate2(语音识别)sounddevice,scipy,numpy(音频处理)customtkinter(GUI 界面)requests(调用 Ollama API)
3. 环境搭建
创建虚拟环境
powershell
# 进入项目目录
cd D:\edge_ai_iot
# 创建虚拟环境
python -m venv .venv
# 激活虚拟环境
.\.venv\Scripts\activate
# 确认 Python 路径是否正确
python -c "import sys; print(sys.executable)"
# 提示:正确路径应为 D:\edge_ai_iot\.venv\Scripts\python.exe
安装 Python 依赖
powershell
# 升级 pip
python -m pip install --upgrade pip
# 安装基础库
python -m pip install sounddevice scipy numpy requests customtkinter
# 安装稳定版 faster-whisper 组合
python -m pip install faster-whisper==1.0.3 ctranslate2==4.4.0
# 修复 pkg_resources 缺失问题
python -m pip install --force-reinstall "setuptools<80" wheel
4. 模型部署 (Ollama)
- 下载模型:
ollama pull qwen2.5:7b - 查看本地模型:
ollama list - API 地址: Python 将通过
http://localhost:11434/api/chat调用。
常见问题: 若下载报错 lookup registry.ollama.ai: no such host,请尝试:
-
刷新 DNS:
ipconfig /flushdns -
设置代理(根据实际端口修改):
powershell$env:HTTPS_PROXY="http://127.0.0.1:7890" ollama pull qwen2.5:7b
5. 硬件调试 (麦克风)
5.1 查看可用设备
创建 check_mic.py:
python
import sounddevice as sd
devices = sd.query_devices()
print("=== 可用麦克风设备 ===")
for i, d in enumerate(devices):
if d["max_input_channels"] > 0:
print(f"{i}: {d['name']},输入通道: {d['max_input_channels']}")
5.2 录音测试
创建 record_test.py 确保能正常捕获声音(手动播放 test.wav 确认):
python
import sounddevice as sd
from scipy.io.wavfile import write
RECORD_SECONDS = 5
MIC_DEVICE_ID = 2 # 根据上一步查到的 ID 修改
device_info = sd.query_devices(MIC_DEVICE_ID, kind="input")
SAMPLE_RATE = int(device_info["default_samplerate"])
audio = sd.rec(int(RECORD_SECONDS * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="int16", device=MIC_DEVICE_ID)
sd.wait()
write("test.wav", SAMPLE_RATE, audio)
print("录音完成")
6. 语音识别测试 (Whisper)
创建 whisper_test.py:
python
from faster_whisper import WhisperModel
model = WhisperModel("small", device="cpu", compute_type="int8")
segments, _ = model.transcribe("test.wav", language="zh")
text = "".join(segment.text for segment in segments)
print(f"识别结果: {text.strip()}")
7. 踩坑总结
| 问题 | 现象 | 解决方案 |
|---|---|---|
| 环境冲突 | 显示 Anaconda 路径 | 执行 conda deactivate 后再激活 .venv |
| Whisper 崩溃 | 退出码 -1073741819 |
固定安装 faster-whisper==1.0.3 和 ctranslate2==4.4.0 |
| VAD 报错 | DLL 初始化失败 | 在代码中设置 vad_filter=False |
| 语音幻觉 | 识别出"点赞订阅" | 增加音量检测过滤,改用手动触发录音 |
8. 启动与后续优化
启动流程
- 确保 Ollama 已启动。
- 激活环境:
.\.venv\Scripts\activate - 运行主程序:
python local_ai_chat_gui_v3.py
优化方向
- 流式输出: 实现逐字显示的打字机效果。
- 语音播报: 集成
pyttsx3实现语音回复。 - RAG 知识库: 让 AI 读取本地文档。
- 打包: 使用
PyInstaller打包为独立.exe。
附录:核心主程序代码
请将以下内容保存为 local_ai_chat_gui_v3.py 并运行。
点击展开核心代码
python
import time
import queue
import threading
import numpy as np
import requests
import sounddevice as sd
from scipy.io.wavfile import write
from faster_whisper import WhisperModel
import customtkinter as ctk
# =========================
# 基础配置
# =========================
MIC_DEVICE_ID = 2
AUDIO_FILE = "input.wav"
WHISPER_MODEL_NAME = "small"
OLLAMA_URL = "http://localhost:11434/api/chat"
OLLAMA_MODEL = "qwen2.5:7b"
# =========================
# 界面主题
# =========================
ctk.set_appearance_mode("Light")
ctk.set_default_color_theme("blue")
# =========================
# AI 引擎
# =========================
class LocalAIEngine:
def __init__(self):
self.whisper_model = None
self.messages = [
{
"role": "system",
"content": (
"你是一个运行在用户本地电脑上的中文 AI 助手。"
"你要像 ChatGPT 一样自然、清楚、实用地回答用户。"
"回答尽量具体,不要空话。"
)
}
]
self.stream = None
self.audio_chunks = []
self.sample_rate = None
self.is_recording = False
self.record_start_time = None
def load_whisper(self):
self.whisper_model = WhisperModel(
WHISPER_MODEL_NAME,
device="cpu",
compute_type="int8"
)
def start_recording(self):
if self.is_recording:
return
device_info = sd.query_devices(MIC_DEVICE_ID, kind="input")
self.sample_rate = int(device_info["default_samplerate"])
self.audio_chunks = []
self.record_start_time = time.time()
def callback(indata, frames, time_info, status):
if status:
print("录音状态:", status)
self.audio_chunks.append(indata.copy())
self.stream = sd.InputStream(
samplerate=self.sample_rate,
device=MIC_DEVICE_ID,
channels=1,
dtype="float32",
callback=callback
)
self.stream.start()
self.is_recording = True
def stop_recording(self):
if not self.is_recording:
return None
self.is_recording = False
if self.stream is not None:
self.stream.stop()
self.stream.close()
self.stream = None
if not self.audio_chunks:
raise RuntimeError("没有采集到音频数据")
audio = np.concatenate(self.audio_chunks, axis=0)
duration = time.time() - self.record_start_time
max_volume = float(np.max(np.abs(audio)))
rms_volume = float(np.sqrt(np.mean(audio ** 2)))
# 过滤太短或太安静的录音,避免 Whisper 幻觉
if duration < 0.6:
raise RuntimeError("录音时间太短,请至少说 1 秒以上")
if max_volume < 0.008 or rms_volume < 0.0015:
raise RuntimeError(
f"录音声音太小,可能没有录到你说话。max={max_volume:.5f}, rms={rms_volume:.5f}"
)
audio_int16 = np.clip(audio, -1.0, 1.0)
audio_int16 = (audio_int16 * 32767).astype(np.int16)
write(AUDIO_FILE, self.sample_rate, audio_int16)
return {
"duration": duration,
"max_volume": max_volume,
"rms_volume": rms_volume,
"sample_rate": self.sample_rate
}
def speech_to_text(self) -> str:
if self.whisper_model is None:
raise RuntimeError("Whisper 模型还没加载完成")
segments, info = self.whisper_model.transcribe(
AUDIO_FILE,
language="zh",
task="transcribe",
beam_size=5,
vad_filter=False,
condition_on_previous_text=False,
initial_prompt="以下是普通话日常对话,可能包含编程、物联网、学习、工作、生活问题。"
)
text = ""
for segment in segments:
text += segment.text
return self.normalize_text(text)
def normalize_text(self, text: str) -> str:
text = text.strip()
text = text.replace("開", "开")
text = text.replace("關", "关")
text = text.replace("溫", "温")
text = text.replace("調", "调")
return text
def is_bad_transcription(self, text: str) -> bool:
if not text:
return True
bad_phrases = [
"请不吝点赞",
"订阅",
"转发",
"打赏支持",
"明镜与点点",
"感谢观看",
"字幕由",
"谢谢观看"
]
for phrase in bad_phrases:
if phrase in text:
return True
# 过短的识别也容易不可靠
if len(text.strip()) <= 1:
return True
return False
def chat(self, user_text: str) -> str:
self.messages.append({
"role": "user",
"content": user_text
})
# 保留最近 10 轮上下文,防止越来越慢
if len(self.messages) > 21:
system_msg = self.messages[0]
recent_msgs = self.messages[-20:]
self.messages = [system_msg] + recent_msgs
payload = {
"model": OLLAMA_MODEL,
"messages": self.messages,
"stream": False,
"options": {
"temperature": 0.7
}
}
response = requests.post(OLLAMA_URL, json=payload, timeout=180)
response.raise_for_status()
answer = response.json()["message"]["content"].strip()
self.messages.append({
"role": "assistant",
"content": answer
})
return answer
def clear_history(self):
self.messages = [self.messages[0]]
# =========================
# GUI
# =========================
class LocalAIChatGUI(ctk.CTk):
def __init__(self):
super().__init__()
self.title("本地可视化 AI 助手")
self.geometry("1050x760")
self.minsize(900, 650)
self.configure(fg_color="#F5F7FB")
self.engine = LocalAIEngine()
self.msg_queue = queue.Queue()
self.is_busy = False
self.voice_ready = False
self.recording = False
self.build_ui()
self.bind_shortcuts()
self.start_queue_loop()
threading.Thread(target=self.load_model_worker, daemon=True).start()
def build_ui(self):
self.grid_columnconfigure(0, weight=1)
self.grid_rowconfigure(1, weight=1)
# 顶部
header = ctk.CTkFrame(self, corner_radius=0, fg_color="#FFFFFF")
header.grid(row=0, column=0, sticky="ew")
header.grid_columnconfigure(0, weight=1)
title = ctk.CTkLabel(
header,
text="本地可视化 AI 助手",
font=ctk.CTkFont(size=26, weight="bold"),
text_color="#111827"
)
title.grid(row=0, column=0, padx=24, pady=(18, 4), sticky="w")
subtitle = ctk.CTkLabel(
header,
text="Ollama 本地大模型 + Whisper 语音输入,可文字聊天,也可语音聊天",
font=ctk.CTkFont(size=14),
text_color="#6B7280"
)
subtitle.grid(row=1, column=0, padx=24, pady=(0, 16), sticky="w")
self.status_label = ctk.CTkLabel(
header,
text="状态:正在加载 Whisper 模型...",
font=ctk.CTkFont(size=14),
text_color="#2563EB"
)
self.status_label.grid(row=0, column=1, padx=24, pady=18, sticky="e")
# 聊天区域
body = ctk.CTkFrame(
self,
corner_radius=18,
fg_color="#FFFFFF",
border_width=1,
border_color="#E5E7EB"
)
body.grid(row=1, column=0, padx=18, pady=14, sticky="nsew")
body.grid_columnconfigure(0, weight=1)
body.grid_rowconfigure(0, weight=1)
self.chat_box = ctk.CTkTextbox(
body,
font=ctk.CTkFont(family="Microsoft YaHei", size=15),
wrap="word",
corner_radius=14,
fg_color="#F9FAFB",
text_color="#111827",
border_width=1,
border_color="#E5E7EB"
)
self.chat_box.grid(row=0, column=0, padx=16, pady=16, sticky="nsew")
self.chat_box.configure(state="disabled")
# 底部输入区域
bottom = ctk.CTkFrame(
self,
corner_radius=18,
fg_color="#FFFFFF",
border_width=1,
border_color="#E5E7EB"
)
bottom.grid(row=2, column=0, padx=18, pady=(0, 18), sticky="ew")
bottom.grid_columnconfigure(0, weight=1)
self.input_box = ctk.CTkTextbox(
bottom,
height=95,
font=ctk.CTkFont(family="Microsoft YaHei", size=15),
corner_radius=14,
wrap="word",
fg_color="#F9FAFB",
text_color="#111827",
border_width=1,
border_color="#E5E7EB"
)
self.input_box.grid(row=0, column=0, padx=(16, 8), pady=16, sticky="ew")
button_panel = ctk.CTkFrame(bottom, fg_color="transparent")
button_panel.grid(row=0, column=1, padx=(8, 16), pady=16, sticky="ns")
self.send_btn = ctk.CTkButton(
button_panel,
text="发送",
width=140,
height=38,
font=ctk.CTkFont(size=15, weight="bold"),
fg_color="#2563EB",
hover_color="#1D4ED8",
text_color="#FFFFFF",
command=self.send_text_message
)
self.send_btn.pack(pady=(0, 8))
self.voice_btn = ctk.CTkButton(
button_panel,
text="🎙 开始录音",
width=140,
height=42,
font=ctk.CTkFont(size=15, weight="bold"),
fg_color="#16A34A",
hover_color="#15803D",
text_color="#FFFFFF",
command=self.toggle_recording,
state="disabled"
)
self.voice_btn.pack(pady=8)
self.clear_btn = ctk.CTkButton(
button_panel,
text="清空对话",
width=140,
height=34,
font=ctk.CTkFont(size=14),
fg_color="#E5E7EB",
hover_color="#D1D5DB",
text_color="#374151",
command=self.clear_chat
)
self.clear_btn.pack(pady=(8, 0))
hint = ctk.CTkLabel(
bottom,
text="提示:Ctrl + Enter 发送文字,Ctrl + M 开始/结束录音",
font=ctk.CTkFont(size=12),
text_color="#6B7280"
)
hint.grid(row=1, column=0, columnspan=2, padx=18, pady=(0, 12), sticky="w")
self.append_chat("系统", "本地 AI 助手启动中。可以先打字聊天,语音模型加载完成后点击"🎙 开始录音"。")
def bind_shortcuts(self):
self.bind("<Control-Return>", lambda event: self.send_text_message())
self.bind("<Control-m>", lambda event: self.toggle_recording())
self.bind("<Control-M>", lambda event: self.toggle_recording())
def append_chat(self, sender, message):
self.chat_box.configure(state="normal")
now = time.strftime("%H:%M:%S")
self.chat_box.insert("end", f"\n[{now}] {sender}\n")
self.chat_box.insert("end", message + "\n")
self.chat_box.insert("end", "─" * 90 + "\n")
self.chat_box.see("end")
self.chat_box.configure(state="disabled")
def set_status(self, text, color="#2563EB"):
self.status_label.configure(text=f"状态:{text}", text_color=color)
def set_busy(self, busy: bool):
self.is_busy = busy
if busy:
self.send_btn.configure(state="disabled")
self.clear_btn.configure(state="disabled")
else:
self.send_btn.configure(state="normal")
self.clear_btn.configure(state="normal")
if not self.voice_ready:
self.voice_btn.configure(state="disabled")
else:
self.voice_btn.configure(state="normal")
def load_model_worker(self):
try:
self.msg_queue.put(("status", "正在加载 Whisper 模型...", "#2563EB"))
self.engine.load_whisper()
self.voice_ready = True
self.msg_queue.put(("status", "就绪,可以文字或语音对话", "#16A34A"))
self.msg_queue.put(("chat", "系统", "Whisper 模型加载完成。现在可以点击"🎙 开始录音",说完后再点击"⏹ 结束录音"。"))
self.msg_queue.put(("voice_ready", True))
except Exception as e:
self.voice_ready = False
self.msg_queue.put(("status", "语音模型加载失败,但文字对话可用", "#F59E0B"))
self.msg_queue.put(("chat", "系统", f"语音模型加载失败:{e}\n文字聊天仍然可用。"))
def send_text_message(self):
if self.is_busy or self.recording:
return
user_text = self.input_box.get("1.0", "end").strip()
if not user_text:
return
self.input_box.delete("1.0", "end")
self.append_chat("你", user_text)
self.set_busy(True)
self.set_status("AI 正在思考...", "#2563EB")
threading.Thread(
target=self.chat_worker,
args=(user_text,),
daemon=True
).start()
def toggle_recording(self):
if not self.voice_ready:
self.append_chat("系统", "Whisper 模型还没加载完成,请稍等。")
return
if self.is_busy and not self.recording:
return
if not self.recording:
self.start_recording()
else:
self.stop_recording_and_send()
def start_recording(self):
try:
self.engine.start_recording()
self.recording = True
self.voice_btn.configure(
text="⏹ 结束录音",
fg_color="#DC2626",
hover_color="#B91C1C"
)
self.send_btn.configure(state="disabled")
self.clear_btn.configure(state="disabled")
self.set_status("正在录音,说完后请点击"结束录音"", "#DC2626")
self.append_chat("系统", "已开始录音,请说话。说完后点击"⏹ 结束录音"。")
except Exception as e:
self.recording = False
self.append_chat("系统", f"开始录音失败:{e}")
self.set_status("录音失败", "#DC2626")
def stop_recording_and_send(self):
if not self.recording:
return
self.recording = False
self.set_busy(True)
self.voice_btn.configure(
text="🎙 开始录音",
fg_color="#16A34A",
hover_color="#15803D"
)
self.set_status("正在处理录音...", "#2563EB")
threading.Thread(target=self.voice_worker_after_stop, daemon=True).start()
def voice_worker_after_stop(self):
try:
info = self.engine.stop_recording()
self.msg_queue.put((
"chat",
"系统",
f"录音结束,时长 {info['duration']:.1f} 秒,音量 max={info['max_volume']:.4f}, rms={info['rms_volume']:.4f}。开始识别..."
))
self.msg_queue.put(("status", "正在识别语音...", "#2563EB"))
user_text = self.engine.speech_to_text()
if self.engine.is_bad_transcription(user_text):
self.msg_queue.put(("chat", "系统", f"识别结果不可靠,已丢弃:{user_text}"))
self.msg_queue.put(("status", "识别不可靠,请重新录音", "#F59E0B"))
return
self.msg_queue.put(("chat", "你", user_text))
self.msg_queue.put(("status", "AI 正在思考...", "#2563EB"))
answer = self.engine.chat(user_text)
self.msg_queue.put(("chat", "AI", answer))
self.msg_queue.put(("status", "就绪", "#16A34A"))
except requests.exceptions.ConnectionError:
self.msg_queue.put(("chat", "系统", "连接 Ollama 失败,请确认 Ollama 已经启动。"))
self.msg_queue.put(("status", "Ollama 未连接", "#DC2626"))
except Exception as e:
self.msg_queue.put(("chat", "系统", f"语音处理失败:{e}"))
self.msg_queue.put(("status", "语音处理失败", "#DC2626"))
finally:
self.msg_queue.put(("busy", False))
def chat_worker(self, user_text):
try:
answer = self.engine.chat(user_text)
self.msg_queue.put(("chat", "AI", answer))
self.msg_queue.put(("status", "就绪", "#16A34A"))
except requests.exceptions.ConnectionError:
self.msg_queue.put(("chat", "系统", "连接 Ollama 失败,请确认 Ollama 已经启动。"))
self.msg_queue.put(("status", "Ollama 未连接", "#DC2626"))
except Exception as e:
self.msg_queue.put(("chat", "系统", f"AI 回复失败:{e}"))
self.msg_queue.put(("status", "执行失败", "#DC2626"))
finally:
self.msg_queue.put(("busy", False))
def clear_chat(self):
if self.recording:
return
self.chat_box.configure(state="normal")
self.chat_box.delete("1.0", "end")
self.chat_box.configure(state="disabled")
self.engine.clear_history()
self.append_chat("系统", "对话已清空。")
self.set_status("就绪", "#16A34A")
def start_queue_loop(self):
self.after(100, self.process_queue)
def process_queue(self):
try:
while True:
item = self.msg_queue.get_nowait()
msg_type = item[0]
if msg_type == "chat":
self.append_chat(item[1], item[2])
elif msg_type == "status":
self.set_status(item[1], item[2])
elif msg_type == "voice_ready":
self.voice_ready = item[1]
self.voice_btn.configure(state="normal" if item[1] else "disabled")
elif msg_type == "busy":
self.set_busy(item[1])
except queue.Empty:
pass
self.after(100, self.process_queue)
if __name__ == "__main__":
app = LocalAIChatGUI()
app.mainloop()