一、代码概述
此 Python 代码实现了一个语音交互系统,主要功能为监听唤醒词,在唤醒后接收用户语音问题,利用百度语音识别将语音转换为文本,再调用 DeepSeek API 获取智能回复,最后使用文本转语音功能将回复朗读出来。
二、环境依赖
在运行此代码前,需要安装以下 Python 库:
baidu-aip
:用于调用百度语音识别服务。pyttsx3
:实现文本转语音功能。dotenv
:用于加载环境变量。pyaudio
:用于录制音频。wave
:用于处理音频文件。requests
:用于发送 HTTP 请求。json
:用于处理 JSON 数据。
可以使用以下命令进行安装:
bash
pip install baidu-aip pyttsx3 python-dotenv pyaudio requests
三、环境变量配置
需要在项目根目录下创建一个 .env
文件,并添加以下环境变量:
plaintext
BAIDU_APP_ID=your_baidu_app_id
BAIDU_API_KEY=your_baidu_api_key
BAIDU_SECRET_KEY=your_baidu_secret_key
DEEPSEEK_API_KEY=your_deepseek_api_key
请将 your_baidu_app_id
、your_baidu_api_key
、your_baidu_secret_key
替换为你在百度语音平台申请的应用 ID、API 密钥和 Secret 密钥,将 your_deepseek_api_key
替换为你在 DeepSeek 平台申请的 API 密钥。
四、代码模块详细解释
- 导入必要的库
python
from aip import AipSpeech
import pyttsx3
import os
from dotenv import load_dotenv
import wave
import pyaudio
import requests
import json
from datetime import datetime
AipSpeech
:用于调用百度语音识别服务。pyttsx3
:实现文本转语音功能。os
:用于操作环境变量。load_dotenv
:从.env
文件中加载环境变量。wave
:用于处理音频文件。pyaudio
:用于录制音频。requests
:用于发送 HTTP 请求。json
:用于处理 JSON 数据。datetime
:虽然代码中未实际使用,但可用于后续添加时间相关功能。
- 加载环境变量并初始化客户端
python
# Load environment variables
load_dotenv()
APP_ID = os.getenv("BAIDU_APP_ID")
API_KEY = os.getenv("BAIDU_API_KEY")
SECRET_KEY = os.getenv("BAIDU_SECRET_KEY")
# Initialize Baidu Speech client
client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
load_dotenv()
:从.env
文件中加载环境变量。AipSpeech
:使用百度提供的应用 ID、API 密钥和 Secret 密钥初始化百度语音识别客户端。
- 初始化文本转语音引擎
python
# Initialize text-to-speech
engine = pyttsx3.init()
# Set Chinese voice if available, and choose a gentle female voice
voices = engine.getProperty('voices')
for voice in voices:
# 检查音色是否为中文,并且音色名称中包含"female"或"woman"等关键词
if 'chinese' in voice.name.lower() and ('female' in voice.name.lower() or 'woman' in voice.name.lower()):
engine.setProperty('voice', voice.id)
break
pyttsx3.init()
:初始化文本转语音引擎。- 通过遍历可用语音,选择支持中文且为女性的语音。
- 调用 DeepSeek API 获取智能回复
python
def call_deepseek_api(prompt):
"""Call DeepSeek API for intelligent response"""
url = "https://api.deepseek.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
}
data = {
"model": "deepseek-chat",
"messages": [{
"role": "user",
"content": prompt
}],
"temperature": 0.7,
"max_tokens": 200
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
result = response.json()
return result['choices'][0]['message']['content'].strip()
except Exception as e:
print(f"Error calling DeepSeek API: {str(e)}")
return "抱歉,我暂时无法回答这个问题"
call_deepseek_api
函数:接收用户的问题prompt
,向 DeepSeek API 发送请求,获取智能回复。requests.post
:发送 HTTP POST 请求。response.raise_for_status()
:检查请求是否成功。- 若请求失败,捕获异常并返回错误提示信息。
- 录制音频
python
def record_audio(filename, record_seconds=5):
"""Record audio from microphone"""
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("开始录音...")
frames = []
for _ in range(0, int(RATE / CHUNK * record_seconds)):
data = stream.read(CHUNK)
frames.append(data)
print("录音结束")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(filename, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
record_audio
函数:用于录制音频,将录制的音频保存为指定文件名的.wav
文件。pyaudio.PyAudio()
:初始化 PyAudio 对象。p.open
:打开音频输入流。wave.open
:打开音频文件进行写入操作。
- 监听唤醒词
python
def listen_for_wake_word(wake_word="小绿同学"):
"""Listen for the wake word using Baidu Speech Recognition"""
temp_file = "temp.wav"
record_audio(temp_file, 3)
with open(temp_file, 'rb') as f:
audio_data = f.read()
result = client.asr(audio_data, 'wav', 16000, {
'dev_pid': 1537 # Mandarin
})
if result.get('err_no') == 0:
text = result['result'][0]
if wake_word in text:
return True
return False
listen_for_wake_word
函数:录制 3 秒音频,使用百度语音识别将音频转换为文本,检查文本中是否包含唤醒词。client.asr
:调用百度语音识别服务。
- 获取回复
python
def get_response(prompt):
"""Generate response using DeepSeek API"""
return call_deepseek_api(prompt)
get_response
函数:调用call_deepseek_api
函数获取智能回复。
- 文本转语音
python
def speak(text):
"""Convert text to speech"""
engine.say(text)
engine.runAndWait()
speak
函数:使用pyttsx3
引擎将文本转换为语音并朗读出来。
- 主函数
python
def main():
print("小绿同学已启动,等待唤醒...")
while True:
if listen_for_wake_word():
print("唤醒成功!请说出你的问题...")
speak("您好,需要什么帮助")
temp_file = "question.wav"
record_audio(temp_file, 5)
with open(temp_file, 'rb') as f:
audio_data = f.read()
result = client.asr(audio_data, 'wav', 16000, {
'dev_pid': 1537 # Mandarin
})
if result.get('err_no') == 0:
user_input = result['result'][0]
print(f"你说: {user_input}")
response = get_response(user_input)
print(f"小绿同学: {response}")
speak(response)
else:
print("抱歉,我没有听清楚")
speak("抱歉,我没有听清楚")
if __name__ == "__main__":
main()
main
函数:程序的入口点,不断监听唤醒词,唤醒后录制用户问题,进行语音识别,调用 DeepSeek API 获取回复,并将回复朗读出来。
五、注意事项
- 确保
.env
文件中的环境变量配置正确。 - 若网络连接不稳定或 DeepSeek API 服务不可用,可能会导致获取回复失败。
- 录音设备需要正常工作,否则可能无法录制音频。
完整代码
python
from aip import AipSpeech
import pyttsx3
import os
from dotenv import load_dotenv
import wave
import pyaudio
import requests
import json
from datetime import datetime
# Load environment variables
load_dotenv()
APP_ID = os.getenv("BAIDU_APP_ID")
API_KEY = os.getenv("BAIDU_API_KEY")
SECRET_KEY = os.getenv("BAIDU_SECRET_KEY")
# Initialize Baidu Speech client
client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
# Initialize text-to-speech
engine = pyttsx3.init()
# Set Chinese voice if available, and choose a gentle female voice
voices = engine.getProperty('voices')
for voice in voices:
# 检查音色是否为中文,并且音色名称中包含"female"或"woman"等关键词
if 'chinese' in voice.name.lower() and ('female' in voice.name.lower() or 'woman' in voice.name.lower()):
engine.setProperty('voice', voice.id)
break
def call_deepseek_api(prompt):
"""Call DeepSeek API for intelligent response"""
url = "https://api.deepseek.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
}
data = {
"model": "deepseek-chat",
"messages": [{
"role": "user",
"content": prompt
}],
"temperature": 0.7,
"max_tokens": 200
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
result = response.json()
return result['choices'][0]['message']['content'].strip()
except Exception as e:
print(f"Error calling DeepSeek API: {str(e)}")
return "抱歉,我暂时无法回答这个问题"
def record_audio(filename, record_seconds=5):
"""Record audio from microphone"""
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("开始录音...")
frames = []
for _ in range(0, int(RATE / CHUNK * record_seconds)):
data = stream.read(CHUNK)
frames.append(data)
print("录音结束")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(filename, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
def listen_for_wake_word(wake_word="小绿"):
"""Listen for the wake word using Baidu Speech Recognition"""
temp_file = "temp.wav"
record_audio(temp_file, 3)
with open(temp_file, 'rb') as f:
audio_data = f.read()
result = client.asr(audio_data, 'wav', 16000, {
'dev_pid': 1537 # Mandarin
})
if result.get('err_no') == 0:
text = result['result'][0]
if wake_word in text:
return True
return False
def get_response(prompt):
"""Generate response using DeepSeek API"""
return call_deepseek_api(prompt)
def speak(text):
"""Convert text to speech"""
engine.say(text)
engine.runAndWait()
def main():
print("小绿已启动,等待唤醒...")
while True:
if listen_for_wake_word():
print("唤醒成功!请说出你的问题...")
speak("您好,需要什么帮助")
temp_file = "question.wav"
record_audio(temp_file, 5)
with open(temp_file, 'rb') as f:
audio_data = f.read()
result = client.asr(audio_data, 'wav', 16000, {
'dev_pid': 1537 # Mandarin
})
if result.get('err_no') == 0:
user_input = result['result'][0]
print(f"你说: {user_input}")
response = get_response(user_input)
print(f"小绿: {response}")
speak(response)
else:
print("抱歉,我没有听清楚")
speak("抱歉,我没有听清楚")
if __name__ == "__main__":
main()
修改了一些,调用百度进行了语音合成输出,回答更自然一些
python
from aip import AipSpeech
import os
from dotenv import load_dotenv
import wave
import pyaudio
import requests
import json
from datetime import datetime
import pygame # 用于播放音频
import tempfile
# Load environment variables
load_dotenv()
APP_ID = os.getenv("BAIDU_APP_ID")
API_KEY = os.getenv("BAIDU_API_KEY")
SECRET_KEY = os.getenv("BAIDU_SECRET_KEY")
# Initialize Baidu Speech client
client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
def call_deepseek_api(prompt):
"""Call DeepSeek API for intelligent response"""
url = "https://api.deepseek.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('DEEPSEEK_API_KEY')}"
}
data = {
"model": "deepseek-chat",
"messages": [{
"role": "user",
"content": prompt
}],
"temperature": 0.7,
"max_tokens": 200
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
result = response.json()
return result['choices'][0]['message']['content'].strip()
except Exception as e:
print(f"Error calling DeepSeek API: {str(e)}")
return "抱歉,我暂时无法回答这个问题"
def record_audio(filename, record_seconds=5):
"""Record audio from microphone"""
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("开始录音...")
frames = []
for _ in range(0, int(RATE / CHUNK * record_seconds)):
data = stream.read(CHUNK)
frames.append(data)
print("录音结束")
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(filename, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
def listen_for_wake_word(wake_word="小绿"):
"""Listen for the wake word using Baidu Speech Recognition"""
temp_file = "temp.wav"
record_audio(temp_file, 3)
with open(temp_file, 'rb') as f:
audio_data = f.read()
result = client.asr(audio_data, 'wav', 16000, {
'dev_pid': 1537 # Mandarin
})
if result.get('err_no') == 0:
text = result['result'][0]
if wake_word in text:
return True
return False
def get_response(prompt):
"""Generate response using DeepSeek API"""
return call_deepseek_api(prompt)
def speak(text):
"""Convert text to speech using Baidu TTS"""
result = client.synthesis(text, 'zh', 1, {
'vol': 5, # 音量,取值0-15,默认为5中音量
'spd': 4, # 语速,取值0-9,默认为5中语速
'pit': 5, # 音调,取值0-9,默认为5中语调
'per': 4 # 发音人选择,0为女声,1为男声,3为度逍遥,4为度丫丫
})
if not isinstance(result, dict):
# 使用临时文件
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
f.write(result)
temp_path = f.name
# 播放语音文件
pygame.mixer.init()
pygame.mixer.music.load(temp_path)
pygame.mixer.music.play()
# 等待播放完成
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10) # 控制循环频率,避免占用过多 CPU
# 强制释放资源
pygame.mixer.music.stop()
pygame.mixer.quit()
# 播放完成后删除临时文件
os.remove(temp_path)
else:
print(f"语音合成失败: {result}")
def main():
print("小绿已启动,等待唤醒...")
while True:
if listen_for_wake_word():
print("您好,需要什么帮助....")
speak("您好,需要什么帮助")
temp_file = "question.wav"
record_audio(temp_file, 5)
with open(temp_file, 'rb') as f:
audio_data = f.read()
result = client.asr(audio_data, 'wav', 16000, {
'dev_pid': 1537 # Mandarin
})
if result.get('err_no') == 0:
user_input = result['result'][0]
print(f"你说: {user_input}")
response = get_response(user_input)
print(f"小绿: {response}")
speak(response)
else:
print("抱歉,我没有听清楚")
speak("抱歉,我没有听清楚")
if __name__ == "__main__":
main()