Speech recognition, translation and speech synthesis.
python
#语音识别、翻译及语音合成
#pip install SpeechRecognition gtts googletrans==3.1.0a0 pyaudio sounddevice soundfile keyboard
import speech_recognition as sr
from gtts import gTTS
from googletrans import Translator
import os
import time
import pyaudio
import logging
import argparse
import sounddevice as sd
import soundfile as sf
import numpy as np
import keyboard
import requests
from datetime import datetime
class SpeechTranslationApp:
def __init__(self, source_language='en-US', target_language='zh-CN', input_type='microphone', save_transcript=False):
"""
Initialize the Speech Translation Application
:param source_language: Language of input speech
:param target_language: Language to translate to
:param input_type: Input method (microphone or system_sound)
:param save_transcript: Whether to save speech recognition results
"""
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
self.logger = logging.getLogger(__name__)
# Speech recognition setup
self.recognizer = sr.Recognizer()
self.translator = Translator()
# Configuration
self.source_language = source_language
self.target_language = target_language
self.input_type = input_type
self.save_transcript = save_transcript
# Audio parameters
self.CHUNK = 1024
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 44100
self.RECORD_SECONDS = 5
# API endpoint (replace with the actual API URL from the GitHub project)
self.ASR_API_URL = "https://api.example.com/asr" # Update with actual API endpoint
def recognize_speech(self):
"""
Recognize speech from either microphone or system sound
:return: Dictionary with recognition results
"""
result = {
'success': False,
'text': None,
'error': None
}
try:
if self.input_type == "microphone":
with sr.Microphone() as source:
self.logger.info(f"Listening (Language: {self.source_language})...")
audio = self.recognizer.listen(source, timeout=5, phrase_time_limit=5)
elif self.input_type == "system_sound":
# Record system audio using sounddevice
self.logger.info(f"Capturing system sound (Language: {self.source_language})...")
recording = sd.rec(
int(self.RECORD_SECONDS * self.RATE),
samplerate=self.RATE,
channels=self.CHANNELS
)
sd.wait()
# Save recording to temporary file
temp_audio_file = 'temp_system_audio.wav'
sf.write(temp_audio_file, recording, self.RATE)
# Convert to speech_recognition format
with sr.AudioFile(temp_audio_file) as source:
audio = self.recognizer.record(source)
# Clean up temporary file
os.remove(temp_audio_file)
else:
raise ValueError("Invalid input type. Choose 'microphone' or 'system_sound'")
# Recognize speech
text = self.recognizer.recognize_google(audio, language=self.source_language)
# Send to ASR API
self.send_to_asr_api(audio)
result['success'] = True
result['text'] = text
self.logger.info(f"Speech Recognition Result: {text}")
# Save transcript if enabled
if self.save_transcript:
self.save_speech_to_file(text)
except sr.WaitTimeoutError:
result['error'] = "Listening timed out. No speech detected."
self.logger.warning(result['error'])
except sr.UnknownValueError:
result['error'] = "Could not understand the audio"
self.logger.warning(result['error'])
except sr.RequestError as e:
result['error'] = f"Could not request results from Google Speech Recognition service; {e}"
self.logger.error(result['error'])
except Exception as e:
result['error'] = f"An unexpected error occurred: {e}"
self.logger.error(result['error'])
return result
def send_to_asr_api(self, audio):
"""
Send audio to ASR API for processing
:param audio: Recognized audio data
"""
try:
# Convert audio to a format suitable for API upload
audio_data = audio.get_wav_data()
# Prepare files for upload
files = {'audio': ('speech.wav', audio_data, 'audio/wav')}
# Send to ASR API (replace with actual API call)
response = requests.post(self.ASR_API_URL, files=files)
if response.status_code == 200:
self.logger.info("Successfully sent audio to ASR API")
else:
self.logger.warning(f"ASR API request failed with status {response.status_code}")
except Exception as e:
self.logger.error(f"Error sending audio to ASR API: {e}")
def save_speech_to_file(self, text):
"""
Save recognized speech to a text file
:param text: Recognized text
"""
try:
# Create transcripts directory if it doesn't exist
os.makedirs('transcripts', exist_ok=True)
# Generate filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"transcripts/speech_transcript_{timestamp}.txt"
with open(filename, 'w', encoding='utf-8') as f:
f.write(text)
self.logger.info(f"Transcript saved to {filename}")
except Exception as e:
self.logger.error(f"Error saving transcript: {e}")
def translate_text(self, text):
"""
Translate text to target language
:param text: Text to translate
:return: Translated text
"""
try:
translation = self.translator.translate(text, dest=self.target_language)
self.logger.info(f"Translation Result: {translation.text}")
return translation.text
except Exception as e:
self.logger.error(f"Translation error: {e}")
return None
def speak_text(self, text):
"""
Convert text to speech and play
:param text: Text to convert to speech
"""
try:
tts = gTTS(text=text, lang=self.target_language)
output_file = "translation_output.mp3"
tts.save(output_file)
# Cross-platform audio playback
if os.name == 'nt': # Windows
os.system(f"start {output_file}")
elif os.name == 'posix': # macOS and Linux
os.system(f"mpg123 {output_file}")
# Remove temporary file after playback
time.sleep(2) # Give time for playback
os.remove(output_file)
except Exception as e:
self.logger.error(f"Text-to-speech error: {e}")
def run(self):
"""
Main application loop
"""
self.logger.info("Speech Translation App Started")
print("Press 'Esc' to exit the application")
try:
while not keyboard.is_pressed('esc'):
# Recognize speech
recognition_result = self.recognize_speech()
if recognition_result['success']:
# Translate recognized text
translated_text = self.translate_text(recognition_result['text'])
if translated_text:
# Speak translated text
self.speak_text(translated_text)
else:
# Log or handle unsuccessful recognition
if recognition_result['error']:
print(f"Recognition error: {recognition_result['error']}")
# Pause to prevent excessive processing
time.sleep(2)
except KeyboardInterrupt:
self.logger.info("Application stopped by user")
finally:
print("Speech Translation App Closed")
def main():
"""
Parse command-line arguments and start the application
"""
parser = argparse.ArgumentParser(description='Speech Translation Application')
parser.add_argument('--source_lang', default='en-US', help='Source language code')
parser.add_argument('--target_lang', default='zh-CN', help='Target language code')
parser.add_argument('--input', choices=['microphone', 'system_sound'], default='microphone', help='Input method')
parser.add_argument('--save_transcript', action='store_true', help='Save speech transcripts')
args = parser.parse_args()
app = SpeechTranslationApp(
source_language=args.source_lang,
target_language=args.target_lang,
input_type=args.input,
save_transcript=args.save_transcript
)
app.run()
if __name__ == "__main__":
main()