faster_whisper语音识别
检测可用设备:list_available_devices()函数
我这边usb摄像头带麦克风的,所以 DEV_index = 8
1 使用 pyaudio 打开音频设备
2 从音频设备读取数据,传递给 faster_whisper 识别
按键 r 录制 s 停止 q退出
python
# from faster_whisper import WhisperModel
# model = WhisperModel("large-v3")
# audio_path= "mlk.flac"
# segments, info = model.transcribe(audio_path)
# for segment in segments:
# print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
from faster_whisper import WhisperModel
import numpy as np
import keyboard
import pynput
from pynput.keyboard import Controller, Listener,Key,KeyCode
import time
import pyaudio
import wave
def list_available_devices():
print("Available input devices:")
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
device_info = p.get_device_info_by_index(i)
if device_info['maxInputChannels'] > 0: # Check if it's an input device
print(f"{i}: {device_info['name']}")
p.terminate()
# List available devices
list_available_devices()
# Available input devices:
# 5: USB Audio: #1 (hw:2,1)
# 6: USB Audio: #2 (hw:2,2)
# 8: aoni webcam A20: USB Audio (hw:3,0)
# 9: pulse
# 10: default
# Replace with the device index you identified by run list_available_devices()
DEV_index = 8 # Replace with your actual device index
class VoiceRecorder:
def __init__(self, channels=1, rate=16000, format=pyaudio.paInt16):
self.p = pyaudio.PyAudio()
self.model = WhisperModel("large-v3")
self.CHANNELS = channels
self.RATE = rate
self.FORMAT = format
def record(self, seconds=5):
"""
记录指定秒数的音频。
"""
CHUNK = 1024
try:
stream = self.p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
input_device_index=DEV_index,
frames_per_buffer=CHUNK)
print("开始录音...")
frames = []
for i in range(0, int(self.RATE / CHUNK * seconds)):
data = stream.read(CHUNK)
frames.append(data)
print("录音结束.")
except Exception as e:
print(f"录音时发生错误:{e}")
return None
finally:
stream.stop_stream()
stream.close()
return b''.join(frames)
def transcribe_audio(self, audio_data):
"""
将音频数据转换为文本。
"""
try:
audio_np = np.frombuffer(audio_data, dtype=np.int16)
if self.CHANNELS > 1:
audio_np = audio_np.reshape((-1, self.CHANNELS)).mean(axis=1)
audio_normalized = np.float32(audio_np) / 32768.0
segments, _ = self.model.transcribe(audio_normalized, language='zh', beam_size=5)
return [segment.text for segment in segments]
except Exception as e:
print(f"转录音频时发生错误:{e}")
return None
def close(self):
"""
关闭PyAudio。
"""
self.p.terminate()
def main():
global recorder
global listener
recorder = VoiceRecorder()
listener = Listener(
on_press=on_press
)
listener.start()
listener.join()
def on_press(key:KeyCode):
print(type(key))
if key.char == 'r':
print("开始录音...")
audio_data = recorder.record()
if audio_data is not None:
transcripts = recorder.transcribe_audio(audio_data)
for text in transcripts:
print(text)
print("录音结束.")
elif key.char == 's':
print("停止录音.")
elif key.char == 'q':
print("退出程序.")
listener.stop()
recorder.close()
if __name__ == "__main__":
main()