ubuntu 下调用系统麦克风,以及faster-whisper-medium 处理音频转写文本

python 复制代码
import asyncio
import pyaudio
import wave
import gc
import io
import zhconv
import torch
import copy
from faster_whisper import WhisperModel


class MicroPhoneTransWords(object):
    def __init__(self):
        self.format = pyaudio.paInt16
        self.channels = 1
        self.rate = 44100  # 采样率
        self.chunck = 1024  # 每帧大小
        self.recode_seconds = 5 # 处理间隔
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=self.format,
                                  channels=self.channels,
                                  rate=self.rate,
                                  input=True,
                                  frames_per_buffer=self.chunck)
        self.model_size = "./modelscape/faster-whisper-medium"
        self.model = WhisperModel(self.model_size, device="cuda", num_workers=1, compute_type="float32")

    async def recode_listen(self):
        print("开始录音")
        frames = []
        try:
            for _ in range(0, int(self.rate / self.chunck * self.recode_seconds)):
                data = self.stream.read(self.chunck, exception_on_overflow=False)
                frames.append(data)
        except OSError as e:
            print(f"录音时发生错误:{e}")
            # 如果流出错,可以尝试重新打开
            self.reopen_stream()
            return  # 出错时返回,下一次循环录音
        print("录音完成")
        # 异步启动转写任务
        await self.recode_voices(copy.copy(frames))
        await asyncio.sleep(0.0001)

    def reopen_stream(self):
        print("重启音频流......")
        try:
            self.stream.stop_stream()
            self.stream.close()
        except Exception:
            pass
        try:
            self.p.terminate()
        except Exception:
            pass
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=self.format,
                                  channels=self.channels,
                                  rate=self.rate,
                                  input=True,
                                  frames_per_buffer=self.chunck)

    async def recode_voices(self, frames):
        """

        :param frames:
        :return:
        """
        print("处理音频格式")
        buffer = io.BytesIO()
        wf = wave.open(buffer, 'wb')
        wf.setnchannels(self.channels)
        wf.setsampwidth(self.p.get_sample_size(self.format))
        wf.setframerate(self.rate)
        wf.writeframes(b''.join(frames))
        wf.close()
        buffer.seek(0)
        await self.transAudioWords(buffer)

    async def listen(self):
        while True:
            print("Listening...")
            try:
                while True:
                    await self.recode_listen()
            except Exception as e:
                print("录音终止", e)
                gc.collect()
                torch.cuda.empty_cache()

    async def tranSampleChinese(self, word):
        locale = "zh-hans"
        return zhconv.convert(word, locale)

    async def transAudioWords(self, buffer):
        print("开始转写......")
        segments, info = self.model.transcribe(buffer, beam_size=5,
                                               condition_on_previous_text=False,
                                               vad_filter=True,
                                               vad_parameters=dict(min_silence_duration_ms=1000))
        print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
        for segment in segments:
            text = await self.tranSampleChinese(segment.text)
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, text))
        # 释放空间 防止cuda 内存溢出
        gc.collect()
        torch.cuda.empty_cache()


if __name__ == '__main__':
    asyncio.run(MicroPhoneTransWords().listen())

requirements.txt

复制代码
torch==2.2.2
torchvision==0.17.2
torchaudio==2.2.2
faster-whisper
gradio
pybind11>=2.12
numpy<2
SpeechRecognition
PyAudio # sudo apt update && sudo apt install ffmpeg portaudio19-dev python3-pyaudio -y
whisper-live
zhconv==1.4.3
相关推荐
wypywyp15 小时前
8. ubuntu 虚拟机 linux 服务器 TCP/IP 概念辨析
linux·服务器·ubuntu
阿蒙Amon16 小时前
TypeScript学习-第10章:模块与命名空间
学习·ubuntu·typescript
No8g攻城狮17 小时前
【Linux】Windows11 安装 WSL2 并运行 Ubuntu 22.04 详细操作步骤
linux·运维·ubuntu
sweetone18 小时前
LINN莲CLASSIK桌面音响微修
经验分享·音视频
森G19 小时前
七、04ledc-sdk--------makefile有变化
linux·c语言·arm开发·c++·ubuntu
生活很暖很治愈1 天前
Linux——孤儿进程&进程调度&大O(1)调度
linux·服务器·ubuntu
晚霞的不甘1 天前
CANN 编译器深度解析:UB、L1 与 Global Memory 的协同调度机制
java·后端·spring·架构·音视频
lili-felicity1 天前
CANN加速Whisper语音识别推理:流式处理与实时转录优化
人工智能·whisper·语音识别
美狐美颜SDK开放平台1 天前
多终端适配下的人脸美型方案:美颜SDK工程开发实践分享
人工智能·音视频·美颜sdk·直播美颜sdk·视频美颜sdk
getapi1 天前
注塑件的费用构成
linux·服务器·ubuntu