ubuntu 下调用系统麦克风,以及faster-whisper-medium 处理音频转写文本

python 复制代码
import asyncio
import pyaudio
import wave
import gc
import io
import zhconv
import torch
import copy
from faster_whisper import WhisperModel


class MicroPhoneTransWords(object):
    def __init__(self):
        self.format = pyaudio.paInt16
        self.channels = 1
        self.rate = 44100  # 采样率
        self.chunck = 1024  # 每帧大小
        self.recode_seconds = 5 # 处理间隔
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=self.format,
                                  channels=self.channels,
                                  rate=self.rate,
                                  input=True,
                                  frames_per_buffer=self.chunck)
        self.model_size = "./modelscape/faster-whisper-medium"
        self.model = WhisperModel(self.model_size, device="cuda", num_workers=1, compute_type="float32")

    async def recode_listen(self):
        print("开始录音")
        frames = []
        try:
            for _ in range(0, int(self.rate / self.chunck * self.recode_seconds)):
                data = self.stream.read(self.chunck, exception_on_overflow=False)
                frames.append(data)
        except OSError as e:
            print(f"录音时发生错误:{e}")
            # 如果流出错,可以尝试重新打开
            self.reopen_stream()
            return  # 出错时返回,下一次循环录音
        print("录音完成")
        # 异步启动转写任务
        await self.recode_voices(copy.copy(frames))
        await asyncio.sleep(0.0001)

    def reopen_stream(self):
        print("重启音频流......")
        try:
            self.stream.stop_stream()
            self.stream.close()
        except Exception:
            pass
        try:
            self.p.terminate()
        except Exception:
            pass
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=self.format,
                                  channels=self.channels,
                                  rate=self.rate,
                                  input=True,
                                  frames_per_buffer=self.chunck)

    async def recode_voices(self, frames):
        """

        :param frames:
        :return:
        """
        print("处理音频格式")
        buffer = io.BytesIO()
        wf = wave.open(buffer, 'wb')
        wf.setnchannels(self.channels)
        wf.setsampwidth(self.p.get_sample_size(self.format))
        wf.setframerate(self.rate)
        wf.writeframes(b''.join(frames))
        wf.close()
        buffer.seek(0)
        await self.transAudioWords(buffer)

    async def listen(self):
        while True:
            print("Listening...")
            try:
                while True:
                    await self.recode_listen()
            except Exception as e:
                print("录音终止", e)
                gc.collect()
                torch.cuda.empty_cache()

    async def tranSampleChinese(self, word):
        locale = "zh-hans"
        return zhconv.convert(word, locale)

    async def transAudioWords(self, buffer):
        print("开始转写......")
        segments, info = self.model.transcribe(buffer, beam_size=5,
                                               condition_on_previous_text=False,
                                               vad_filter=True,
                                               vad_parameters=dict(min_silence_duration_ms=1000))
        print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
        for segment in segments:
            text = await self.tranSampleChinese(segment.text)
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, text))
        # 释放空间 防止cuda 内存溢出
        gc.collect()
        torch.cuda.empty_cache()


if __name__ == '__main__':
    asyncio.run(MicroPhoneTransWords().listen())

requirements.txt

复制代码
torch==2.2.2
torchvision==0.17.2
torchaudio==2.2.2
faster-whisper
gradio
pybind11>=2.12
numpy<2
SpeechRecognition
PyAudio # sudo apt update && sudo apt install ffmpeg portaudio19-dev python3-pyaudio -y
whisper-live
zhconv==1.4.3
相关推荐
ACP广源盛139246256731 小时前
GSV2221 显示转换芯片@ACP#赋能 RTX Spark 端侧 AI 设备,构建多屏全模态视觉交互新生态
大数据·人工智能·嵌入式硬件·gpt·spark·电脑·音视频
jinxindeep4 小时前
JoyAI-Echo:让五分钟叙事视频拥有可延续的角色记忆
音视频
行智科技6 小时前
ORB-SLAM3代码详解 - 第 01 篇 · 系统总览与三线程架构
linux·ubuntu·架构·自动驾驶
街灯L6 小时前
【Ubuntu】使用ffmpeg解析m3u8网页视频
ubuntu·ffmpeg·音视频
嵌入式学习和实践10 小时前
Ubuntu 系统 socat 详细介绍与使用教程 - 映射任意两种数据通道
linux·ubuntu·虚拟串口·数据映射·socat
VidDown10 小时前
VidDown 使用介绍:一个免费、本地化的在线工具集
javascript·编辑器·音视频·视频编解码·视频
VidDown11 小时前
VidDown 视频解析下载:免安装、无水印、免费使用
音视频
byte轻骑兵11 小时前
【LE Audio】CAP精讲[15]: 音频城堡的安保体系,全流程安全防护与权限管控
音视频·实时音视频·le audio·蓝牙音频·低功耗音频
YYRAN_ZZU12 小时前
Ubuntu22.04搭建QEMU嵌入式开发环境全攻略
linux·嵌入式硬件·ubuntu
huangdong_12 小时前
拼多多商品图片视频批量采集:整店自动分类与高清原图
前端·javascript·音视频