ubuntu 下调用系统麦克风,以及faster-whisper-medium 处理音频转写文本

python 复制代码
import asyncio
import pyaudio
import wave
import gc
import io
import zhconv
import torch
import copy
from faster_whisper import WhisperModel


class MicroPhoneTransWords(object):
    def __init__(self):
        self.format = pyaudio.paInt16
        self.channels = 1
        self.rate = 44100  # 采样率
        self.chunck = 1024  # 每帧大小
        self.recode_seconds = 5 # 处理间隔
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=self.format,
                                  channels=self.channels,
                                  rate=self.rate,
                                  input=True,
                                  frames_per_buffer=self.chunck)
        self.model_size = "./modelscape/faster-whisper-medium"
        self.model = WhisperModel(self.model_size, device="cuda", num_workers=1, compute_type="float32")

    async def recode_listen(self):
        print("开始录音")
        frames = []
        try:
            for _ in range(0, int(self.rate / self.chunck * self.recode_seconds)):
                data = self.stream.read(self.chunck, exception_on_overflow=False)
                frames.append(data)
        except OSError as e:
            print(f"录音时发生错误:{e}")
            # 如果流出错,可以尝试重新打开
            self.reopen_stream()
            return  # 出错时返回,下一次循环录音
        print("录音完成")
        # 异步启动转写任务
        await self.recode_voices(copy.copy(frames))
        await asyncio.sleep(0.0001)

    def reopen_stream(self):
        print("重启音频流......")
        try:
            self.stream.stop_stream()
            self.stream.close()
        except Exception:
            pass
        try:
            self.p.terminate()
        except Exception:
            pass
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=self.format,
                                  channels=self.channels,
                                  rate=self.rate,
                                  input=True,
                                  frames_per_buffer=self.chunck)

    async def recode_voices(self, frames):
        """

        :param frames:
        :return:
        """
        print("处理音频格式")
        buffer = io.BytesIO()
        wf = wave.open(buffer, 'wb')
        wf.setnchannels(self.channels)
        wf.setsampwidth(self.p.get_sample_size(self.format))
        wf.setframerate(self.rate)
        wf.writeframes(b''.join(frames))
        wf.close()
        buffer.seek(0)
        await self.transAudioWords(buffer)

    async def listen(self):
        while True:
            print("Listening...")
            try:
                while True:
                    await self.recode_listen()
            except Exception as e:
                print("录音终止", e)
                gc.collect()
                torch.cuda.empty_cache()

    async def tranSampleChinese(self, word):
        locale = "zh-hans"
        return zhconv.convert(word, locale)

    async def transAudioWords(self, buffer):
        print("开始转写......")
        segments, info = self.model.transcribe(buffer, beam_size=5,
                                               condition_on_previous_text=False,
                                               vad_filter=True,
                                               vad_parameters=dict(min_silence_duration_ms=1000))
        print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
        for segment in segments:
            text = await self.tranSampleChinese(segment.text)
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, text))
        # 释放空间 防止cuda 内存溢出
        gc.collect()
        torch.cuda.empty_cache()


if __name__ == '__main__':
    asyncio.run(MicroPhoneTransWords().listen())

requirements.txt

复制代码
torch==2.2.2
torchvision==0.17.2
torchaudio==2.2.2
faster-whisper
gradio
pybind11>=2.12
numpy<2
SpeechRecognition
PyAudio # sudo apt update && sudo apt install ffmpeg portaudio19-dev python3-pyaudio -y
whisper-live
zhconv==1.4.3
相关推荐
#君君#2 小时前
解决 Ubuntu 20.04 虚拟机中 catkin_make 编译卡死问题
linux·运维·ubuntu
m0_526119403 小时前
h5的aliplayer-min.js 加密视频会走到debugger
开发语言·javascript·音视频
ue星空3 小时前
UE音频中间件wwise插件
学习·ue5·音视频
2401_896008195 小时前
ubuntu之开机自启frpc
linux·运维·ubuntu
陈奕昆6 小时前
3.2 HarmonyOS NEXT跨设备任务调度与协同实战:算力分配、音视频协同与智能家居联动
音视频·智能家居·harmonyos
趣浪吧7 小时前
【JSON-to-Video】设置背景视频片断
json·aigc·音视频·视频
昨日之日20067 小时前
SoloSpeech - 高质量语音处理模型,一键提取指定说话人音频并提升提取音频清晰度和质量 本地一键整合包下载
人工智能·音视频
黑风风7 小时前
MySQL 8 完整安装指南(Ubuntu 22.04)
mysql·ubuntu
科技小E9 小时前
嵌入式SDK技术EasyRTC音视频实时通话助力即时通信社交/教育等多场景创新应用
人工智能·音视频
梁汉强6669 小时前
B站缓存视频数据m4s转mp4
缓存·音视频