ubuntu 下调用系统麦克风,以及faster-whisper-medium 处理音频转写文本

python 复制代码
import asyncio
import pyaudio
import wave
import gc
import io
import zhconv
import torch
import copy
from faster_whisper import WhisperModel


class MicroPhoneTransWords(object):
    def __init__(self):
        self.format = pyaudio.paInt16
        self.channels = 1
        self.rate = 44100  # 采样率
        self.chunck = 1024  # 每帧大小
        self.recode_seconds = 5 # 处理间隔
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=self.format,
                                  channels=self.channels,
                                  rate=self.rate,
                                  input=True,
                                  frames_per_buffer=self.chunck)
        self.model_size = "./modelscape/faster-whisper-medium"
        self.model = WhisperModel(self.model_size, device="cuda", num_workers=1, compute_type="float32")

    async def recode_listen(self):
        print("开始录音")
        frames = []
        try:
            for _ in range(0, int(self.rate / self.chunck * self.recode_seconds)):
                data = self.stream.read(self.chunck, exception_on_overflow=False)
                frames.append(data)
        except OSError as e:
            print(f"录音时发生错误:{e}")
            # 如果流出错,可以尝试重新打开
            self.reopen_stream()
            return  # 出错时返回,下一次循环录音
        print("录音完成")
        # 异步启动转写任务
        await self.recode_voices(copy.copy(frames))
        await asyncio.sleep(0.0001)

    def reopen_stream(self):
        print("重启音频流......")
        try:
            self.stream.stop_stream()
            self.stream.close()
        except Exception:
            pass
        try:
            self.p.terminate()
        except Exception:
            pass
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=self.format,
                                  channels=self.channels,
                                  rate=self.rate,
                                  input=True,
                                  frames_per_buffer=self.chunck)

    async def recode_voices(self, frames):
        """

        :param frames:
        :return:
        """
        print("处理音频格式")
        buffer = io.BytesIO()
        wf = wave.open(buffer, 'wb')
        wf.setnchannels(self.channels)
        wf.setsampwidth(self.p.get_sample_size(self.format))
        wf.setframerate(self.rate)
        wf.writeframes(b''.join(frames))
        wf.close()
        buffer.seek(0)
        await self.transAudioWords(buffer)

    async def listen(self):
        while True:
            print("Listening...")
            try:
                while True:
                    await self.recode_listen()
            except Exception as e:
                print("录音终止", e)
                gc.collect()
                torch.cuda.empty_cache()

    async def tranSampleChinese(self, word):
        locale = "zh-hans"
        return zhconv.convert(word, locale)

    async def transAudioWords(self, buffer):
        print("开始转写......")
        segments, info = self.model.transcribe(buffer, beam_size=5,
                                               condition_on_previous_text=False,
                                               vad_filter=True,
                                               vad_parameters=dict(min_silence_duration_ms=1000))
        print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
        for segment in segments:
            text = await self.tranSampleChinese(segment.text)
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, text))
        # 释放空间 防止cuda 内存溢出
        gc.collect()
        torch.cuda.empty_cache()


if __name__ == '__main__':
    asyncio.run(MicroPhoneTransWords().listen())

requirements.txt

复制代码
torch==2.2.2
torchvision==0.17.2
torchaudio==2.2.2
faster-whisper
gradio
pybind11>=2.12
numpy<2
SpeechRecognition
PyAudio # sudo apt update && sudo apt install ffmpeg portaudio19-dev python3-pyaudio -y
whisper-live
zhconv==1.4.3
相关推荐
REDcker5 天前
WebCodecs VideoDecoder 的 hardwareAcceleration 使用
前端·音视频·实时音视频·直播·webcodecs·videodecoder
欧云服务器5 天前
怎么让脚本命令可以同时在centos、debian、ubuntu执行?
ubuntu·centos·debian
gihigo19985 天前
基于TCP协议实现视频采集与通信
网络协议·tcp/ip·音视频
智渊AI5 天前
Ubuntu 20.04/22.04 下通过 NVM 安装 Node.js 22(LTS 稳定版)
ubuntu·node.js·vim
山河君5 天前
四麦克风声源定位实战:基于 GCC-PHAT + 最小二乘法实现 DOA
算法·音视频·语音识别·信号处理·最小二乘法·tdoa
The️5 天前
Linux驱动开发之Read_Write函数
linux·运维·服务器·驱动开发·ubuntu·交互
音视频牛哥5 天前
Android平台RTMP/RTSP超低延迟直播播放器开发详解——基于SmartMediaKit深度实践
android·人工智能·计算机视觉·音视频·rtmp播放器·安卓rtmp播放器·rtmp直播播放器
再战300年5 天前
Samba在ubuntu上安装部署
linux·运维·ubuntu
qq_416276425 天前
通用音频表征的对比学习
学习·音视频
qwfys2005 天前
How to install golang 1.26.0 to Ubuntu 24.04
ubuntu·golang·install