忘了怎么安装了,这里记录一下整理出来的类,不过这个识别容易出现幻觉,对齐也不是很准,比如说使用 large-v3 倒是能有一定的分句作用,但是每句最后一个字给的时间太短,这也表明了对齐不准。
python
from chj.comm.pic import *
import json
import whisper
import whisperx
import gc
class Warp_whisper:
def __init__(self, language="zh", device="cuda", compute_type="float32", model="large-v2" ):
torch.backends.cudnn.enabled = False
if not torch.cuda.is_available():
device="cpu"
dmodel="XXXXX/models/torch/whisper"
self.asr_model=whisper.load_model(model, device, download_root=dmodel)
self.txt_converter = None
if model=="large-v2" and language=="zh":
from opencc import OpenCC
converter = OpenCC('t2s')
self.txt_converter = converter
self.prompt=None
else:
if language=="zh":
self.prompt='以下是普通话的句子'
else:
self.prompt=None
self.prompt=None
self.language=language
self.device=device
self.align_model, self.align_metadata = whisperx.load_align_model(language_code=language, device=device)
def do_asr_algin(self, fjson, fwav):
audio = whisper.load_audio(fwav)
result = self.asr_model.transcribe(audio, language=self.language, initial_prompt=self.prompt)
#assert result["language"] == self.language
result_segments = result["segments"]
if self.txt_converter:
for e in result_segments:
e['text'] = converter.convert( e['text'] )
result = whisperx.align(result_segments, self.align_model, self.align_metadata, audio, self.device, return_char_alignments=False)
result=result["segments"]
with open(fjson, "w") as fout:
json.dump(result, fout, indent=4, ensure_ascii=False)
def f2_invoke():
print("Doing... whisper align")
basedir=sys.argv[1]
din=f"{basedir}/audio_feats/wav16k"
if not os.path.exists(din):
print("no such dir", din)
exit(1)
dout=f"{basedir}/audio_feats/whisper_align"
# model="large-v3
cls_asr=Warp_whisper()
chj_file.mkdir(dout)
for fwav in tqdm( glob.glob(f"{din}/*.wav") ):
nm = chj_file.get_nm(fwav)
fnm=f"{dout}/{nm}.json"
if os.path.isfile(fnm): continue
cls_asr.do_asr_algin(fnm,fwav)
print("Finished whisper align")