之前看到了ai voice有些许兴趣便玩了玩,用花儿不哭大佬制作的rvc推理包做了几个模型,可能因为训练量和杂音的原因导致最后使用的效果不佳,后关注到花儿不哭大佬新的GPT-SoVITS项目。这个由于是文字转语音,训练出来的效果好上很多,但是是tts,没法用做变声器,而且推理时间相对rvc变声器过长。因此发现可以以语音转文字然后翻译再转为音频,以当做一个翻译器来用。
项目使用需有GPT-SoVITS的后端接口所以选择了箱庭XTer的推理包,给了相应接口给与调用
模型训练:
翻译使用谷歌翻译,调用对应api。参考编写
https://www.52pojie.cn/thread-1730501-1-1.html
备用api参考:
https://www.52pojie.cn/forum.php?mod=viewthread\&tid=1903090\&highlight=�ȸ跭��
由于语音转文字使用的是python的vosk库需去网址下载对应的模型文件(感觉识别准确度不够高)
https://alphacephei.com/vosk/models
<blockquote>import sounddevice as sd import vosk import queue import json import os import sys import requests import pyaudio import requests from requests.exceptions import JSONDecodeError from bs4 import BeautifulSoup # 初始化队列 q = queue.Queue() # 设置音频输入设备的采样率 samplerate = 16000 # 确保模型路径正确 model_path = "E:/aivoice/vosk-model-cn-0.22" # 替换为你的模型文件夹路径,例如 "vosk-model-small-cn-0.3" # 检查模型路径是否存在 if not os.path.exists(model_path): print(f"模型路径 {model_path} 不存在,请检查路径是否正确。") sys.exit(1) # 加载Vosk模型 model = vosk.Model(model_path) def translate_text(text,source_lang='',target_lang='ja'): #接口1:https://findmyip.net/api/translate.php # url = f"https://findmyip.net/api/translate.php?text={text}" url =f"https://findmyip.net/api/translate.php?text={text}&source_lang={source_lang}&target_lang={target_lang}" response = requests.get(url) try: data = response.json() print(data) if response.status_code == 200: if data['code']== 200: translation = data['data']['translate_result'] print("translation",translation) return translation elif data['code'] == 400: return "出错" else: return "内部接口错误,请联系开发者" else: return "内部接口错误,请联系开发者" except JSONDecodeError as e: return "出错" except requests.RequestException as e: return "出错" def google_translate_text(text,source_lang='zh-CN',target_lang='ja'): #接口2:https://translate.google.com url = "https://www.google.com.hk/async/translate" # word="你好" payload = f"async=translate,sl:{source_lang},tl:{target_lang},st:{text},id:1672056488960,qc:true,ac:true,_id:tw-async-translate,_pms:s,_fmt:pc" payload_encoded = payload.encode('utf-8') headers = { 'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"', 'DNT': '1', 'sec-ch-ua-mobile': '?0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 'sec-ch-ua-arch': '"x86"', 'sec-ch-ua-full-version': '"108.0.5359.125"', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'sec-ch-ua-platform-version': '"10.0.0"', 'sec-ch-ua-full-version-list': '"Not?A_Brand";v="8.0.0.0", "Chromium";v="108.0.5359.125", "Google Chrome";v="108.0.5359.125"', 'sec-ch-ua-bitness': '"64"', 'sec-ch-ua-model': '', 'sec-ch-ua-wow64': '?0', 'sec-ch-ua-platform': '"Windows"', 'Accept': '*/*', 'X-Client-Data': 'CKW1yQEIhbbJAQiktskBCMS2yQEIqZ3KAQjb08oBCLD+ygEIlaHLAQjv8swBCN75zAEI5PrMAQjxgM0BCLKCzQEI7ILNAQjIhM0BCO+EzQEIt4XNAQ==', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Dest': 'empty', 'host': 'www.google.com.hk', 'Cookie': '1P_JAR=2022-12-26-12; NID=511=eVLI1bG9nhyOZtqU14JBHm5Be00epdxfR4XmfQeehYyIkzgpXi6dbpNY75ZMVyS7aOjoM2oZ5WdoR8eNq6wi1-e_J0NeoyI0dtsHW-_8Ik4PGrqvuGHdcvVC03zTOEK2TY1FZL85Wimo_ZPIE3hGIrmGPSiel6-rRRW9lD30UPs' } response = requests.request("POST", url, headers=headers, data=payload_encoded) html=response.text soup = BeautifulSoup(html, 'html.parser') target_text = soup.find('span', {'id': 'tw-answ-target-text'}) if target_text: print(target_text.get_text()) return target_text.get_text() return None # return response # 定义回调函数 def callback(indata, frames, time, status): if status: print(status, file=sys.stderr) q.put(bytes(indata)) # 定义发送文字到文本到语音服务并播放音频的函数 def text_to_speech(text): stream_url = f'http://127.0.0.1:5000/tts?character=holi&text={text}&stream=true' # 初始化pyaudio p = pyaudio.PyAudio() # 打开音频流 stream = p.open(format=p.get_format_from_width(2), channels=1, rate=32000, output=True) # 使用requests获取音频流 response = requests.get(stream_url, stream=True) # 读取数据块并播放 for data in response.iter_content(chunk_size=1024): stream.write(data) # 停止和关闭流 stream.stop_stream() stream.close() # 终止pyaudio p.terminate() # 打开音频流 with sd.RawInputStream(samplerate=samplerate, blocksize=8000, device=2, dtype='int16', channels=1, callback=callback): #device是所使用的录入设备,需要自行调整,可通过代码输出查看所有设备列表进行更改 print('#' * 80) print('按 Ctrl+C 结束') print('#' * 80) rec = vosk.KaldiRecognizer(model, samplerate) last_partial = "" while True: data = q.get() if rec.AcceptWaveform(data): result = rec.Result() result_json = json.loads(result) recognized_text = result_json['text'] print(f"识别结果: {recognized_text}") last_partial = "" # 将识别结果发送到文本到语音服务并播放音频 if recognized_text: # translated_text=translate_text(recognized_text) #调用接口1 translated_text=google_translate_text(recognized_text,'zh-CN','ja') #调用接口2 前一个是输入语言(需跟vosk模型一样),后一个是翻译后的语言 text_to_speech(translated_text) else: partial_result = rec.PartialResult() partial_result_json = json.loads(partial_result) new_partial = partial_result_json['partial'] # 分割新旧部分的不同内容 new_words = new_partial[len(last_partial):].strip().split() if new_words: print(f"部分识别结果: {' '.join(new_words)}") last_partial = new_partial</blockquote>