写在前面:主要原理是提取视频中的音频,通过语音识别文字的方法完成字幕提取;因此,不支持没有声音的视频。
1.音频提取
def convert_mp4_to_mp3(input_file, output_file):
"""Convert MP4 video file to MP3 audio."""
video_clip = VideoFileClip(input_file)
audio = video_clip.audio
audio.write_audiofile(output_file)
2.音频转文字
model = whisper.load_model(WHISPER_MODEL_PATH).to(device)
def transcribe_audio(file_path):
"""Transcribe audio file using Whisper model."""
result = model.transcribe(
file_path,
language="zh",# Explicitly tell Whisper to use Chinese
task="transcribe",
initial_prompt="这是一段视频教程,请加上标点符号",
beam_size=5,# Add beam_size directly
prompt="- 吃了没?- 吃了。",# Add prompt directly
word_timestamps=True, # 设置为 True 以获取单词时间戳
fp16=False,
)
return result
3.文字保存为字幕
def save_transcription_to_srt(segments, file_path):
"""Save transcription segments to an SRT file."""
with open(file_path, 'w', encoding='utf-8') as f:
for i, segment in enumerate(segments):
f.write(f"{i + 1}\n")
f.write(f"{seconds_to_hmsm(segment['start'])} --> {seconds_to_hmsm(segment['end'])}\n")
translated_text = translate_text(segment['text'])
f.write(segment['text'] + f'\n{translated_text}\n\n')
该代码中
translated_text = translate_text(segment['text'])
功能是对字幕进行翻译,因此保存的字幕为双语字幕(解放字幕组劳动力o(* ̄▽ ̄*)ブ)。翻译方法很多,简单提供一个基于ollama的翻译。
python
def translate_text(content):
"""Translate text to English using Ollama."""
stream = ollama.chat(
model="llama3.2:1b",
messages=[{"role": "user", "content": f"下列文字翻译为简体中文,只输出翻译结果:{content}"}],
stream=True
)
output = ''
for chunk in stream:
output += chunk['message']['content']
return output
在上述功能的基础上,进阶两个功能。
4.更新字幕到视频内
def videoclip_with_subtitles(src_mp4, dst_mp4, srt_file):
"""Add translated subtitles to the video."""
video = VideoFileClip(src_mp4)
subtitles = parse_srt(srt_file)
text_clips = []
for text, start, duration in tqdm(subtitles):
# translated_text = translate_text(text)
wrapped_text = insert_newlines(text, max_words=10)
text_clip = (TextClip(wrapped_text, font='msyh.ttc', fontsize=30, color='red')
.set_position('top')
.set_duration(duration)
.set_start(start))
text_clips.append(text_clip)
final_video = CompositeVideoClip([video, *text_clips])
final_video.write_videofile(dst_mp4, audio_codec='aac')
其中字幕解析为:
def parse_srt(srt_file):
"""Parse SRT file and return subtitles as text, start time, and duration."""
subtitles = []
with open(srt_file, 'r', encoding='utf-8') as file:
content = file.read()
entries = re.split(r'\n\n+', content.strip())
for entry in entries:
lines = entry.splitlines()
if len(lines) >= 3:
time_range = lines[1]
start, end = time_range.split('-->')
start_time = convert_srt_time_to_seconds(start)
duration = convert_srt_time_to_seconds(end) - start_time
text = '\n'.join(lines[2:])
subtitles.append((text, start_time, duration))
return subtitles
5.根据新的字幕,生成新的音频(方法很多,另开贴,更新后网址会贴在这里),即可以将外文音频转为中文音频。(代码还存在一些缺陷,修改后更新)
python
def text_to_speech(text, mp3_filename="output.mp3"):
# 初始化TTS引擎
engine = pyttsx3.init()
# 设置语音属性(可选)
engine.setProperty('rate', 150) # 语速
engine.setProperty('volume', 1) # 音量 (0.0到1.0)
# 保存为WAV文件
wav_filename = "1.wav"
engine.save_to_file(text, wav_filename)
engine.runAndWait()
# 使用pydub将WAV转换为MP3
audio = AudioSegment.from_wav(wav_filename)
audio.export(mp3_filename, format="mp3")
# 读取字幕文件
subs = pysrt.open('data\\27364690929-1-192.srt')
# 创建一个空的音频文件列表
audio_clips = []
# 遍历每条字幕
for sub in tqdm(subs):
# 将字幕文本转换为语音
text_list = sub.text.split('\n')
text_zh = text_list[-1] # 确保提取的文本是字符串而不是列表
# 生成音频文件
audio_filename = f"audio_temp/audio_{sub.index}.mp3"
text_to_speech(text_zh, audio_filename)
# 创建音频片段
audio_clip = AudioFileClip(audio_filename)
# 确保使用正确的单位:毫秒到秒
# duration = (sub.end.ordinal - sub.start.ordinal) / 1000 # 转为秒
# audio_clip = audio_clip.set_duration(duration)
# 计算合适的开始时间
start_time = sub.start.ordinal # 转为秒
audio_clip = audio_clip.set_start(start_time)
audio_clips.append(audio_clip)
# 合并所有音频片段
final_audio = concatenate_audioclips(audio_clips)
将新的音频合并到视频内,即可完成音频替换
python
video = VideoFileClip('data\\27364690929-1-192.mp4').without_audio()
# 将新的音频合并到视频中
final_video = video.set_audio(final_audio)
# 保存最终视频
final_video.write_videofile('data\\27364690929-1-192-zh-audio.mp4', codec='libx264')
github上有一些非常棒的类似项目,做完才看到,还没深入研究,大家可以上去多看看。