whisper:将wav指定格式的音频文件生成srt字幕,但是字幕是繁体中文的,需要用OpenCC转成中文简体
开源项目地址:https://github.com/ggml-org/whisper.cpp
windows二进制包
在开源项目地址releases中下载
下载后将bin目录添加到Path环境变量中
Docker/Liunx
FROM openjdk:8-jdk
# 安装编译工具
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && apt-get install -y \
build-essential \
cmake \
unzip \
&& rm -rf /var/lib/apt/lists/*
#在这里安装构建whisper.cpp
# ========== 新增:复制并构建 whisper.cpp ==========
# 创建 whisper 工作目录
RUN mkdir -p /opt/whisper
# 复制 zip 文件到容器
COPY ./whisper/whisper.cpp-master.zip /tmp/
# 解压 zip 文件
RUN unzip /tmp/whisper.cpp-master.zip -d /opt/whisper/ && \
rm /tmp/whisper.cpp-master.zip
# 编译 whisper.cpp
RUN cd /opt/whisper/whisper.cpp-master && \
mkdir -p build && \
cd build && \
cmake .. -DCMAKE_BUILD_TYPE=Release && \
make -j$(nproc) whisper-cli
# 创建 models 目录
RUN mkdir -p /opt/whisper/whisper.cpp-master/build/models
# 复制模型文件到容器内的 models 目录
COPY ./whisper/models/ggml-base.bin /opt/whisper/whisper.cpp-master/build/models/
# 设置环境变量
ENV WHISPER_PATH=/opt/whisper/whisper.cpp-master/build/bin
ENV PATH="${WHISPER_PATH}:${PATH}"
#测试是否安装成功
COPY ./whisper/test/test.wav /opt/whisper/
RUN whisper-cli -m /opt/whisper/whisper.cpp-master/build/models/ggml-base.bin -f /opt/whisper/test.wav -l zh -of /opt/whisper/testOutput -osrt
# 查看结果
RUN cat /opt/whisper/testOutput.srt
模型文件下载地址
https://huggingface.co/ggerganov/whisper.cpp/tree/main
需指定模型文件路径
-m /opt/whisper/whisper.cpp-master/build/models/ggml-base.bin
OpenCC:将繁体转换为简体
开源项目地址: https://github.com/BYVoid/OpenCC?tab=readme-ov-file
windows
windows二进制包下载地址在开源项目地址中有展示
下载后将bin目录添加到Path环境变量中
Docker/Liunx
FROM openjdk:8-jdk
# 安装编译工具
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && apt-get install -y \
build-essential \
cmake \
unzip \
&& rm -rf /var/lib/apt/lists/*
#安装opencc
# 创建 opencc 工作目录
RUN mkdir -p /opt/opencc
# 复制 zip 文件到容器
COPY ./opencc/OpenCC-master.zip /tmp/
# 解压 zip 文件
RUN unzip /tmp/OpenCC-master.zip -d /opt/opencc/ && \
rm /tmp/OpenCC-master.zip
# 编译 opencc
RUN cd /opt/opencc/OpenCC-master && \
mkdir -p build && \
cd build && \
cmake .. && \
make -j$(nproc) && \
make install && \
ldconfig
# 设置环境变量
ENV OPENCC_PATH=/opt/opencc/OpenCC-master/build/bin
ENV PATH="${OPENCC_PATH}:${PATH}"
# 测试运行
RUN opencc -i /opt/whisper/testOutput.srt -o /opt/whisper/output_simplified.srt -c t2s.json
如果wav或其他音频格式不正确,需要用ffmpeg转一下
List<String> commandList=new ArrayList<>();
commandList.add(ffmpegPath);
commandList.add("-i");
commandList.add(audioFile.getAbsolutePath());
commandList.add("-ar");
commandList.add("16000");
commandList.add("-ac");
commandList.add("1");
/*commandList.add("-acodec");
commandList.add("pcm_s16le");
commandList.add("-f");
commandList.add("s16le");*/
commandList.add(tempDir+ baseName+".wav");
FfmpegUtil.exec(commandList);
/**
* 执行命令
* @param commands
*/
public static void exec(List<String> commands) {
log.info("执行的FFmpeg命令: {}", String.join(" ", commands));
try {
ProcessBuilder pb = new ProcessBuilder(commands);
Process process = pb.start();
// 创建线程读取输出流
new Thread(() -> {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
System.out.println("Output: " + line);
}
} catch (IOException e) {
e.printStackTrace();
}
}).start();
// 创建线程读取错误流
new Thread(() -> {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getErrorStream()))) {
String line;
while ((line = reader.readLine()) != null) {
System.out.println("Error: " + line);
}
} catch (IOException e) {
e.printStackTrace();
}
}).start();
// 等待命令执行完成
int exitCode = process.waitFor(); // 阻塞当前线程,直到命令执行完成
} catch (IOException e) {
throw new RuntimeException(e);
}catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
得到正确的wav音频文件后,可生成字幕,字幕可能出现同音不正确的字,需要在调用deepseek修正一下
package com.ruoyi.image.utils;
import cn.hutool.core.io.FileUtil;
import com.google.gson.Gson;
import com.ruoyi.common.core.exception.ServiceException;
import com.ruoyi.common.core.utils.deepseek.DeepSeekChat;
import com.ruoyi.image.utils.dto.SrtBlockDto;
import com.ruoyi.image.utils.ffmpeg.FfmpegUtil;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SubtitlesUtil {
//根据音频生成字幕
public static String generateSubtitles(String audioPath){
File file = new File(audioPath);
if(file == null || !file.exists()){
throw new ServiceException("生成字幕输入音频不存在");
}
String traditionalChineseSrt = convertToSrtPath(audioPath, "traditionalChinese");
List<String> commandList=new ArrayList<>();
commandList.add("whisper-cli");
commandList.add("-m");
//commandList.add("C:\\Users\\Administrator\\Desktop\\ss\\whisper\\Release\\models\\ggml-base.bin");//开发模型路径
commandList.add("/opt/whisper/whisper.cpp-master/build/models/ggml-base.bin");//线上模型路径
commandList.add("-f");
commandList.add(audioPath);
commandList.add("-l");
commandList.add("zh");
commandList.add("-of");
commandList.add(traditionalChineseSrt.replace(".srt",""));
commandList.add("-osrt");
FfmpegUtil.exec(commandList);
String simplifiedChinese = convertToSrtPath(audioPath, "simplifiedChinese");
List<String> commandList1=new ArrayList<>();
commandList1.add("opencc");
commandList1.add("-i");
commandList1.add(traditionalChineseSrt);
commandList1.add("-o");
commandList1.add(simplifiedChinese);
commandList1.add("-c");
commandList1.add("t2s.json");
FfmpegUtil.exec(commandList1);
File simplifiedChineseFile = new File(simplifiedChinese);
if(simplifiedChineseFile == null || !simplifiedChineseFile.exists()){
throw new ServiceException("生成字幕失败");
}
//纠正错别字
correctTypos(simplifiedChineseFile.getAbsolutePath());
return simplifiedChinese;
}
/**
* 根据音频路径生成srt字幕路径
* @param audioPath
* @param srtFileNameSuffix
* @return
*/
public static String convertToSrtPath(String audioPath,String srtFileNameSuffix) {
if (audioPath == null || !audioPath.toLowerCase().endsWith(".wav")) {
// 可以抛出异常或返回原路径,根据业务需求决定
throw new IllegalArgumentException("无效的音频文件路径: " + audioPath);
}
File audioFile = new File(audioPath);
String parentDir = audioFile.getParent(); // 获取目录部分
String fileName = audioFile.getName(); // 获取文件名,如 "a.wav"
String baseName = fileName.substring(0, fileName.lastIndexOf('.')); // 获取不含扩展名的文件名 "a"
String newFileName = baseName + "_"+srtFileNameSuffix+".srt"; // 构建新文件名
return parentDir+File.separator+newFileName; // 组合并返回路径
}
/**
* 纠正字幕中的错别字
* @param simplifiedChineseSrtPath
*/
public static void correctTypos(String simplifiedChineseSrtPath) {
List<SrtBlockDto> srtBlockDtos = SrtParserUtil.parseSrtFile(new File(simplifiedChineseSrtPath));
if(srtBlockDtos == null || srtBlockDtos.isEmpty()){
throw new ServiceException("格式化字幕为空");
}
List<Map> plist=new ArrayList<>();
srtBlockDtos.stream().forEach(i -> {
Map<String,String> pmap=new HashMap<>();
pmap.put("sequence",String.valueOf(i.getSequence()));
pmap.put("text",i.getText());
plist.add(pmap);
});
Gson gson = new Gson();
String chat = DeepSeekChat.chat("我发给你json格式文件内容,其中有同音但是字词不符合语境的句子,你原位置替换为正确的字词后,在把内容返回给我,返回内容不要makdown格式,返回json,最外层的map key为'data'"
,gson.toJson(plist),"json_object");
Map map = gson.fromJson(chat, Map.class);
List<Map<String,String>> data = (List<Map<String,String>>)map.get("data");
Map<String,String> dmap=new HashMap<>();
data.stream().forEach(i -> {
dmap.put(i.get("sequence"),i.get("text"));
});
srtBlockDtos.stream().forEach(i -> {
i.setText(dmap.get(String.valueOf(i.getSequence())));
});
//System.out.println(srtBlockDtos);
SrtParserUtil.writeToFile(srtBlockDtos,simplifiedChineseSrtPath);
}
public static void main(String[] args) {
/*String s = SubtitlesUtil.generateSubtitles("C:\\Users\\Administrator\\Desktop\\ss\\a.wav");
System.out.println(s);*/
/*List<SrtBlockDto> srtBlockDtos = SrtParserUtil.parseSrtFile(new File("C:\\Users\\Administrator\\Desktop\\ss\\fdf2e20f-68e3-4407-967c-1fec087b4973.srt"));
System.out.println(srtBlockDtos);
SrtParserUtil.writeToFile(srtBlockDtos,"C:\\Users\\Administrator\\Desktop\\ss\\fdf2e20f-68e3-4407-967c-1fec087b4973111.srt");*/
SubtitlesUtil.correctTypos("C:\\Users\\Administrator\\Desktop\\ss\\fdf2e20f-68e3-4407-967c-1fec087b4973.srt");
}
}
public static String chat(String systemRole,String content,String responseFormat){
OkHttpClient.Builder builder = new OkHttpClient.Builder();
builder.readTimeout(180, TimeUnit.SECONDS);
builder.writeTimeout(180, TimeUnit.SECONDS);
builder.connectTimeout(30, TimeUnit.SECONDS);
OkHttpClient client = builder.build();
MediaType mediaType = MediaType.parse("application/json");
String requestBody = "{\n \"model\": \"deepseek-chat\",\n \"messages\": [\n {\n \"role\": \"system\",\n \"content\": \""+escapeContent(systemRole)+"\"\n },\n {\n \"role\": \"user\",\n \"content\": \""+escapeContent(content)+"\"\n }\n ],\n \"stream\": false,\"response_format\": { \"type\": \""+responseFormat+"\" }\n}";
RequestBody body = RequestBody.create(mediaType, requestBody);
logger.info("DeepSeek请求参数:{}", requestBody);
Request request = new Request.Builder()
.url("https://api.deepseek.com/chat/completions")
.method("POST", body)
.addHeader("Authorization", "Bearer sk-a49c86")
.addHeader("Content-Type", "application/json")
.build();
try {
Response response = client.newCall(request).execute();
if (!response.isSuccessful()) {
logger.error("调用DeepSeek出错,返回结果:{}", response);
throw new ServiceException("请求异常,请联系管理员");
}
Map<String, Object> map = gson.fromJson(response.body().string(), new TypeToken<Map<String, Object>>() {
}.getType());
List<Map<String, Object>> choicesList = (List<Map<String, Object>>)map.get("choices");
if(choicesList == null || choicesList.isEmpty()){
return null;
}
Map<String, Object> map1 = choicesList.get(0);
Map<String, Object> message = (Map<String, Object>)map1.get("message");
String o = (String)message.get("content");
return o;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
package com.ruoyi.image.utils.dto;
import lombok.Data;
@Data
public class SrtBlockDto {
/**
* 序号
*/
private Integer sequence;
/**
* 开始时间(毫秒)
*/
private Long startTime;
/**
* 结束时间(毫秒)
*/
private Long endTime;
/**
* 原始时间字符串,如 "00:00:00,000 --> 00:00:02,320"
*/
private String timeString;
/**
* 文本内容(可能包含多行)
*/
private String text;
}
package com.ruoyi.image.utils;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.ruoyi.image.utils.dto.SrtBlockDto;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
public class SrtParserUtil {
/**
* 解析SRT文件内容为实体列表
* @param srtContent SRT文件原始内容
* @return 字幕块列表
*/
public static List<SrtBlockDto> parseSrtContent(String srtContent) {
List<SrtBlockDto> blocks = new ArrayList<>();
if (StrUtil.isBlank(srtContent)) {
return blocks;
}
// 按双换行符分割每个字幕块
String[] blockStrings = srtContent.trim().split("\\r?\\n\\r?\\n");
for (String blockStr : blockStrings) {
SrtBlockDto block = parseSingleBlock(blockStr);
if (block != null) {
blocks.add(block);
}
}
return blocks;
}
/**
* 从文件直接解析
* @param srtFile SRT文件
* @return 字幕块列表
*/
public static List<SrtBlockDto> parseSrtFile(File srtFile) {
String content = FileUtil.readUtf8String(srtFile);
return parseSrtContent(content);
}
/**
* 解析单个字幕块
* @param blockStr 单个字幕块的原始文本
* @return SrtBlock对象
*/
private static SrtBlockDto parseSingleBlock(String blockStr) {
String[] lines = blockStr.split("\\r?\\n");
if (lines.length < 3) {
return null; // 格式不完整
}
try {
SrtBlockDto block = new SrtBlockDto();
// 1. 解析序号
block.setSequence(Integer.parseInt(lines[0].trim()));
// 2. 解析时间行
String timeLine = lines[1].trim();
block.setTimeString(timeLine);
// 提取开始和结束时间
String[] times = timeLine.split(" --> ");
if (times.length == 2) {
block.setStartTime(parseTimeToMillis(times[0]));
block.setEndTime(parseTimeToMillis(times[1]));
}
// 3. 解析文本(可能有多行)
StringBuilder textBuilder = new StringBuilder();
for (int i = 2; i < lines.length; i++) {
if (textBuilder.length() > 0) {
textBuilder.append("\n");
}
textBuilder.append(lines[i]);
}
block.setText(textBuilder.toString());
return block;
} catch (Exception e) {
System.err.println("解析字幕块失败: " + blockStr);
e.printStackTrace();
return null;
}
}
/**
* 将时间字符串转换为毫秒数
* @param timeStr 格式: "00:00:00,000" 或 "00:00:00.000"
* @return 毫秒数
*/
private static Long parseTimeToMillis(String timeStr) {
// 支持逗号或点作为毫秒分隔符
String normalized = timeStr.replace(',', '.');
String[] parts = normalized.split("[:.]");
if (parts.length >= 4) {
long hours = Long.parseLong(parts[0]);
long minutes = Long.parseLong(parts[1]);
long seconds = Long.parseLong(parts[2]);
long millis = Long.parseLong(parts[3]);
return hours * 3600000 + minutes * 60000 + seconds * 1000 + millis;
}
return 0L;
}
/**
* 获取指定范围内的字幕块
* @param blocks 所有字幕块
* @param startSeq 起始序号(包含)
* @param endSeq 结束序号(包含)
* @return 指定范围的列表
*/
public static List<SrtBlockDto> getBlocksBySequence(List<SrtBlockDto> blocks, int startSeq, int endSeq) {
List<SrtBlockDto> result = new ArrayList<>();
for (SrtBlockDto block : blocks) {
if (block.getSequence() >= startSeq && block.getSequence() <= endSeq) {
result.add(block);
}
}
return result;
}
// ==================== 写入相关方法 ====================
/**
* 将字幕块列表写入文件(默认UTF-8编码)
* @param blocks 字幕块列表
* @param outputFile 输出文件
*/
public static void writeToFile(List<SrtBlockDto> blocks, File outputFile) {
String content = formatToSrtContent(blocks);
FileUtil.writeString(content, outputFile, StandardCharsets.UTF_8);
}
/**
* 将字幕块列表写入文件(指定编码)
* @param blocks 字幕块列表
* @param outputFile 输出文件
* @param charsetName 字符集名称,如 "GBK"
*/
public static void writeToFile(List<SrtBlockDto> blocks, File outputFile, String charsetName) {
String content = formatToSrtContent(blocks);
FileUtil.writeString(content, outputFile, charsetName);
}
/**
* 将字幕块列表写入文件(指定文件路径)
* @param blocks 字幕块列表
* @param outputPath 输出文件路径
*/
public static void writeToFile(List<SrtBlockDto> blocks, String outputPath) {
writeToFile(blocks, new File(outputPath));
}
/**
* 将字幕块列表格式化为标准SRT字符串
* @param blocks 字幕块列表
* @return 标准SRT格式的字符串
*/
public static String formatToSrtContent(List<SrtBlockDto> blocks) {
if (blocks == null || blocks.isEmpty()) {
return "";
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < blocks.size(); i++) {
SrtBlockDto block = blocks.get(i);
// 序号(重新生成连续序号)
sb.append(i + 1).append("\n");
// 时间行
sb.append(block.getTimeString()).append("\n");
// 文本内容
sb.append(block.getText());
// 最后一个块后面不加空行
if (i < blocks.size() - 1) {
sb.append("\n\n");
}
}
return sb.toString();
}
/**
* 重新编号所有字幕块(从1开始连续编号)
* @param blocks 字幕块列表
*/
public static void renumberBlocks(List<SrtBlockDto> blocks) {
for (int i = 0; i < blocks.size(); i++) {
blocks.get(i).setSequence(i + 1);
}
}
}