使用 whisper OpenCC 从音频文件生成字幕

whisper:将wav指定格式的音频文件生成srt字幕,但是字幕是繁体中文的,需要用OpenCC转成中文简体

开源项目地址:https://github.com/ggml-org/whisper.cpp

windows二进制包

在开源项目地址releases中下载

下载后将bin目录添加到Path环境变量中

Docker/Liunx

复制代码

FROM openjdk:8-jdk
# 安装编译工具
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
    sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
    apt-get update && apt-get install -y \
    build-essential \
    cmake \
    unzip \
    && rm -rf /var/lib/apt/lists/*

#在这里安装构建whisper.cpp
# ========== 新增：复制并构建 whisper.cpp ==========
# 创建 whisper 工作目录
RUN mkdir -p /opt/whisper
# 复制 zip 文件到容器
COPY ./whisper/whisper.cpp-master.zip /tmp/
# 解压 zip 文件
RUN unzip /tmp/whisper.cpp-master.zip -d /opt/whisper/ && \
    rm /tmp/whisper.cpp-master.zip

# 编译 whisper.cpp
RUN cd /opt/whisper/whisper.cpp-master && \
mkdir -p build && \
cd build && \
cmake .. -DCMAKE_BUILD_TYPE=Release && \
make -j$(nproc) whisper-cli

# 创建 models 目录
RUN mkdir -p /opt/whisper/whisper.cpp-master/build/models
# 复制模型文件到容器内的 models 目录
COPY ./whisper/models/ggml-base.bin /opt/whisper/whisper.cpp-master/build/models/

# 设置环境变量
ENV WHISPER_PATH=/opt/whisper/whisper.cpp-master/build/bin
ENV PATH="${WHISPER_PATH}:${PATH}"

#测试是否安装成功
COPY ./whisper/test/test.wav /opt/whisper/
RUN whisper-cli -m /opt/whisper/whisper.cpp-master/build/models/ggml-base.bin -f /opt/whisper/test.wav -l zh -of /opt/whisper/testOutput -osrt
# 查看结果
RUN cat /opt/whisper/testOutput.srt

模型文件下载地址

https://huggingface.co/ggerganov/whisper.cpp/tree/main

需指定模型文件路径

-m /opt/whisper/whisper.cpp-master/build/models/ggml-base.bin

OpenCC:将繁体转换为简体

开源项目地址: https://github.com/BYVoid/OpenCC?tab=readme-ov-file

windows

windows二进制包下载地址在开源项目地址中有展示

下载后将bin目录添加到Path环境变量中

Docker/Liunx

复制代码

FROM openjdk:8-jdk
# 安装编译工具
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
    sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
    apt-get update && apt-get install -y \
    build-essential \
    cmake \
    unzip \
    && rm -rf /var/lib/apt/lists/*

#安装opencc
# 创建 opencc 工作目录
RUN mkdir -p /opt/opencc
# 复制 zip 文件到容器
COPY ./opencc/OpenCC-master.zip /tmp/
# 解压 zip 文件
RUN unzip /tmp/OpenCC-master.zip -d /opt/opencc/ && \
    rm /tmp/OpenCC-master.zip

# 编译 opencc
RUN cd /opt/opencc/OpenCC-master && \
mkdir -p build && \
cd build && \
cmake .. && \
make -j$(nproc) && \
make install && \
    ldconfig

# 设置环境变量
ENV OPENCC_PATH=/opt/opencc/OpenCC-master/build/bin
ENV PATH="${OPENCC_PATH}:${PATH}"

# 测试运行
RUN opencc -i /opt/whisper/testOutput.srt -o /opt/whisper/output_simplified.srt -c t2s.json

如果wav或其他音频格式不正确,需要用ffmpeg转一下

复制代码

List<String> commandList=new ArrayList<>();
                commandList.add(ffmpegPath);
                commandList.add("-i");
                commandList.add(audioFile.getAbsolutePath());
                commandList.add("-ar");
                commandList.add("16000");
                commandList.add("-ac");
                commandList.add("1");
                /*commandList.add("-acodec");
                commandList.add("pcm_s16le");
                commandList.add("-f");
                commandList.add("s16le");*/
                commandList.add(tempDir+ baseName+".wav");
                FfmpegUtil.exec(commandList);


/**
     * 执行命令
     * @param commands
     */
    public static void exec(List<String> commands) {

        log.info("执行的FFmpeg命令: {}", String.join(" ", commands));

        try {
            ProcessBuilder pb = new ProcessBuilder(commands);
            Process process = pb.start();

            // 创建线程读取输出流
            new Thread(() -> {
                try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) {
                    String line;
                    while ((line = reader.readLine()) != null) {
                        System.out.println("Output: " + line);
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }).start();

            // 创建线程读取错误流
            new Thread(() -> {
                try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getErrorStream()))) {
                    String line;
                    while ((line = reader.readLine()) != null) {
                        System.out.println("Error: " + line);
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }).start();

            // 等待命令执行完成
            int exitCode = process.waitFor(); // 阻塞当前线程，直到命令执行完成
        } catch (IOException e) {
            throw new RuntimeException(e);
        }catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }

得到正确的wav音频文件后,可生成字幕,字幕可能出现同音不正确的字,需要在调用deepseek修正一下

复制代码

package com.ruoyi.image.utils;

import cn.hutool.core.io.FileUtil;
import com.google.gson.Gson;
import com.ruoyi.common.core.exception.ServiceException;
import com.ruoyi.common.core.utils.deepseek.DeepSeekChat;
import com.ruoyi.image.utils.dto.SrtBlockDto;
import com.ruoyi.image.utils.ffmpeg.FfmpegUtil;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SubtitlesUtil {

    //根据音频生成字幕
    public static String generateSubtitles(String audioPath){
        File file = new File(audioPath);
        if(file == null || !file.exists()){
            throw new ServiceException("生成字幕输入音频不存在");
        }

        String traditionalChineseSrt = convertToSrtPath(audioPath, "traditionalChinese");

        List<String> commandList=new ArrayList<>();
        commandList.add("whisper-cli");
        commandList.add("-m");
        //commandList.add("C:\\Users\\Administrator\\Desktop\\ss\\whisper\\Release\\models\\ggml-base.bin");//开发模型路径
        commandList.add("/opt/whisper/whisper.cpp-master/build/models/ggml-base.bin");//线上模型路径
        commandList.add("-f");
        commandList.add(audioPath);
        commandList.add("-l");
        commandList.add("zh");
        commandList.add("-of");
        commandList.add(traditionalChineseSrt.replace(".srt",""));
        commandList.add("-osrt");
        FfmpegUtil.exec(commandList);

        String simplifiedChinese = convertToSrtPath(audioPath, "simplifiedChinese");

        List<String> commandList1=new ArrayList<>();
        commandList1.add("opencc");
        commandList1.add("-i");
        commandList1.add(traditionalChineseSrt);
        commandList1.add("-o");
        commandList1.add(simplifiedChinese);
        commandList1.add("-c");
        commandList1.add("t2s.json");
        FfmpegUtil.exec(commandList1);

        File simplifiedChineseFile = new File(simplifiedChinese);
        if(simplifiedChineseFile == null || !simplifiedChineseFile.exists()){
            throw new ServiceException("生成字幕失败");
        }

        //纠正错别字
        correctTypos(simplifiedChineseFile.getAbsolutePath());

        return simplifiedChinese;
    }

    /**
     * 根据音频路径生成srt字幕路径
     * @param audioPath
     * @param srtFileNameSuffix
     * @return
     */
    public static String convertToSrtPath(String audioPath,String srtFileNameSuffix) {
        if (audioPath == null || !audioPath.toLowerCase().endsWith(".wav")) {
            // 可以抛出异常或返回原路径，根据业务需求决定
            throw new IllegalArgumentException("无效的音频文件路径: " + audioPath);
        }

        File audioFile = new File(audioPath);
        String parentDir = audioFile.getParent();               // 获取目录部分
        String fileName = audioFile.getName();                  // 获取文件名，如 "a.wav"
        String baseName = fileName.substring(0, fileName.lastIndexOf('.')); // 获取不含扩展名的文件名 "a"
        String newFileName = baseName + "_"+srtFileNameSuffix+".srt";          // 构建新文件名

        return parentDir+File.separator+newFileName;      // 组合并返回路径
    }


    /**
     * 纠正字幕中的错别字
     * @param simplifiedChineseSrtPath
     */
    public static void correctTypos(String simplifiedChineseSrtPath) {
        List<SrtBlockDto> srtBlockDtos = SrtParserUtil.parseSrtFile(new File(simplifiedChineseSrtPath));
        if(srtBlockDtos == null || srtBlockDtos.isEmpty()){
            throw new ServiceException("格式化字幕为空");
        }
        List<Map> plist=new ArrayList<>();
        srtBlockDtos.stream().forEach(i -> {
            Map<String,String> pmap=new HashMap<>();
            pmap.put("sequence",String.valueOf(i.getSequence()));
            pmap.put("text",i.getText());

            plist.add(pmap);
        });

        Gson gson = new Gson();


        String chat = DeepSeekChat.chat("我发给你json格式文件内容,其中有同音但是字词不符合语境的句子,你原位置替换为正确的字词后,在把内容返回给我,返回内容不要makdown格式,返回json,最外层的map key为'data'"
                ,gson.toJson(plist),"json_object");


        Map map = gson.fromJson(chat, Map.class);
        List<Map<String,String>> data = (List<Map<String,String>>)map.get("data");

        Map<String,String> dmap=new HashMap<>();
        data.stream().forEach(i -> {
            dmap.put(i.get("sequence"),i.get("text"));
        });

        srtBlockDtos.stream().forEach(i -> {
            i.setText(dmap.get(String.valueOf(i.getSequence())));
        });

        //System.out.println(srtBlockDtos);

        SrtParserUtil.writeToFile(srtBlockDtos,simplifiedChineseSrtPath);
    }

    public static void main(String[] args) {
        /*String s = SubtitlesUtil.generateSubtitles("C:\\Users\\Administrator\\Desktop\\ss\\a.wav");
        System.out.println(s);*/

        /*List<SrtBlockDto> srtBlockDtos = SrtParserUtil.parseSrtFile(new File("C:\\Users\\Administrator\\Desktop\\ss\\fdf2e20f-68e3-4407-967c-1fec087b4973.srt"));
        System.out.println(srtBlockDtos);

        SrtParserUtil.writeToFile(srtBlockDtos,"C:\\Users\\Administrator\\Desktop\\ss\\fdf2e20f-68e3-4407-967c-1fec087b4973111.srt");*/

        SubtitlesUtil.correctTypos("C:\\Users\\Administrator\\Desktop\\ss\\fdf2e20f-68e3-4407-967c-1fec087b4973.srt");
    }
}

public static String chat(String systemRole,String content,String responseFormat){
        OkHttpClient.Builder builder = new OkHttpClient.Builder();
        builder.readTimeout(180, TimeUnit.SECONDS);
        builder.writeTimeout(180, TimeUnit.SECONDS);
        builder.connectTimeout(30, TimeUnit.SECONDS);

        OkHttpClient client = builder.build();

        MediaType mediaType = MediaType.parse("application/json");
        String requestBody = "{\n  \"model\": \"deepseek-chat\",\n  \"messages\": [\n    {\n      \"role\": \"system\",\n      \"content\": \""+escapeContent(systemRole)+"\"\n    },\n    {\n      \"role\": \"user\",\n      \"content\": \""+escapeContent(content)+"\"\n    }\n  ],\n  \"stream\": false,\"response_format\": { \"type\": \""+responseFormat+"\" }\n}";
        RequestBody body = RequestBody.create(mediaType, requestBody);
        logger.info("DeepSeek请求参数：{}", requestBody);
        Request request = new Request.Builder()
                .url("https://api.deepseek.com/chat/completions")
                .method("POST", body)
                .addHeader("Authorization", "Bearer sk-a49c86")
                .addHeader("Content-Type", "application/json")
                .build();
        try {
            Response response = client.newCall(request).execute();
            if (!response.isSuccessful()) {
                logger.error("调用DeepSeek出错，返回结果：{}", response);
                throw new ServiceException("请求异常，请联系管理员");
            }
            Map<String, Object> map = gson.fromJson(response.body().string(), new TypeToken<Map<String, Object>>() {
            }.getType());

            List<Map<String, Object>> choicesList = (List<Map<String, Object>>)map.get("choices");
            if(choicesList == null || choicesList.isEmpty()){
                return null;
            }
            Map<String, Object> map1 = choicesList.get(0);
            Map<String, Object> message = (Map<String, Object>)map1.get("message");
            String o = (String)message.get("content");
            return o;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

package com.ruoyi.image.utils.dto;

import lombok.Data;

@Data
public class SrtBlockDto {
    /**
     * 序号
     */
    private Integer sequence;

    /**
     * 开始时间（毫秒）
     */
    private Long startTime;

    /**
     * 结束时间（毫秒）
     */
    private Long endTime;

    /**
     * 原始时间字符串，如 "00:00:00,000 --> 00:00:02,320"
     */
    private String timeString;

    /**
     * 文本内容（可能包含多行）
     */
    private String text;
}

package com.ruoyi.image.utils;

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.ruoyi.image.utils.dto.SrtBlockDto;

import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

public class SrtParserUtil {
    /**
     * 解析SRT文件内容为实体列表
     * @param srtContent SRT文件原始内容
     * @return 字幕块列表
     */
    public static List<SrtBlockDto> parseSrtContent(String srtContent) {
        List<SrtBlockDto> blocks = new ArrayList<>();

        if (StrUtil.isBlank(srtContent)) {
            return blocks;
        }

        // 按双换行符分割每个字幕块
        String[] blockStrings = srtContent.trim().split("\\r?\\n\\r?\\n");

        for (String blockStr : blockStrings) {
            SrtBlockDto block = parseSingleBlock(blockStr);
            if (block != null) {
                blocks.add(block);
            }
        }

        return blocks;
    }

    /**
     * 从文件直接解析
     * @param srtFile SRT文件
     * @return 字幕块列表
     */
    public static List<SrtBlockDto> parseSrtFile(File srtFile) {
        String content = FileUtil.readUtf8String(srtFile);
        return parseSrtContent(content);
    }

    /**
     * 解析单个字幕块
     * @param blockStr 单个字幕块的原始文本
     * @return SrtBlock对象
     */
    private static SrtBlockDto parseSingleBlock(String blockStr) {
        String[] lines = blockStr.split("\\r?\\n");

        if (lines.length < 3) {
            return null; // 格式不完整
        }

        try {
            SrtBlockDto block = new SrtBlockDto();

            // 1. 解析序号
            block.setSequence(Integer.parseInt(lines[0].trim()));

            // 2. 解析时间行
            String timeLine = lines[1].trim();
            block.setTimeString(timeLine);

            // 提取开始和结束时间
            String[] times = timeLine.split(" --> ");
            if (times.length == 2) {
                block.setStartTime(parseTimeToMillis(times[0]));
                block.setEndTime(parseTimeToMillis(times[1]));
            }

            // 3. 解析文本（可能有多行）
            StringBuilder textBuilder = new StringBuilder();
            for (int i = 2; i < lines.length; i++) {
                if (textBuilder.length() > 0) {
                    textBuilder.append("\n");
                }
                textBuilder.append(lines[i]);
            }
            block.setText(textBuilder.toString());

            return block;

        } catch (Exception e) {
            System.err.println("解析字幕块失败: " + blockStr);
            e.printStackTrace();
            return null;
        }
    }

    /**
     * 将时间字符串转换为毫秒数
     * @param timeStr 格式: "00:00:00,000" 或 "00:00:00.000"
     * @return 毫秒数
     */
    private static Long parseTimeToMillis(String timeStr) {
        // 支持逗号或点作为毫秒分隔符
        String normalized = timeStr.replace(',', '.');
        String[] parts = normalized.split("[:.]");

        if (parts.length >= 4) {
            long hours = Long.parseLong(parts[0]);
            long minutes = Long.parseLong(parts[1]);
            long seconds = Long.parseLong(parts[2]);
            long millis = Long.parseLong(parts[3]);

            return hours * 3600000 + minutes * 60000 + seconds * 1000 + millis;
        }

        return 0L;
    }

    /**
     * 获取指定范围内的字幕块
     * @param blocks 所有字幕块
     * @param startSeq 起始序号（包含）
     * @param endSeq 结束序号（包含）
     * @return 指定范围的列表
     */
    public static List<SrtBlockDto> getBlocksBySequence(List<SrtBlockDto> blocks, int startSeq, int endSeq) {
        List<SrtBlockDto> result = new ArrayList<>();
        for (SrtBlockDto block : blocks) {
            if (block.getSequence() >= startSeq && block.getSequence() <= endSeq) {
                result.add(block);
            }
        }
        return result;
    }

    // ==================== 写入相关方法 ====================

    /**
     * 将字幕块列表写入文件（默认UTF-8编码）
     * @param blocks 字幕块列表
     * @param outputFile 输出文件
     */
    public static void writeToFile(List<SrtBlockDto> blocks, File outputFile) {
        String content = formatToSrtContent(blocks);
        FileUtil.writeString(content, outputFile, StandardCharsets.UTF_8);
    }

    /**
     * 将字幕块列表写入文件（指定编码）
     * @param blocks 字幕块列表
     * @param outputFile 输出文件
     * @param charsetName 字符集名称，如 "GBK"
     */
    public static void writeToFile(List<SrtBlockDto> blocks, File outputFile, String charsetName) {
        String content = formatToSrtContent(blocks);
        FileUtil.writeString(content, outputFile, charsetName);
    }

    /**
     * 将字幕块列表写入文件（指定文件路径）
     * @param blocks 字幕块列表
     * @param outputPath 输出文件路径
     */
    public static void writeToFile(List<SrtBlockDto> blocks, String outputPath) {
        writeToFile(blocks, new File(outputPath));
    }

    /**
     * 将字幕块列表格式化为标准SRT字符串
     * @param blocks 字幕块列表
     * @return 标准SRT格式的字符串
     */
    public static String formatToSrtContent(List<SrtBlockDto> blocks) {
        if (blocks == null || blocks.isEmpty()) {
            return "";
        }

        StringBuilder sb = new StringBuilder();

        for (int i = 0; i < blocks.size(); i++) {
            SrtBlockDto block = blocks.get(i);

            // 序号（重新生成连续序号）
            sb.append(i + 1).append("\n");

            // 时间行
            sb.append(block.getTimeString()).append("\n");

            // 文本内容
            sb.append(block.getText());

            // 最后一个块后面不加空行
            if (i < blocks.size() - 1) {
                sb.append("\n\n");
            }
        }

        return sb.toString();
    }

    /**
     * 重新编号所有字幕块（从1开始连续编号）
     * @param blocks 字幕块列表
     */
    public static void renumberBlocks(List<SrtBlockDto> blocks) {
        for (int i = 0; i < blocks.size(); i++) {
            blocks.get(i).setSequence(i + 1);
        }
    }
}