Java 实现将Word 转换成markdown

日常的开发中,需要将word 等各类文章信息转换成格式化语言,因此需要使用各类语言将word 转换成Markdown

1、引入 jar包

复制代码
  <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.2.3</version>
        </dependency>

2、使用以下代码进行编写

java 复制代码
package com.nd.ai.test.service.utils;

import com.alibaba.fastjson.JSONObject;
import com.nd.ai.test.service.dto.apitest.FileMarkdownDTO;
import org.apache.commons.compress.utils.Lists;
import org.apache.poi.xwpf.usermodel.*;

import java.io.*;
import java.util.List;
import java.util.UUID;

/**
 * @ClassName WordToMarkdownConverter
 * @Author Administrator
 */
public class WordToMarkdownConverter {


    public static FileMarkdownDTO convertWordToMarkdown(String wordPath) throws IOException {

        List<String> imagePathList = Lists.newArrayList();

        FileMarkdownDTO dto = new FileMarkdownDTO();


        try  {
            File file = new File(wordPath);
            String uuid = UUID.randomUUID().toString();
            String outputFilePath = file.getParent() + File.separator + uuid + ".md";
            File outputFile = new File(outputFilePath);
            FileInputStream fis = new FileInputStream(file); // Corrected to read the input file
            XWPFDocument document = new XWPFDocument(fis);
            FileWriter writer = new FileWriter(outputFile);

            dto.setMdPath(outputFilePath);

            String imageDir = file.getParent() + File.separator + uuid + "-images";
            new File(imageDir).mkdirs(); // Create image directory

            for (IBodyElement element : document.getBodyElements()) {
                if (element instanceof XWPFParagraph) {
                    processParagraph((XWPFParagraph) element, writer, imageDir, imagePathList);
                } else if (element instanceof XWPFTable) {
                    processTable((XWPFTable) element, writer, imageDir, imagePathList);
                }
            }

            writer.close();
            // 增加读取mardkown 文件内容
            // Read the content of the generated Markdown file
            StringBuilder markdownContent = new StringBuilder();
            try (BufferedReader br = new BufferedReader(new FileReader(outputFile))) {
                String line;
                while ((line = br.readLine()) != null) {
                    markdownContent.append(line).append("\n");
                }
            }

            dto.setMarkdownContent(markdownContent.toString());
            dto.setParserStatus("success");
        } catch (IOException e){
            dto.setParserStatus("error");
            throw new IOException(e);
        }
        dto.setFileImagePathList(imagePathList);
        return dto;
    }

    private static void processParagraph(XWPFParagraph paragraph, FileWriter writer, String imageDir,List<String> imageNamePath) throws IOException {
        String content = processParagraphContent(paragraph, imageDir,imageNamePath);
        if (content.isEmpty()) return;

        // 处理标题和列表样式
        String style = paragraph.getStyle();
        if (style != null && style.startsWith("Heading")) {
            int level = Math.min(Character.getNumericValue(style.charAt(7)), 6);
            StringBuilder heading = new StringBuilder();
            for (int i = 0; i < level; i++) {
                heading.append("#");
            }
            heading.append(" ").append(content).append("\n\n");
            writer.write(heading.toString());
//            writer.write("#".repeat(level) + " " + content + "\n\n");
        } else if (isListParagraph(paragraph)) {
            String listMark = getListMark(paragraph);
            writer.write("* " + listMark + " " + content + "\n");
        } else {
            writer.write(content + "\n\n");
        }
    }

    private static String getListMark(XWPFParagraph para) {
        int indentLevel = para.getNumIlvl() != null ? para.getNumIlvl().intValue() : 0;
        String numFmt = para.getNumFmt(); // 获取列表编号格式

        // 处理有序列表
        if ("decimal".equals(numFmt) || "upperRoman".equals(numFmt)) {
            StringBuilder prefixBuilder = new StringBuilder();
            for (int i = 0; i < indentLevel * 4; i++) {
                prefixBuilder.append(" ");
            }
            String prefix = prefixBuilder.toString();



            return prefix + ".";
        }
        // 处理无序列表
        else {
            String bullet;
            switch (para.getNumFmt()) {
                case "bullet":
                    bullet = "•"; // 实心圆点
                    break;
                default:
                    bullet = "-"; // 默认用减号
                    break;
            }
            StringBuilder prefixBuilder = new StringBuilder();
            for (int i = 0; i < indentLevel * 4; i++) {
                prefixBuilder.append(" ");
            }
            return prefixBuilder.toString() + bullet;
        }
    }

    private static boolean isListParagraph(XWPFParagraph paragraph) {
        return isOrderedList(paragraph) || isUnorderedList(paragraph); // 如果没有找到对应的样式,则不可能是列表段落
    }

    private static boolean isOrderedList(XWPFParagraph paragraph) {
        if (paragraph.getNumFmt() != null) {
            String numFmt = paragraph.getNumFmt();
            return "decimal".equals(numFmt) || "upperRoman".equals(numFmt) || "lowerRoman".equals(numFmt) ||
                    "upperLetter".equals(numFmt) || "lowerLetter".equals(numFmt);
        }
        return false;
    }

    private static boolean isUnorderedList(XWPFParagraph paragraph) {
        if (paragraph.getNumFmt() != null) {
            String numFmt = paragraph.getNumFmt();
            return "bullet".equals(numFmt);
        }
        return false;
    }

    private static void processTable(XWPFTable table, FileWriter writer, String imageDir,List<String> imageNamePath) throws IOException {
        StringBuilder mdTable = new StringBuilder();
        List<XWPFTableRow> rows = table.getRows();

        for (int i = 0; i < rows.size(); i++) {
            XWPFTableRow row = rows.get(i);
            mdTable.append("|");

            // 处理每个单元格
            for (XWPFTableCell cell : row.getTableCells()) {
                StringBuilder cellContent = new StringBuilder();
                // 处理单元格内的段落
                for (XWPFParagraph para : cell.getParagraphs()) {
                    cellContent.append(processParagraphContent(para, imageDir,imageNamePath).replace("\n", "<br>"));
                }
                mdTable.append(cellContent.toString().trim()).append("|");
            }
            mdTable.append("\n");


            // 添加表头分隔线
            if (i == 0) {
                mdTable.append("|");
                for (int j = 0; j < row.getTableCells().size(); j++) {
                    mdTable.append(" --- |");
                }
                mdTable.append("\n");
            }
        }
        writer.write(mdTable + "\n\n");
    }

    private static String processParagraphContent(XWPFParagraph paragraph, String imageDir,List<String> imageNamePath) throws IOException {
        StringBuilder sb = new StringBuilder();

        for (XWPFRun run : paragraph.getRuns()) {
            // 处理图片
            for (XWPFPicture picture : run.getEmbeddedPictures()) {
                sb.append(saveImage(picture, imageDir,imageNamePath)).append(" ");
            }
            // 处理文本样式
            String text = run.getText(0);
            if (text == null) continue;

            text = applyTextStyles(run, text);
            sb.append(text);
        }

        String content = sb.toString().trim();
        // 处理有序列表和无序列表
        if (isListParagraph(paragraph)) {
            String listMark = getListMark(paragraph);
            content ="* " + listMark + " " + content;
        }
        return content;
    }

    private static String applyTextStyles(XWPFRun run, String text) {
        if (run.isBold()) text = "**" + text + "**";
        if (run.isItalic()) text = "*" + text + "*";
        if (run.getUnderline() != UnderlinePatterns.NONE) text = "__" + text + "__";
        return text;
    }

    private static String saveImage(XWPFPicture picture, String imageDir,List<String> imageNamePath) throws IOException {
        XWPFPictureData picData = picture.getPictureData();
        String fileName = "img_" + UUID.randomUUID() + "." + picData.suggestFileExtension();
        File output = new File(imageDir, fileName);
        imageNamePath.add(output.getPath());

        try (FileOutputStream fos = new FileOutputStream(output)) {
            fos.write(picData.getData());
        }
        return "![" + fileName + "](" + imageDir + "/" + fileName + ")";
    }


    public static void main(String[] args) throws Exception {
        System.out.println(JSONObject.toJSONString( convertWordToMarkdown("word path")));
    }
}

获得信息

java 复制代码
{
"fileImagePathList":["文件中图片路径1","文件中图片路径2"],
"markdownContent": "markdwon 信息",
"mdPath": "markdown文件地址"
}

运行上方的程序将会得到

1、解析文件中所有图片信息,保存到下方的地址

2、将word 文档转换成markdown

3、获取markdown 文件

相关推荐
珹洺39 分钟前
C++从入门到实战(十)类和对象(最终部分)static成员,内部类,匿名对象与对象拷贝时的编译器优化详解
java·数据结构·c++·redis·后端·算法·链表
一 乐42 分钟前
网红酒店|基于java+vue的网红酒店预定系统(源码+数据库+文档)
java·开发语言·数据库·毕业设计·论文·springboot·网红酒店预定系统
xyliiiiiL3 小时前
从责任链模式聊到aware接口
java·开发语言
码农老起6 小时前
与Aspose.pdf类似的jar库分享
java·pdf·jar
程序猿小D6 小时前
第三百八十九节 JavaFX教程 - JavaFX WebEngine
java·eclipse·intellij-idea·vr·javafx
self-discipline6348 小时前
【Java】Java核心知识点与相应面试技巧(七)——类与对象(二)
java·开发语言·面试
wei3872452328 小时前
java笔记02
java·开发语言·笔记
zjj5878 小时前
Docker使用ubuntu
java·docker·eureka
士别三日&&当刮目相看8 小时前
JAVA学习*简单的代理模式
java·学习·代理模式
ShareBeHappy_Qin9 小时前
设计模式——设计模式理念
java·设计模式