日常的开发中,需要将word 等各类文章信息转换成格式化语言,因此需要使用各类语言将word 转换成Markdown
1、引入 jar包
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
2、使用以下代码进行编写
java
package com.nd.ai.test.service.utils;
import com.alibaba.fastjson.JSONObject;
import com.nd.ai.test.service.dto.apitest.FileMarkdownDTO;
import org.apache.commons.compress.utils.Lists;
import org.apache.poi.xwpf.usermodel.*;
import java.io.*;
import java.util.List;
import java.util.UUID;
/**
* @ClassName WordToMarkdownConverter
* @Author Administrator
*/
public class WordToMarkdownConverter {
public static FileMarkdownDTO convertWordToMarkdown(String wordPath) throws IOException {
List<String> imagePathList = Lists.newArrayList();
FileMarkdownDTO dto = new FileMarkdownDTO();
try {
File file = new File(wordPath);
String uuid = UUID.randomUUID().toString();
String outputFilePath = file.getParent() + File.separator + uuid + ".md";
File outputFile = new File(outputFilePath);
FileInputStream fis = new FileInputStream(file); // Corrected to read the input file
XWPFDocument document = new XWPFDocument(fis);
FileWriter writer = new FileWriter(outputFile);
dto.setMdPath(outputFilePath);
String imageDir = file.getParent() + File.separator + uuid + "-images";
new File(imageDir).mkdirs(); // Create image directory
for (IBodyElement element : document.getBodyElements()) {
if (element instanceof XWPFParagraph) {
processParagraph((XWPFParagraph) element, writer, imageDir, imagePathList);
} else if (element instanceof XWPFTable) {
processTable((XWPFTable) element, writer, imageDir, imagePathList);
}
}
writer.close();
// 增加读取mardkown 文件内容
// Read the content of the generated Markdown file
StringBuilder markdownContent = new StringBuilder();
try (BufferedReader br = new BufferedReader(new FileReader(outputFile))) {
String line;
while ((line = br.readLine()) != null) {
markdownContent.append(line).append("\n");
}
}
dto.setMarkdownContent(markdownContent.toString());
dto.setParserStatus("success");
} catch (IOException e){
dto.setParserStatus("error");
throw new IOException(e);
}
dto.setFileImagePathList(imagePathList);
return dto;
}
private static void processParagraph(XWPFParagraph paragraph, FileWriter writer, String imageDir,List<String> imageNamePath) throws IOException {
String content = processParagraphContent(paragraph, imageDir,imageNamePath);
if (content.isEmpty()) return;
// 处理标题和列表样式
String style = paragraph.getStyle();
if (style != null && style.startsWith("Heading")) {
int level = Math.min(Character.getNumericValue(style.charAt(7)), 6);
StringBuilder heading = new StringBuilder();
for (int i = 0; i < level; i++) {
heading.append("#");
}
heading.append(" ").append(content).append("\n\n");
writer.write(heading.toString());
// writer.write("#".repeat(level) + " " + content + "\n\n");
} else if (isListParagraph(paragraph)) {
String listMark = getListMark(paragraph);
writer.write("* " + listMark + " " + content + "\n");
} else {
writer.write(content + "\n\n");
}
}
private static String getListMark(XWPFParagraph para) {
int indentLevel = para.getNumIlvl() != null ? para.getNumIlvl().intValue() : 0;
String numFmt = para.getNumFmt(); // 获取列表编号格式
// 处理有序列表
if ("decimal".equals(numFmt) || "upperRoman".equals(numFmt)) {
StringBuilder prefixBuilder = new StringBuilder();
for (int i = 0; i < indentLevel * 4; i++) {
prefixBuilder.append(" ");
}
String prefix = prefixBuilder.toString();
return prefix + ".";
}
// 处理无序列表
else {
String bullet;
switch (para.getNumFmt()) {
case "bullet":
bullet = "•"; // 实心圆点
break;
default:
bullet = "-"; // 默认用减号
break;
}
StringBuilder prefixBuilder = new StringBuilder();
for (int i = 0; i < indentLevel * 4; i++) {
prefixBuilder.append(" ");
}
return prefixBuilder.toString() + bullet;
}
}
private static boolean isListParagraph(XWPFParagraph paragraph) {
return isOrderedList(paragraph) || isUnorderedList(paragraph); // 如果没有找到对应的样式,则不可能是列表段落
}
private static boolean isOrderedList(XWPFParagraph paragraph) {
if (paragraph.getNumFmt() != null) {
String numFmt = paragraph.getNumFmt();
return "decimal".equals(numFmt) || "upperRoman".equals(numFmt) || "lowerRoman".equals(numFmt) ||
"upperLetter".equals(numFmt) || "lowerLetter".equals(numFmt);
}
return false;
}
private static boolean isUnorderedList(XWPFParagraph paragraph) {
if (paragraph.getNumFmt() != null) {
String numFmt = paragraph.getNumFmt();
return "bullet".equals(numFmt);
}
return false;
}
private static void processTable(XWPFTable table, FileWriter writer, String imageDir,List<String> imageNamePath) throws IOException {
StringBuilder mdTable = new StringBuilder();
List<XWPFTableRow> rows = table.getRows();
for (int i = 0; i < rows.size(); i++) {
XWPFTableRow row = rows.get(i);
mdTable.append("|");
// 处理每个单元格
for (XWPFTableCell cell : row.getTableCells()) {
StringBuilder cellContent = new StringBuilder();
// 处理单元格内的段落
for (XWPFParagraph para : cell.getParagraphs()) {
cellContent.append(processParagraphContent(para, imageDir,imageNamePath).replace("\n", "<br>"));
}
mdTable.append(cellContent.toString().trim()).append("|");
}
mdTable.append("\n");
// 添加表头分隔线
if (i == 0) {
mdTable.append("|");
for (int j = 0; j < row.getTableCells().size(); j++) {
mdTable.append(" --- |");
}
mdTable.append("\n");
}
}
writer.write(mdTable + "\n\n");
}
private static String processParagraphContent(XWPFParagraph paragraph, String imageDir,List<String> imageNamePath) throws IOException {
StringBuilder sb = new StringBuilder();
for (XWPFRun run : paragraph.getRuns()) {
// 处理图片
for (XWPFPicture picture : run.getEmbeddedPictures()) {
sb.append(saveImage(picture, imageDir,imageNamePath)).append(" ");
}
// 处理文本样式
String text = run.getText(0);
if (text == null) continue;
text = applyTextStyles(run, text);
sb.append(text);
}
String content = sb.toString().trim();
// 处理有序列表和无序列表
if (isListParagraph(paragraph)) {
String listMark = getListMark(paragraph);
content ="* " + listMark + " " + content;
}
return content;
}
private static String applyTextStyles(XWPFRun run, String text) {
if (run.isBold()) text = "**" + text + "**";
if (run.isItalic()) text = "*" + text + "*";
if (run.getUnderline() != UnderlinePatterns.NONE) text = "__" + text + "__";
return text;
}
private static String saveImage(XWPFPicture picture, String imageDir,List<String> imageNamePath) throws IOException {
XWPFPictureData picData = picture.getPictureData();
String fileName = "img_" + UUID.randomUUID() + "." + picData.suggestFileExtension();
File output = new File(imageDir, fileName);
imageNamePath.add(output.getPath());
try (FileOutputStream fos = new FileOutputStream(output)) {
fos.write(picData.getData());
}
return "";
}
public static void main(String[] args) throws Exception {
System.out.println(JSONObject.toJSONString( convertWordToMarkdown("word path")));
}
}
获得信息
java
{
"fileImagePathList":["文件中图片路径1","文件中图片路径2"],
"markdownContent": "markdwon 信息",
"mdPath": "markdown文件地址"
}
运行上方的程序将会得到
1、解析文件中所有图片信息,保存到下方的地址
2、将word 文档转换成markdown
3、获取markdown 文件