一.pom引入依赖
java
<dependency>
<groupId>com.aspose</groupId>
<artifactId>aspose-words</artifactId>
<version>15.12.0</version>
<classifier>jdk16</classifier>
</dependency>
二.代码实现
java
package com.example.demo.handler;
import com.alibaba.fastjson.JSONObject;
import com.aspose.words.HtmlSaveOptions;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import java.io.*;
import java.util.*;
@Component
public class WordAnalysis {
/**
* 解析word
* @param multipartFile 前端接收的文件,根据自己的需求也可以将MultipartFile转换为File
* @return TitleTreeVO 存放标题的实体
*/
public List wordAnalysis(MultipartFile multipartFile) throws IOException {
byte[] byteArr = multipartFile.getBytes();
InputStream inputStream = new ByteArrayInputStream(byteArr);
List tableList = new ArrayList();
try {
// 设置转化的格式
HtmlSaveOptions saveOptions = new HtmlSaveOptions();
saveOptions.setExportImagesAsBase64(false);
// 将所有word中的图片放在临时文件夹中,并将html中的链接替换为临时文件夹中绝对路径
String property = System.getProperty("java.io.tmpdir");
saveOptions.setImagesFolder(property);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// 把流转化为Document
com.aspose.words.Document doc = new com.aspose.words.Document(inputStream);
doc.save(baos, saveOptions);
// 将html文件转化为Document,方便后续使用jsoup的操作
Document htmlDoc = Jsoup.parse(baos.toString());
// 解析Document
tableList = analysisDoc(htmlDoc);
} catch (Exception e) {
e.printStackTrace();
} finally {
inputStream.close();
}
return tableList;
}
/**
* 解析Document,按需写,样例只写了表格处理
* @param htmlDoc
* @return
*/
public List analysisDoc(Document htmlDoc) {
Elements tables = htmlDoc.getElementsByTag("table");
List tableList = new ArrayList();
for (int i = 0; i < tables.size(); i++) {
Map<String, Object> tableInfo = new HashMap<>();
UUID uuid = UUID.randomUUID();
Element table = tables.get(i);
// String tableName = table.previousElementSibling().text();
// if ("".equals(tableName)) {
// tableName = table.nextElementSibling().text();
// }
tableInfo.put("tableId", uuid);
tableInfo.put("tableName", "表"+(i+1));
tableInfo.put("tableHtml", tables.get(i).toString());
Elements rows = table.select("tr");
List rowList = new ArrayList();
for (Element row: rows) {
if (!row.attributes().get("style").contains("height:0pt")) {
List rowInfo = new ArrayList();
Elements cells = row.select("td");
for (Element cell: cells) {
JSONObject cellInfo = new JSONObject();
String data = cell.text();
int rowspan = new Integer(cell.attributes().get("rowspan")=="" ? "1" : cell.attributes().get("rowspan"));
int colspan = new Integer(cell.attributes().get("colspan")=="" ? "1" : cell.attributes().get("colspan"));
System.out.print(data + "\t");
cellInfo.put("content", data);
cellInfo.put("rowspan", rowspan);
cellInfo.put("colspan", colspan);
rowInfo.add(cellInfo);
}
System.out.println();
rowList.add(rowInfo);
}
}
tableInfo.put("tableContent", rowList);
tableList.add(tableInfo);
}
return tableList;
}
}