一.pom文件里引入相关jar包
<!-- easy poi -->
<dependency>
<groupId>cn.afterturn</groupId>
<artifactId>easypoi-base</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>cn.afterturn</groupId>
<artifactId>easypoi-web</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>cn.afterturn</groupId>
<artifactId>easypoi-annotation</artifactId>
<version>4.1.0</version>
</dependency>
二.文档解析
1.读取docx
public static String readDocxUrlContent(String docxUrl) {
InputStream inputStream = null;
try {
//1.创建URL对象,获得资源
URL url = new URL(docxUrl);
//2.打开连接的输入流
inputStream = url.openStream();
//3.定义文档对象
XWPFDocument document = new XWPFDocument(inputStream);
//4.获取文档内所有的内容段落
List<XWPFParagraph> paragraphs = document.getParagraphs();
StringBuilder paragraphText = new StringBuilder();
if (ObjectUtils.isEmpty(paragraphs)) {
//5.不能获取到内容段落,则读取文档内的表格内容
return WordUtils.getTableCellsText(document);
}
//6.读取段落内容文本
for (XWPFParagraph paragraph : paragraphs) {
paragraphText.append(paragraph.getText());
}
return paragraphText.toString();
} catch (Exception e) {
log.error("FileUtil.readDocxUrlContent:读取文件内容失败", e);
} finally {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
log.error("FileUtil.readDocxUrlContent:关闭inputStream失败", e);
}
}
}
return null;
}
2.获取所有的表格内的文本内容
/**
* 获取文档表内,所有的文本内容
*/
public static String getTableCellsText(XWPFDocument document) {
StringBuilder stringBuilder = new StringBuilder();
//1. 读取文档内所有表
List<XWPFTable> tables = document.getTables();
for (XWPFTable table : tables) {
//2. 读取表里所有行
List<XWPFTableRow> rows = table.getRows();
for (int i = NumberUtils.INTEGER_ZERO; i < rows.size(); i++) {
XWPFTableRow row = rows.get(i);
//3. 读取每行的所有单元格
List<XWPFTableCell> tableCells = row.getTableCells();
for (int j = NumberUtils.INTEGER_ZERO; j < tableCells.size(); j++) {
XWPFTableCell cell = tableCells.get(j);
stringBuilder.append(cell.getText());
}
}
}
return stringBuilder.toString();
}