一、思路
-
入口→分发:
extractFromWord是总入口,核心是 "格式分发",将.doc 和.docx 分流到不同处理逻辑; -
.doc 核心:绕开路径解析,用 "逐层遍历 + 兜底读取" 确保文件能读到,再交给
extractFromOLE解析; -
.docx 核心:直接遍历 PackagePart,利用 POI 对 OOXML 的原生支持,快速识别 Excel 附件;
-
解析核心:
extractFromOLE是格式兼容关键,区分.xls/.xlsx 用不同 POI 模块,避免解析失败; -
稳定性保障:多层过滤(过小文件 / 特殊字符)+ 异常捕获(单个文件失败不中断)+ 格式适配,确保程序稳定运行。
二、核心依赖:
<!-- Apache POI 处理 Word 和 Excel -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.4</version>
</dependency>
<!-- Apache POI 处理 OLE 对象(嵌入式附件) -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.4</version>
</dependency>
<!-- Apache Tika 识别文件类型(辅助提取附件) -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.1</version>
</dependency>
三、源码
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.tika.Tika;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class WordExcelExtractor {
private static final Tika tika = new Tika();
private static final Pattern NON_PRINTABLE_CHAR_PATTERN = Pattern.compile("[^\\x20-\\x7E]");
private static final int MIN_EXCEL_SIZE = 100;
public static List<byte[]> extractFromWord(File wordFile) throws IOException {
validateFile(wordFile);
List<byte[]> excelDataList = new ArrayList<>();
String fileName = wordFile.getName().toLowerCase();
try {
if (fileName.endsWith(".docx")) {
extractFromDocx(wordFile, excelDataList);
} else if (fileName.endsWith(".doc")) {
extractFromDocDirect(wordFile, excelDataList);
} else {
throw new IllegalArgumentException("不支持的格式!仅支持 .doc/.docx");
}
} catch (IllegalArgumentException e) {
throw e;
} catch (Exception e) {
throw new IOException("提取 Excel 附件失败:" + e.getMessage(), e);
}
return excelDataList;
}
private static void extractFromDocDirect(File docFile, List<byte[]> excelDataList) throws IOException {
try (FileInputStream fis = new FileInputStream(docFile);
POIFSFileSystem poifs = new POIFSFileSystem(fis)) {
DirectoryEntry rootEntry = poifs.getRoot();
DirectoryEntry objectPool = getObjectPoolDirectory(rootEntry);
if (objectPool == null) {
System.out.println("无 ObjectPool 目录,无嵌入式附件");
return;
}
for (Entry entry1 : objectPool) {
String dir1Name = filterSpecialChars(entry1.getName());
if (dir1Name.isEmpty() || !(entry1 instanceof DirectoryEntry)) {
continue;
}
DirectoryEntry subDir = (DirectoryEntry) entry1;
System.out.println("找到 OBJECTPOOL 子目录:" + dir1Name);
for (Entry entry2 : subDir) {
String fileName = filterSpecialChars(entry2.getName());
if (fileName.isEmpty() || !(entry2 instanceof DocumentEntry)) {
continue;
}
DocumentEntry docEntry = (DocumentEntry) entry2;
long fileSize = docEntry.getSize();
System.out.println("找到文件:" + dir1Name + "/" + fileName + "(大小:" + fileSize + " 字节)");
if (fileSize < MIN_EXCEL_SIZE) {
System.out.println("⚠️ 跳过过小文件(非 Excel):" + dir1Name + "/" + fileName);
continue;
}
try {
// 主方案:路径读取
try (InputStream is = poifs.createDocumentInputStream(getEntryFullPath(docEntry))) {
byte[] oleData = is.readAllBytes();
extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList);
} catch (Exception e) {
System.out.println("⚠️ 路径读取失败,尝试直接读取文件字节");
byte[] oleData = readDocumentEntryDirect(docEntry);
if (oleData != null && oleData.length >= MIN_EXCEL_SIZE) {
extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList);
} else {
System.out.println("❌ 兜底读取失败(数据无效):" + dir1Name + "/" + fileName);
}
}
} catch (Exception e) {
System.out.println("❌ 处理文件失败,跳过:" + dir1Name + "/" + fileName + " → " + e.getMessage());
}
}
}
}
}
private static void extractAndAddExcel(byte[] oleData, String filePath, List<byte[]> excelDataList) {
try {
byte[] excelData = extractFromOLE(oleData);
if (excelData != null) {
excelDataList.add(excelData);
System.out.println("✅ 成功提取 Excel:" + filePath);
} else {
String fileHeader = getFileHeader(oleData);
System.out.println("❌ 非 Excel 文件(文件头:" + fileHeader + "):" + filePath);
}
} catch (Exception e) {
System.out.println("❌ 提取 Excel 失败:" + filePath + " → " + e.getMessage());
}
}
private static byte[] readDocumentEntryDirect(DocumentEntry docEntry) {
try (InputStream is = new DocumentInputStream(docEntry)) {
byte[] data = new byte[(int) docEntry.getSize()];
int readLen = is.read(data);
return readLen > 0 ? data : null;
} catch (IOException e) {
System.out.println("⚠️ 直接读取字节失败:" + e.getMessage());
return null;
}
}
/**
* 核心修复:区分 .xls 和 .xlsx 格式,适配对应的解析模块
*/
private static byte[] extractFromOLE(byte[] oleData) {
// 1. 快速过滤非 Excel 文件
if (!isExcelFile(oleData)) {
return null;
}
// 2. 判断是 .xls(OLE2)还是 .xlsx(OOXML)
boolean isXls = isXlsFile(oleData);
boolean isXlsx = isXlsxFile(oleData);
// 3. 处理 .xlsx 格式(OOXML)
if (isXlsx) {
try (ByteArrayInputStream bais = new ByteArrayInputStream(oleData)) {
// 验证是否为有效 .xlsx(用 OOXML 专用的 OPCPackage)
try (OPCPackage opcPackage = OPCPackage.open(bais)) {
// 可选:进一步验证是否为 Excel 工作表(避免其他 OOXML 文件)
try (XSSFWorkbook workbook = new XSSFWorkbook(opcPackage)) {
// 能打开工作簿,说明是有效 .xlsx
return oleData;
}
}
} catch (Exception e) {
System.out.println("⚠️ 无效的 .xlsx 文件:" + e.getMessage());
return null;
}
}
// 4. 处理 .xls 格式(OLE2)
if (isXls) {
ByteArrayInputStream bais = null;
POIFSFileSystem poifs = null;
try {
bais = new ByteArrayInputStream(oleData);
poifs = new POIFSFileSystem(bais);
DirectoryEntry root = poifs.getRoot();
if (root.hasEntry("Package")) {
try (InputStream is = poifs.createDocumentInputStream("Package")) {
byte[] data = is.readAllBytes();
return isExcelFile(data) ? data : null;
}
} else if (root.hasEntry("Contents")) {
try (InputStream is = poifs.createDocumentInputStream("Contents")) {
byte[] data = is.readAllBytes();
return isExcelFile(data) ? data : null;
}
}
// 直接是 .xls 文件,无需额外解析
return oleData;
} catch (NotOLE2FileException e) {
System.out.println("⚠️ 非 OLE2 格式文件:" + e.getMessage());
} catch (IOException e) {
System.out.println("⚠️ 解析 .xls 文件失败:" + e.getMessage());
} finally {
if (poifs != null) {
try {
poifs.close();
} catch (IOException e) {}
}
if (bais != null) {
try {
bais.close();
} catch (IOException e) {}
}
}
}
// 5. Tika 辅助验证
String fileType = tika.detect(oleData, "");
if (fileType.contains("excel") || fileType.contains("spreadsheet")) {
return oleData;
}
return null;
}
/**
* 单独判断是否为 .xls 文件(OLE2 格式)
*/
private static boolean isXlsFile(byte[] data) {
if (data.length < 4) return false;
byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3];
return (b1 == (byte) 0xD0 && b2 == (byte) 0xCF && b3 == (byte) 0x11 && b4 == (byte) 0xE0);
}
/**
* 单独判断是否为 .xlsx 文件(OOXML 格式)
*/
private static boolean isXlsxFile(byte[] data) {
if (data.length < 4) return false;
byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3];
return (b1 == (byte) 0x50 && b2 == (byte) 0x4B && b3 == (byte) 0x03 && b4 == (byte) 0x04);
}
// ---------------------- 工具方法 ----------------------
private static DirectoryEntry getObjectPoolDirectory(DirectoryEntry root) throws IOException {
if (root.hasEntry("ObjectPool")) {
return (DirectoryEntry) root.getEntry("ObjectPool");
} else if (root.hasEntry("OBJECTPOOL")) {
return (DirectoryEntry) root.getEntry("OBJECTPOOL");
}
return null;
}
private static String filterSpecialChars(String name) {
return name == null ? "" : NON_PRINTABLE_CHAR_PATTERN.matcher(name).replaceAll("");
}
private static boolean isExcelContentType(String contentType) {
return contentType.equals("application/vnd.ms-excel")
|| contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|| contentType.equals("application/vnd.ms-excel.sheet.macroEnabled.12");
}
private static boolean isExcelFile(byte[] data) {
return isXlsFile(data) || isXlsxFile(data);
}
private static String getFileHeader(byte[] data) {
if (data.length < 4) return "不足4字节";
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 4; i++) {
sb.append(String.format("%02X ", data[i]));
}
return sb.toString().trim();
}
private static void validateFile(File file) throws IOException {
if (!file.exists()) throw new FileNotFoundException("文件不存在:" + file.getAbsolutePath());
if (!file.isFile()) throw new IOException("路径不是文件:" + file.getAbsolutePath());
if (!file.canRead()) throw new IOException("文件不可读:" + file.getAbsolutePath());
}
private static void extractFromDocx(File docxFile, List<byte[]> excelDataList) throws IOException {
try (XWPFDocument doc = new XWPFDocument(OPCPackage.open(docxFile))) {
for (PackagePart part : doc.getAllEmbeddedParts()) {
String contentType = part.getContentType();
try (InputStream is = part.getInputStream()) {
byte[] data = is.readAllBytes();
if (data.length < MIN_EXCEL_SIZE) continue;
if (isExcelContentType(contentType) || isExcelFile(data)) {
excelDataList.add(data);
System.out.println("✅ 提取 .docx 中的 Excel 附件");
} else if (contentType.contains("oleObject")) {
byte[] excelData = extractFromOLE(data);
if (excelData != null) {
excelDataList.add(excelData);
}
}
} catch (Exception e) {
System.out.println("❌ 处理 .docx 附件失败:" + e.getMessage());
}
}
} catch (Exception e) {
throw new IOException("解析 .docx 文件失败:" + e.getMessage(), e);
}
}
/**
* 获取 Entry 的绝对路径(用于主方案路径读取,即使兜底方案常用,也需保留避免报红)
*/
private static String getEntryFullPath(Entry entry) {
List<String> pathParts = new ArrayList<>();
Entry current = entry;
while (current != null) {
String name = current.getName();
// 过滤根目录和无效名称
if (name != null && !name.isEmpty() && !"Root Entry".equals(name)) {
pathParts.add(name);
}
current = current.getParent();
}
// 反转路径部分,得到正确的绝对路径
StringBuilder path = new StringBuilder();
for (int i = pathParts.size() - 1; i >= 0; i--) {
if (path.length() > 0) {
path.append("/");
}
path.append(pathParts.get(i));
}
return path.toString();
}
}