java解析word中的excel

一、思路

  1. 入口→分发:extractFromWord 是总入口,核心是 "格式分发",将.doc 和.docx 分流到不同处理逻辑;

  2. .doc 核心:绕开路径解析,用 "逐层遍历 + 兜底读取" 确保文件能读到,再交给extractFromOLE解析;

  3. .docx 核心:直接遍历 PackagePart,利用 POI 对 OOXML 的原生支持,快速识别 Excel 附件;

  4. 解析核心:extractFromOLE 是格式兼容关键,区分.xls/.xlsx 用不同 POI 模块,避免解析失败;

  5. 稳定性保障:多层过滤(过小文件 / 特殊字符)+ 异常捕获(单个文件失败不中断)+ 格式适配,确保程序稳定运行。

二、核心依赖:

复制代码
<!-- Apache POI 处理 Word 和 Excel -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.4</version>
</dependency>
<!-- Apache POI 处理 OLE 对象(嵌入式附件) -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.4</version>
</dependency>
<!-- Apache Tika 识别文件类型(辅助提取附件) -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.1</version>
</dependency>

三、源码

复制代码
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.tika.Tika;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

public class WordExcelExtractor {
    private static final Tika tika = new Tika();
    private static final Pattern NON_PRINTABLE_CHAR_PATTERN = Pattern.compile("[^\\x20-\\x7E]");
    private static final int MIN_EXCEL_SIZE = 100;

    public static List<byte[]> extractFromWord(File wordFile) throws IOException {
        validateFile(wordFile);
        List<byte[]> excelDataList = new ArrayList<>();
        String fileName = wordFile.getName().toLowerCase();

        try {
            if (fileName.endsWith(".docx")) {
                extractFromDocx(wordFile, excelDataList);
            } else if (fileName.endsWith(".doc")) {
                extractFromDocDirect(wordFile, excelDataList);
            } else {
                throw new IllegalArgumentException("不支持的格式!仅支持 .doc/.docx");
            }
        } catch (IllegalArgumentException e) {
            throw e;
        } catch (Exception e) {
            throw new IOException("提取 Excel 附件失败:" + e.getMessage(), e);
        }
        return excelDataList;
    }

    private static void extractFromDocDirect(File docFile, List<byte[]> excelDataList) throws IOException {
        try (FileInputStream fis = new FileInputStream(docFile);
             POIFSFileSystem poifs = new POIFSFileSystem(fis)) {
            DirectoryEntry rootEntry = poifs.getRoot();
            DirectoryEntry objectPool = getObjectPoolDirectory(rootEntry);
            if (objectPool == null) {
                System.out.println("无 ObjectPool 目录,无嵌入式附件");
                return;
            }

            for (Entry entry1 : objectPool) {
                String dir1Name = filterSpecialChars(entry1.getName());
                if (dir1Name.isEmpty() || !(entry1 instanceof DirectoryEntry)) {
                    continue;
                }
                DirectoryEntry subDir = (DirectoryEntry) entry1;
                System.out.println("找到 OBJECTPOOL 子目录:" + dir1Name);

                for (Entry entry2 : subDir) {
                    String fileName = filterSpecialChars(entry2.getName());
                    if (fileName.isEmpty() || !(entry2 instanceof DocumentEntry)) {
                        continue;
                    }
                    DocumentEntry docEntry = (DocumentEntry) entry2;
                    long fileSize = docEntry.getSize();
                    System.out.println("找到文件:" + dir1Name + "/" + fileName + "(大小:" + fileSize + " 字节)");

                    if (fileSize < MIN_EXCEL_SIZE) {
                        System.out.println("⚠️ 跳过过小文件(非 Excel):" + dir1Name + "/" + fileName);
                        continue;
                    }

                    try {
                        // 主方案:路径读取
                        try (InputStream is = poifs.createDocumentInputStream(getEntryFullPath(docEntry))) {
                            byte[] oleData = is.readAllBytes();
                            extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList);
                        } catch (Exception e) {
                            System.out.println("⚠️  路径读取失败,尝试直接读取文件字节");
                            byte[] oleData = readDocumentEntryDirect(docEntry);
                            if (oleData != null && oleData.length >= MIN_EXCEL_SIZE) {
                                extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList);
                            } else {
                                System.out.println("❌ 兜底读取失败(数据无效):" + dir1Name + "/" + fileName);
                            }
                        }
                    } catch (Exception e) {
                        System.out.println("❌ 处理文件失败,跳过:" + dir1Name + "/" + fileName + " → " + e.getMessage());
                    }
                }
            }
        }
    }

    private static void extractAndAddExcel(byte[] oleData, String filePath, List<byte[]> excelDataList) {
        try {
            byte[] excelData = extractFromOLE(oleData);
            if (excelData != null) {
                excelDataList.add(excelData);
                System.out.println("✅ 成功提取 Excel:" + filePath);
            } else {
                String fileHeader = getFileHeader(oleData);
                System.out.println("❌ 非 Excel 文件(文件头:" + fileHeader + "):" + filePath);
            }
        } catch (Exception e) {
            System.out.println("❌ 提取 Excel 失败:" + filePath + " → " + e.getMessage());
        }
    }

    private static byte[] readDocumentEntryDirect(DocumentEntry docEntry) {
        try (InputStream is = new DocumentInputStream(docEntry)) {
            byte[] data = new byte[(int) docEntry.getSize()];
            int readLen = is.read(data);
            return readLen > 0 ? data : null;
        } catch (IOException e) {
            System.out.println("⚠️  直接读取字节失败:" + e.getMessage());
            return null;
        }
    }

    /**
     * 核心修复:区分 .xls 和 .xlsx 格式,适配对应的解析模块
     */
    private static byte[] extractFromOLE(byte[] oleData) {
        // 1. 快速过滤非 Excel 文件
        if (!isExcelFile(oleData)) {
            return null;
        }

        // 2. 判断是 .xls(OLE2)还是 .xlsx(OOXML)
        boolean isXls = isXlsFile(oleData);
        boolean isXlsx = isXlsxFile(oleData);

        // 3. 处理 .xlsx 格式(OOXML)
        if (isXlsx) {
            try (ByteArrayInputStream bais = new ByteArrayInputStream(oleData)) {
                // 验证是否为有效 .xlsx(用 OOXML 专用的 OPCPackage)
                try (OPCPackage opcPackage = OPCPackage.open(bais)) {
                    // 可选:进一步验证是否为 Excel 工作表(避免其他 OOXML 文件)
                    try (XSSFWorkbook workbook = new XSSFWorkbook(opcPackage)) {
                        // 能打开工作簿,说明是有效 .xlsx
                        return oleData;
                    }
                }
            } catch (Exception e) {
                System.out.println("⚠️  无效的 .xlsx 文件:" + e.getMessage());
                return null;
            }
        }

        // 4. 处理 .xls 格式(OLE2)
        if (isXls) {
            ByteArrayInputStream bais = null;
            POIFSFileSystem poifs = null;
            try {
                bais = new ByteArrayInputStream(oleData);
                poifs = new POIFSFileSystem(bais);
                DirectoryEntry root = poifs.getRoot();

                if (root.hasEntry("Package")) {
                    try (InputStream is = poifs.createDocumentInputStream("Package")) {
                        byte[] data = is.readAllBytes();
                        return isExcelFile(data) ? data : null;
                    }
                } else if (root.hasEntry("Contents")) {
                    try (InputStream is = poifs.createDocumentInputStream("Contents")) {
                        byte[] data = is.readAllBytes();
                        return isExcelFile(data) ? data : null;
                    }
                }
                // 直接是 .xls 文件,无需额外解析
                return oleData;
            } catch (NotOLE2FileException e) {
                System.out.println("⚠️  非 OLE2 格式文件:" + e.getMessage());
            } catch (IOException e) {
                System.out.println("⚠️  解析 .xls 文件失败:" + e.getMessage());
            } finally {
                if (poifs != null) {
                    try {
                        poifs.close();
                    } catch (IOException e) {}
                }
                if (bais != null) {
                    try {
                        bais.close();
                    } catch (IOException e) {}
                }
            }
        }

        // 5. Tika 辅助验证
        String fileType = tika.detect(oleData, "");
        if (fileType.contains("excel") || fileType.contains("spreadsheet")) {
            return oleData;
        }

        return null;
    }

    /**
     * 单独判断是否为 .xls 文件(OLE2 格式)
     */
    private static boolean isXlsFile(byte[] data) {
        if (data.length < 4) return false;
        byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3];
        return (b1 == (byte) 0xD0 && b2 == (byte) 0xCF && b3 == (byte) 0x11 && b4 == (byte) 0xE0);
    }

    /**
     * 单独判断是否为 .xlsx 文件(OOXML 格式)
     */
    private static boolean isXlsxFile(byte[] data) {
        if (data.length < 4) return false;
        byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3];
        return (b1 == (byte) 0x50 && b2 == (byte) 0x4B && b3 == (byte) 0x03 && b4 == (byte) 0x04);
    }

    // ---------------------- 工具方法 ----------------------
    private static DirectoryEntry getObjectPoolDirectory(DirectoryEntry root) throws IOException {
        if (root.hasEntry("ObjectPool")) {
            return (DirectoryEntry) root.getEntry("ObjectPool");
        } else if (root.hasEntry("OBJECTPOOL")) {
            return (DirectoryEntry) root.getEntry("OBJECTPOOL");
        }
        return null;
    }

    private static String filterSpecialChars(String name) {
        return name == null ? "" : NON_PRINTABLE_CHAR_PATTERN.matcher(name).replaceAll("");
    }

    private static boolean isExcelContentType(String contentType) {
        return contentType.equals("application/vnd.ms-excel")
                || contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
                || contentType.equals("application/vnd.ms-excel.sheet.macroEnabled.12");
    }

    private static boolean isExcelFile(byte[] data) {
        return isXlsFile(data) || isXlsxFile(data);
    }

    private static String getFileHeader(byte[] data) {
        if (data.length < 4) return "不足4字节";
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < 4; i++) {
            sb.append(String.format("%02X ", data[i]));
        }
        return sb.toString().trim();
    }

    private static void validateFile(File file) throws IOException {
        if (!file.exists()) throw new FileNotFoundException("文件不存在:" + file.getAbsolutePath());
        if (!file.isFile()) throw new IOException("路径不是文件:" + file.getAbsolutePath());
        if (!file.canRead()) throw new IOException("文件不可读:" + file.getAbsolutePath());
    }

    private static void extractFromDocx(File docxFile, List<byte[]> excelDataList) throws IOException {
        try (XWPFDocument doc = new XWPFDocument(OPCPackage.open(docxFile))) {
            for (PackagePart part : doc.getAllEmbeddedParts()) {
                String contentType = part.getContentType();
                try (InputStream is = part.getInputStream()) {
                    byte[] data = is.readAllBytes();
                    if (data.length < MIN_EXCEL_SIZE) continue;
                    if (isExcelContentType(contentType) || isExcelFile(data)) {
                        excelDataList.add(data);
                        System.out.println("✅ 提取 .docx 中的 Excel 附件");
                    } else if (contentType.contains("oleObject")) {
                        byte[] excelData = extractFromOLE(data);
                        if (excelData != null) {
                            excelDataList.add(excelData);
                        }
                    }
                } catch (Exception e) {
                    System.out.println("❌ 处理 .docx 附件失败:" + e.getMessage());
                }
            }
        } catch (Exception e) {
            throw new IOException("解析 .docx 文件失败:" + e.getMessage(), e);
        }
    }

    /**
     * 获取 Entry 的绝对路径(用于主方案路径读取,即使兜底方案常用,也需保留避免报红)
     */
    private static String getEntryFullPath(Entry entry) {
        List<String> pathParts = new ArrayList<>();
        Entry current = entry;
        while (current != null) {
            String name = current.getName();
            // 过滤根目录和无效名称
            if (name != null && !name.isEmpty() && !"Root Entry".equals(name)) {
                pathParts.add(name);
            }
            current = current.getParent();
        }
        // 反转路径部分,得到正确的绝对路径
        StringBuilder path = new StringBuilder();
        for (int i = pathParts.size() - 1; i >= 0; i--) {
            if (path.length() > 0) {
                path.append("/");
            }
            path.append(pathParts.get(i));
        }
        return path.toString();
    }
}
相关推荐
百锦再3 小时前
第6章 结构体与方法
android·java·c++·python·rust·go
lang201509283 小时前
Maven 4:20年老工具的重生之路
java·maven
音符犹如代码3 小时前
ArrayList常见面试题二
java·开发语言·面试·职场和发展
JanelSirry3 小时前
Java + Spring Boot + Redis技术栈,在实际使用缓存时遇到 缓存击穿、缓存穿透、缓存雪崩
java·spring boot·缓存
NO.10243 小时前
11.4八股
java·linux·数据库
天工无极3 小时前
基于Spring AI实现法律咨询AI助手
java
乐悠小码3 小时前
Java设计模式精讲---01工厂方法模式
java·设计模式·工厂方法模式
cherry--3 小时前
集合(开发重点)
java·开发语言
寻星探路3 小时前
测试开发话题10---自动化测试常用函数(2)
java·前端·python