word、pdf文档内容提取工具类
1.依赖
xml
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.5</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.5</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.5</version> <!-- 如果有 DOCX 相关 -->
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.5</version>
</dependency>
2.代码
java
import cn.hutool.core.util.StrUtil;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Locale;
/**
* @Description 文档提取工具类
* @Date 2025/9/29 17:32
*/
public class DocumentTextExtractUtil {
public static String extractText(InputStream inputStream, String fileType) throws IOException {
if (inputStream == null) return "";
if (StrUtil.isBlank(fileType)) throw new IllegalArgumentException("文件类型不能为空");
String ft = normalizeFileType(fileType);
byte[] data = toByteArray(inputStream);
return switch (ft) {
case "pdf" -> extractPdfText(data);
case "docx" -> extractDocxText(data);
case "doc" -> extractDocText(data);
default -> new String(data, StandardCharsets.UTF_8);
};
}
private static String normalizeFileType(String fileType) {
String ft = fileType.trim().toLowerCase(Locale.ROOT);
if (ft.startsWith(".")) ft = ft.substring(1);
if (ft.contains("/")) {
ft = ft.substring(ft.indexOf('/') + 1);
if (ft.contains("+")) ft = ft.substring(0, ft.indexOf('+'));
}
return ft;
}
private static String extractPdfText(byte[] data) throws IOException {
try (PDDocument doc = Loader.loadPDF(data)) { // 3.x 用 Loader.loadPDF
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true);
String text = stripper.getText(doc);
return text == null ? "" : text.trim();
}
}
private static String extractDocxText(byte[] data) throws IOException {
try (XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(data))) {
StringBuilder sb = new StringBuilder();
List<XWPFParagraph> paragraphs = doc.getParagraphs();
for (XWPFParagraph p : paragraphs) {
String t = p.getText();
if (t != null && !t.isEmpty()) {
sb.append(t).append('\n');
}
}
return sb.toString().trim();
}
}
private static String extractDocText(byte[] data) throws IOException {
try (HWPFDocument doc = new HWPFDocument(new ByteArrayInputStream(data))) {
WordExtractor extractor = new WordExtractor(doc);
String[] paragraphs = extractor.getParagraphText();
StringBuilder sb = new StringBuilder();
if (paragraphs != null) {
for (String p : paragraphs) {
if (p == null) continue;
String clean = p.replaceAll("\\u0000", "").trim();
if (!clean.isEmpty()) sb.append(clean).append('\n');
}
}
return sb.toString().trim();
}
}
private static byte[] toByteArray(InputStream in) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream(8192);
byte[] buf = new byte[8192];
int r;
while ((r = in.read(buf)) != -1) {
baos.write(buf, 0, r);
}
return baos.toByteArray();
}
}
3.如何使用
java
try (InputStream textStream = new ByteArrayInputStream(content)) {
String text = DocumentTextExtractUtil.extractText(textStream, fileType.toLowerCase());
} catch (Exception e) {
log.warn("文本提取失败,文件: {},类型: {}", fileName, fileType, e);
}