LangChain4j 企业知识库实战:PDF 解析、OCR 与文档加载器生态
前言
企业知识库建设面临复杂文档处理挑战:PDF 表格、扫描件 OCR、多格式文档解析。LangChain4j 提供了丰富的文档加载器生态,支持 Apache Tika、PDFBox、Tesseract OCR 等工具,构建完整的文档处理链路。
一、文档加载器生态
1.1 支持的文档类型
| 格式 | 加载器 | 特性 |
|---|---|---|
| ApachePdfDocumentLoader, PdfBoxDocumentLoader | 文本、表格、元数据 | |
| Word | ApacheTikaDocumentLoader | .doc, .docx |
| Excel | ApacheTikaDocumentLoader | .xls, .xlsx |
| HTML | HtmlDocumentLoader | 网页解析 |
| Markdown | MarkdownDocumentLoader | .md 文件 |
| TXT | TextDocumentLoader | 纯文本 |
1.2 基础加载示例
java
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentLoader;
import dev.langchain4j.data.document.loader.FileSystemDocumentLoader;
// 1. 加载 PDF
Document pdfDoc = FileSystemDocumentLoader.loadDocument(
"docs/report.pdf",
DocumentType.PDF
);
// 2. 加载 Word
Document wordDoc = FileSystemDocumentLoader.loadDocument(
"docs/spec.docx",
DocumentType.DOCX
);
// 3. 加载 Excel
Document excelDoc = FileSystemDocumentLoader.loadDocument(
"docs/data.xlsx",
DocumentType.XLSX
);
// 4. 加载整个目录
List<Document> documents = FileSystemDocumentLoader.loadDocuments(
"docs/",
DocumentType.PDF,
DocumentType.DOCX,
DocumentType.TXT
);
二、PDF 解析
2.1 Apache Tika 集成
xml
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.1</version>
</dependency>
java
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
Tika tika = new Tika();
// 解析 PDF
String content = tika.parseToString(new File("docs/report.pdf"));
// 提取元数据
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "report.pdf");
String text = tika.parseToString(new File("docs/report.pdf"), metadata);
System.out.println("标题: " + metadata.get("title"));
System.out.println("作者: " + metadata.get("author"));
System.out.println("创建日期: " + metadata.get("created"));
2.2 PDFBox 表格提取
xml
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
java
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PdfTableExtractor {
public String extractTables(String pdfPath) throws IOException {
PDDocument document = PDDocument.load(new File(pdfPath));
PDFTextStripper stripper = new PDFTextStripper();
// 提取文本(包含表格)
String text = stripper.getText(document);
document.close();
return text;
}
}
2.3 表格结构化提取
java
public class TableProcessor {
public List<Map<String, String>> extractStructuredTables(String text) {
List<Map<String, String>> tables = new ArrayList<>();
// 解析表格(示例:简单实现)
String[] lines = text.split("\n");
List<String> headers = Arrays.asList(lines[0].split("\\|"));
for (int i = 1; i < lines.length; i++) {
String[] values = lines[i].split("\\|");
Map<String, String> row = new HashMap<>();
for (int j = 0; j < headers.size() && j < values.length; j++) {
row.put(headers.get(j).trim(), values[j].trim());
}
tables.add(row);
}
return tables;
}
}
三、OCR 文字识别
3.1 Tesseract OCR 集成
bash
# 安装 Tesseract
# Ubuntu: sudo apt-get install tesseract-ocr
# macOS: brew install tesseract
# Windows: 下载安装包
# 安装中文语言包
# Ubuntu: sudo apt-get install tesseract-ocr-chi-sim
java
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
public class OcrProcessor {
private final Tesseract tesseract;
public OcrProcessor() {
this.tesseract = new Tesseract();
tesseract.setDatapath("/usr/share/tesseract-ocr/4.00/tessdata/");
tesseract.setLanguage("chi_sim+eng"); // 中文简体 + 英文
}
public String recognizeText(String imagePath) throws TesseractException {
File imageFile = new File(imagePath);
return tesseract.doOCR(imageFile);
}
public String recognizePdf(String pdfPath) throws TesseractException {
File pdfFile = new File(pdfPath);
return tesseract.doOCR(pdfFile);
}
}
3.2 PDF 转 OCR
java
public class PdfOcrProcessor {
private final OcrProcessor ocrProcessor;
public String processScannedPdf(String pdfPath) throws Exception {
// 1. PDF 转图片
List<BufferedImage> images = pdfToImages(pdfPath);
// 2. OCR 识别
StringBuilder sb = new StringBuilder();
for (int i = 0; i < images.size(); i++) {
File tempFile = new File("temp_" + i + ".png");
ImageIO.write(images.get(i), "png", tempFile);
String text = ocrProcessor.recognizeText(tempFile.getAbsolutePath());
sb.append(text).append("\n\n");
tempFile.delete();
}
return sb.toString();
}
private List<BufferedImage> pdfToImages(String pdfPath) throws Exception {
// 使用 PDFRenderer 或其他工具将 PDF 转为图片
// 简化实现
return new ArrayList<>();
}
}
四、1.4.0 新增文档加载器
4.1 Office 文档专用解析器
java
import dev.langchain4j.data.document.loader.OfficeDocumentLoader;
// Word 文档
Document wordDoc = OfficeDocumentLoader.loadWord(new File("docs/spec.docx"));
// Excel 文档
Document excelDoc = OfficeDocumentLoader.loadExcel(new File("docs/data.xlsx"));
// PowerPoint 文档
Document pptDoc = OfficeDocumentLoader.loadPowerPoint(new File("docs/slides.pptx"));
4.2 HTML/Markdown 专用解析器
java
import dev.langchain4j.data.document.loader.HtmlDocumentLoader;
import dev.langchain4j.data.document.loader.MarkdownDocumentLoader;
// HTML 文档
Document htmlDoc = HtmlDocumentLoader.load(new URL("https://example.com"));
// Markdown 文档
Document mdDoc = MarkdownDocumentLoader.load(new File("docs/guide.md"));
五、增量更新
5.1 基于文档哈希的去重
java
public class DocumentIngestor {
private final EmbeddingStore<TextSegment> embeddingStore;
private final EmbeddingModel embeddingModel;
private final Map<String, String> documentHashes = new ConcurrentHashMap<>();
public void ingestDocument(String docPath) {
try {
// 1. 计算文档哈希
String fileHash = calculateFileHash(docPath);
// 2. 检查是否已处理
if (documentHashes.containsKey(docPath) &&
documentHashes.get(docPath).equals(fileHash)) {
System.out.println("文档未变更,跳过: " + docPath);
return;
}
// 3. 加载文档
Document document = FileSystemDocumentLoader.loadDocument(
docPath, DocumentType.PDF
);
// 4. 删除旧数据
if (documentHashes.containsKey(docPath)) {
removeOldDocument(docPath);
}
// 5. 处理新文档
processDocument(document, docPath);
// 6. 更新哈希
documentHashes.put(docPath, fileHash);
} catch (Exception e) {
System.err.println("处理文档失败: " + docPath);
e.printStackTrace();
}
}
private String calculateFileHash(String filePath) throws Exception {
MessageDigest md = MessageDigest.getInstance("SHA-256");
byte[] fileBytes = Files.readAllBytes(Paths.get(filePath));
byte[] hashBytes = md.digest(fileBytes);
return Base64.getEncoder().encodeToString(hashBytes);
}
private void removeOldDocument(String docPath) {
// 实现删除旧文档逻辑
Filter filter = Metadata.metadataKey("source").isEqualTo(docPath);
embeddingStore.removeAll(filter);
}
}
5.2 定时增量更新
java
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class DocumentUpdater {
private final DocumentIngestor ingestor;
private final ScheduledExecutorService scheduler;
public DocumentUpdater() {
this.ingestor = new DocumentIngestor();
this.scheduler = Executors.newScheduledThreadPool(1);
}
public void startScheduledUpdate(String docsDir, long interval, TimeUnit unit) {
scheduler.scheduleAtFixedRate(() -> {
try {
Files.list(Paths.get(docsDir))
.filter(Files::isRegularFile)
.forEach(path -> {
ingestor.ingestDocument(path.toString());
});
} catch (IOException e) {
e.printStackTrace();
}
}, 0, interval, unit);
}
}
六、完整实战
6.1 企业知识库构建
java
public class EnterpriseKnowledgeBase {
private final DocumentIngestor ingestor;
private final RAGPipeline ragPipeline;
public EnterpriseKnowledgeBase() {
this.ingestor = new DocumentIngestor();
this.ragPipeline = new RAGPipeline();
// 初始化加载文档
initializeKnowledgeBase();
}
private void initializeKnowledgeBase() {
String docsDir = "docs/";
try {
Files.list(Paths.get(docsDir))
.filter(Files::isRegularFile)
.forEach(path -> {
String docPath = path.toString();
if (docPath.endsWith(".pdf")) {
ingestor.ingestDocument(docPath);
} else if (docPath.endsWith(".docx")) {
ingestor.ingestDocument(docPath);
}
});
} catch (IOException e) {
e.printStackTrace();
}
}
public String query(String question) {
return ragPipeline.query(question);
}
}
七、小结
本文介绍了 LangChain4j 企业知识库实战:
- 文档加载器生态:PDF、Word、Excel、HTML、Markdown
- PDF 解析:Apache Tika、PDFBox、表格提取
- OCR 文字识别:Tesseract OCR、扫描件处理
- 1.4.0 新增加载器:Office、HTML、Markdown 专用解析器
- 增量更新:文档哈希去重、定时更新
下一步学习:
- 文章 10:《LangChain4j 记忆架构:ChatMemory、持久化与跨会话状态》