添加依赖:
bash
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-tika-document-reader</artifactId>
</dependency>
Java代码:
java
package com.zkwm.springai.rag;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.FileSystemResource;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.util.List;
@RestController
@RequestMapping("/docChunk")
public class DocumentChunkController {
@Autowired
private VectorStore vectorStore;
@GetMapping("/processAndStore")
public String processAndStore(String filePath) {
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(500) // 每块最大 Token 数
.withMinChunkSizeChars(50) // 最小字符数
.withMaxNumChunks(10000) // 最大分块数
.withKeepSeparator(true) // 保留分隔符
.build();
// 1. 读取文档(PDF/Word/TXT)
TikaDocumentReader reader = new TikaDocumentReader(new FileSystemResource(filePath));
List<Document> documents = reader.read();
// 2. 文档切片(核心)
List<Document> chunks = splitter.apply(documents);
// 3. 向量化并存入向量库
vectorStore.add(chunks);
return "ok";
}
}