RAG检索增强 ETL最佳实战
文章目录
- [RAG检索增强 ETL最佳实战](#RAG检索增强 ETL最佳实战)
在 RAG(检索增强生成)系统的构建中,业界有一句共识: "Garbage In, Garbage Out(垃圾进,垃圾出)" 。系统的最终表现,60% 取决于数据处理(ETL)阶段的质量。
本文将深入探讨如何针对不同格式的文档进行深度解析与语义增强,以构建生产级的高质量知识库。
E - Extract抽取
抽取的原则:
- 语义完整性 :核心思想:宁可多取一段,不可断开一句。
- 结构保留 :保留文档的"骨架",将其转化为大模型易读的格式(如 Markdown)。
- 噪声剔除 :核心思想:彻底剔除无效干扰信息。
抽取选择策略
- 基于不同文件类型选择对应DocumentReader
| 文件类型 | 对应的DocumentReader | 备注 |
|---|---|---|
| 首选ParagraphPdfDocumentReader,若 PDF 缺少目录结构(TOC),Paragraph 模式会报错。此时应自动降级为 PagePdfDocumentReader | 对应依赖:spring-ai-pdf-document-reader | |
| Excel | 直接提取 Excel 文本会丢失表头与数值的对应关,将每一行转换为 {"列名": "内容"} 格式。 | 需要自行实现:EasyExcelJsonReader |
| Word | 将 Word 样式映射为 Markdown 标签(#, ##)。Mammoth 专注于从 .docx 提取语义化的 HTML(它会把 Word 样式转为 ## 、 * 等),然后用 Flexmark 将 HTML 转为 Markdown。 | 需要自行实现:WordMarkdownReader |
| HTML | 首先JsoupDocumentReader,网页源码充斥着与知识无关的标签和脚本,使用 CSS 选择器(如 article 或 #content)只提取正文。 | spring-ai-jsoup-document-reader |
| json | JsonReader | |
| markdown | MarkdownDocumentReader | spring-ai-markdown-document-reader |
| 默认兜底 | TikaDocumentReader | spring-ai-tika-document-reader |
抽取代码实现
- 通过文档读取工厂来统一读取
java
@Slf4j
public class SmartDocumentReaderFactory {
// 默认表头行号
private static final int DEFAULT_EXCEL_HEAD_ROW = 1;
public static List<Document> read(String fileType, Resource resource) {
return read(fileType, resource, DEFAULT_EXCEL_HEAD_ROW);
}
public static List<Document> read(String fileType, Resource resource, int excelHeadRowNumber) {
if (fileType == null) {
return new TikaDocumentReader(resource).read();
}
try {
return switch (fileType) {
case ".pdf" -> readPdfSmartly(resource);
case ".html", ".htm" -> new JsoupDocumentReader(resource).read();
// --- 优化点:针对 Excel 采用自定义 JSON 解析器 ---
case ".xlsx", ".xls" -> new EasyExcelJsonReader(resource, excelHeadRowNumber).read();
// 默认解析整行,可自定义解析 key
case ".json" -> new JsonReader(resource).read();
// 默认解析整行,可自定义解析 key
case ".docx", ".doc" -> new WordMarkdownReader(resource).read();
case ".md", ".markdown" -> new MarkdownDocumentReader(resource,
MarkdownDocumentReaderConfig.builder()
// 保留水平分割线
.withHorizontalRuleCreateDocument(true)
// 包含代码块
.withIncludeCodeBlock(true)
.build()).read();
case ".txt" -> new TextReader(resource).read();
// 默认统一交给 Tika
default -> new TikaDocumentReader(resource).read();
};
} catch (Exception e) {
log.warn("DocumentReader 读取失败fileType:{} 通过TikaDocumentReader兜底", fileType, e);
return new TikaDocumentReader(resource).read();
}
}
/**
* 针对 PDF 的智能读取策略
*/
private static List<Document> readPdfSmartly(Resource resource) {
try {
// 1. 尝试按段落读取(依赖 TOC/目录)
log.info("尝试使用 ParagraphPdfDocumentReader 解析 PDF...");
return new ParagraphPdfDocumentReader(resource,
PdfDocumentReaderConfig.builder()
.withPageTopMargin(0)
.withPageBottomMargin(0)
.build()).read();
} catch (IllegalArgumentException e) {
// 2. 如果报 TOC 缺失错误,自动降级为 Page 模式
if (e.getMessage().contains("Document outline") || e.getMessage().contains("TOC")) {
log.warn("PDF 缺少目录结构,自动降级为 PagePdfDocumentReader 解析");
return new PagePdfDocumentReader(resource,
PdfDocumentReaderConfig.builder()
.withPageTopMargin(0)
.withPageBottomMargin(0)
.build()).read();
}
throw e;
} catch (Exception e) {
// 3. 其他解析异常交由上层 switch 块中的 Tika 最终兜底
throw e;
}
}
}
- WordMarkdownReader
java
public class WordMarkdownReader implements DocumentReader {
private final Resource resource;
public WordMarkdownReader(Resource resource) {
this.resource = resource;
}
@Override
public List<Document> get() {
try {
String markdownContent = WordToMarkdownConverter.convertToMarkdown(resource);
// 将 Markdown 作为内容,注入元数据
Map<String, Object> metadata = Map.of(
"source", resource.getFilename(),
"file_type", "docx_as_markdown"
);
return List.of(new Document(markdownContent, metadata));
} catch (Exception e) {
throw new RuntimeException("Word转Markdown失败", e);
}
}
}
public class WordToMarkdownConverter {
public static String convertToMarkdown(Resource resource) throws Exception {
try (InputStream is = resource.getInputStream()) {
DocumentConverter converter = new DocumentConverter()
// 核心设置:忽略图片。如果不加这一行,result 里的 HTML 会包含巨大的 Base64 字符串
.imageConverter(image -> Map.of("alt", "[图片已忽略]"))
// 或者直接彻底删除:.addStyleMap("img => ")
.addStyleMap("p[style-name='Heading 1'] => h1:fresh") // 确保标题映射正确
.addStyleMap("p[style-name='Heading 2'] => h2:fresh");
Result<String> result = converter.convertToHtml(is);
String html = result.getValue();
// 转换为 Markdown
return FlexmarkHtmlConverter.builder().build().convert(html);
}
}
}
- EasyExcelJsonReader
java
public class EasyExcelJsonReader implements DocumentReader {
private final Resource resource;
private final int headRowNumber; // 表头行数
private final ObjectMapper objectMapper = new ObjectMapper();
public EasyExcelJsonReader(Resource resource) {
this(resource, 1);
}
public EasyExcelJsonReader(Resource resource, int headRowNumber) {
this.resource = resource;
this.headRowNumber = headRowNumber;
}
@Override
public List<Document> get() {
List<Document> documents = new ArrayList<>();
try {
// EasyExcel 默认读取所有 Sheet
EasyExcel.read(resource.getInputStream(), new AnalysisEventListener<Map<Integer, String>>() {
private Map<Integer, String> headerMap = new HashMap<>();
// 获取表头:当读取到 headRowNumber 这一行时,触发此方法
@Override
public void invokeHeadMap(Map<Integer, String> headMap, AnalysisContext context) {
this.headerMap = headMap;
}
@Override
public void invoke(Map<Integer, String> data, AnalysisContext context) {
// 将数据行与表头对应,转化为 JSON
Map<String, Object> rowMap = new LinkedHashMap<>();
for (Map.Entry<Integer, String> entry : headerMap.entrySet()) {
String value = data.get(entry.getKey());
rowMap.put(entry.getValue(), value != null ? value : "");
}
try {
String jsonContent = objectMapper.writeValueAsString(rowMap);
// 提取元数据
Map<String, Object> metadata = new HashMap<>();
metadata.put("source", resource.getFilename());
metadata.put("sheet", context.readSheetHolder().getSheetName());
metadata.put("row_index", context.readRowHolder().getRowIndex() + 1);
documents.add(new Document(jsonContent, metadata));
} catch (Exception e) {
// 忽略转换失败的行
}
}
@Override
public void doAfterAllAnalysed(AnalysisContext context) {}
})
.headRowNumber(headRowNumber) // 关键设置:指定表头在第几行
.sheet()
.doRead();
} catch (IOException e) {
throw new RuntimeException("Excel读取失败", e);
}
return documents;
}
}
T - Transformer转换
-
对抽取出来的原始数据进行清洗、格式化、脱敏、归并和加工的过程。
-
分块策略让用户根据实际情况选择,在结构化切分之后,再过一遍 TokenTextSplitter。

转换策略的选择
| 文本分块器 | 特性 | 适用场景 |
|---|---|---|
| ParagraphTextSplitter | 段落文本分块器 特性:递归降级策略(段落 *->句子 ->*字符 | 📝 按自然段落分块,保留段落完整性,适合法律条文、书籍、博客 |
| SemanticTextSplitter | 策略:滑动窗口**Embedding + 语义相似度切分 + 最大长度强制切分 它尽可能地把句子往一个块里塞(为了上下文完整),直到 塞满了(超长) 或者发现话题变了(语义突变) 才会停下来切一刀。 | 🧠 基于语义相似度智能分块,自动识别主题边界,适合核心技术文档、深度论文、科研资料 |
| SentenceSplitter | 它弥补了简单字符切分(Character-based)破坏语义和段落切分(Paragraph-based)可能导致块过大的不足 | ✨ 保证句子完整性,语义不被截断,适合FAQ 库、客服话术、实时对话 |
| RecursiveCharacterTextSplitter | 通过一组有优先级的分隔符(从大到小),递归地尝试将文本拆分成尽量大但又不超过** chunkSize **的块,以尽可能保留语义的连续性。 | 📚 平衡之选,保留文档结构(段落、章节),适合混合排版 PDF、Word、企业综合文档 |
| TokenTextSplitter | 基于 Token(词元)数量而非字符数或段落进行切分。 | ⚡ 速度最快,按固定 token 数切分,代码库、原始日志、混合语种文本 |
- ParagraphTextSplitter
java
@Slf4j
@Builder
public class ParagraphTextSplitter extends TextSplitter {
private final int chunkSize;
/**
* 段落重叠字符数
*/
private final int paragraphOverlapChars;
/**
* 段落分隔符:至少两个连续的换行符
*/
private static final Pattern PARAGRAPH_PATTERN = Pattern.compile("\\n\\s*\\n+");
private static final Pattern sentencePattern = Pattern.compile("[^。!?.!?\\n]+[。!?.!?\\n]*");
@Override
public List<String> splitText(String text) {
if (text == null || text.trim().isEmpty()) {
return List.of();
}
// 1. 按段落粗切
String[] paragraphs = PARAGRAPH_PATTERN.split(text);
log.debug("Split text into {} paragraphs", paragraphs.length);
List<String> chunks = new ArrayList<>();
StringBuilder currentChunk = new StringBuilder();
for (String paragraph : paragraphs) {
String trimmedParagraph = paragraph.trim();
if (trimmedParagraph.isEmpty()) {
continue;
}
// --- 情况 A: 遇到超大段落 (递归处理) ---
if (trimmedParagraph.length() > chunkSize) {
log.debug("Processing large paragraph length: {}", trimmedParagraph.length());
// 1. 先结算当前缓存区 (Buffer),确保之前的上下文不丢失
if (currentChunk.length() > 0) {
chunks.add(currentChunk.toString().trim());
// 提取 Overlap 留给大段落的开头使用
currentChunk = extractOverlap(currentChunk.toString());
}
// 2. 切分大段落
// 注意:这里切分出来的 subChunks 每一个都已经接近 chunkSize 了
List<String> subChunks = splitLargeParagraph(trimmedParagraph);
// 3. 逐个处理子块
for (String subChunk : subChunks) {
// 检查:Overlap + 当前子块 是否会超限?
int potentialLen = currentChunk.length() + (currentChunk.length() > 0 ? 2 : 0) + subChunk.length();
if (potentialLen > chunkSize) {
// 如果加上 Overlap 会超限,那也没办法,只能舍弃 Overlap (或者先把 Overlap
// 存为一个独立块,但这通常没必要)
// 我们选择:强制结算 Overlap (如果有的话),然后重新开始当前 subChunk
if (currentChunk.length() > 0) {
// 这里通常意味着 Overlap 本身就挺大,或者 subChunk 很大
// 为了简单,我们放弃将 Overlap 拼接到这个巨型 subChunk 前面,防止溢出
// 而是直接让 subChunk 独立成块
currentChunk = new StringBuilder();
}
}
if (currentChunk.length() > 0) {
currentChunk.append("\n\n");
}
currentChunk.append(subChunk);
// 当前子块处理完,立即结算,为下一个子块准备 overlap
chunks.add(currentChunk.toString().trim());
currentChunk = extractOverlap(currentChunk.toString());
}
continue; // 大段落处理完毕,进入下一个循环
}
// --- 情况 B: 普通段落 (积累处理) ---
// 计算添加这个段落后的总长度
int separatorLength = currentChunk.length() > 0 ? 2 : 0;
int potentialLength = currentChunk.length() + separatorLength + trimmedParagraph.length();
// 如果加上当前段落会超过 chunkSize,先保存当前块
if (potentialLength > chunkSize && currentChunk.length() > 0) {
chunks.add(currentChunk.toString().trim());
// 提取 overlap 内容作为新块的开始
currentChunk = extractOverlap(currentChunk.toString());
}
// 添加当前段落
if (currentChunk.length() > 0) {
currentChunk.append("\n\n");
}
currentChunk.append(trimmedParagraph);
}
// 处理最后的尾巴
if (currentChunk.length() > 0) {
chunks.add(currentChunk.toString().trim());
}
log.info("Created {} paragraph chunks", chunks.size());
return chunks;
}
/**
* 从已完成的块中提取 overlap 内容 策略:尝试智能贴合段落边界,如果找不到段落边界,则硬截取
*/
private StringBuilder extractOverlap(String chunk) {
if (paragraphOverlapChars <= 0 || chunk == null || chunk.isEmpty()) {
return new StringBuilder();
}
int len = chunk.length();
// 如果块本身就很小,小于 overlap 要求,那就全拿
if (len <= paragraphOverlapChars) {
return new StringBuilder(chunk);
}
// 1. 初步截取
int overlapStart = len - paragraphOverlapChars;
String rawOverlap = chunk.substring(overlapStart);
// 2. 尝试寻找最近的段落边界 "\n\n",让 Overlap 从完整的段落开始
// 这里的逻辑是:不要从段落中间截断,尽量从段落头开始
int firstParagraphBreak = rawOverlap.indexOf("\n\n");
if (firstParagraphBreak != -1 && firstParagraphBreak + 2 < rawOverlap.length()) {
// 找到了边界,且边界后还有内容。丢弃边界前的半截段落,保留后面的完整段落
return new StringBuilder(rawOverlap.substring(firstParagraphBreak + 2));
}
// 3. 如果找不到段落边界(说明最后一段很长),那就只能硬截取了,但最好避开句子中间
// 可以在这里加一个寻找句号的逻辑,但为了性能和通用性,直接返回 rawOverlap 也是可接受的
return new StringBuilder(rawOverlap.trim());
}
/**
* 切分过大的段落 (逻辑保持不变,你的实现已经很好了)
*/
private List<String> splitLargeParagraph(String paragraph) {
List<String> subChunks = new ArrayList<>();
// 1. 尝试按句子切分
Matcher matcher = sentencePattern.matcher(paragraph);
StringBuilder currentChunk = new StringBuilder();
int lastMatchEnd = 0;
while (matcher.find()) {
String sentence = matcher.group();
lastMatchEnd = matcher.end();
if (sentence.length() > chunkSize) {
if (currentChunk.length() > 0) {
subChunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
}
subChunks.addAll(splitByChars(sentence));
continue;
}
if (currentChunk.length() + sentence.length() > chunkSize && currentChunk.length() > 0) {
subChunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
}
currentChunk.append(sentence);
}
if (lastMatchEnd < paragraph.length()) {
String remaining = paragraph.substring(lastMatchEnd);
if (!remaining.trim().isEmpty()) {
if (remaining.length() > chunkSize) {
if (currentChunk.length() > 0) {
subChunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
}
subChunks.addAll(splitByChars(remaining));
}
else {
if (currentChunk.length() + remaining.length() > chunkSize && currentChunk.length() > 0) {
subChunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
}
currentChunk.append(remaining);
}
}
}
if (currentChunk.length() > 0) {
subChunks.add(currentChunk.toString().trim());
}
return subChunks;
}
private List<String> splitByChars(String text) {
List<String> chunks = new ArrayList<>();
int start = 0;
while (start < text.length()) {
int end = Math.min(start + chunkSize, text.length());
chunks.add(text.substring(start, end).trim());
start = end;
}
return chunks;
}
}
- SemanticTextSplitter
java
@Slf4j
@Builder
public class SemanticTextSplitter extends TextSplitter {
private final EmbeddingModel embeddingModel;
private final int minChunkSize; // 建议 200
private final int maxChunkSize; // 建议 1000
private final double similarityThreshold;
/**
* Embedding API 每批次最大句子数 阿里 text-embedding-v4 支持 10,OpenAI 支持更多。建议可配置。
*/
@Builder.Default
private int embeddingBatchSize = 10;
/**
* 句子正则:匹配标点或换行
*/
private static final Pattern SENTENCE_PATTERN = Pattern.compile("([^。!?;.!?;\\n]+[。!?;.!?;]?|[^。!?;.!?;\\n]*\\n)");
@Override
public List<String> splitText(String text) {
if (text == null || text.trim().isEmpty()) {
return List.of();
}
// 1. 提取句子
List<String> sentences = extractSentences(text);
if (sentences.isEmpty()) {
return List.of(text);
}
log.debug("Extracted {} sentences", sentences.size());
// 2. 只有一句,直接返回(或者检查长度)
if (sentences.size() == 1) {
return splitLargeChunk(sentences.get(0));
}
// 3. 构建滑动窗口上下文 (Windowed Context)
List<String> contextSentences = buildContextSentences(sentences);
// 4. 计算 Embeddings
List<float[]> embeddings = batchEmbed(contextSentences);
// 5. 核心:基于 语义+长度 双重约束进行合并
return combineSentences(sentences, embeddings);
}
/**
* 核心逻辑:合并句子
*/
private List<String> combineSentences(List<String> sentences, List<float[]> embeddings) {
List<String> chunks = new ArrayList<>();
StringBuilder currentChunk = new StringBuilder();
for (int i = 0; i < sentences.size(); i++) {
String sentence = sentences.get(i);
// 先检查单句是否本身就是超出最大长度
if (sentence.length() > maxChunkSize) {
// 1. 先把当前的结算了
if (!currentChunk.isEmpty()) {
chunks.add(currentChunk.toString().trim());
currentChunk.setLength(0);
}
// 2. 巨无霸单独切分
chunks.addAll(splitLargeChunk(sentence));
// 处理下一句
continue;
}
// --- 决策:是否需要在当前句子之前切一刀? ---
boolean shouldSplit = false;
// 如果 currentChunk 为空,说明是新的一块,不需要 split,直接 add 即可
if (!currentChunk.isEmpty()) {
// 1. 长度检查:加上这句是否超长?
if (currentChunk.length() + sentence.length() > maxChunkSize) {
log.debug("Splitting at index {} due to max size limit", i);
shouldSplit = true;
}
// 2. 语义检查:语义突变?
else if (i < embeddings.size()) {
double similarity = cosineSimilarity(embeddings.get(i - 1), embeddings.get(i));
// 只有当当前块已经达到最小长度时,才允许按语义切分
// 否则即使语义变了,为了保证块不太碎,也强行合并
if (similarity < similarityThreshold && currentChunk.length() >= minChunkSize) {
log.debug("Splitting at index {} due to semantic shift (sim={})", i, similarity);
shouldSplit = true;
}
}
}
// --- 执行动作 ---
if (shouldSplit) {
chunks.add(currentChunk.toString().trim());
currentChunk.setLength(0);
}
// 拼接空格逻辑
if (!currentChunk.isEmpty() && !isChinese(sentence)) {
currentChunk.append(" ");
}
currentChunk.append(sentence);
}
// 处理最后一个块
if (!currentChunk.isEmpty()) {
chunks.add(currentChunk.toString().trim());
}
return chunks;
}
// 简单的中文判断,用于决定拼接时加不加空格
private boolean isChinese(String str) {
return str.codePoints()
.anyMatch(codepoint -> Character.UnicodeScript.of(codepoint) == Character.UnicodeScript.HAN);
}
/**
* 提取句子
*/
private List<String> extractSentences(String text) {
List<String> sentences = new ArrayList<>();
Matcher matcher = SENTENCE_PATTERN.matcher(text);
int lastEnd = 0;
while (matcher.find()) {
String s = matcher.group().trim();
if (!s.isEmpty())
sentences.add(s);
lastEnd = matcher.end();
}
// 兜底:防止正则漏掉最后一段没有标点的文本
if (lastEnd < text.length()) {
String tail = text.substring(lastEnd).trim();
if (!tail.isEmpty()) {
sentences.add(tail);
}
}
return sentences;
}
/**
* 滑动窗口构建上下文 在计算第 i 句的向量时,我们实际送给模型的文本是:[第 i-1 句] + [第 i 句] + [第 i+1 句]。 目的:让 Embedding
* 向量包含上下文信息,算出来的相似度更准。
*/
private List<String> buildContextSentences(List<String> sentences) {
List<String> contextSentences = new ArrayList<>();
for (int i = 0; i < sentences.size(); i++) {
StringBuilder context = new StringBuilder();
if (i > 0)
context.append(sentences.get(i - 1)).append(" ");
context.append(sentences.get(i));
if (i < sentences.size() - 1)
context.append(" ").append(sentences.get(i + 1));
contextSentences.add(context.toString());
}
return contextSentences;
}
/**
* 批量 Embedding (带容错)
*/
private List<float[]> batchEmbed(List<String> texts) {
// 获取向量维度的占位符
int dimensions = embeddingModel.dimensions();
List<float[]> allEmbeddings = new ArrayList<>();
// 这里为了安全,建议 catch 异常时填入 new float[0],计算相似度时做空检查
for (int i = 0; i < texts.size(); i += embeddingBatchSize) {
int endIdx = Math.min(i + embeddingBatchSize, texts.size());
List<String> batch = texts.subList(i, endIdx);
try {
EmbeddingResponse response = embeddingModel.embedForResponse(batch);
// 假设 Spring AI 的 EmbeddingResponse 结构
for (var result : response.getResults()) {
allEmbeddings.add(result.getOutput());
}
}
catch (Exception e) {
log.error("Embedding failed for batch {}-{}", i, endIdx, e);
// 填充零向量
for (int k = 0; k < batch.size(); k++)
allEmbeddings.add(new float[dimensions]);
}
}
return allEmbeddings;
}
private double cosineSimilarity(float[] vec1, float[] vec2) {
if (vec1 == null || vec2 == null || vec1.length != vec2.length)
return 0.0;
double dot = 0.0, norm1 = 0.0, norm2 = 0.0;
for (int i = 0; i < vec1.length; i++) {
dot += vec1[i] * vec2[i];
norm1 += vec1[i] * vec1[i];
norm2 += vec2[i] * vec2[i];
}
if (norm1 == 0 || norm2 == 0)
return 0.0; // 零向量处理
return dot / (Math.sqrt(norm1) * Math.sqrt(norm2));
}
/**
* 保底策略:如果单个句子本身就超长,还是得硬切
*/
private List<String> splitLargeChunk(String text) {
List<String> result = new ArrayList<>();
for (int i = 0; i < text.length(); i += maxChunkSize) {
result.add(text.substring(i, Math.min(i + maxChunkSize, text.length())));
}
return result;
}
}
- SentenceSplitter
java
@Slf4j
public class SentenceSplitter extends TextSplitter {
/**
* 正则说明: 1. ([^。!?;.!?;\n]+) : 非分隔符内容 2. (?:[。!?;.!?;]|\n+) : 分隔符(标点或换行) 3.
* ["')\)\\]]* : 可能跟随的后引号/括号 4. \\s* : 尾随空白(将被 trim 掉)
*/
private static final Pattern SENTENCE_PATTERN = Pattern
.compile("([^。!?;.!?;\n]+(?:[。!?;.!?;]|\\n+)[\"')\\)\\]]*\\s*)");
private static final int DEFAULT_CHUNK_SIZE = 1000;
private static final int DEFAULT_SENTENCE_OVERLAP = 1;
private final int chunkSize;
private final int sentenceOverlap;
// 私有构造器 & Builder
private SentenceSplitter(Builder builder) {
this.chunkSize = builder.chunkSize > 0 ? builder.chunkSize : DEFAULT_CHUNK_SIZE;
this.sentenceOverlap = builder.sentenceOverlap >= 0 ? builder.sentenceOverlap : DEFAULT_SENTENCE_OVERLAP;
}
public static Builder builder() {
return new Builder();
}
public static class Builder {
private int chunkSize = DEFAULT_CHUNK_SIZE;
private int sentenceOverlap = DEFAULT_SENTENCE_OVERLAP;
public Builder withChunkSize(int chunkSize) {
this.chunkSize = chunkSize;
return this;
}
public Builder withSentenceOverlap(int overlap) {
this.sentenceOverlap = overlap;
return this;
}
public SentenceSplitter build() {
return new SentenceSplitter(this);
}
}
@Override
protected List<String> splitText(String text) {
return extractSentences(text);
}
@Override
public List<Document> apply(List<Document> documents) {
if (CollectionUtils.isEmpty(documents))
return new ArrayList<>();
List<Document> result = new ArrayList<>();
for (Document doc : documents) {
if (StringUtils.hasText(doc.getText())) {
result.addAll(splitDocument(doc));
}
}
log.info("Split {} documents into {} chunks", documents.size(), result.size());
return result;
}
private List<Document> splitDocument(Document document) {
List<String> sentences = extractSentences(document.getText());
if (sentences.isEmpty())
return List.of(document);
List<Document> result = new ArrayList<>();
List<String> currentChunk = new ArrayList<>();
int currentSize = 0;
for (String sentence : sentences) {
int sentenceLen = sentence.length();
// 1. 处理巨型句子 (单句本身超长)
if (sentenceLen > this.chunkSize) {
if (!currentChunk.isEmpty()) {
flushChunk(result, currentChunk, document);
// 巨型句子打断了上下文,Overlap 意义不大,直接清空
currentChunk.clear();
currentSize = 0;
}
// 切分巨型句子
List<String> parts = splitLongSentence(sentence);
for (String part : parts) {
// 检查 part 是否能放入 (通常 splitLongSentence 保证了 part <= chunkSize)
if (currentSize + part.length() > this.chunkSize && !currentChunk.isEmpty()) {
flushChunk(result, currentChunk, document);
handleOverlap(currentChunk);
currentSize = calculateSize(currentChunk);
}
currentChunk.add(part);
currentSize += part.length();
}
continue;
}
// 2. 普通句子处理
if (currentSize + sentenceLen > this.chunkSize && !currentChunk.isEmpty()) {
flushChunk(result, currentChunk, document);
handleOverlap(currentChunk);
currentSize = calculateSize(currentChunk);
}
currentChunk.add(sentence);
currentSize += sentenceLen;
}
// 处理剩余
if (!currentChunk.isEmpty()) {
flushChunk(result, currentChunk, document);
}
return result;
}
/**
* 核心方法:将当前 Buffer 生成 Document,并添加到结果集
*/
private void flushChunk(List<Document> result, List<String> chunkSentences, Document originalDoc) {
StringBuilder content = new StringBuilder();
for (int i = 0; i < chunkSentences.size(); i++) {
String s = chunkSentences.get(i);
// 智能拼接:如果前一句不是中文,且当前句不是中文,才加空格
// 或者简单点:只要不是中文就加空格。这里用严谨判断。
if (i > 0 && !isChinese(s)) {
content.append(" ");
}
content.append(s);
}
String text = content.toString();
Document chunkDoc = new Document(text);
if (originalDoc.getMetadata() != null) {
chunkDoc.getMetadata().putAll(originalDoc.getMetadata());
}
chunkDoc.getMetadata().put("chunk_index", result.size());
chunkDoc.getMetadata().put("chunk_size", text.length());
chunkDoc.getMetadata().put("splitter_type", "sentence");
result.add(chunkDoc);
}
/**
* 处理 Overlap:保留 list 的最后 N 个元素,修改 list 本身
*/
private void handleOverlap(List<String> currentChunk) {
if (this.sentenceOverlap > 0 && currentChunk.size() > this.sentenceOverlap) {
List<String> overlap = new ArrayList<>(
currentChunk.subList(currentChunk.size() - this.sentenceOverlap, currentChunk.size()));
currentChunk.clear();
currentChunk.addAll(overlap);
}
else {
currentChunk.clear();
}
}
private int calculateSize(List<String> chunk) {
return chunk.stream().mapToInt(String::length).sum();
}
private List<String> extractSentences(String text) {
List<String> sentences = new ArrayList<>();
Matcher matcher = SENTENCE_PATTERN.matcher(text);
int lastEnd = 0;
while (matcher.find()) {
String sentence = matcher.group(1).trim();
if (StringUtils.hasText(sentence)) {
sentences.add(sentence);
}
lastEnd = matcher.end();
}
if (lastEnd < text.length()) {
String remaining = text.substring(lastEnd).trim();
if (StringUtils.hasText(remaining)) {
// 如果剩余部分本身就超大,也需要切
if (remaining.length() > this.chunkSize) {
sentences.addAll(splitLongSentence(remaining));
}
else {
sentences.add(remaining);
}
}
}
return sentences;
}
/**
* 分割超长句子
*/
private List<String> splitLongSentence(String sentence) {
List<String> result = new ArrayList<>();
int i = 0;
int len = sentence.length();
while (i < len) {
int end = Math.min(i + this.chunkSize, len);
// 优化:只有当截断点在字符串中间,且是 ASCII 字符时才尝试回溯
if (end < len && isAsciiLetter(sentence.charAt(end))) {
int adjustedEnd = end;
// 限制回溯范围 (例如最大回溯 50 个字符),防止极端情况退回太多
int minEnd = Math.max(i, end - 50);
while (adjustedEnd > minEnd && isAsciiLetter(sentence.charAt(adjustedEnd))) {
adjustedEnd--;
}
// 只有找到了合适的分割点(非字母数字)才更新 end
if (adjustedEnd > minEnd && adjustedEnd < end) {
end = adjustedEnd;
}
}
result.add(sentence.substring(i, end));
// 下一次的起点,必须是这一次的终点
i = end;
}
return result;
}
// 判断是否为 ASCII 字母或数字 (用于英文单词边界判断)
private boolean isAsciiLetter(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9');
}
// 判断是否包含汉字 (用于拼接空格判断)
private boolean isChinese(String str) {
if (str == null || str.isEmpty())
return false;
// 简单判断首字符是否为汉字即可满足大部分拼接场景
int codePoint = str.codePointAt(0);
return Character.UnicodeScript.of(codePoint) == Character.UnicodeScript.HAN;
}
}
L - Load数据加载
-
将处理好的数据写入目标存储系统(Target)的过程,指将转换后的 List 及其对应的向量,写入向量数据库(如 Milvus 、Elasticsearch 、Pinecone 等)。
-
在加载前一定要把上下文的信息作为元数据存储
java
public static List<Document> convertAgentKnowledgeDocumentsWithMetadata(List<Document> documents,
AgentKnowledge knowledge) {
List<Document> documentsWithMetadata = new ArrayList<>();
for (Document doc : documents) {
// 创建元数据
Map<String, Object> metadata = new HashMap<>(doc.getMetadata());
metadata.put(Constant.AGENT_ID, knowledge.getAgentId().toString());
metadata.put(DocumentMetadataConstant.DB_AGENT_KNOWLEDGE_ID, knowledge.getId());
metadata.put(DocumentMetadataConstant.VECTOR_TYPE, DocumentMetadataConstant.AGENT_KNOWLEDGE);
metadata.put(DocumentMetadataConstant.CONCRETE_AGENT_KNOWLEDGE_TYPE, knowledge.getType().getCode());
// 创建带有元数据的新文档
Document docWithMetadata = new Document(doc.getId(), doc.getText(), metadata);
documentsWithMetadata.add(docWithMetadata);
}
return documentsWithMetadata;
}