SpringAI Rag 文件读取

SpringAI Rag 文件读取

  • [1 依赖](#1 依赖)
  • [2 配置](#2 配置)
  • [3 代码](#3 代码)
  • [4 中文拆分](#4 中文拆分)

1 依赖

xml 复制代码
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<parent>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-parent</artifactId>
		<version>3.5.9</version>
		<relativePath/> <!-- lookup parent from repository -->
	</parent>

	<groupId>com.xu</groupId>
	<artifactId>spring-openai-onnx</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<name>spring-openai-onnx</name>
	<description>Demo project for Spring Boot</description>

	<properties>
		<java.version>25</java.version>
		<spring-ai.version>1.1.2</spring-ai.version>
	</properties>

	<dependencies>

		<!-- SpringBoot 前端请求 -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-web</artifactId>
		</dependency>

		<!-- Redis Rag 存储 -->
		<dependency>
			<groupId>org.springframework.ai</groupId>
			<artifactId>spring-ai-starter-vector-store-redis</artifactId>
		</dependency>

		<!-- Rag 文档解析 -->
		<dependency>
			<groupId>org.springframework.ai</groupId>
			<artifactId>spring-ai-tika-document-reader</artifactId>
		</dependency>

		<!-- hutool -->
		<dependency>
			<groupId>cn.hutool</groupId>
			<artifactId>hutool-all</artifactId>
			<version>5.8.42</version>
		</dependency>

		<!-- devtools -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-devtools</artifactId>
			<scope>runtime</scope>
			<optional>true</optional>
		</dependency>

		<!-- lombok -->
		<dependency>
			<groupId>org.projectlombok</groupId>
			<artifactId>lombok</artifactId>
			<optional>true</optional>
		</dependency>

		<!-- test -->
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-test</artifactId>
			<scope>test</scope>
		</dependency>

	</dependencies>

	<dependencyManagement>
		<dependencies>
			<dependency>
				<groupId>org.springframework.ai</groupId>
				<artifactId>spring-ai-bom</artifactId>
				<version>${spring-ai.version}</version>
				<type>pom</type>
				<scope>import</scope>
			</dependency>
		</dependencies>
	</dependencyManagement>

	<build>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-compiler-plugin</artifactId>
				<configuration>
					<annotationProcessorPaths>
						<path>
							<groupId>org.projectlombok</groupId>
							<artifactId>lombok</artifactId>
						</path>
					</annotationProcessorPaths>
				</configuration>
			</plugin>
			<plugin>
				<groupId>org.springframework.boot</groupId>
				<artifactId>spring-boot-maven-plugin</artifactId>
				<configuration>
					<excludes>
						<exclude>
							<groupId>org.projectlombok</groupId>
							<artifactId>lombok</artifactId>
						</exclude>
					</excludes>
				</configuration>
			</plugin>
		</plugins>
	</build>

</project>

2 配置

yml 复制代码
spring:
  data:
    redis:
      host: 127.0.0.1
      port: 6379
      password:
  application:
    name: spring-openai-onnx
  ai:
    vectorstore:
      redis:
        initialize-schema: true # 是否初始化所需的模式
        index-name: zhaoxin-rag # 用于存储向量的索引名称
        prefix: 'zhaoxin:rag:file:info' # redis 键的前缀

3 代码

java 复制代码
package com.xu;

import org.springframework.ai.document.Document;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.transformer.splitter.TextSplitter;
import org.springframework.ai.vectorstore.redis.RedisVectorStore;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;

import org.junit.jupiter.api.Test;

import com.xu.conf.ZhaoXinTextSplitter;

import java.util.List;

@SpringBootTest
class RagTest {

    @Autowired
    private RedisVectorStore redisVectorStore;

    @Test
    void file() {
        Resource pdf = new FileSystemResource("D:\\SourceCode\\简历\\个人简历.pdf");
        Resource doc = new FileSystemResource("D:\\SourceCode\\简历\\个人简历.docx");
        Resource excel = new FileSystemResource("D:\\SourceCode\\简历\\个人简历.xlsx");
        TikaDocumentReader reader = new TikaDocumentReader(pdf);
        List<Document> origin = reader.get();
        // 中文拆分
        TextSplitter splitter = new ZhaoXinTextSplitter();
        List<Document> documents = splitter.split(origin);
        redisVectorStore.add(documents);
    }

    @Test
    void text() {
        Resource resource = new FileSystemResource("D:\\SourceCode\\简历\\a.txt");
        TextReader reader = new TextReader(resource);
        List<Document> origin = reader.get();
        // 中文拆分
        TextSplitter splitter = new ZhaoXinTextSplitter();
        List<Document> documents = splitter.split(origin);
        redisVectorStore.add(documents);
    }

}

4 中文拆分

java 复制代码
package com.xu.conf;

import org.springframework.ai.transformer.splitter.TextSplitter;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * 优化后的中文专属文本拆分器
 * 适配中文语义边界,避免拆分后语义断裂
 *
 * @author hyacinth
 */
public class ZhaoXinTextSplitter extends TextSplitter {

    // 中文核心分隔符正则(按「强语义边界→弱语义边界」排序,优先拆分大的语义单元)
    private static final Pattern CHINESE_SEPARATOR_PATTERN = Pattern.compile(
            "(。|?|!|;|:|\\n\\n|\\n|,)",
            Pattern.UNICODE_CASE | Pattern.MULTILINE
    );

    // 空白字符过滤正则(去除全角/半角空白、制表符等)
    private static final Pattern BLANK_PATTERN = Pattern.compile("\\s+");

    @Override
    protected List<String> splitText(String text) {
        // 步骤1:预处理文本,去除多余空白,避免无效拆分
        String processedText = preprocessText(text);

        // 步骤2:按中文语义边界拆分文本,得到初步拆分结果
        List<String> initialSplits = splitByChineseSemantic(processedText);

        // 步骤3:过滤无效分块,合并过短分块,得到最终有效分块
        return filterAndOptimizeChunks(initialSplits);
    }

    /**
     * 文本预处理:去除多余空白、制表符,统一换行格式
     */
    private String preprocessText(String text) {
        if (text == null || text.isBlank()) {
            return "";
        }
        // 1. 替换全角空白为半角
        String result = text.replace(" ", " ");
        // 2. 去除连续多个空白(空格、制表符等),保留单个空格
        result = BLANK_PATTERN.matcher(result).replaceAll(" ");
        // 3. 去除文本首尾空白
        return result.trim();
    }

    /**
     * 核心:按中文语义边界拆分文本,保留分隔符(保证语义完整)
     */
    private List<String> splitByChineseSemantic(String text) {
        List<String> chunks = new ArrayList<>();
        if (text.isBlank()) {
            return chunks;
        }

        // 利用正则拆分,同时保留分隔符(中文标点是语义的重要组成,不可丢弃)
        String[] splitParts = CHINESE_SEPARATOR_PATTERN.split(text);
        Matcher matcher = CHINESE_SEPARATOR_PATTERN.matcher(text);
        List<String> separators = new ArrayList<>();

        // 提取所有匹配到的中文分隔符(用于后续拼接,保留语义)
        while (matcher.find()) {
            separators.add(matcher.group());
        }

        // 拼接拆分后的内容和对应的分隔符,形成完整语义块
        for (int i = 0; i < splitParts.length; i++) {
            String part = splitParts[i].trim();
            if (part.isBlank()) {
                continue;
            }
            // 拼接当前内容和对应的分隔符(避免分隔符丢失)
            StringBuilder chunk = new StringBuilder(part);
            if (i < separators.size()) {
                chunk.append(separators.get(i));
            }
            chunks.add(chunk.toString().trim());
        }

        return chunks;
    }

    /**
     * 过滤无效分块,优化分块质量(去除空值、合并过短分块)
     */
    private List<String> filterAndOptimizeChunks(List<String> initialSplits) {
        // 步骤1:过滤空字符串、仅含标点的无效分块
        List<String> validChunks = initialSplits.stream()
                .filter(chunk -> chunk != null && !chunk.isBlank() && !isOnlyPunctuation(chunk))
                .collect(Collectors.toList());

        // 步骤2:合并过短分块(避免出现单个词语、几个汉字的无效小分块)
        return mergeShortChunks(validChunks, 100);
    }

    /**
     * 判断是否仅含标点符号(过滤无效分块)
     */
    private boolean isOnlyPunctuation(String chunk) {
        return chunk.matches("[。|?|!|;|:|,|、|\\s]+");
    }

    /**
     * 合并过短分块,保证分块语义完整性
     *
     * @param validChunks    有效分块列表
     * @param minChunkLength 最小分块字符数
     * @return 优化后的分块列表
     */
    private List<String> mergeShortChunks(List<String> validChunks, int minChunkLength) {
        List<String> mergedChunks = new ArrayList<>();
        StringBuilder currentChunk = new StringBuilder();

        for (String chunk : validChunks) {
            // 若当前拼接的分块长度不足,继续拼接下一个分块
            if (currentChunk.length() + chunk.length() < minChunkLength) {
                currentChunk.append(chunk);
            } else {
                // 长度达标,存入结果列表,重置当前拼接器
                if (!currentChunk.isEmpty()) {
                    mergedChunks.add(currentChunk.toString().trim());
                    currentChunk.setLength(0);
                }
                mergedChunks.add(chunk.trim());
            }
        }

        // 处理最后一个未完成拼接的分块
        if (!currentChunk.isEmpty()) {
            mergedChunks.add(currentChunk.toString().trim());
        }

        return mergedChunks;
    }

    // 可选:重写拆分入口方法,适配Spring AI的Document拆分
    @Override
    public List<org.springframework.ai.document.Document> split(List<org.springframework.ai.document.Document> documents) {
        return super.split(documents);
    }

}
相关推荐
二进制_博客1 天前
SpringAI智能助手案例
大模型·springai
Haooog1 天前
RAG医疗问答系统
java·大模型·项目·rag
Haooog1 天前
Spring AI 与 LangChain4j 对比
人工智能·大模型·springai·langchain4j
一个无名的炼丹师2 天前
GraphRAG深度解析:从原理到实战,重塑RAG检索增强生成的未来
人工智能·python·rag
FranzLiszt18472 天前
基于One API 将本地 Ollama 模型接入 FastGPT
langchain·fastgpt·rag·ollama·one api
石去皿2 天前
从本地知识库到“活”知识——RAG 落地全景指南
c++·python·大模型·rag
小小工匠2 天前
LLM - 从通用对话到自治智能体:Agent / Skills / MCP / RAG 三层架构实战
agent·rag·skill·mcp
沛沛老爹3 天前
Web开发者5分钟上手:Agent Skills环境搭建与基础使用实战
java·人工智能·llm·llama·rag·agent skills
沛沛老爹3 天前
深入理解Agent Skills——AI助手的“专业工具箱“实战入门
java·人工智能·交互·rag·企业开发·web转型ai