场景描述
构建一个企业内部文档智能问答系统,员工可以上传PDF文档,系统自动提取文本、生成向量嵌入存储到Qdrant中,然后员工可以通过自然语言提问来获取相关文档内容的答案。
步骤一:项目依赖配置
首先在pom.xml
中添加必要依赖:
xml
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- Spring AI Core -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-core</artifactId>
<version>0.8.1</version>
</dependency>
<!-- Spring AI OpenAI -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-openai-spring-boot-starter</artifactId>
<version>0.8.1</version>
</dependency>
<!-- Qdrant客户端 -->
<dependency>
<groupId>io.qdrant</groupId>
<artifactId>client</artifactId>
<version>1.7.0</version>
</dependency>
<!-- PDF处理 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
<!-- 文件上传 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
</dependencies>
步骤二:配置文件
在application.yml
中配置:
yaml
spring:
ai:
openai:
api-key: ${OPENAI_API_KEY}
embedding:
model: text-embedding-ada-002
chat:
model: gpt-3.5-turbo
qdrant:
host: localhost
port: 6333
collection-name: documents
server:
port: 8080
# 文件上传配置
spring:
servlet:
multipart:
max-file-size: 10MB
max-request-size: 10MB
步骤三:Qdrant配置类
java
@Configuration
public class QdrantConfig {
@Value("${qdrant.host}")
private String host;
@Value("${qdrant.port}")
private int port;
@Bean
public QdrantClient qdrantClient() {
return new QdrantClient(
QdrantGrpcClient.newBuilder(host, port, false).build()
);
}
}
步骤四:文档实体类
java
@Data
@AllArgsConstructor
@NoArgsConstructor
public class DocumentChunk {
private String id;
private String content;
private String fileName;
private int chunkIndex;
private Map<String, Object> metadata;
private List<Float> embedding;
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public class QueryRequest {
private String question;
private int topK = 5;
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public class QueryResponse {
private String answer;
private List<DocumentChunk> relevantChunks;
}
步骤五:文档处理服务
java
@Service
@Slf4j
public class DocumentService {
@Autowired
private QdrantClient qdrantClient;
@Autowired
private EmbeddingClient embeddingClient;
@Value("${qdrant.collection-name}")
private String collectionName;
private static final int CHUNK_SIZE = 1000;
private static final int CHUNK_OVERLAP = 200;
@PostConstruct
public void initializeCollection() {
try {
// 检查集合是否存在
var collections = qdrantClient.listCollectionsAsync().get();
boolean collectionExists = collections.getCollectionsList()
.stream()
.anyMatch(desc -> desc.getName().equals(collectionName));
if (!collectionExists) {
// 创建集合
VectorParams vectorParams = VectorParams.newBuilder()
.setSize(1536) // OpenAI embedding维度
.setDistance(Distance.Cosine)
.build();
CreateCollection createCollection = CreateCollection.newBuilder()
.setCollectionName(collectionName)
.setVectorsConfig(VectorsConfig.newBuilder()
.setParams(vectorParams)
.build())
.build();
qdrantClient.createCollectionAsync(createCollection).get();
log.info("Created Qdrant collection: {}", collectionName);
}
} catch (Exception e) {
log.error("Failed to initialize Qdrant collection", e);
throw new RuntimeException("Failed to initialize Qdrant collection", e);
}
}
public void processDocument(MultipartFile file) throws Exception {
String fileName = file.getOriginalFilename();
String content = extractTextFromPdf(file);
// 将文档分块
List<String> chunks = splitTextIntoChunks(content);
// 为每个块生成嵌入并存储
for (int i = 0; i < chunks.size(); i++) {
String chunk = chunks.get(i);
// 生成嵌入向量
EmbeddingRequest embeddingRequest = new EmbeddingRequest(
Arrays.asList(chunk),
EmbeddingOptionsBuilder.builder().build()
);
EmbeddingResponse embeddingResponse = embeddingClient.call(embeddingRequest);
List<Float> embedding = embeddingResponse.getResults().get(0).getOutput();
// 准备向量点
String pointId = UUID.randomUUID().toString();
Map<String, Value> payload = new HashMap<>();
payload.put("content", Value.newBuilder().setStringValue(chunk).build());
payload.put("fileName", Value.newBuilder().setStringValue(fileName).build());
payload.put("chunkIndex", Value.newBuilder().setIntegerValue(i).build());
PointStruct point = PointStruct.newBuilder()
.setId(PointId.newBuilder().setUuid(pointId).build())
.setVectors(Vectors.newBuilder().setVector(
Vector.newBuilder().addAllData(embedding).build()
).build())
.putAllPayload(payload)
.build();
// 插入向量点
UpsertPoints upsertPoints = UpsertPoints.newBuilder()
.setCollectionName(collectionName)
.addPoints(point)
.build();
qdrantClient.upsertAsync(upsertPoints).get();
}
log.info("Successfully processed document: {}, created {} chunks", fileName, chunks.size());
}
private String extractTextFromPdf(MultipartFile file) throws Exception {
try (PDDocument document = PDDocument.load(file.getInputStream())) {
PDFTextStripper stripper = new PDFTextStripper();
return stripper.getText(document);
}
}
private List<String> splitTextIntoChunks(String text) {
List<String> chunks = new ArrayList<>();
int start = 0;
while (start < text.length()) {
int end = Math.min(start + CHUNK_SIZE, text.length());
// 尝试在单词边界分割
if (end < text.length()) {
int lastSpace = text.lastIndexOf(' ', end);
if (lastSpace > start) {
end = lastSpace;
}
}
chunks.add(text.substring(start, end).trim());
start = end - CHUNK_OVERLAP;
if (start < 0) start = 0;
}
return chunks;
}
}
步骤六:问答服务
java
@Service
@Slf4j
public class QuestionAnsweringService {
@Autowired
private QdrantClient qdrantClient;
@Autowired
private EmbeddingClient embeddingClient;
@Autowired
private ChatClient chatClient;
@Value("${qdrant.collection-name}")
private String collectionName;
public QueryResponse answerQuestion(QueryRequest request) throws Exception {
// 1. 为问题生成嵌入向量
EmbeddingRequest embeddingRequest = new EmbeddingRequest(
Arrays.asList(request.getQuestion()),
EmbeddingOptionsBuilder.builder().build()
);
EmbeddingResponse embeddingResponse = embeddingClient.call(embeddingRequest);
List<Float> queryEmbedding = embeddingResponse.getResults().get(0).getOutput();
// 2. 在Qdrant中搜索相似文档块
SearchPoints searchPoints = SearchPoints.newBuilder()
.setCollectionName(collectionName)
.addAllVector(queryEmbedding)
.setLimit(request.getTopK())
.setWithPayload(WithPayloadSelector.newBuilder().setEnable(true).build())
.build();
SearchResponse searchResponse = qdrantClient.searchAsync(searchPoints).get();
// 3. 提取相关文档块
List<DocumentChunk> relevantChunks = new ArrayList<>();
StringBuilder contextBuilder = new StringBuilder();
for (ScoredPoint point : searchResponse.getResultList()) {
String content = point.getPayload().get("content").getStringValue();
String fileName = point.getPayload().get("fileName").getStringValue();
int chunkIndex = (int) point.getPayload().get("chunkIndex").getIntegerValue();
DocumentChunk chunk = new DocumentChunk();
chunk.setId(point.getId().getUuid());
chunk.setContent(content);
chunk.setFileName(fileName);
chunk.setChunkIndex(chunkIndex);
relevantChunks.add(chunk);
contextBuilder.append(content).append("\n\n");
}
// 4. 使用LLM生成答案
String prompt = String.format(
"基于以下文档内容回答问题。如果文档中没有相关信息,请说明无法找到答案。\n\n" +
"文档内容:\n%s\n\n" +
"问题:%s\n\n" +
"答案:",
contextBuilder.toString(),
request.getQuestion()
);
ChatResponse chatResponse = chatClient.call(new Prompt(prompt));
String answer = chatResponse.getResult().getOutput().getContent();
return new QueryResponse(answer, relevantChunks);
}
}
步骤七:REST控制器
java
@RestController
@RequestMapping("/api/documents")
@Slf4j
public class DocumentController {
@Autowired
private DocumentService documentService;
@Autowired
private QuestionAnsweringService qaService;
@PostMapping("/upload")
public ResponseEntity<?> uploadDocument(@RequestParam("file") MultipartFile file) {
try {
if (file.isEmpty()) {
return ResponseEntity.badRequest().body("文件不能为空");
}
if (!file.getOriginalFilename().toLowerCase().endsWith(".pdf")) {
return ResponseEntity.badRequest().body("只支持PDF文件");
}
documentService.processDocument(file);
return ResponseEntity.ok().body(Map.of(
"message", "文档上传成功",
"fileName", file.getOriginalFilename()
));
} catch (Exception e) {
log.error("文档上传失败", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
.body(Map.of("error", "文档处理失败: " + e.getMessage()));
}
}
@PostMapping("/query")
public ResponseEntity<?> queryDocuments(@RequestBody QueryRequest request) {
try {
if (request.getQuestion() == null || request.getQuestion().trim().isEmpty()) {
return ResponseEntity.badRequest().body("问题不能为空");
}
QueryResponse response = qaService.answerQuestion(request);
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("查询失败", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
.body(Map.of("error", "查询失败: " + e.getMessage()));
}
}
}
步骤八:启动类
java
@SpringBootApplication
public class DocumentQAApplication {
public static void main(String[] args) {
SpringApplication.run(DocumentQAApplication.class, args);
}
}
使用示例
启动应用后,可以通过以下方式测试:
- 上传文档:
bash
curl -X POST -F "[email protected]" http://localhost:8080/api/documents/upload
- 查询文档:
bash
curl -X POST -H "Content-Type: application/json" \
-d '{"question":"文档中提到的主要观点是什么?","topK":3}' \
http://localhost:8080/api/documents/query
系统架构说明
这个系统的工作流程是:文档上传后被分割成小块,每个块通过OpenAI的嵌入模型生成向量表示并存储到Qdrant中。当用户提问时,问题也被转换成向量,在Qdrant中搜索最相似的文档块,然后将这些相关内容作为上下文提供给GPT模型生成最终答案。
这种RAG(检索增强生成)架构能够让AI基于企业内部文档内容回答问题,避免了幻觉问题,并且可以引用具体的文档来源。