项目原型
text
🏥 医疗知识图谱智能问答系统
==================================================
[知识图谱统计]
📊 实体总数: 1,245,678 🔗 关系总数: 2,567,890
🔄 最后更新: 2024-01-15 14:30:25
[文档处理面板]
待处理医学文献: 125篇
┌─────────────────────────────────────────────────────┐
│ 最新文献: 《2型糖尿病药物治疗新进展》 │
│ 抽取状态: ✅ 完成 (实体: 23, 关系: 45) │
└─────────────────────────────────────────────────────┘
[智能问答界面]
用户问题: 糖尿病有哪些典型症状和常用药物?
🤖 系统回答:
根据医疗知识图谱,糖尿病相关信息如下:
【典型症状】
• 多饮、多尿、多食
• 体重下降
• 视力模糊
• 疲劳乏力
【常用药物】
• 二甲双胍
• 胰岛素
• 格列美脲
• 西格列汀
【相关检查】
• 血糖检测
• 糖化血红蛋白
• 口服葡萄糖耐量试验
💡 温馨提示: 以上信息仅供参考,具体诊疗请咨询专业医生。
[图谱可视化]
(显示糖尿病节点及其相关症状、药物、检查项目的网络图)
[操作选项]
1. 🔍 查看详细图谱 2. 📚 文献管理 3. ⚙️ 系统配置
4. 📊 统计分析 5. 🔄 更新图谱 6. ❓ 帮助
请输入选择 [1-6]:
配置参数
yml
# config/kg_config.yaml
uie_model:
model_path: "/models/uie-medical/"
schema:
- "疾病"
- "症状"
- "药物"
- "检查项目"
- "治疗方法"
- "基因"
- "蛋白质"
- "科室"
batch_size: 16
max_seq_len: 512
neo4j:
uri: "bolt://localhost:7687"
username: "neo4j"
password: "medical_kg_2024"
database: "medical_knowledge"
encrypted: false
knowledge_graph:
entity_threshold: 0.7
relation_threshold: 0.6
max_entities_per_doc: 50
qa_system:
cache_size: 1000
timeout: 30
max_results: 10
核心代码实现
python
import torch
from paddlenlp import Taskflow
from py2neo import Graph,Node,Relationship
import re
from typing import List,Dict,Tuple
class MedicalKGSystem:
"""
知识图谱系统
包含信息抽取、图谱构建、智能回答
"""
def ____(self,neo4j_uri:str,username:str,password:str):
#初始化UIE信息抽取模型
self.schema = [
"疾病", "症状", "药物", "检查项目",
"治疗方法", "基因", "蛋白质", "科室"
]
self.uie = Taskflow(
"information_extraction",
schema=self.schema,
model="uie-base",
task_path="/models/uie-medical/",
device_id=0
)
#连接Neo4j数据库
self.graph = Graph(neo4j_uri,auth=(username,password))
#定义医疗实体关系
self.relationships = {
"疾病-症状": "has_symptom",
"疾病-检查项目": "need_check",
"疾病-药物": "treated_by",
"疾病-治疗方法": "treated_with",
"药物-疾病": "treats",
"症状-疾病": "symptom_of",
"疾病-科室": "belongs_to_department"
}
def extract_from_text(self,text:str)->Dict:
"""
从医疗文本中抽取实体和关系
"""
try:
#使用UIE进行实体抽取
extraction_result = self.uie(text)
#后处理抽取结果
processed_entities = self._process_extraction_result(extraction_result)
#基于规则的关系抽取
relations = self._extract_relations(text,processed_entities)
#更新知识图谱
self._update_knowledge_graph(processed_entities,relations)
return {
"entities": processed_entities,
"relations": relations,
"status": "success"
}
except Exception as e:
return {
"entities": [],
"relations": [],
"status": f"error: {str(e)}"
}
def _process_extraction_result(self, extraction_result: Dict)->List[Dict]:
"""处理UIE抽取结果,标准化实体"""
processed_entities = []
for entity_type,entities in extraction_result.items():
for entity in entities:
#实体标准化和去重
normalized_entity = self._normalize_entity(
entity['text']
entity_type
)
processed_entities.append({
'name': normalized_entity,
'type': entity_type,
'original_text': entity['text'],
'confidence': entity.get('probability', 0.0)
})
return processed_entities
def _normalize_entity(self,entity_text:str,entity_type:str)->str:
"""实体标准化,统一同义词,去除修饰词等"""
#医疗实体标准化规则
normalization_rules = {
"疾病": {
"糖尿病 mellitus": "糖尿病",
"DM": "糖尿病",
"高血压病": "高血压"
},
"药物": {
"阿司匹林片": "阿司匹林",
"ASP": "阿司匹林"
}
}
#应用标准化规则
if entity_type in normalizaiton_rules:
for original,normalized in normalization_rules[entity_type].items():
if orginal in entity_text:
return normalized
#去除修饰词
modifiers = ["急性", "慢性", "重度", "轻度", "典型"]
cleaned_text = entity_text
for modifier in modifiers:
cleaned_text = cleaned_text.replace(modifier,"")
return cleaned_text.strip()
def _extract_relations(self, text: str, entities: List[Dict])->List<Dict>:
"""基于规则和上下文的关系抽取"""
relations = []
#构建实体位置索引
entity_positions = {}
for entity in entities:
start_pos = text.find(entity['original_text'])
if start_pos != -1:
entity_positions[entity['name']] = {
'start': start_pos,
'end': start_pos + len(entity['original_text']),
'type': entity['type']
}
#基于距离和模式的关系抽取
entity_names = [entity['name'] for entity in entities]
for i,entity1 in enumerate(entities):
for j,entity2 in enumerate(entities):
if i != j:
#检查是否存在预定义关系
relation_key = f"{entity1['type']}--{entity2['type']}"
if relation_key in self.relationships:
#计算实体在文本中的距离
pos1 = entity_positions.get(entity1['name'], {})
pos2 = entity_positions.get(entity2['name'], {})
if pos1 and pos2:
distance = abs(pos1['start'] - pos2['start'])
#如果实体在文本中距离较劲,人为存在关系
if distance < 100:#100个字符以内
relations.append({
'source': entity1['name'],
'target': entity2['name'],
'relation': self.relationships[relation_key],
'confidence': 0.8,
'source_type': entity1['type'],
'target_type': entity2['type']
})
return relations
def _update_knowledge_graph(self,entities:List[Dict],relations:List[Dict]):
"""将抽取的实体和关系更新到Neo4j知识图谱"""
tx = self.graph.begin()
try:
#创建或更新实体节点
entity_nodes = {}
for entity in entities:
#检查节点是否已经存在
existing_node = self.graph.nodes.match(
entity['type'],
name=entity['name']
).first()
if existing_node:
node = existing_node
#更新节点属性
node['count'] = node.get('count',0) + 1
node['last_updated'] = datetime.now().isoformat()
else:
#创建新节点
node = Node(
entity['type'],
name=entity['name'],
count=1,
created_time=datetime.now().isoformat(),
last_updated=datetime.now().isoformat()
)
tx.create(node)
entity_nodes[entity['name']] = node
#创建关系
for relation in relations:
source_node = entity_nodes.get()
target_node = entity_nodes.get()
if source_node and target_node:
#检查关系是否已存在
existing_rel = self.graph.relationships.match(
{source_node,target_node},
relation['relation']
).first()
if not existing_rel:
#创建新关系
new_rel = relationship(
source_node,
relation['relation'],
target_node,
confidence=relation['confidence'],
created_time=datetime.now().isoformat()
)
tx.xreate(new_rel)
else:
#更新关系权重
existing_rel['weight'] = existing_rel.get('weight', 1) + 1
existing_rel['last_updated'] = datetime.now().isoformat()
tx.commit()
except Exception as e:
tx.rollback()
raise e
class GraphQASystem:
"""图谱问答系统,将自然语言转换为Cypher查询"""
def __init__(self,graph:Graph):
self.graph = graph
self.patterns = self._load_quesion_patterns()
def answer_question()->Dict:
"""回答自然语言问题"""
#解析问题类型
question_type = self._classify_question(question)
#生成Cypher查询
cypher_query = self._generate_cypher(question,question_type)
#执行查询
try:
result = self.graph.run(cypher_query).data()
answer = self._format_answer(result, question_type)
return {
"question": question,
"cypher_query": cypher_query,
"answer": answer,
"raw_result": result,
"status": "success"
}
except Exception as e:
return {
"question": question,
"cypher_query": cypher_query,
"answer": f"查询失败: {str(e)}",
"raw_result": [],
"status": "error"
}
def _classify_question(self,quesion:str)->str:
"""分类问题类型"""
question_patterns = {
"symptom_query": [
r".*([糖尿病高血压]).*症状.*",
r".*症状.*([糖尿病高血压]).*"
],
"treatment_query": [
r".*([糖尿病高血压]).*治疗.*",
r".*治疗.*([糖尿病高血压]).*"
],
"drug_query": [
r".*([糖尿病高血压]).*药.*",
r".*药.*治疗.*([糖尿病高血压]).*"
],
"department_query": [
r".*([糖尿病高血压]).*挂.*科.*",
r".*看.*科.*"
]
}
for q_type, patterns in question_patterns.items():
for pattern in patterns:
if re.search(pattern, question):
return q_type
return "general_query"
def _generate_cypher(self,question:str,question_type:str)->str:
"""根据问题类型生成Cypher查询"""
# 提取疾病实体
disease_pattern = r"(糖尿病|高血压|冠心病|哮喘|肺炎)"
disease_match = re.search(disease_pattern, question)
disease = disease_match.group(1) if disease_match else None
cypher_templates = {
"symptom_query": f"""
MATCH (d:疾病 {{name: '{disease}'}})-[r:has_symptom]->(s:症状)
RETURN d.name as disease, collect(s.name) as symptoms
""",
"treatment_query": f"""
MATCH (d:疾病 {{name: '{disease}'}})-[r:treated_with]->(t:治疗方法)
RETURN d.name as disease, collect(t.name) as treatments
""",
"drug_query": f"""
MATCH (d:疾病 {{name: '{disease}'}})-[r:treated_by]->(m:药物)
RETURN d.name as disease, collect(m.name) as drugs
""",
"department_query": f"""
MATCH (d:疾病 {{name: '{disease}'}})-[r:belongs_to_department]->(dept:科室)
RETURN d.name as disease, dept.name as department
""",
"general_query": """
MATCH (d:疾病)-[r]-(related)
WHERE d.name CONTAINS $disease_name
RETURN d.name as disease, type(r) as relation, collect(related.name) as related_entities
LIMIT 10
"""
}
return cypher_templates.get(question_type, cypher_templates["general_query"])
def _format_answer():
"""格式化查询结果为自然语言回答"""
if not result:
return "抱歉,没有找到相关信息。"
answer_templates = {
"symptom_query": lambda r: f"{r['disease']}的常见症状包括:{', '.join(r['symptoms'])}。",
"treatment_query": lambda r: f"{r['disease']}的治疗方法包括:{', '.join(r['treatments'])}。",
"drug_query": lambda r: f"用于治疗{r['disease']}的药物有:{', '.join(r['drugs'])}。",
"department_query": lambda r: f"{r['disease']}应该挂{r['department']}科室。"
}
template = answer_templates.get(question_type)
if template:
return template(result[0])
else:
# 通用回答格式
answers = []
for item in result[:3]: # 最多显示3条结果
answers.append(f"{item['disease']} - {item['relation']} - {item['related_entities']}")
return "相关结果:\n" + "\n".join(answers)
===========================================
Java代码
系统架构
text
信息抽取与图谱问答系统
├── 前端界面 (Vue.js + Element Plus + ECharts)
├── RESTful API (Spring Boot)
├── 业务逻辑层
│ ├── 信息抽取服务
│ ├── 知识图谱构建服务
│ ├── 图谱问答服务
│ └── 图谱分析服务
├── 数据访问层
├── 图数据库 (Neo4j)
├── NLP处理层 (Stanford CoreNLP)
└── 缓存层 (Redis)
依赖文件
xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.company</groupId>
<artifactId>kg-qa-system</artifactId>
<version>1.0.0</version>
<packaging>jar</packaging>
<name>Knowledge Graph QA System</name>
<description>Enterprise-level information extraction and knowledge graph QA system</description>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.7.0</version>
<relativePath/>
</parent>
<properties>
<java.version>11</java.version>
<neo4j.version>4.4.9</neo4j.version>
<stanford-corenlp.version>4.5.0</stanford-corenlp.version>
</properties>
<dependencies>
<!-- Spring Boot Starters -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-neo4j</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-websocket</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-validation</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<!-- Database -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<!-- Neo4j -->
<dependency>
<groupId>org.neo4j</groupId>
<artifactId>neo4j-ogm-core</artifactId>
<version>${neo4j.version}</version>
</dependency>
<!-- NLP Processing -->
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>${stanford-corenlp.version}</version>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>${stanford-corenlp.version}</version>
<classifier>models</classifier>
</dependency>
<!-- PDF Processing -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.27</version>
</dependency>
<!-- Word Processing -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<!-- Utilities -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>
<!-- JSON Processing -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
</dependency>
<!-- Test -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testcontainers</groupId>
<artifactId>neo4j</artifactId>
<version>1.17.6</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testcontainers</groupId>
<artifactId>junit-jupiter</artifactId>
<version>1.17.6</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
application.yml
yml
server:
port: 8082
servlet:
context-path: /kg-qa
spring:
datasource:
url: jdbc:mysql://localhost:3306/kg_qa_system?useSSL=false&serverTimezone=UTC
username: root
password: password
driver-class-name: com.mysql.cj.jdbc.Driver
jpa:
hibernate:
ddl-auto: update
show-sql: true
properties:
hibernate:
dialect: org.hibernate.dialect.MySQL8Dialect
format_sql: true
data:
neo4j:
uri: bolt://localhost:7687
username: neo4j
password: password
auto-index: update
redis:
host: localhost
port: 6379
password:
database: 0
timeout: 2000ms
lettuce:
pool:
max-active: 8
max-wait: -1ms
max-idle: 8
min-idle: 0
websocket:
allowed-origins: "*"
# Knowledge Graph Configuration
kg:
# Information Extraction Configuration
extraction:
max-text-length: 10000
batch-size: 50
enable-relation-extraction: true
enable-attribute-extraction: true
# Graph Configuration
graph:
auto-indexing: true
batch-insert-size: 1000
cache-enabled: true
# QA Configuration
qa:
max-path-length: 3
timeout: 30000
enable-fallback: true
# NLP Configuration
nlp:
stanford:
annotators: tokenize, ssplit, pos, lemma, ner, parse, depparse, coref
threads: 4
timeout: 30000
# Application Configuration
app:
cache:
extraction-results-ttl: 3600
query-results-ttl: 1800
storage:
upload-path: ./uploads/
processed-path: ./processed/
# Logging
logging:
level:
com.company.kg: DEBUG
org.springframework.web: INFO
org.hibernate: WARN
org.neo4j: WARN
file:
name: logs/kg-qa-system.log
pattern:
file: "%d{yyyy-MM-dd HH:mm:ss} - %logger{36} - %msg%n"
# Management Endpoints
management:
endpoints:
web:
exposure:
include: health,info,metrics,prometheus
endpoint:
health:
show-details: always
数据模型层
文本源实体
java
/**
* 文本源实体 - 存储待处理的文本数据源
*/
@Entity
@Table(name = "text_source", indexes = {
@Index(name = "idx_source_type", columnList = "sourceType"),
@Index(name = "idx_source_status", columnList = "status"),
@Index(name = "idx_source_domain", columnList = "domain")
})
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TextSource {
public enum SourceType {
PDF, DOC, DOCX, TXT, HTML, WEB_PAGE, DATABASE, API
}
public enum ProcessingStatus {
PENDING, PROCESSING, EXTRACTED, FAILED, DELETED
}
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
@EqualsAndHashCode.Include
private Long id;
@NotBlank(message = "源名称不能为空")
@Column(nullable = false, length = 500)
private String name;
@Enumerated(EnumType.STRING)
@Column(nullable = false, length = 20)
private SourceType sourceType;
@Column(length = 50)
private String fileType;
@Column
private Long fileSize;
@Column(length = 500)
private String filePath;
@Column(columnDefinition = "TEXT")
private String content;
@Column(length = 100)
private String domain = "default";
@Column(length = 100)
private String category;
@Enumerated(EnumType.STRING)
@Column(nullable = false, length = 20)
private ProcessingStatus status = ProcessingStatus.PENDING;
@Column(nullable = false)
private Integer entityCount = 0;
@Column(nullable = false)
private Integer relationCount = 0;
@Column(nullable = false)
private Integer attributeCount = 0;
@Column(length = 1000)
private String processingError;
@Column(length = 500)
private String metadata; // JSON格式的元数据
@OneToMany(mappedBy = "textSource", cascade = CascadeType.ALL, fetch = FetchType.LAZY)
private List<ExtractionResult> extractionResults = new ArrayList<>();
@CreationTimestamp
@Column(updatable = false)
private LocalDateTime createdAt;
@UpdateTimestamp
private LocalDateTime updatedAt;
@Version
private Long version;
public TextSource(String name, SourceType sourceType, String content) {
this.name = name;
this.sourceType = sourceType;
this.content = content;
}
public boolean isProcessed() {
return ProcessingStatus.EXTRACTED.equals(this.status);
}
}
抽取结果实体
java
/**
* 抽取结果实体 - 存储信息抽取的结果
*/
@Entity
@Table(name = "extraction_result", indexes = {
@Index(name = "idx_result_source", columnList = "textSource_id"),
@Index(name = "idx_result_created", columnList = "createdAt")
})
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class ExtractionResult {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
@EqualsAndHashCode.Include
private Long id;
@ManyToOne(fetch = FetchType.LAZY)
@JoinColumn(name = "textSource_id", nullable = false)
private TextSource textSource;
@Column(nullable = false)
private Integer entityCount = 0;
@Column(nullable = false)
private Integer relationCount = 0;
@Column(nullable = false)
private Integer attributeCount = 0;
@Column(precision = 5, scale = 4)
private Double extractionConfidence;
@Column(length = 50)
private String extractionModel;
@Column(columnDefinition = "TEXT")
private String extractedEntities; // JSON格式的实体列表
@Column(columnDefinition = "TEXT")
private String extractedRelations; // JSON格式的关系列表
@Column(columnDefinition = "TEXT")
private String extractedAttributes; // JSON格式的属性列表
@Column(length = 1000)
private String processingLog;
@Column(nullable = false)
private Long processingTimeMs;
@ElementCollection
@CollectionTable(name = "extraction_statistics",
joinColumns = @JoinColumn(name = "extraction_result_id"))
@MapKeyColumn(name = "stat_key")
@Column(name = "stat_value")
private Map<String, String> statistics = new HashMap<>();
@CreationTimestamp
@Column(updatable = false)
private LocalDateTime createdAt;
@Version
private Long version;
public ExtractionResult(TextSource textSource) {
this.textSource = textSource;
}
}
Neo4j图数据库模型
实体节点
java
/**
* 实体节点 - 知识图谱中的实体
*/
@NodeEntity
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class EntityNode {
@Id
@GeneratedValue
@EqualsAndHashCode.Include
private Long id;
@Index(unique = true)
@Property(name = "entity_id")
private String entityId;
@Property(name = "name")
private String name;
@Property(name = "type")
private String type;
@Property(name = "description")
private String description;
@Property(name = "domain")
private String domain = "default";
@Property(name = "source")
private String source;
@Property(name = "confidence")
private Double confidence;
@Property(name = "created_at")
private LocalDateTime createdAt;
@Property(name = "updated_at")
private LocalDateTime updatedAt;
@Property(name = "metadata")
private String metadata; // JSON格式的元数据
@Relationship(type = "HAS_ATTRIBUTE", direction = Relationship.OUTGOING)
private Set<AttributeRelation> attributes = new HashSet<>();
@Relationship(type = "RELATED_TO", direction = Relationship.OUTGOING)
private Set<EntityRelation> relations = new HashSet<>();
public EntityNode(String entityId, String name, String type) {
this.entityId = entityId;
this.name = name;
this.type = type;
this.createdAt = LocalDateTime.now();
this.updatedAt = LocalDateTime.now();
}
public EntityNode(String entityId, String name, String type, String domain) {
this(entityId, name, type);
this.domain = domain;
}
/**
* 添加属性
*/
public void addAttribute(String key, String value, Double confidence) {
AttributeRelation attribute = new AttributeRelation(this, key, value, confidence);
this.attributes.add(attribute);
}
/**
* 添加关系
*/
public void addRelation(EntityNode target, String relationType, Double confidence) {
EntityRelation relation = new EntityRelation(this, target, relationType, confidence);
this.relations.add(relation);
}
/**
* 获取实体显示标签
*/
public String getDisplayLabel() {
return String.format("%s (%s)", name, type);
}
}
实体关系
java
/**
* 实体关系 - 知识图谱中实体之间的关系
*/
@RelationshipEntity(type = "RELATED_TO")
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class EntityRelation {
@Id
@GeneratedValue
@EqualsAndHashCode.Include
private Long id;
@StartNode
private EntityNode startEntity;
@EndNode
private EntityNode endEntity;
@Property(name = "relation_id")
private String relationId;
@Property(name = "type")
private String type;
@Property(name = "description")
private String description;
@Property(name = "confidence")
private Double confidence;
@Property(name = "source")
private String source;
@Property(name = "domain")
private String domain = "default";
@Property(name = "created_at")
private LocalDateTime createdAt;
@Property(name = "metadata")
private String metadata; // JSON格式的元数据
public EntityRelation(EntityNode startEntity, EntityNode endEntity, String type, Double confidence) {
this.startEntity = startEntity;
this.endEntity = endEntity;
this.type = type;
this.confidence = confidence;
this.relationId = generateRelationId(startEntity, endEntity, type);
this.createdAt = LocalDateTime.now();
}
/**
* 生成关系ID
*/
private String generateRelationId(EntityNode start, EntityNode end, String relationType) {
return String.format("%s_%s_%s_%d",
start.getEntityId(), relationType, end.getEntityId(), System.currentTimeMillis());
}
/**
* 获取关系显示标签
*/
public String getDisplayLabel() {
return String.format("%s → %s", type, confidence != null ? String.format("(%.2f)", confidence) : "");
}
}
属性关系
java
/**
* 属性关系 - 实体与属性之间的关系
*/
@RelationshipEntity(type = "HAS_ATTRIBUTE")
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class AttributeRelation {
@Id
@GeneratedValue
@EqualsAndHashCode.Include
private Long id;
@StartNode
private EntityNode entity;
@Property(name = "attribute_id")
private String attributeId;
@Property(name = "key")
private String key;
@Property(name = "value")
private String value;
@Property(name = "data_type")
private String dataType;
@Property(name = "confidence")
private Double confidence;
@Property(name = "source")
private String source;
@Property(name = "created_at")
private LocalDateTime createdAt;
@Property(name = "metadata")
private String metadata; // JSON格式的元数据
public AttributeRelation(EntityNode entity, String key, String value, Double confidence) {
this.entity = entity;
this.key = key;
this.value = value;
this.confidence = confidence;
this.attributeId = generateAttributeId(entity, key);
this.createdAt = LocalDateTime.now();
this.dataType = inferDataType(value);
}
/**
* 生成属性ID
*/
private String generateAttributeId(EntityNode entity, String key) {
return String.format("%s_%s_%d", entity.getEntityId(), key, System.currentTimeMillis());
}
/**
* 推断数据类型
*/
private String inferDataType(String value) {
if (value == null) return "STRING";
// 检查是否为数字
if (value.matches("-?\\d+")) return "INTEGER";
if (value.matches("-?\\d+\\.\\d+")) return "FLOAT";
// 检查是否为布尔值
if ("true".equalsIgnoreCase(value) || "false".equalsIgnoreCase(value)) return "BOOLEAN";
// 检查是否为日期
if (value.matches("\\d{4}-\\d{2}-\\d{2}")) return "DATE";
if (value.matches("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}")) return "DATETIME";
return "STRING";
}
/**
* 获取属性显示标签
*/
public String getDisplayLabel() {
return String.format("%s: %s", key, value);
}
}
Web控制器层
信息抽取API控制器
java
@RestController
@RequestMapping("/api/v1/extraction")
@Validated
@Slf4j
public class ExtractionController {
@Autowired
private InformationExtractionService extractionService;
@Autowired
private KnowledgeGraphService knowledgeGraphService;
/**
从文本中抽取信息
*/
@PostMapping("/extract-text")
public ResponseEntity< ExtractionResponse> extractFromText(@Valid @RequestBody TextExtractionRequest request) {
log.info("Text extraction request - Domain: {}, Length: {}",
request.getDomain(), request.getText().length());
try {
InformationExtractionService.ExtractionData extractionData =
extractionService.extractInformation(request.getText(), request.getDomain());
ExtractionResponse response = new ExtractionResponse();
response.setSuccess(true);
response.setEntityCount(extractionData.getEntities().size());
response.setRelationCount(extractionData.getRelations().size());
response.setAttributeCount(extractionData.getAttributes().size());
response.setExtractionData(extractionData);
response.setMessage("信息抽取完成");
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("Text extraction failed", e);
return ResponseEntity.badRequest()
.body(ExtractionResponse.error("文本抽取失败: " + e.getMessage()));
}
}
/**
* 上传文件并抽取信息
*/
@PostMapping("/upload-file")
public ResponseEntity<ExtractionResponse> uploadAndExtract(
@RequestParam("file") MultipartFile file,
@RequestParam("domain") String domain,
@RequestParam(value = "category", required = false) String category) {
log.info("File upload and extraction - File: {}, Domain: {}", file.getOriginalFilename(), domain);
try {
// 检查文件类型
String contentType = file.getContentType();
if (!isSupportedFileType(contentType)) {
return ResponseEntity.badRequest()
.body(ExtractionResponse.error("不支持的文件类型: " + contentType));
}
// 读取文件内容
String content = readFileContent(file);
if (content == null || content.trim().isEmpty()) {
return ResponseEntity.badRequest()
.body(ExtractionResponse.error("文件内容为空"));
}
// 创建文本源
TextSource textSource = new TextSource(
file.getOriginalFilename(),
TextSource.SourceType.valueOf(getSourceType(contentType)),
content
);
textSource.setDomain(domain);
textSource.setCategory(category);
textSource.setFileSize(file.getSize());
textSource.setFileType(getFileExtension(file.getOriginalFilename()));
// 执行信息抽取
InformationExtractionService.ExtractionData extractionData =
extractionService.extractInformation(content, domain);
ExtractionResponse response = new ExtractionResponse();
response.setSuccess(true);
response.setEntityCount(extractionData.getEntities().size());
response.setRelationCount(extractionData.getRelations().size());
response.setAttributeCount(extractionData.getAttributes().size());
response.setExtractionData(extractionData);
response.setMessage("文件上传和信息抽取完成");
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("File upload and extraction failed", e);
return ResponseEntity.badRequest()
.body(ExtractionResponse.error("文件上传和抽取失败: " + e.getMessage()));
}
}
/**
* 批量处理文本源
*/
@PostMapping("/batch-process")
public ResponseEntity<BatchExtractionResponse> batchProcess(@Valid @RequestBody BatchExtractionRequest request) {
log.info("Batch extraction request - Count: {}, Domain: {}",
request.getTexts().size(), request.getDomain());
try {
BatchExtractionResponse response = new BatchExtractionResponse();
response.setTotalCount(request.getTexts().size());
response.setSuccessCount(0);
response.setFailedCount(0);
for (String text : request.getTexts()) {
try {
InformationExtractionService.ExtractionData extractionData =
extractionService.extractInformation(text, request.getDomain());
ExtractionResult result = new ExtractionResult();
result.setEntityCount(extractionData.getEntities().size());
result.setRelationCount(extractionData.getRelations().size());
result.setAttributeCount(extractionData.getAttributes().size());
response.getResults().add(result);
response.setSuccessCount(response.getSuccessCount() + 1);
} catch (Exception e) {
log.error("Batch extraction failed for one text", e);
response.setFailedCount(response.getFailedCount() + 1);
response.getErrors().add("处理失败: " + e.getMessage());
}
}
response.setSuccess(true);
response.setMessage("批量处理完成");
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("Batch extraction failed", e);
return ResponseEntity.badRequest()
.body(BatchExtractionResponse.error("批量处理失败: " + e.getMessage()));
}
}
/**
* 获取抽取结果统计
*/
@GetMapping("/statistics")
public ResponseEntity<ExtractionStatisticsResponse> getExtractionStatistics(
@RequestParam(value = "domain", required = false) String domain) {
log.info("Getting extraction statistics - Domain: {}", domain);
try {
ExtractionStatisticsResponse response = knowledgeGraphService.getExtractionStatistics(domain);
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("Failed to get extraction statistics", e);
return ResponseEntity.badRequest()
.body(ExtractionStatisticsResponse.error("获取统计信息失败: " + e.getMessage()));
}
}
// 工具方法
private boolean isSupportedFileType(String contentType) {
return contentType != null && (
contentType.equals("text/plain") ||
contentType.equals("application/pdf") ||
contentType.equals("application/msword") ||
contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
);
}
private String getSourceType(String contentType) {
if (contentType == null) return "TXT";
switch (contentType) {
case "application/pdf": return "PDF";
case "application/msword": return "DOC";
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return "DOCX";
default: return "TXT";
}
}
private String getFileExtension(String filename) {
if (filename == null) return "";
int lastDot = filename.lastIndexOf(".");
return lastDot > 0 ? filename.substring(lastDot + 1) : "";
}
private String readFileContent(MultipartFile file) throws IOException {
String contentType = file.getContentType();
if ("text/plain".equals(contentType)) {
return new String(file.getBytes(), StandardCharsets.UTF_8);
} else if ("application/pdf".equals(contentType)) {
// 使用PDFBox读取PDF内容
try (var pdfDocument = org.apache.pdfbox.pdmodel.PDDocument.load(file.getInputStream())) {
var stripper = new org.apache.pdfbox.text.PDFTextStripper();
return stripper.getText(pdfDocument);
}
} else {
// 对于其他格式,返回简单提示
return "文件内容需要特殊处理: " + contentType;
}
}
}
知识图谱问答API控制器
java
@RestController
@RequestMapping("/api/v1/qa")
@Validated
@Slf4j
public class QAController {
@Autowired
private KnowledgeGraphQAService qaService;
/**
* 回答自然语言问题
*/
@PostMapping("/answer")
public ResponseEntity<QAResponse> answerQuestion(@Valid @RequestBody QARequest request) {
log.info("QA request - Question: {}, Domain: {}", request.getQuestion(), request.getDomain());
try {
KnowledgeGraphQAService.QAAnswer answer =
qaService.answerQuestion(request.getQuestion(), request.getDomain());
QAResponse response = new QAResponse();
response.setSuccess(answer.isSuccess());
response.setQuestion(answer.getQuestion());
response.setAnswer(answer.getAnswer());
response.setDomain(answer.getDomain());
response.setProcessingTimeMs(answer.getProcessingTimeMs());
response.setResultCount(answer.getResultCount());
response.setFallbackUsed(answer.isFallbackUsed());
if (!answer.isSuccess()) {
response.setErrorMessage(answer.getErrorMessage());
}
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("QA request failed", e);
return ResponseEntity.badRequest()
.body(QAResponse.error("问答请求失败: " + e.getMessage()));
}
}
/**
* 解析问题(不执行查询)
*/
@PostMapping("/parse-question")
public ResponseEntity<QuestionParseResponse> parseQuestion(@Valid @RequestBody QARequest request) {
log.info("Question parse request - Question: {}", request.getQuestion());
try {
KnowledgeGraphQAService.QueryTemplate queryTemplate =
qaService.parseQuestion(request.getQuestion(), request.getDomain());
QuestionParseResponse response = new QuestionParseResponse();
response.setSuccess(true);
response.setOriginalQuestion(queryTemplate.getOriginalQuestion());
response.setQuestionType(queryTemplate.getQuestionType());
response.setDomain(queryTemplate.getDomain());
response.setEntities(queryTemplate.getEntities());
response.setResolvedEntities(queryTemplate.getResolvedEntities());
response.setParameters(queryTemplate.getParameters());
// 生成Cypher查询预览
String cypherQuery = qaService.generateCypherQuery(queryTemplate);
response.setCypherQuery(cypherQuery);
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("Question parse failed", e);
return ResponseEntity.badRequest()
.body(QuestionParseResponse.error("问题解析失败: " + e.getMessage()));
}
}
/**
* 执行Cypher查询
*/
@PostMapping("/execute-cypher")
public ResponseEntity<CypherExecutionResponse> executeCypher(@Valid @RequestBody CypherExecutionRequest request) {
log.info("Cypher execution request - Query: {}", request.getQuery());
try {
// 这里应该添加查询验证和限制,防止恶意查询
if (!isSafeQuery(request.getQuery())) {
return ResponseEntity.badRequest()
.body(CypherExecutionResponse.error("查询包含不安全操作"));
}
var results = qaService.executeCypherQuery(request.getQuery());
CypherExecutionResponse response = new CypherExecutionResponse();
response.setSuccess(true);
response.setQuery(request.getQuery());
response.setResults(results);
response.setResultCount(results.size());
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("Cypher execution failed", e);
return ResponseEntity.badRequest()
.body(CypherExecutionResponse.error("Cypher查询执行失败: " + e.getMessage()));
}
}
/**
* 获取知识图谱统计信息
*/
@GetMapping("/graph-statistics")
public ResponseEntity<GraphStatisticsResponse> getGraphStatistics(
@RequestParam(value = "domain", required = false) String domain) {
log.info("Getting graph statistics - Domain: {}", domain);
try {
GraphStatisticsResponse response = new GraphStatisticsResponse();
response.setSuccess(true);
response.setDomain(domain);
// 获取实体统计
var entityStats = qaService.findEntitiesByName(".*", domain);
response.setTotalEntities(entityStats.size());
// 获取实体类型分布
var typeDistribution = entityStats.stream()
.collect(java.util.stream.Collectors.groupingBy(
e -> e.getType(),
java.util.stream.Collectors.counting()
));
response.setEntityTypeDistribution(typeDistribution);
// 这里可以添加更多统计信息...
return ResponseEntity.ok(response);
} catch (Exception e) {
log.error("Failed to get graph statistics", e);
return ResponseEntity.badRequest()
.body(GraphStatisticsResponse.error("获取图谱统计失败: " + e.getMessage()));
}
}
/**
* 检查查询安全性
*/
private boolean isSafeQuery(String query) {
if (query == null) return false;
// 检查是否包含危险操作
String lowerQuery = query.toLowerCase();
return !lowerQuery.contains("delete") &&
!lowerQuery.contains("remove") &&
!lowerQuery.contains("drop") &&
!lowerQuery.contains("create") &&
!lowerQuery.contains("merge") &&
!lowerQuery.contains("set") &&
lowerQuery.contains("match");
}
}
核心业务服务层
信息抽取服务
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
java
/**
* 信息抽取服务 - 负责从文本中抽取实体、关系和属性
*/
package com.company.kg.service;
import com.company.kg.entity.ExtractionResult;
import com.company.kg.entity.TextSource;
import com.company.kg.graph.EntityNode;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* 信息抽取服务 - 负责从文本中抽取实体、关系和属性
*/
@Service
@Slf4j
public class InformationExtractionService {
private final StanfordCoreNLP pipeline;
private final KnowledgeGraphService knowledgeGraphService;
// 实体类型映射
private static final Map<String, String> ENTITY_TYPE_MAPPING = Map.of(
"PERSON", "Person",
"ORGANIZATION", "Organization",
"LOCATION", "Location",
"CITY", "Location",
"STATE_OR_PROVINCE", "Location",
"COUNTRY", "Location",
"NATIONALITY", "Nationality",
"MISC", "Miscellaneous",
"DATE", "Date",
"TIME", "Time",
"MONEY", "Money",
"PERCENT", "Percent",
"NUMBER", "Number"
);
// 关系模式
private static final Map<Pattern, String> RELATION_PATTERNS = Map.of(
Pattern.compile("(\\w+)是(\\w+)的"), "IS_PART_OF",
Pattern.compile("(\\w+)在(\\w+)工作"), "WORKS_AT",
Pattern.compile("(\\w+)出生于(\\w+)"), "BORN_IN",
Pattern.compile("(\\w+)创建了(\\w+)"), "FOUNDED_BY"),
Pattern.compile("(\\w+)位于(\\w+)"), "LOCATED_IN",
Pattern.compile("(\\w+)属于(\\w+)"), "BELONGS_TO"
);
@Autowired
public InformationExtractionService(KnowledgeGraphService knowledgeGraphService) {
this.knowledgeGraphService = knowledgeGraphService;
// 初始化Stanford CoreNLP管道
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, depparse, coref");
props.setProperty("coref.algorithm", "statistical");
props.setProperty("threads", "4");
this.pipeline = new StanfordCoreNLP(props);
log.info("Stanford CoreNLP pipeline initialized successfully");
}
/**
* 从文本源抽取信息
*/
public ExtractionResult extractFromTextSource(TextSource textSource) {
long startTime = System.currentTimeMillis();
ExtractionResult result = new ExtractionResult(textSource);
try {
log.info("Starting information extraction for text source: {}", textSource.getName());
String text = textSource.getContent();
if (text == null || text.trim().isEmpty()) {
throw new IllegalArgumentException("文本内容为空");
}
// 执行信息抽取
ExtractionData extractionData = extractInformation(text, textSource.getDomain());
// 构建抽取结果
result.setEntityCount(extractionData.getEntities().size());
result.setRelationCount(extractionData.getRelations().size());
result.setAttributeCount(extractionData.getAttributes().size());
result.setExtractionConfidence(calculateAverageConfidence(extractionData));
result.setExtractionModel("Stanford-CoreNLP-4.5.0");
result.setProcessingTimeMs(System.currentTimeMillis() - startTime);
// 转换为JSON存储
result.setExtractedEntities(convertToJson(extractionData.getEntities()));
result.setExtractedRelations(convertToJson(extractionData.getRelations()));
result.setExtractedAttributes(convertToJson(extractionData.getAttributes()));
// 构建知识图谱
buildKnowledgeGraph(extractionData, textSource.getDomain(), textSource.getName());
log.info("Information extraction completed: {} entities, {} relations, {} attributes",
extractionData.getEntities().size(),
extractionData.getRelations().size(),
extractionData.getAttributes().size());
} catch (Exception e) {
log.error("Information extraction failed for text source: {}", textSource.getName(), e);
result.setProcessingError("抽取失败: " + e.getMessage());
}
return result;
}
/**
* 从文本中抽取信息
*/
public ExtractionData extractInformation(String text, String domain) {
ExtractionData extractionData = new ExtractionData();
try {
// 创建文档注解
Annotation document = new Annotation(text);
// 执行所有注解
pipeline.annotate(document);
// 处理句子
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
// 抽取实体
extractEntities(sentence, extractionData, domain);
// 抽取关系
extractRelations(sentence, extractionData, domain);
// 抽取属性
extractAttributes(sentence, extractionData, domain);
}
// 处理共指消解
resolveCoreferences(document, extractionData);
// 后处理:合并重复实体,验证关系等
postProcessExtraction(extractionData);
} catch (Exception e) {
log.error("Information extraction failed", e);
throw new RuntimeException("信息抽取失败: " + e.getMessage());
}
return extractionData;
}
/**
* 抽取实体
*/
private void extractEntities(CoreMap sentence, ExtractionData extractionData, String domain) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
StringBuilder currentEntity = new StringBuilder();
String currentType = null;
int startPosition = -1;
for (int i = 0; i < tokens.size(); i++) {
CoreLabel token = tokens.get(i);
String word = token.word();
String ner = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (!"O".equals(ner)) {
// 开始新实体或继续当前实体
if (currentEntity.length() == 0) {
startPosition = token.beginPosition();
currentType = normalizeEntityType(ner);
}
currentEntity.append(word);
// 检查下一个token是否属于同一实体
if (i + 1 < tokens.size()) {
CoreLabel nextToken = tokens.get(i + 1);
String nextNer = nextToken.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (ner.equals(nextNer)) {
currentEntity.append(" ");
continue;
}
}
// 实体结束,添加到结果
if (currentEntity.length() > 0 && currentType != null) {
String entityName = currentEntity.toString().trim();
if (isValidEntity(entityName, currentType)) {
EntityInfo entity = new EntityInfo(
generateEntityId(entityName, currentType),
entityName,
currentType,
domain,
calculateEntityConfidence(entityName, currentType),
startPosition,
token.endPosition()
);
extractionData.addEntity(entity);
}
}
// 重置
currentEntity.setLength(0);
currentType = null;
startPosition = -1;
}
}
}
/**
* 抽取关系
*/
private void extractRelations(CoreMap sentence, ExtractionData extractionData, String domain) {
// 获取依存句法树
SemanticGraph dependencies = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (dependencies == null) {
return;
}
// 获取实体列表
List<EntityInfo> sentenceEntities = extractionData.getEntities().stream()
.filter(e -> e.getDomain().equals(domain))
.collect(Collectors.toList());
// 基于依存关系抽取关系
for (EntityInfo entity1 : sentenceEntities) {
for (EntityInfo entity2 : sentenceEntities) {
if (!entity1.equals(entity2)) {
// 检查是否存在依存路径
String relationType = detectRelationByDependencies(dependencies, entity1, entity2);
if (relationType != null) {
RelationInfo relation = new RelationInfo(
generateRelationId(entity1, entity2, relationType),
entity1.getEntityId(),
entity2.getEntityId(),
relationType,
domain,
calculateRelationConfidence(entity1, entity2, relationType)
);
extractionData.addRelation(relation);
}
}
}
}
// 基于模式匹配抽取关系
extractRelationsByPatterns(sentence.toString(), extractionData, domain);
}
/**
* 基于依存关系检测关系
*/
private String detectRelationByDependencies(SemanticGraph dependencies, EntityInfo entity1, EntityInfo entity2) {
// 简化的依存关系分析
// 实际项目中应该使用更复杂的规则
String text = dependencies.toList();
if (text.contains(entity1.getName()) && text.contains(entity2.getName())) {
// 检查常见的依存关系模式
if (text.contains("nsubj") && text.contains("dobj")) {
return "SUBJECT_OBJECT";
} else if (text.contains("prep") && (text.contains("in") || text.contains("at"))) {
return "LOCATED_IN";
} else if (text.contains("appos")) {
return "ALSO_KNOWN_AS";
}
}
return null;
}
/**
* 基于模式匹配抽取关系
*/
private void extractRelationsByPatterns(String sentence, ExtractionData extractionData, String domain) {
for (Map.Entry<Pattern, String> entry : RELATION_PATTERNS.entrySet()) {
java.util.regex.Matcher matcher = entry.getKey().matcher(sentence);
while (matcher.find()) {
String entity1Name = matcher.group(1);
String entity2Name = matcher.group(2);
String relationType = entry.getValue();
// 查找匹配的实体
Optional<EntityInfo> entity1 = findEntityByName(extractionData, entity1Name);
Optional<EntityInfo> entity2 = findEntityByName(extractionData, entity2Name);
if (entity1.isPresent() && entity2.isPresent()) {
RelationInfo relation = new RelationInfo(
generateRelationId(entity1.get(), entity2.get(), relationType),
entity1.get().getEntityId(),
entity2.get().getEntityId(),
relationType,
domain,
0.8 // 模式匹配的置信度
);
extractionData.addRelation(relation);
}
}
}
}
/**
* 抽取属性
*/
private void extractAttributes(CoreMap sentence, ExtractionData extractionData, String domain) {
// 获取句法树
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
if (tree == null) {
return;
}
// 简化的属性抽取:基于名词短语和动词短语
List<EntityInfo> entities = extractionData.getEntities();
for (EntityInfo entity : entities) {
// 抽取基本属性:类型、长度等
AttributeInfo typeAttr = new AttributeInfo(
generateAttributeId(entity, "type"),
entity.getEntityId(),
"type",
entity.getType(),
"STRING",
domain,
0.9
);
extractionData.addAttribute(typeAttr);
// 基于上下文抽取其他属性
extractContextualAttributes(sentence, entity, extractionData, domain);
}
}
/**
* 抽取上下文属性
*/
private void extractContextualAttributes(CoreMap sentence, EntityInfo entity,
ExtractionData extractionData, String domain) {
String sentenceText = sentence.toString();
int entityPosition = sentenceText.indexOf(entity.getName());
if (entityPosition >= 0) {
// 在实体周围寻找可能的属性
String context = getContextAround(sentenceText, entityPosition, entity.getName().length(), 50);
// 抽取数字属性
extractNumericAttributes(context, entity, extractionData, domain);
// 抽取描述性属性
extractDescriptiveAttributes(context, entity, extractionData, domain);
}
}
/**
* 抽取数字属性
*/
private void extractNumericAttributes(String context, EntityInfo entity,
ExtractionData extractionData, String domain) {
java.util.regex.Pattern numberPattern = java.util.regex.Pattern.compile("\\d+(\\.\\d+)?");
java.util.regex.Matcher matcher = numberPattern.matcher(context);
while (matcher.find()) {
String number = matcher.group();
String attributeKey = inferNumericAttributeKey(context, number, entity.getType());
if (attributeKey != null) {
AttributeInfo attribute = new AttributeInfo(
generateAttributeId(entity, attributeKey),
entity.getEntityId(),
attributeKey,
number,
"NUMBER",
domain,
0.7
);
extractionData.addAttribute(attribute);
}
}
}
/**
* 推断数字属性的键
*/
private String inferNumericAttributeKey(String context, String number, String entityType) {
context = context.toLowerCase();
if (context.contains("年龄") || context.contains("岁")) {
return "age";
} else if (context.contains("工资") || context.contains("薪资") || context.contains("收入")) {
return "salary";
} else if (context.contains("身高")) {
return "height";
} else if (context.contains("体重")) {
return "weight";
} else if (context.contains("成立于") || context.contains("建立于")) {
return "founded_year";
} else if (context.contains("人口")) {
return "population";
}
return "numeric_value";
}
/**
* 抽取描述性属性
*/
private void extractDescriptiveAttributes(String context, EntityInfo entity,
ExtractionData extractionData, String domain) {
// 基于实体类型和上下文的简单规则
switch (entity.getType()) {
case "Person":
extractPersonAttributes(context, entity, extractionData, domain);
break;
case "Organization":
extractOrganizationAttributes(context, entity, extractionData, domain);
break;
case "Location":
extractLocationAttributes(context, entity, extractionData, domain);
break;
}
}
/**
* 抽取人物属性
*/
private void extractPersonAttributes(String context, EntityInfo entity,
ExtractionData extractionData, String domain) {
if (context.contains("博士") || context.contains("教授")) {
AttributeInfo attribute = new AttributeInfo(
generateAttributeId(entity, "title"),
entity.getEntityId(),
"title",
"博士",
"STRING",
domain,
0.8
);
extractionData.addAttribute(attribute);
}
if (context.contains("男")) {
AttributeInfo attribute = new AttributeInfo(
generateAttributeId(entity, "gender"),
entity.getEntityId(),
"gender",
"男",
"STRING",
domain,
0.9
);
extractionData.addAttribute(attribute);
} else if (context.contains("女")) {
AttributeInfo attribute = new AttributeInfo(
generateAttributeId(entity, "gender"),
entity.getEntityId(),
"gender",
"女",
"STRING",
domain,
0.9
);
extractionData.addAttribute(attribute);
}
}
/**
* 抽取组织属性
*/
private void extractOrganizationAttributes(String context, EntityInfo entity,
ExtractionData extractionData, String domain) {
if (context.contains("公司") || context.contains("企业")) {
AttributeInfo attribute = new AttributeInfo(
generateAttributeId(entity, "organization_type"),
entity.getEntityId(),
"organization_type",
"公司",
"STRING",
domain,
0.8
);
extractionData.addAttribute(attribute);
}
}
/**
* 抽取地点属性
*/
private void extractLocationAttributes(String context, EntityInfo entity,
ExtractionData extractionData, String domain) {
if (context.contains("市") || context.contains("省") || context.contains("区")) {
AttributeInfo attribute = new AttributeInfo(
generateAttributeId(entity, "location_type"),
entity.getEntityId(),
"location_type",
"行政区划",
"STRING",
domain,
0.8
);
extractionData.addAttribute(attribute);
}
}
/**
* 处理共指消解
*/
private void resolveCoreferences(Annotation document, ExtractionData extractionData) {
Map<Integer, CorefChain> corefChains = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
if (corefChains == null) {
return;
}
for (CorefChain chain : corefChains.values()) {
CorefChain.CorefMention representative = chain.getRepresentativeMention();
if (representative != null) {
String representativeText = representative.mentionSpan;
// 为共指链中的其他提及创建别名关系
for (CorefChain.CorefMention mention : chain.getMentions()) {
if (!mention.equals(representative)) {
String mentionText = mention.mentionSpan;
// 查找对应的实体
Optional<EntityInfo> representativeEntity = findEntityByName(extractionData, representativeText);
Optional<EntityInfo> mentionEntity = findEntityByName(extractionData, mentionText);
if (representativeEntity.isPresent() && mentionEntity.isPresent()) {
// 创建别名关系
RelationInfo aliasRelation = new RelationInfo(
generateRelationId(representativeEntity.get(), mentionEntity.get(), "ALIAS"),
representativeEntity.get().getEntityId(),
mentionEntity.get().getEntityId(),
"ALIAS",
representativeEntity.get().getDomain(),
0.95
);
extractionData.addRelation(aliasRelation);
}
}
}
}
}
}
/**
* 后处理抽取结果
*/
private void postProcessExtraction(ExtractionData extractionData) {
// 合并重复实体(基于名称和类型的简单合并)
mergeDuplicateEntities(extractionData);
// 验证和清理关系
validateRelations(extractionData);
// 计算统计信息
calculateStatistics(extractionData);
}
/**
* 合并重复实体
*/
private void mergeDuplicateEntities(ExtractionData extractionData) {
Map<String, EntityInfo> entityMap = new HashMap<>();
List<EntityInfo> uniqueEntities = new ArrayList<>();
for (EntityInfo entity : extractionData.getEntities()) {
String key = entity.getName() + "|" + entity.getType();
EntityInfo existing = entityMap.get(key);
if (existing == null) {
entityMap.put(key, entity);
uniqueEntities.add(entity);
} else {
// 合并置信度
double newConfidence = Math.max(existing.getConfidence(), entity.getConfidence());
existing.setConfidence(newConfidence);
}
}
extractionData.setEntities(uniqueEntities);
}
/**
* 验证关系
*/
private void validateRelations(ExtractionData extractionData) {
List<RelationInfo> validRelations = extractionData.getRelations().stream()
.filter(relation -> {
// 检查关系中的实体是否存在
boolean startExists = extractionData.getEntities().stream()
.anyMatch(e -> e.getEntityId().equals(relation.getStartEntityId()));
boolean endExists = extractionData.getEntities().stream()
.anyMatch(e -> e.getEntityId().equals(relation.getEndEntityId()));
return startExists && endExists;
})
.collect(Collectors.toList());
extractionData.setRelations(validRelations);
}
/**
* 计算统计信息
*/
private void calculateStatistics(ExtractionData extractionData) {
Map<String, Long> entityTypeCount = extractionData.getEntities().stream()
.collect(Collectors.groupingBy(EntityInfo::getType, Collectors.counting()));
Map<String, Long> relationTypeCount = extractionData.getRelations().stream()
.collect(Collectors.groupingBy(RelationInfo::getType, Collectors.counting()));
extractionData.getStatistics().put("entity_types", String.valueOf(entityTypeCount.size()));
extractionData.getStatistics().put("relation_types", String.valueOf(relationTypeCount.size()));
extractionData.getStatistics().put("total_confidence",
String.valueOf(extractionData.getEntities().stream()
.mapToDouble(EntityInfo::getConfidence)
.average()
.orElse(0.0)));
}
// 工具方法
private String normalizeEntityType(String nerType) {
return ENTITY_TYPE_MAPPING.getOrDefault(nerType, "Other");
}
private boolean isValidEntity(String name, String type) {
return name != null && !name.trim().isEmpty() && name.length() > 1;
}
private double calculateEntityConfidence(String name, String type) {
// 基于实体长度和类型的简单置信度计算
double confidence = 0.5;
if (name.length() >= 3) confidence += 0.2;
if (name.length() >= 5) confidence += 0.1;
if (!"Other".equals(type)) confidence += 0.2;
return Math.min(confidence, 1.0);
}
private double calculateRelationConfidence(EntityInfo entity1, EntityInfo entity2, String relationType) {
// 基于实体置信度和关系类型的简单置信度计算
return (entity1.getConfidence() + entity2.getConfidence()) / 2 * 0.8;
}
private String generateEntityId(String name, String type) {
return String.format("ENT_%s_%s_%d",
type.toUpperCase(),
name.hashCode() & 0xfffffff,
System.currentTimeMillis() % 10000);
}
private String generateRelationId(EntityInfo entity1, EntityInfo entity2, String relationType) {
return String.format("REL_%s_%s_%s_%d",
entity1.getEntityId(),
relationType,
entity2.getEntityId(),
System.currentTimeMillis() % 10000);
}
private String generateAttributeId(EntityInfo entity, String key) {
return String.format("ATTR_%s_%s_%d",
entity.getEntityId(),
key,
System.currentTimeMillis() % 10000);
}
private Optional<EntityInfo> findEntityByName(ExtractionData extractionData, String name) {
return extractionData.getEntities().stream()
.filter(e -> e.getName().equals(name))
.findFirst();
}
private String getContextAround(String text, int position, int length, int contextSize) {
int start = Math.max(0, position - contextSize);
int end = Math.min(text.length(), position + length + contextSize);
return text.substring(start, end);
}
private double calculateAverageConfidence(ExtractionData extractionData) {
double entityAvg = extractionData.getEntities().stream()
.mapToDouble(EntityInfo::getConfidence)
.average()
.orElse(0.0);
double relationAvg = extractionData.getRelations().stream()
.mapToDouble(RelationInfo::getConfidence)
.average()
.orElse(0.0);
return (entityAvg + relationAvg) / 2;
}
private String convertToJson(List<?> list) {
try {
com.fasterxml.jackson.databind.ObjectMapper mapper = new com.fasterxml.jackson.databind.ObjectMapper();
return mapper.writeValueAsString(list);
} catch (Exception e) {
log.error("Failed to convert list to JSON", e);
return "[]";
}
}
/**
* 构建知识图谱
*/
private void buildKnowledgeGraph(ExtractionData extractionData, String domain, String source) {
try {
log.info("Building knowledge graph for domain: {}, source: {}", domain, source);
// 创建实体节点
for (EntityInfo entityInfo : extractionData.getEntities()) {
EntityNode entityNode = new EntityNode(
entityInfo.getEntityId(),
entityInfo.getName(),
entityInfo.getType(),
domain
);
entityNode.setConfidence(entityInfo.getConfidence());
entityNode.setSource(source);
knowledgeGraphService.saveEntity(entityNode);
}
// 创建关系
for (RelationInfo relationInfo : extractionData.getRelations()) {
EntityNode startEntity = knowledgeGraphService.findEntityById(relationInfo.getStartEntityId());
EntityNode endEntity = knowledgeGraphService.findEntityById(relationInfo.getEndEntityId());
if (startEntity != null && endEntity != null) {
startEntity.addRelation(endEntity, relationInfo.getType(), relationInfo.getConfidence());
knowledgeGraphService.saveEntity(startEntity);
}
}
// 创建属性
for (AttributeInfo attributeInfo : extractionData.getAttributes()) {
EntityNode entity = knowledgeGraphService.findEntityById(attributeInfo.getEntityId());
if (entity != null) {
entity.addAttribute(attributeInfo.getKey(), attributeInfo.getValue(), attributeInfo.getConfidence());
knowledgeGraphService.saveEntity(entity);
}
}
log.info("Knowledge graph built successfully: {} entities, {} relations, {} attributes",
extractionData.getEntities().size(),
extractionData.getRelations().size(),
extractionData.getAttributes().size());
} catch (Exception e) {
log.error("Failed to build knowledge graph", e);
throw new RuntimeException("知识图谱构建失败: " + e.getMessage());
}
}
/**
* 抽取数据封装类
*/
@Data
public static class ExtractionData {
private List<EntityInfo> entities = new ArrayList<>();
private List<RelationInfo> relations = new ArrayList<>();
private List<AttributeInfo> attributes = new ArrayList<>();
private Map<String, String> statistics = new HashMap<>();
public void addEntity(EntityInfo entity) {
this.entities.add(entity);
}
public void addRelation(RelationInfo relation) {
this.relations.add(relation);
}
public void addAttribute(AttributeInfo attribute) {
this.attributes.add(attribute);
}
}
/**
* 实体信息类
*/
@Data
public static class EntityInfo {
private String entityId;
private String name;
private String type;
private String domain;
private Double confidence;
private Integer startPosition;
private Integer endPosition;
private Map<String, Object> metadata = new HashMap<>();
public EntityInfo(String entityId, String name, String type, String domain,
Double confidence, Integer startPosition, Integer endPosition) {
this.entityId = entityId;
this.name = name;
this.type = type;
this.domain = domain;
this.confidence = confidence;
this.startPosition = startPosition;
this.endPosition = endPosition;
}
}
/**
* 关系信息类
*/
@Data
public static class RelationInfo {
private String relationId;
private String startEntityId;
private String endEntityId;
private String type;
private String domain;
private Double confidence;
private Map<String, Object> metadata = new HashMap<>();
public RelationInfo(String relationId, String startEntityId, String endEntityId,
String type, String domain, Double confidence) {
this.relationId = relationId;
this.startEntityId = startEntityId;
this.endEntityId = endEntityId;
this.type = type;
this.domain = domain;
this.confidence = confidence;
}
}
/**
* 属性信息类
*/
@Data
public static class AttributeInfo {
private String attributeId;
private String entityId;
private String key;
private String value;
private String dataType;
private String domain;
private Double confidence;
private Map<String, Object> metadata = new HashMap<>();
public AttributeInfo(String attributeId, String entityId, String key, String value,
String dataType, String domain, Double confidence) {
this.attributeId = attributeId;
this.entityId = entityId;
this.key = key;
this.value = value;
this.dataType = dataType;
this.domain = domain;
this.confidence = confidence;
}
}
}
知识图谱问答服务
java
package com.company.kg.service;
import com.company.kg.graph.EntityNode;
import com.company.kg.graph.EntityRelation;
import lombok.extern.slf4j.Slf4j;
import org.neo4j.ogm.session.Session;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.neo4j.core.Neo4jTemplate;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* 知识图谱问答服务 - 提供基于知识图谱的智能问答功能
*/
@Service
@Slf4j
public class KnowledgeGraphQAService {
@Autowired
private Neo4jTemplate neo4jTemplate;
@Autowired
private Session neo4jSession;
@Value("${kg.qa.max-path-length:3}")
private int maxPathLength;
@Value("${kg.qa.timeout:30000}")
private int timeout;
@Value("${kg.qa.enable-fallback:true}")
private boolean enableFallback;
// 问题模式映射
private static final Map<Pattern, String> QUESTION_PATTERNS = Map.ofEntries(
Map.entry(Pattern.compile("(.*)是谁"), "PERSON_QUERY"),
Map.entry(Pattern.compile("(.*)是什么"), "ENTITY_QUERY"),
Map.entry(Pattern.compile("(.*)在哪里"), "LOCATION_QUERY"),
Map.entry(Pattern.compile("(.*)的(.*)是什么"), "ATTRIBUTE_QUERY"),
Map.entry(Pattern.compile("(.*)和(.*)有什么关系"), "RELATION_QUERY"),
Map.entry(Pattern.compile("(.*)有哪些属性"), "PROPERTIES_QUERY"),
Map.entry(Pattern.compile("(.*)属于哪个(.*)"), "CATEGORY_QUERY"),
Map.entry(Pattern.compile("哪些(.*)位于(.*)"), "LOCATED_ENTITY_QUERY"),
Map.entry(Pattern.compile("(.*)成立于什么时候"), "FOUNDED_QUERY"),
Map.entry(Pattern.compile("(.*)的创始人是谁"), "FOUNDER_QUERY")
);
/**
* 回答自然语言问题
*/
public QAAnswer answerQuestion(String question, String domain) {
long startTime = System.currentTimeMillis();
QAAnswer answer = new QAAnswer();
answer.setQuestion(question);
answer.setDomain(domain);
try {
log.info("Processing question: {}", question);
// 1. 问题解析
QueryTemplate queryTemplate = parseQuestion(question, domain);
answer.setQueryTemplate(queryTemplate);
// 2. 转换为Cypher查询
String cypherQuery = generateCypherQuery(queryTemplate);
answer.setCypherQuery(cypherQuery);
// 3. 执行查询
List<Map<String, Object>> results = executeCypherQuery(cypherQuery);
answer.setQueryResults(results);
// 4. 生成自然语言回答
String naturalAnswer = generateNaturalLanguageAnswer(results, queryTemplate);
answer.setAnswer(naturalAnswer);
// 5. 设置回答元数据
answer.setSuccess(true);
answer.setProcessingTimeMs(System.currentTimeMillis() - startTime);
answer.setResultCount(results.size());
log.info("Question answered successfully: {} results in {}ms",
results.size(), answer.getProcessingTimeMs());
} catch (Exception e) {
log.error("Failed to answer question: {}", question, e);
answer.setSuccess(false);
answer.setErrorMessage("回答问题失败: " + e.getMessage());
// 降级回答
if (enableFallback) {
answer.setAnswer(generateFallbackAnswer(question));
answer.setFallbackUsed(true);
}
}
return answer;
}
/**
* 解析自然语言问题
*/
public QueryTemplate parseQuestion(String question, String domain) {
QueryTemplate template = new QueryTemplate();
template.setOriginalQuestion(question);
template.setDomain(domain);
// 识别问题类型
for (Map.Entry<Pattern, String> entry : QUESTION_PATTERNS.entrySet()) {
java.util.regex.Matcher matcher = entry.getKey().matcher(question);
if (matcher.find()) {
template.setQuestionType(entry.getValue());
// 提取实体和参数
extractEntitiesAndParameters(matcher, template, question);
break;
}
}
// 如果未匹配到已知模式,使用默认查询
if (template.getQuestionType() == null) {
template.setQuestionType("ENTITY_SEARCH");
template.getParameters().put("search_term", question);
}
return template;
}
/**
* 提取实体和参数
*/
private void extractEntitiesAndParameters(java.util.regex.Matcher matcher,
QueryTemplate template, String question) {
String questionType = template.getQuestionType();
switch (questionType) {
case "PERSON_QUERY":
template.getEntities().add(matcher.group(1));
template.getParameters().put("entity_name", matcher.group(1));
template.getParameters().put("entity_type", "Person");
break;
case "ENTITY_QUERY":
template.getEntities().add(matcher.group(1));
template.getParameters().put("entity_name", matcher.group(1));
break;
case "LOCATION_QUERY":
template.getEntities().add(matcher.group(1));
template.getParameters().put("entity_name", matcher.group(1));
template.getParameters().put("relation_type", "LOCATED_IN");
break;
case "ATTRIBUTE_QUERY":
template.getEntities().add(matcher.group(1));
template.getParameters().put("entity_name", matcher.group(1));
template.getParameters().put("attribute_key", matcher.group(2));
break;
case "RELATION_QUERY":
template.getEntities().add(matcher.group(1));
template.getEntities().add(matcher.group(2));
template.getParameters().put("entity1_name", matcher.group(1));
template.getParameters().put("entity2_name", matcher.group(2));
break;
case "PROPERTIES_QUERY":
template.getEntities().add(matcher.group(1));
template.getParameters().put("entity_name", matcher.group(1));
break;
case "CATEGORY_QUERY":
template.getEntities().add(matcher.group(1));
template.getParameters().put("entity_name", matcher.group(1));
template.getParameters().put("category_type", matcher.group(2));
break;
case "LOCATED_ENTITY_QUERY":
template.getParameters().put("entity_type", matcher.group(1));
template.getEntities().add(matcher.group(2));
template.getParameters().put("location_name", matcher.group(2));
break;
case "FOUNDED_QUERY":
template.getEntities().add(matcher.group(1));
template.getParameters().put("entity_name", matcher.group(1));
template.getParameters().put("attribute_key", "founded_year");
break;
case "FOUNDER_QUERY":
template.getEntities().add(matcher.group(1));
template.getParameters().put("entity_name", matcher.group(1));
template.getParameters().put("relation_type", "FOUNDED_BY");
break;
}
// 查找实体在知识图谱中的存在
resolveEntitiesInGraph(template);
}
/**
* 解析知识图谱中的实体
*/
private void resolveEntitiesInGraph(QueryTemplate template) {
for (String entityName : template.getEntities()) {
List<EntityNode> matchedEntities = findEntitiesByName(entityName, template.getDomain());
if (!matchedEntities.isEmpty()) {
template.getResolvedEntities().addAll(matchedEntities);
}
}
}
/**
* 生成Cypher查询
*/
public String generateCypherQuery(QueryTemplate template) {
String questionType = template.getQuestionType();
switch (questionType) {
case "PERSON_QUERY":
return generatePersonQuery(template);
case "ENTITY_QUERY":
return generateEntityQuery(template);
case "LOCATION_QUERY":
return generateLocationQuery(template);
case "ATTRIBUTE_QUERY":
return generateAttributeQuery(template);
case "RELATION_QUERY":
return generateRelationQuery(template);
case "PROPERTIES_QUERY":
return generatePropertiesQuery(template);
case "CATEGORY_QUERY":
return generateCategoryQuery(template);
case "LOCATED_ENTITY_QUERY":
return generateLocatedEntityQuery(template);
case "FOUNDED_QUERY":
return generateFoundedQuery(template);
case "FOUNDER_QUERY":
return generateFounderQuery(template);
case "ENTITY_SEARCH":
return generateEntitySearchQuery(template);
default:
return generateDefaultQuery(template);
}
}
/**
* 生成人物查询
*/
private String generatePersonQuery(QueryTemplate template) {
String entityName = (String) template.getParameters().get("entity_name");
return "MATCH (e:EntityNode) " +
"WHERE e.name =~ $name AND e.type = 'Person' AND e.domain = $domain " +
"RETURN e.name AS name, e.type AS type, e.description AS description, " +
" e.confidence AS confidence, e.source AS source " +
"LIMIT 10";
}
/**
* 生成实体查询
*/
private String generateEntityQuery(QueryTemplate template) {
String entityName = (String) template.getParameters().get("entity_name");
return "MATCH (e:EntityNode) " +
"WHERE e.name =~ $name AND e.domain = $domain " +
"RETURN e.name AS name, e.type AS type, e.description AS description, " +
" e.confidence AS confidence " +
"LIMIT 10";
}
/**
* 生成位置查询
*/
private String generateLocationQuery(QueryTemplate template) {
String entityName = (String) template.getParameters().get("entity_name");
String relationType = (String) template.getParameters().get("relation_type");
return "MATCH (e:EntityNode)-[r:RELATED_TO]->(loc:EntityNode) " +
"WHERE e.name =~ $name AND r.type = $relationType AND e.domain = $domain " +
"RETURN loc.name AS location, r.type AS relation, r.confidence AS confidence " +
"LIMIT 10";
}
/**
* 生成属性查询
*/
private String generateAttributeQuery(QueryTemplate template) {
String entityName = (String) template.getParameters().get("entity_name");
String attributeKey = (String) template.getParameters().get("attribute_key");
return "MATCH (e:EntityNode)-[a:HAS_ATTRIBUTE]->() " +
"WHERE e.name =~ $name AND a.key = $attributeKey AND e.domain = $domain " +
"RETURN a.key AS attribute_key, a.value AS attribute_value, " +
" a.confidence AS confidence " +
"LIMIT 10";
}
/**
* 生成关系查询
*/
private String generateRelationQuery(QueryTemplate template) {
String entity1Name = (String) template.getParameters().get("entity1_name");
String entity2Name = (String) template.getParameters().get("entity2_name");
return "MATCH (e1:EntityNode)-[r:RELATED_TO]-(e2:EntityNode) " +
"WHERE e1.name =~ $entity1Name AND e2.name =~ $entity2Name " +
"AND e1.domain = $domain AND e2.domain = $domain " +
"RETURN e1.name AS entity1, e2.name AS entity2, r.type AS relation, " +
" r.confidence AS confidence " +
"LIMIT 10";
}
/**
* 生成属性列表查询
*/
private String generatePropertiesQuery(QueryTemplate template) {
String entityName = (String) template.getParameters().get("entity_name");
return "MATCH (e:EntityNode)-[a:HAS_ATTRIBUTE]->() " +
"WHERE e.name =~ $name AND e.domain = $domain " +
"RETURN a.key AS attribute_key, a.value AS attribute_value, " +
" a.data_type AS data_type, a.confidence AS confidence " +
"ORDER BY a.confidence DESC " +
"LIMIT 20";
}
/**
* 生成分类查询
*/
private String generateCategoryQuery(QueryTemplate template) {
String entityName = (String) template.getParameters().get("entity_name");
String categoryType = (String) template.getParameters().get("category_type");
return "MATCH (e:EntityNode)-[r:RELATED_TO]->(c:EntityNode) " +
"WHERE e.name =~ $name AND c.type = $categoryType AND e.domain = $domain " +
"RETURN c.name AS category, r.type AS relation, r.confidence AS confidence " +
"LIMIT 10";
}
/**
* 生成位置实体查询
*/
private String generateLocatedEntityQuery(QueryTemplate template) {
String entityType = (String) template.getParameters().get("entity_type");
String locationName = (String) template.getParameters().get("location_name");
return "MATCH (e:EntityNode)-[r:RELATED_TO]->(loc:EntityNode) " +
"WHERE e.type = $entityType AND loc.name =~ $locationName " +
"AND r.type = 'LOCATED_IN' AND e.domain = $domain " +
"RETURN e.name AS entity_name, e.type AS entity_type, " +
" r.confidence AS confidence " +
"LIMIT 10";
}
/**
* 生成成立时间查询
*/
private String generateFoundedQuery(QueryTemplate template) {
String entityName = (String) template.getParameters().get("entity_name");
return "MATCH (e:EntityNode)-[a:HAS_ATTRIBUTE]->() " +
"WHERE e.name =~ $name AND a.key = 'founded_year' AND e.domain = $domain " +
"RETURN a.value AS founded_year, a.confidence AS confidence " +
"LIMIT 5";
}
/**
* 生成创始人查询
*/
private String generateFounderQuery(QueryTemplate template) {
String entityName = (String) template.getParameters().get("entity_name");
return "MATCH (e:EntityNode)<-[r:RELATED_TO]-(founder:EntityNode) " +
"WHERE e.name =~ $name AND r.type = 'FOUNDED_BY' AND e.domain = $domain " +
"RETURN founder.name AS founder_name, r.confidence AS confidence " +
"LIMIT 10";
}
/**
* 生成实体搜索查询
*/
private String generateEntitySearchQuery(QueryTemplate template) {
String searchTerm = (String) template.getParameters().get("search_term");
return "MATCH (e:EntityNode) " +
"WHERE e.name =~ $searchTerm OR e.description =~ $searchTerm " +
"AND e.domain = $domain " +
"RETURN e.name AS name, e.type AS type, e.description AS description, " +
" e.confidence AS confidence " +
"ORDER BY e.confidence DESC " +
"LIMIT 10";
}
/**
* 生成默认查询
*/
private String generateDefaultQuery(QueryTemplate template) {
return "MATCH (e:EntityNode) " +
"WHERE e.domain = $domain " +
"RETURN e.name AS name, e.type AS type " +
"LIMIT 5";
}
/**
* 执行Cypher查询
*/
private List<Map<String, Object>> executeCypherQuery(String cypherQuery) {
try {
Map<String, Object> parameters = new HashMap<>();
// 这里应该根据查询类型设置相应的参数
return neo4jTemplate.query(cypherQuery, parameters).queryResults();
} catch (Exception e) {
log.error("Failed to execute Cypher query: {}", cypherQuery, e);
throw new RuntimeException("Cypher查询执行失败: " + e.getMessage());
}
}
/**
* 生成自然语言回答
*/
private String generateNaturalLanguageAnswer(List<Map<String, Object>> results,
QueryTemplate template) {
if (results.isEmpty()) {
return "抱歉,我没有找到相关的信息。";
}
String questionType = template.getQuestionType();
switch (questionType) {
case "PERSON_QUERY":
return generatePersonAnswer(results, template);
case "ENTITY_QUERY":
return generateEntityAnswer(results, template);
case "LOCATION_QUERY":
return generateLocationAnswer(results, template);
case "ATTRIBUTE_QUERY":
return generateAttributeAnswer(results, template);
case "RELATION_QUERY":
return generateRelationAnswer(results, template);
case "PROPERTIES_QUERY":
return generatePropertiesAnswer(results, template);
default:
return generateDefaultAnswer(results, template);
}
}
/**
* 生成人物回答
*/
private String generatePersonAnswer(List<Map<String, Object>> results, QueryTemplate template) {
StringBuilder answer = new StringBuilder();
String entityName = (String) template.getParameters().get("entity_name");
if (results.size() == 1) {
Map<String, Object> result = results.get(0);
answer.append(entityName).append("是");
answer.append(result.get("type")).append("。");
if (result.get("description") != null) {
answer.append(" ").append(result.get("description"));
}
} else {
answer.append("找到了多个名为").append(entityName).append("的人物:\n");
for (Map<String, Object> result : results) {
answer.append("• ").append(result.get("name")).append(" (").append(result.get("type")).append(")");
if (result.get("description") != null) {
answer.append(" - ").append(result.get("description"));
}
answer.append("\n");
}
}
return answer.toString();
}
/**
* 生成实体回答
*/
private String generateEntityAnswer(List<Map<String, Object>> results, QueryTemplate template) {
StringBuilder answer = new StringBuilder();
String entityName = (String) template.getParameters().get("entity_name");
if (results.size() == 1) {
Map<String, Object> result = results.get(0);
answer.append(entityName).append("是");
answer.append(result.get("type")).append("。");
if (result.get("description") != null) {
answer.append(" ").append(result.get("description"));
}
} else {
answer.append("找到了多个名为").append(entityName).append("的实体:\n");
for (Map<String, Object> result : results) {
answer.append("• ").append(result.get("name")).append(" (").append(result.get("type")).append(")");
if (result.get("description") != null) {
answer.append(" - ").append(result.get("description"));
}
answer.append("\n");
}
}
return answer.toString();
}
/**
* 生成位置回答
*/
private String generateLocationAnswer(List<Map<String, Object>> results, QueryTemplate template) {
StringBuilder answer = new StringBuilder();
String entityName = (String) template.getParameters().get("entity_name");
if (!results.isEmpty()) {
Map<String, Object> result = results.get(0);
answer.append(entityName).append("位于").append(result.get("location")).append("。");
} else {
answer.append("没有找到").append(entityName).append("的位置信息。");
}
return answer.toString();
}
/**
* 生成属性回答
*/
private String generateAttributeAnswer(List<Map<String, Object>> results, QueryTemplate template) {
StringBuilder answer = new StringBuilder();
String entityName = (String) template.getParameters().get("entity_name");
String attributeKey = (String) template.getParameters().get("attribute_key");
if (!results.isEmpty()) {
Map<String, Object> result = results.get(0);
answer.append(entityName).append("的").append(attributeKey).append("是");
answer.append(result.get("attribute_value")).append("。");
} else {
answer.append("没有找到").append(entityName).append("的").append(attributeKey).append("信息。");
}
return answer.toString();
}
/**
* 生成关系回答
*/
private String generateRelationAnswer(List<Map<String, Object>> results, QueryTemplate template) {
StringBuilder answer = new StringBuilder();
String entity1Name = (String) template.getParameters().get("entity1_name");
String entity2Name = (String) template.getParameters().get("entity2_name");
if (!results.isEmpty()) {
Map<String, Object> result = results.get(0);
answer.append(entity1Name).append("和").append(entity2Name).append("之间存在");
answer.append(result.get("relation")).append("的关系。");
} else {
answer.append("没有找到").append(entity1Name).append("和").append(entity2Name).append("之间的关系。");
}
return answer.toString();
}
/**
* 生成属性列表回答
*/
private String generatePropertiesAnswer(List<Map<String, Object>> results, QueryTemplate template) {
StringBuilder answer = new StringBuilder();
String entityName = (String) template.getParameters().get("entity_name");
if (!results.isEmpty()) {
answer.append(entityName).append("的属性包括:\n");
for (Map<String, Object> result : results) {
answer.append("• ").append(result.get("attribute_key")).append(": ");
answer.append(result.get("attribute_value")).append("\n");
}
} else {
answer.append("没有找到").append(entityName).append("的属性信息。");
}
return answer.toString();
}
/**
* 生成默认回答
*/
private String generateDefaultAnswer(List<Map<String, Object>> results, QueryTemplate template) {
StringBuilder answer = new StringBuilder();
answer.append("找到以下相关信息:\n");
for (Map<String, Object> result : results) {
answer.append("• ").append(result.get("name")).append(" (").append(result.get("type")).append(")\n");
}
return answer.toString();
}
/**
* 生成降级回答
*/
private String generateFallbackAnswer(String question) {
return "抱歉,我暂时无法回答这个问题。您可以尝试:\n" +
"1. 重新表述您的问题\n" +
"2. 查询特定的实体或关系\n" +
"3. 检查知识图谱中是否有相关数据\n\n" +
"对于给您带来的不便,我们深表歉意。";
}
/**
* 根据名称查找实体
*/
public List<EntityNode> findEntitiesByName(String name, String domain) {
String query = "MATCH (e:EntityNode) " +
"WHERE e.name =~ $name AND e.domain = $domain " +
"RETURN e";
Map<String, Object> parameters = new HashMap<>();
parameters.put("name", "(?i).*" + name + ".*");
parameters.put("domain", domain);
return neo4jTemplate.query(query, parameters).to(EntityNode.class);
}
/**
* 查找实体之间的关系路径
*/
public List<Map<String, Object>> findRelationPath(String entity1Id, String entity2Id, int maxPathLength) {
String query = "MATCH path = (e1:EntityNode)-[r*1.." + maxPathLength + "]-(e2:EntityNode) " +
"WHERE e1.entityId = $entity1Id AND e2.entityId = $entity2Id " +
"RETURN path, length(path) as pathLength " +
"ORDER BY pathLength " +
"LIMIT 10";
Map<String, Object> parameters = new HashMap<>();
parameters.put("entity1Id", entity1Id);
parameters.put("entity2Id", entity2Id);
return neo4jTemplate.query(query, parameters).queryResults();
}
/**
* 问答答案封装类
*/
@Data
public static class QAAnswer {
private boolean success;
private String question;
private String domain;
private String answer;
private QueryTemplate queryTemplate;
private String cypherQuery;
private List<Map<String, Object>> queryResults;
private String errorMessage;
private boolean fallbackUsed;
private Long processingTimeMs;
private Integer resultCount;
public QAAnswer() {
this.success = false;
this.queryResults = new ArrayList<>();
}
}
/**
* 查询模板类
*/
@Data
public static class QueryTemplate {
private String originalQuestion;
private String questionType;
private String domain;
private List<String> entities = new ArrayList<>();
private List<EntityNode> resolvedEntities = new ArrayList<>();
private Map<String, Object> parameters = new HashMap<>();
public QueryTemplate() {
this.parameters = new HashMap<>();
this.entities = new ArrayList<>();
this.resolvedEntities = new ArrayList<>();
}
}
}
测试用例
信息抽取服务测试
java
package com.company.kg.service;
import com.company.kg.entity.TextSource;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import static org.junit.jupiter.api.Assertions.*;
/**
* 信息抽取服务测试
*/
@ExtendWith(SpringExtension.class)
@SpringBootTest
class InformationExtractionServiceTest {
@Autowired
private InformationExtractionService extractionService;
@Test
void testExtractInformation() {
String text = "张三在北京的清华大学工作。李四是清华大学的教授。清华大学位于北京市。";
String domain = "test";
InformationExtractionService.ExtractionData result =
extractionService.extractInformation(text, domain);
assertNotNull(result);
assertTrue(result.getEntities().size() >= 3); // 张三, 李四, 清华大学, 北京, 北京市
assertTrue(result.getRelations().size() >= 2); // 工作在, 位于
// 验证实体
assertTrue(result.getEntities().stream()
.anyMatch(e -> "张三".equals(e.getName()) && "Person".equals(e.getType())));
assertTrue(result.getEntities().stream()
.anyMatch(e -> "清华大学".equals(e.getName()) && "Organization".equals(e.getType())));
}
@Test
void testEntityExtraction() {
String text = "苹果公司由史蒂夫·乔布斯在1976年创立。公司总部位于加利福尼亚州。";
String domain = "test";
InformationExtractionService.ExtractionData result =
extractionService.extractInformation(text, domain);
assertNotNull(result);
// 验证实体识别
var entities = result.getEntities();
assertTrue(entities.stream().anyMatch(e -> "苹果公司".equals(e.getName())));
assertTrue(entities.stream().anyMatch(e -> "史蒂夫·乔布斯".equals(e.getName())));
assertTrue(entities.stream().anyMatch(e -> "加利福尼亚州".equals(e.getName())));
}
@Test
void testRelationExtraction() {
String text = "马云是阿里巴巴集团的创始人。阿里巴巴集团位于杭州市。";
String domain = "test";
InformationExtractionService.ExtractionData result =
extractionService.extractInformation(text, domain);
assertNotNull(result);
// 验证关系抽取
var relations = result.getRelations();
assertTrue(relations.size() > 0);
// 应该包含创建关系和位置关系
boolean hasFounderRelation = relations.stream()
.anyMatch(r -> "FOUNDED_BY".equals(r.getType()) || "创始人".contains(r.getType()));
boolean hasLocationRelation = relations.stream()
.anyMatch(r -> "LOCATED_IN".equals(r.getType()) || "位于".contains(r.getType()));
assertTrue(hasFounderRelation || hasLocationRelation);
}
@Test
void testEmptyText() {
String text = "";
String domain = "test";
assertThrows(IllegalArgumentException.class, () -> {
extractionService.extractInformation(text, domain);
});
}
@Test
void testTextSourceExtraction() {
TextSource textSource = new TextSource("测试文档", TextSource.SourceType.TXT,
"百度公司由李彦宏于2000年在北京创立。");
textSource.setDomain("test");
var result = extractionService.extractFromTextSource(textSource);
assertNotNull(result);
assertTrue(result.getEntityCount() > 0);
assertTrue(result.getRelationCount() > 0);
assertNotNull(result.getExtractionConfidence());
}
}
知识图谱问答服务测试
java
package com.company.kg.service;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import static org.junit.jupiter.api.Assertions.*;
/**
* 知识图谱问答服务测试
*/
@ExtendWith(SpringExtension.class)
@SpringBootTest
class KnowledgeGraphQAServiceTest {
@Autowired
private KnowledgeGraphQAService qaService;
@Test
void testParseQuestion() {
String question = "马云是谁?";
String domain = "test";
KnowledgeGraphQAService.QueryTemplate template =
qaService.parseQuestion(question, domain);
assertNotNull(template);
assertEquals("PERSON_QUERY", template.getQuestionType());
assertEquals("马云", template.getParameters().get("entity_name"));
assertEquals("Person", template.getParameters().get("entity_type"));
}
@Test
void testGenerateCypherQuery() {
KnowledgeGraphQAService.QueryTemplate template = new KnowledgeGraphQAService.QueryTemplate();
template.setQuestionType("PERSON_QUERY");
template.getParameters().put("entity_name", "马云");
template.setDomain("test");
String cypherQuery = qaService.generateCypherQuery(template);
assertNotNull(cypherQuery);
assertTrue(cypherQuery.contains("MATCH"));
assertTrue(cypherQuery.contains("EntityNode"));
assertTrue(cypherQuery.contains("Person"));
}
@Test
void testAnswerQuestion() {
String question = "清华大学在哪里?";
String domain = "test";
KnowledgeGraphQAService.QAAnswer answer =
qaService.answerQuestion(question, domain);
assertNotNull(answer);
// 由于测试环境可能没有数据,主要测试流程是否正常
assertTrue(answer.isSuccess() || answer.isFallbackUsed());
assertNotNull(answer.getAnswer());
}
@Test
void testFindEntitiesByName() {
String name = "测试实体";
String domain = "test";
var entities = qaService.findEntitiesByName(name, domain);
assertNotNull(entities);
// 测试环境可能没有数据,主要测试方法是否正常执行
}
@Test
void testQuestionTypes() {
String[] questions = {
"马云是谁?",
"阿里巴巴是什么?",
"清华大学在哪里?",
"马云的年龄是多少?",
"马云和阿里巴巴有什么关系?",
"阿里巴巴有哪些属性?"
};
String domain = "test";
for (String question : questions) {
KnowledgeGraphQAService.QueryTemplate template =
qaService.parseQuestion(question, domain);
assertNotNull(template);
assertNotNull(template.getQuestionType());
assertFalse(template.getQuestionType().isEmpty());
// 生成Cypher查询应该不抛出异常
String cypherQuery = qaService.generateCypherQuery(template);
assertNotNull(cypherQuery);
assertFalse(cypherQuery.isEmpty());
}
}
}
Docker部署
Dockerfile
dockerfile
FROM openjdk:11-jre-slim
# 安装系统依赖
RUN apt-get update && apt-get install -y \
curl \
gnupg \
&& rm -rf /var/lib/apt/lists/*
# 创建应用目录
WORKDIR /app
# 创建非root用户
RUN groupadd -r appuser && useradd -r -g appuser appuser
# 复制JAR文件
COPY target/kg-qa-system-1.0.0.jar app.jar
# 创建存储目录
RUN mkdir -p /app/uploads /app/processed /app/logs && \
chown -R appuser:appuser /app
# 切换用户
USER appuser
# 暴露端口
EXPOSE 8082
# 健康检查
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8082/kg-qa/actuator/health || exit 1
# 启动应用
ENTRYPOINT ["java", "-jar", "app.jar"]
docker-compose.yml
yml
version: '3.8'
services:
kg-qa-system:
build: .
ports:
- "8082:8082"
environment:
- SPRING_PROFILES_ACTIVE=prod
- SPRING_DATASOURCE_URL=jdbc:mysql://mysql:3306/kg_qa_system
- SPRING_DATA_NEO4J_URI=bolt://neo4j:7687
- SPRING_DATA_NEO4J_USERNAME=neo4j
- SPRING_DATA_NEO4J_PASSWORD=password
- SPRING_REDIS_HOST=redis
depends_on:
- mysql
- redis
- neo4j
volumes:
- ./uploads:/app/uploads
- ./processed:/app/processed
- ./logs:/app/logs
networks:
- kg-network
mysql:
image: mysql:8.0
environment:
- MYSQL_ROOT_PASSWORD=rootpassword
- MYSQL_DATABASE=kg_qa_system
- MYSQL_USER=kg_user
- MYSQL_PASSWORD=kg_password
volumes:
- mysql_data:/var/lib/mysql
networks:
- kg-network
redis:
image: redis:7-alpine
command: redis-server --appendonly yes
volumes:
- redis_data:/data
networks:
- kg-network
neo4j:
image: neo4j:4.4
environment:
- NEO4J_AUTH=neo4j/password
- NEO4J_PLUGINS=["apoc"]
volumes:
- neo4j_data:/data
- neo4j_logs:/logs
ports:
- "7474:7474"
- "7687:7687"
networks:
- kg-network
volumes:
mysql_data:
redis_data:
neo4j_data:
neo4j_logs:
networks:
kg-network:
driver: bridge
使用Docker快速启动
bash
# 克隆项目
git clone <repository-url>
cd kg-qa-system
# 构建项目
mvn clean package
# 启动所有服务
docker-compose up -d
# 查看日志
docker-compose logs -f kg-qa-system
手动启动
bash
# 创建数据库
mysql -u root -p -e "CREATE DATABASE kg_qa_system;"
# 启动Neo4j
neo4j start
# 构建项目
mvn clean package
# 启动应用
java -jar target/kg-qa-system-1.0.0.jar
前端界面: http://localhost:8082/kg-qa
API文档: http://localhost:8082/kg-qa/swagger-ui.html
健康检查: http://localhost:8082/kg-qa/actuator/health
Neo4j浏览器: http://localhost:7474 (用户名: neo4j, 密码: password)
上传示例数据
bash
# 创建示例文本文件
echo "阿里巴巴由马云在1999年创立。公司总部位于杭州市。马云是阿里巴巴的主要创始人。" > example.txt
# 上传并抽取信息
curl -X POST "http://localhost:8082/kg-qa/api/v1/extraction/upload-file" \
-F "file=@example.txt" \
-F "domain=default" \
-F "category=company"
测试问答功能
bash
curl -X POST "http://localhost:8082/kg-qa/api/v1/qa/answer" \
-H "Content-Type: application/json" \
-d '{
"question": "阿里巴巴的创始人是谁?",
"domain": "default"
}'
信息抽取
bash
curl -X POST "http://localhost:8082/kg-qa/api/v1/extraction/extract-text" \
-H "Content-Type: application/json" \
-d '{
"text": "清华大学位于北京市海淀区。清华大学是中国著名的高等学府。",
"domain": "education"
}'
知识图谱问答
bash
curl -X POST "http://localhost:8082/kg-qa/api/v1/qa/answer" \
-H "Content-Type: application/json" \
-d '{
"question": "清华大学在哪里?",
"domain": "education"
}'
执行Cypher查询
bash
curl -X POST "http://localhost:8082/kg-qa/api/v1/qa/execute-cypher" \
-H "Content-Type: application/json" \
-d '{
"query": "MATCH (e:EntityNode) WHERE e.domain = \"education\" RETURN e.name, e.type LIMIT 5"
}'