目录
Python实例题
题目
基于知识图谱的智能问答系统(NLP、图数据库)
问题描述
开发一个基于知识图谱的智能问答系统,包含以下功能:
- 知识图谱构建:从结构化和非结构化数据中构建知识图谱
- 实体识别与关系抽取:从文本中提取实体和关系
- 问题理解:解析用户问题,识别意图和关键实体
- 知识推理:基于知识图谱进行推理和查询
- 自然语言生成:生成自然语言回答
解题思路
- 使用图数据库(Neo4j 或 JanusGraph)存储知识图谱
- 应用 NLP 技术(BERT、GPT 等)处理自然语言
- 设计实体识别和关系抽取模型
- 开发查询转换模块将自然语言问题转换为图查询
- 实现答案生成和排序机制
关键代码框架
python
# 知识图谱构建模块
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship
import spacy
from transformers import pipeline
import re
class KnowledgeGraphBuilder:
def __init__(self, db_uri, username, password):
# 连接图数据库
self.graph = Graph(db_uri, auth=(username, password))
# 加载NLP模型
self.nlp = spacy.load("en_core_web_sm")
self.ner = pipeline("ner", model="dslim/bert-base-NER")
self.relation_extractor = pipeline("text-classification", model="distilbert-base-uncased")
def build_from_csv(self, csv_path, entity_columns, relationship_columns):
"""从CSV文件构建知识图谱"""
df = pd.read_csv(csv_path)
# 创建实体
entities = {}
for entity_type, columns in entity_columns.items():
entities[entity_type] = self._create_entities(df, entity_type, columns)
# 创建关系
for rel_type, (source_entity, target_entity, rel_columns) in relationship_columns.items():
self._create_relationships(
df, rel_type,
entities[source_entity], entities[target_entity],
rel_columns
)
def _create_entities(self, df, entity_type, columns):
"""创建实体节点"""
entities = {}
for _, row in df.iterrows():
entity_id = row[columns[0]]
entity_name = row.get(columns[1], entity_id)
if entity_id not in entities:
node = Node(entity_type, id=str(entity_id), name=str(entity_name))
self.graph.create(node)
entities[entity_id] = node
return entities
def _create_relationships(self, df, rel_type, source_entities, target_entities, columns):
"""创建实体间关系"""
for _, row in df.iterrows():
source_id = row[columns[0]]
target_id = row[columns[1]]
if source_id in source_entities and target_id in target_entities:
rel = Relationship(source_entities[source_id], rel_type, target_entities[target_id])
self.graph.create(rel)
def extract_entities_and_relations(self, text):
"""从文本中提取实体和关系"""
# 实体识别
entities = self.ner(text)
# 关系抽取
doc = self.nlp(text)
sentences = [sent.text for sent in doc.sents]
relations = []
for sentence in sentences:
# 简化的关系抽取逻辑
rel = self._extract_relation(sentence, entities)
if rel:
relations.append(rel)
return entities, relations
def _extract_relation(self, sentence, entities):
"""从句子中抽取关系"""
# 这是一个简化的实现,实际应用中应使用更复杂的模型
for ent1 in entities:
for ent2 in entities:
if ent1['entity'] != ent2['entity']:
# 使用关系分类模型预测关系
relation = self.relation_extractor(
f"{ent1['word']} [SEP] {sentence} [SEP] {ent2['word']}"
)
return {
'source': ent1['word'],
'target': ent2['word'],
'type': relation[0]['label'],
'score': relation[0]['score']
}
return None
def add_text_to_knowledge_graph(self, text):
"""将文本添加到知识图谱"""
entities, relations = self.extract_entities_and_relations(text)
# 创建实体节点
entity_nodes = {}
for ent in entities:
node = Node(ent['entity'], name=ent['word'])
self.graph.merge(node, ent['entity'], 'name')
entity_nodes[ent['word']] = node
# 创建关系
for rel in relations:
if rel['source'] in entity_nodes and rel['target'] in entity_nodes:
rel_obj = Relationship(
entity_nodes[rel['source']],
rel['type'],
entity_nodes[rel['target']],
confidence=rel['score']
)
self.graph.merge(rel_obj)
python
# 问题理解与回答模块
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from py2neo import Graph
class QuestionAnsweringSystem:
def __init__(self, db_uri, username, password, qa_model="bert-large-uncased-whole-word-masking-finetuned-squad"):
# 连接图数据库
self.graph = Graph(db_uri, auth=(username, password))
# 加载问题回答模型
self.tokenizer = AutoTokenizer.from_pretrained(qa_model)
self.model = AutoModelForQuestionAnswering.from_pretrained(qa_model)
def answer_question(self, question):
"""回答用户问题"""
# 1. 理解问题类型和关键实体
entities, question_type = self._understand_question(question)
# 2. 生成图查询
query = self._generate_query(entities, question_type)
# 3. 执行查询
results = self._execute_query(query)
# 4. 生成自然语言回答
answer = self._generate_answer(question, results)
return answer
def _understand_question(self, question):
"""理解问题类型和关键实体"""
# 使用NER识别实体
entities = self._extract_entities(question)
# 确定问题类型(简化版)
question_type = self._classify_question_type(question)
return entities, question_type
def _extract_entities(self, question):
"""从问题中提取实体"""
# 使用NER模型
inputs = self.tokenizer(question, return_tensors="pt")
with torch.no_grad():
outputs = self.model(**inputs)
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
entities = self.tokenizer.decode(predict_answer_tokens)
return entities
def _classify_question_type(self, question):
"""分类问题类型"""
# 简单的基于关键词的分类
if "who" in question.lower():
return "person"
elif "where" in question.lower():
return "location"
elif "when" in question.lower():
return "time"
elif "what" in question.lower():
return "thing"
elif "how" in question.lower():
return "method"
else:
return "general"
def _generate_query(self, entities, question_type):
"""生成图查询"""
# 根据问题类型和实体生成Cypher查询
if question_type == "person":
return f"MATCH (p:Person)-[r]-(o) WHERE p.name CONTAINS '{entities}' RETURN p, r, o LIMIT 5"
elif question_type == "location":
return f"MATCH (l:Location)-[r]-(o) WHERE l.name CONTAINS '{entities}' RETURN l, r, o LIMIT 5"
else:
# 通用查询
return f"MATCH (n)-[r]-(m) WHERE n.name CONTAINS '{entities}' RETURN n, r, m LIMIT 5"
def _execute_query(self, query):
"""执行图查询"""
try:
results = self.graph.run(query).data()
return results
except Exception as e:
print(f"查询执行错误: {e}")
return []
def _generate_answer(self, question, results):
"""生成自然语言回答"""
if not results:
return "抱歉,我无法回答这个问题。"
# 简单的回答生成
if len(results) == 1:
# 单个结果
node = results[0]['n']
return f"{question} {node['name']} 是 {node.get('description', '一个实体')}。"
else:
# 多个结果
answers = [result['n']['name'] for result in results]
return f"{question} 相关的实体有: {', '.join(answers)}。"
python
# 主应用
class KnowledgeGraphQASystem:
def __init__(self, config):
self.builder = KnowledgeGraphBuilder(
config['db_uri'],
config['db_username'],
config['db_password']
)
self.qa_system = QuestionAnsweringSystem(
config['db_uri'],
config['db_username'],
config['db_password']
)
def build_knowledge_graph(self, data_sources):
"""构建知识图谱"""
for source in data_sources:
if source['type'] == 'csv':
self.builder.build_from_csv(
source['path'],
source['entity_columns'],
source['relationship_columns']
)
elif source['type'] == 'text':
with open(source['path'], 'r') as f:
text = f.read()
self.builder.add_text_to_knowledge_graph(text)
def answer_question(self, question):
"""回答用户问题"""
return self.qa_system.answer_question(question)
# 示例配置和使用
if __name__ == "__main__":
config = {
'db_uri': 'bolt://localhost:7687',
'db_username': 'neo4j',
'db_password': 'password'
}
# 初始化系统
system = KnowledgeGraphQASystem(config)
# 构建知识图谱
data_sources = [
{
'type': 'csv',
'path': 'data/people.csv',
'entity_columns': {
'Person': ['id', 'name'],
'Company': ['company_id', 'company_name']
},
'relationship_columns': {
'WORKS_AT': ('Person', 'Company', ['id', 'company_id']),
'POSITION': ('Person', 'Company', ['id', 'position'])
}
},
{
'type': 'text',
'path': 'data/knowledge.txt'
}
]
system.build_knowledge_graph(data_sources)
# 回答问题
question = "谁是苹果公司的CEO?"
answer = system.answer_question(question)
print(f"问题: {question}")
print(f"回答: {answer}")
难点分析
- 知识抽取准确性:从非结构化文本中准确提取实体和关系
- 知识图谱构建:处理数据不一致性和冗余问题
- 问题理解:准确解析自然语言问题的意图
- 推理能力:实现复杂的知识推理和逻辑推断
- 可扩展性:支持大规模知识图谱的高效查询
扩展方向
- 添加多语言支持
- 实现知识图谱的持续更新和学习
- 开发可视化界面展示知识图谱
- 集成深度学习模型提高问答准确性
- 添加对话管理功能实现多轮对话