Python实例题:基于知识图谱的智能问答系统(NLP、图数据库)

目录

Python实例题

题目

问题描述

解题思路

关键代码框架

难点分析

扩展方向

Python实例题

题目

基于知识图谱的智能问答系统(NLP、图数据库)

问题描述

开发一个基于知识图谱的智能问答系统,包含以下功能:

  • 知识图谱构建:从结构化和非结构化数据中构建知识图谱
  • 实体识别与关系抽取:从文本中提取实体和关系
  • 问题理解:解析用户问题,识别意图和关键实体
  • 知识推理:基于知识图谱进行推理和查询
  • 自然语言生成:生成自然语言回答

解题思路

  • 使用图数据库(Neo4j 或 JanusGraph)存储知识图谱
  • 应用 NLP 技术(BERT、GPT 等)处理自然语言
  • 设计实体识别和关系抽取模型
  • 开发查询转换模块将自然语言问题转换为图查询
  • 实现答案生成和排序机制

关键代码框架

python 复制代码
# 知识图谱构建模块
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship
import spacy
from transformers import pipeline
import re

class KnowledgeGraphBuilder:
    def __init__(self, db_uri, username, password):
        # 连接图数据库
        self.graph = Graph(db_uri, auth=(username, password))
        # 加载NLP模型
        self.nlp = spacy.load("en_core_web_sm")
        self.ner = pipeline("ner", model="dslim/bert-base-NER")
        self.relation_extractor = pipeline("text-classification", model="distilbert-base-uncased")
        
    def build_from_csv(self, csv_path, entity_columns, relationship_columns):
        """从CSV文件构建知识图谱"""
        df = pd.read_csv(csv_path)
        
        # 创建实体
        entities = {}
        for entity_type, columns in entity_columns.items():
            entities[entity_type] = self._create_entities(df, entity_type, columns)
        
        # 创建关系
        for rel_type, (source_entity, target_entity, rel_columns) in relationship_columns.items():
            self._create_relationships(
                df, rel_type, 
                entities[source_entity], entities[target_entity], 
                rel_columns
            )
    
    def _create_entities(self, df, entity_type, columns):
        """创建实体节点"""
        entities = {}
        
        for _, row in df.iterrows():
            entity_id = row[columns[0]]
            entity_name = row.get(columns[1], entity_id)
            
            if entity_id not in entities:
                node = Node(entity_type, id=str(entity_id), name=str(entity_name))
                self.graph.create(node)
                entities[entity_id] = node
        
        return entities
    
    def _create_relationships(self, df, rel_type, source_entities, target_entities, columns):
        """创建实体间关系"""
        for _, row in df.iterrows():
            source_id = row[columns[0]]
            target_id = row[columns[1]]
            
            if source_id in source_entities and target_id in target_entities:
                rel = Relationship(source_entities[source_id], rel_type, target_entities[target_id])
                self.graph.create(rel)
    
    def extract_entities_and_relations(self, text):
        """从文本中提取实体和关系"""
        # 实体识别
        entities = self.ner(text)
        
        # 关系抽取
        doc = self.nlp(text)
        sentences = [sent.text for sent in doc.sents]
        
        relations = []
        for sentence in sentences:
            # 简化的关系抽取逻辑
            rel = self._extract_relation(sentence, entities)
            if rel:
                relations.append(rel)
        
        return entities, relations
    
    def _extract_relation(self, sentence, entities):
        """从句子中抽取关系"""
        # 这是一个简化的实现,实际应用中应使用更复杂的模型
        for ent1 in entities:
            for ent2 in entities:
                if ent1['entity'] != ent2['entity']:
                    # 使用关系分类模型预测关系
                    relation = self.relation_extractor(
                        f"{ent1['word']} [SEP] {sentence} [SEP] {ent2['word']}"
                    )
                    return {
                        'source': ent1['word'],
                        'target': ent2['word'],
                        'type': relation[0]['label'],
                        'score': relation[0]['score']
                    }
        return None
    
    def add_text_to_knowledge_graph(self, text):
        """将文本添加到知识图谱"""
        entities, relations = self.extract_entities_and_relations(text)
        
        # 创建实体节点
        entity_nodes = {}
        for ent in entities:
            node = Node(ent['entity'], name=ent['word'])
            self.graph.merge(node, ent['entity'], 'name')
            entity_nodes[ent['word']] = node
        
        # 创建关系
        for rel in relations:
            if rel['source'] in entity_nodes and rel['target'] in entity_nodes:
                rel_obj = Relationship(
                    entity_nodes[rel['source']], 
                    rel['type'], 
                    entity_nodes[rel['target']],
                    confidence=rel['score']
                )
                self.graph.merge(rel_obj)
python 复制代码
# 问题理解与回答模块
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from py2neo import Graph

class QuestionAnsweringSystem:
    def __init__(self, db_uri, username, password, qa_model="bert-large-uncased-whole-word-masking-finetuned-squad"):
        # 连接图数据库
        self.graph = Graph(db_uri, auth=(username, password))
        
        # 加载问题回答模型
        self.tokenizer = AutoTokenizer.from_pretrained(qa_model)
        self.model = AutoModelForQuestionAnswering.from_pretrained(qa_model)
    
    def answer_question(self, question):
        """回答用户问题"""
        # 1. 理解问题类型和关键实体
        entities, question_type = self._understand_question(question)
        
        # 2. 生成图查询
        query = self._generate_query(entities, question_type)
        
        # 3. 执行查询
        results = self._execute_query(query)
        
        # 4. 生成自然语言回答
        answer = self._generate_answer(question, results)
        
        return answer
    
    def _understand_question(self, question):
        """理解问题类型和关键实体"""
        # 使用NER识别实体
        entities = self._extract_entities(question)
        
        # 确定问题类型(简化版)
        question_type = self._classify_question_type(question)
        
        return entities, question_type
    
    def _extract_entities(self, question):
        """从问题中提取实体"""
        # 使用NER模型
        inputs = self.tokenizer(question, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        answer_start_index = outputs.start_logits.argmax()
        answer_end_index = outputs.end_logits.argmax()
        
        predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        entities = self.tokenizer.decode(predict_answer_tokens)
        
        return entities
    
    def _classify_question_type(self, question):
        """分类问题类型"""
        # 简单的基于关键词的分类
        if "who" in question.lower():
            return "person"
        elif "where" in question.lower():
            return "location"
        elif "when" in question.lower():
            return "time"
        elif "what" in question.lower():
            return "thing"
        elif "how" in question.lower():
            return "method"
        else:
            return "general"
    
    def _generate_query(self, entities, question_type):
        """生成图查询"""
        # 根据问题类型和实体生成Cypher查询
        if question_type == "person":
            return f"MATCH (p:Person)-[r]-(o) WHERE p.name CONTAINS '{entities}' RETURN p, r, o LIMIT 5"
        elif question_type == "location":
            return f"MATCH (l:Location)-[r]-(o) WHERE l.name CONTAINS '{entities}' RETURN l, r, o LIMIT 5"
        else:
            # 通用查询
            return f"MATCH (n)-[r]-(m) WHERE n.name CONTAINS '{entities}' RETURN n, r, m LIMIT 5"
    
    def _execute_query(self, query):
        """执行图查询"""
        try:
            results = self.graph.run(query).data()
            return results
        except Exception as e:
            print(f"查询执行错误: {e}")
            return []
    
    def _generate_answer(self, question, results):
        """生成自然语言回答"""
        if not results:
            return "抱歉,我无法回答这个问题。"
        
        # 简单的回答生成
        if len(results) == 1:
            # 单个结果
            node = results[0]['n']
            return f"{question} {node['name']} 是 {node.get('description', '一个实体')}。"
        else:
            # 多个结果
            answers = [result['n']['name'] for result in results]
            return f"{question} 相关的实体有: {', '.join(answers)}。"
python 复制代码
# 主应用
class KnowledgeGraphQASystem:
    def __init__(self, config):
        self.builder = KnowledgeGraphBuilder(
            config['db_uri'], 
            config['db_username'], 
            config['db_password']
        )
        
        self.qa_system = QuestionAnsweringSystem(
            config['db_uri'], 
            config['db_username'], 
            config['db_password']
        )
    
    def build_knowledge_graph(self, data_sources):
        """构建知识图谱"""
        for source in data_sources:
            if source['type'] == 'csv':
                self.builder.build_from_csv(
                    source['path'], 
                    source['entity_columns'], 
                    source['relationship_columns']
                )
            elif source['type'] == 'text':
                with open(source['path'], 'r') as f:
                    text = f.read()
                    self.builder.add_text_to_knowledge_graph(text)
    
    def answer_question(self, question):
        """回答用户问题"""
        return self.qa_system.answer_question(question)

# 示例配置和使用
if __name__ == "__main__":
    config = {
        'db_uri': 'bolt://localhost:7687',
        'db_username': 'neo4j',
        'db_password': 'password'
    }
    
    # 初始化系统
    system = KnowledgeGraphQASystem(config)
    
    # 构建知识图谱
    data_sources = [
        {
            'type': 'csv',
            'path': 'data/people.csv',
            'entity_columns': {
                'Person': ['id', 'name'],
                'Company': ['company_id', 'company_name']
            },
            'relationship_columns': {
                'WORKS_AT': ('Person', 'Company', ['id', 'company_id']),
                'POSITION': ('Person', 'Company', ['id', 'position'])
            }
        },
        {
            'type': 'text',
            'path': 'data/knowledge.txt'
        }
    ]
    
    system.build_knowledge_graph(data_sources)
    
    # 回答问题
    question = "谁是苹果公司的CEO?"
    answer = system.answer_question(question)
    print(f"问题: {question}")
    print(f"回答: {answer}")

难点分析

  • 知识抽取准确性:从非结构化文本中准确提取实体和关系
  • 知识图谱构建:处理数据不一致性和冗余问题
  • 问题理解:准确解析自然语言问题的意图
  • 推理能力:实现复杂的知识推理和逻辑推断
  • 可扩展性:支持大规模知识图谱的高效查询

扩展方向

  • 添加多语言支持
  • 实现知识图谱的持续更新和学习
  • 开发可视化界面展示知识图谱
  • 集成深度学习模型提高问答准确性
  • 添加对话管理功能实现多轮对话