LLM之RAG实战(五十一)| 使用python和Cypher解析PDF数据,并加载到Neo4j数据库

一、必备条件:

  • python语言
  • Neo4j数据库
  • python库:neo4j、llmsherpa、glob、dotenv

二、代码:

复制代码
from llmsherpa.readers import LayoutPDFReaderfrom neo4j import GraphDatabaseimport uuidimport hashlibimport osimport globfrom datetime import datetimeimport timefrom dotenv import load_dotenv​# Load environment variablespath = "/home/QA/Neo4j_Stage1/.env"load_dotenv(path)​# Neo4j configurationNEO4J_URL = os.environ["NEO4J_URI"]NEO4J_USER = "neo4j"NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]NEO4J_DATABASE = "neo4j"​# File location for PDFsfile_location = '/home/QA/Neo4j_Stage1/PDFs'​# Initialize Neo4jdef initialiseNeo4j():    cypher_schema = [        "CREATE CONSTRAINT sectionKey IF NOT EXISTS FOR (c:Section) REQUIRE (c.key) IS UNIQUE;",        "CREATE CONSTRAINT chunkKey IF NOT EXISTS FOR (c:Chunk) REQUIRE (c.key) IS UNIQUE;",        "CREATE CONSTRAINT documentKey IF NOT EXISTS FOR (c:Document) REQUIRE (c.url_hash) IS UNIQUE;",        "CREATE CONSTRAINT tableKey IF NOT EXISTS FOR (c:Table) REQUIRE (c.key) IS UNIQUE;",        "CALL db.index.vector.createNodeIndex('chunkVectorIndex', 'Embedding', 'value', 1536, 'COSINE');"    ]​    driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))    with driver.session() as session:        for cypher in cypher_schema:            session.run(cypher)    driver.close()​# Ingest document into Neo4jdef ingestDocumentNeo4j(doc, doc_location):    cypher_pool = [        "MERGE (d:Document {name: $doc_name_val}) ON CREATE SET d.url = $doc_url_val RETURN d;",        "MERGE (p:Section {key: $doc_name_val+'|'+$block_idx_val+'|'+$title_hash_val}) ON CREATE SET p.page_idx = $page_idx_val, p.title_hash = $title_hash_val, p.block_idx = $block_idx_val, p.title = $title_val, p.tag = $tag_val, p.level = $level_val RETURN p;",        "MATCH (d:Document {name: $doc_name_val}) MATCH (s:Section {key: $doc_name_val+'|'+$block_idx_val+'|'+$title_hash_val}) MERGE (d)<-[:HAS_DOCUMENT]-(s);",        "MATCH (s1:Section {key: $doc_name_val+'|'+$parent_block_idx_val+'|'+$parent_title_hash_val}) MATCH (s2:Section {key: $doc_name_val+'|'+$block_idx_val+'|'+$title_hash_val}) MERGE (s1)<-[:UNDER_SECTION]-(s2);",        "MERGE (c:Chunk {key: $doc_name_val+'|'+$block_idx_val+'|'+$sentences_hash_val}) ON CREATE SET c.sentences = $sentences_val, c.sentences_hash = $sentences_hash_val, c.block_idx = $block_idx_val, c.page_idx = $page_idx_val, c.tag = $tag_val, c.level = $level_val RETURN c;",        "MATCH (c:Chunk {key: $doc_name_val+'|'+$block_idx_val+'|'+$sentences_hash_val}) MATCH (s:Section {key:$doc_name_val+'|'+$parent_block_idx_val+'|'+$parent_hash_val}) MERGE (s)<-[:HAS_PARENT]-(c);",        "MERGE (t:Table {key: $doc_name_val+'|'+$block_idx_val+'|'+$name_val}) ON CREATE SET t.name = $name_val, t.doc_name = $doc_name_val, t.block_idx = $block_idx_val, t.page_idx = $page_idx_val, t.html = $html_val, t.rows = $rows_val RETURN t;",        "MATCH (t:Table {key: $doc_name_val+'|'+$block_idx_val+'|'+$name_val}) MATCH (s:Section {key: $doc_name_val+'|'+$parent_block_idx_val+'|'+$parent_hash_val}) MERGE (s)<-[:HAS_PARENT]-(t);",        "MATCH (t:Table {key: $doc_name_val+'|'+$block_idx_val+'|'+$name_val}) MATCH (s:Document {name: $doc_name_val}) MERGE (s)<-[:HAS_PARENT]-(t);"    ]​    driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))    with driver.session() as session:        doc_name_val = os.path.basename(doc_location)        doc_url_val = doc_location        cypher = cypher_pool[0]        session.run(cypher, doc_name_val=doc_name_val, doc_url_val=doc_url_val)​        for sec in doc.sections():            sec_title_val = sec.title            sec_title_hash_val = hashlib.md5(sec_title_val.encode("utf-8")).hexdigest()            sec_tag_val = sec.tag            sec_level_val = sec.level            sec_page_idx_val = sec.page_idx            sec_block_idx_val = sec.block_idx​            if sec_tag_val != 'table':                cypher = cypher_pool[1]                session.run(cypher, page_idx_val=sec_page_idx_val, title_hash_val=sec_title_hash_val, title_val=sec_title_val, tag_val=sec_tag_val, level_val=sec_level_val, block_idx_val=sec_block_idx_val, doc_name_val=doc_name_val)​                sec_parent_val = str(sec.parent.to_text())                if sec_parent_val == "None":                    cypher = cypher_pool[2]                    session.run(cypher, page_idx_val=sec_page_idx_val, title_hash_val=sec_title_hash_val, doc_name_val=doc_name_val, block_idx_val=sec_block_idx_val)                else:                    sec_parent_title_hash_val = hashlib.md5(sec_parent_val.encode("utf-8")).hexdigest()                    sec_parent_page_idx_val = sec.parent.page_idx                    sec_parent_block_idx_val = sec.parent.block_idx                    cypher = cypher_pool[3]                    session.run(cypher, page_idx_val=sec_page_idx_val, title_hash_val=sec_title_hash_val, block_idx_val=sec_block_idx_val, parent_page_idx_val=sec_parent_page_idx_val, parent_title_hash_val=sec_parent_title_hash_val, parent_block_idx_val=sec_parent_block_idx_val, doc_name_val=doc_name_val)​        for chk in doc.chunks():            chunk_block_idx_val = chk.block_idx            chunk_page_idx_val = chk.page_idx            chunk_tag_val = chk.tag            chunk_level_val = chk.level            chunk_sentences = "\n".join(chk.sentences)​            if chunk_tag_val != 'table':                chunk_sentences_hash_val = hashlib.md5(chunk_sentences.encode("utf-8")).hexdigest()                cypher = cypher_pool[4]                session.run(cypher, sentences_hash_val=chunk_sentences_hash_val, sentences_val=chunk_sentences, block_idx_val=chunk_block_idx_val, page_idx_val=chunk_page_idx_val, tag_val=chunk_tag_val, level_val=chunk_level_val, doc_name_val=doc_name_val)​                chk_parent_val = str(chk.parent.to_text())                if chk_parent_val != "None":                    chk_parent_hash_val = hashlib.md5(chk_parent_val.encode("utf-8")).hexdigest()                    chk_parent_page_idx_val = chk.parent.page_idx                    chk_parent_block_idx_val = chk.parent.block_idx                    cypher = cypher_pool[5]                    session.run(cypher, sentences_hash_val=chunk_sentences_hash_val, block_idx_val=chunk_block_idx_val, parent_hash_val=chk_parent_hash_val, parent_block_idx_val=chk_parent_block_idx_val, doc_name_val=doc_name_val)​        for tb in doc.tables():            page_idx_val = tb.page_idx            block_idx_val = tb.block_idx            name_val = 'block#' + str(block_idx_val) + '_' + tb.name            html_val = tb.to_html()            rows_val = len(tb.rows)            cypher = cypher_pool[6]            session.run(cypher, block_idx_val=block_idx_val, page_idx_val=page_idx_val, name_val=name_val, html_val=html_val, rows_val=rows_val, doc_name_val=doc_name_val)​            table_parent_val = str(tb.parent.to_text())            if table_parent_val != "None":                table_parent_hash_val = hashlib.md5(table_parent_val.encode("utf-8")).hexdigest()                table_parent_page_idx_val = tb.parent.page_idx                table_parent_block_idx_val = tb.parent.block_idx                cypher = cypher_pool[7]                session.run(cypher, name_val=name_val, block_idx_val=block_idx_val, parent_page_idx_val=table_parent_page_idx_val, parent_hash_val=table_parent_hash_val, parent_block_idx_val=table_parent_block_idx_val, doc_name_val=doc_name_val)            else:                cypher = cypher_pool[8]                session.run(cypher, name_val=name_val, block_idx_val=block_idx_val, doc_name_val=doc_name_val)​        print(f'\'{doc_name_val}\' Done! Summary: ')        print('#Sections: ' + str(len(doc.sections())))        print('#Chunks: ' + str(len(doc.chunks())))        print('#Tables: ' + str(len(doc.tables())))​    driver.close()​# Parse PDFs and ingest into Neo4jdef parseAndIngestPDFs():    pdf_files = glob.glob(file_location + '/*.pdf')    print(f'#PDF files found: {len(pdf_files)}!')​    pdf_reader = LayoutPDFReader("https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all")​    startTime = datetime.now()​    for pdf_file in pdf_files:        doc = pdf_reader.read_pdf(pdf_file)        ingestDocumentNeo4j(doc, pdf_file)​    print(f'Total time: {datetime.now() - startTime}')​# Initialize Neo4jinitialiseNeo4j()​# Parse PDFs and ingest into Neo4jparseAndIngestPDFs()

三、代码解释

3.1 设置

  • 导入Neo4j环境变量
  • 设置Neo4j唯一key

3.2 初始化Neo4j

  • 建立与 Neo4j 的连接并创建必要的约束以确保数据完整性。

3.3 提取文档内容

  • 抽取PDFsection、块和表格数据
  • 使用 Cypher 查询在 Neo4j 图形中创建和链接节点

3.4 解析PDF内容

  1. 查找指定目录中的所有 PDF 文件;

  2. 使用 LayoutPDFReader 解析每个 PDF;

  3. 将解析后的数据加入到Neo4j数据库中;

相关推荐
ZTLJQ7 小时前
序列化的艺术:Python JSON处理完全解析
开发语言·python·json
l1t7 小时前
DeepSeek总结的 pg_regresql插件:真正可移植的 PostgreSQL 统计信息
数据库·postgresql
oradh7 小时前
Oracle 11.2.0.1版本升级至11.2.0.4_单机环境
数据库·oracle·oracle11g·oracle升级
l1t7 小时前
用docker安装测试crate数据库
数据库·docker·容器·cratedb
H5css�海秀8 小时前
今天是自学大模型的第一天(sanjose)
后端·python·node.js·php
anzhxu8 小时前
QT数据库(三):QSqlQuery使用
数据库·qt·oracle
身如柳絮随风扬8 小时前
MySQL核心知识
数据库·mysql
德彪稳坐倒骑驴8 小时前
Oracle 11g安装
数据库·oracle
韩立学长8 小时前
Springboot校园跑腿业务系统0b7amk02(程序、源码、数据库、调试部署方案及开发环境)系统界面展示及获取方式置于文档末尾,可供参考。
数据库·spring boot·后端
阿贵---8 小时前
使用XGBoost赢得Kaggle比赛
jvm·数据库·python