neo4j 图表数据导入到 TuGraph
前言:近期在引入阿里的 TuGraph 图数据库,需要将 原 neo4j 数据导入到新的 tugraph 数据库中。预期走csv文件导入导出,但因为格式和数据库设计问题,操作起来比较麻烦(可能是个人没有发现其他比较方便的办法),因此写了一个 python 脚本进行数据导入操作。
使用:python3,TuGraph 4.5.1
遇到的问题:tugraph 的节点需要一个主键,这个只能自行指定。
支持:指定节点,指定边。自动创建不存在的节点/边,数据导入批量导入节点,单条导入边(试过批量的,tugraph好像不支持 官网的 CALL db.upsertEdge 我的版本也还没实现)。
导入图示:
代码文件
python
# import time
import json
from typing import Dict, List, cast
class GraphConnector():
db_type: str = "tugraph"
driver: str = "bolt"
dialect: str = "cypher"
batch_size: int = 100
# 指定节点的主键
node_pro_key: dict = dict({'Ren':'zjhm','Aj':'ajbh','Che':'rowkey','Hh':'rowkey','Sj':'dhhm'})
#指定需要导入的边
specified_relation = ['ajgx','th','tfj','sysj','sycl']
#指定需要导入的节点
specified_node = ['Ren','Aj','Che','Sj','Hh']
def __init__(self, driver, graph):
"""Initialize the connector with a Neo4j driver."""
self._driver = driver
self._schema = None
self._graph = graph
self._session = None
@classmethod
def from_uri_db(
cls, host: str, port: int, user: str, pwd: str, db_name: str, db_type: str
) -> "GraphConnector":
"""Create a new TuGraphConnector from host, port, user, pwd, db_name."""
try:
from neo4j import GraphDatabase
cls.db_type = db_type
db_url = f"{cls.driver}://{host}:{str(port)}"
driver = GraphDatabase.driver(db_url, auth=(user, pwd))
driver.verify_connectivity()
return cast(GraphConnector, cls(driver=driver, graph=db_name))
except ImportError as err:
raise ImportError(
"neo4j package is not installed, please install it with "
"`pip install neo4j`"
) from err
def create_graph_new(self, graph_name: str) -> bool:
"""Create a new graph in the database if it doesn't already exist."""
try:
with self._driver.session() as session:
graph_list = session.run("CALL dbms.graph.listGraphs()").data()
exists = any(item["graph_name"] == graph_name for item in graph_list)
if not exists:
session.run(
f"CALL dbms.graph.createGraph('{graph_name}', '', 2048)"
)
except Exception as e:
raise Exception(f"Failed to create graph '{graph_name}': {str(e)}") from e
return not exists
def create_vertex_labels(self, json_data):
try:
with self._driver.session(database=self._graph) as session:
# graph_list = session.run(f"CALL db.createVertexLabelByJson({json_data})").data()
session.run(
"CALL db.createVertexLabelByJson($json_data)",
json_data=json_data
)
except Exception as e:
raise Exception(f"Failed to create vertex_labels ") from e
# 批量更新节点,没有就新增,有就更新
def batch_update_node(self, json_data):
try:
with self._driver.session(database=self._graph) as session:
# graph_list = session.run(f"CALL db.createVertexLabelByJson({json_data})").data()
session.upsertVertex(
"CALL db.upsertVertex($json_data)",
json_data=json_data
)
except Exception as e:
raise Exception(f"Failed to create vertex_labels ") from e
# 批量更新关系,没有就新增,有就更新
def batch_update_edge(self, json_data):
try:
with self._driver.session(database=self._graph) as session:
# graph_list = session.run(f"CALL db.createVertexLabelByJson({json_data})").data()
session.upsertVertex(
"CALL db.upsertEdge($json_data)",
json_data=json_data
)
except Exception as e:
raise Exception(f"Failed to create vertex_labels ") from e
def create_edge_labels(self, json_data):
try:
with self._driver.session(database=self._graph) as session:
# graph_list = session.run(f"CALL db.createVertexLabelByJson({json_data})").data()
session.run(
"CALL db.createEdgeLabelByJson($json_data)",
json_data=json_data
)
except Exception as e:
raise Exception(f"Failed to create vertex_labels ") from e
def run(self, query: str, fetch: str = "all") -> List:
"""Run query."""
with self._driver.session(database=self._graph) as session:
try:
result = session.run(query)
return list(result)
except Exception as e:
raise Exception(f"Query execution failed: {e}\nQuery: {query}") from e
def check_label_exists(self, label: str, label_type: str) -> bool:
with self._driver.session(database=self._graph) as session:
# Run the query to get vertex labels
if label_type == "node":
raw_vertex_labels = session.run("CALL db.vertexLabels()").data()
vertex_labels = [table_name["label"] for table_name in raw_vertex_labels]
if label in vertex_labels:
return True
else:
# Run the query to get edge labels
raw_edge_labels = session.run("CALL db.edgeLabels()").data()
edge_labels = [table_name["label"] for table_name in raw_edge_labels]
if label in edge_labels:
return True
return False
# 获取节点或边的结构
def get_columns(self, table_name: str, table_type: str = "vertex") -> List[Dict]:
"""Retrieve the column for a specified vertex or edge table in the graph db."""
with self._driver.session(database=self._graph) as session:
data = []
result = None
if table_type == "vertex":
result = session.run(f"CALL db.getVertexSchema('{table_name}')").data()
else:
result = session.run(f"CALL db.getEdgeSchema('{table_name}')").data()
schema_info = json.loads(result[0]["schema"])
for prop in schema_info.get("properties", []):
prop_dict = {
"name": prop["name"],
"type": prop["type"],
"default_expression": "",
"is_in_primary_key": bool(
"primary" in schema_info
and prop["name"] == schema_info["primary"]
),
"comment": prop["name"],
}
data.append(prop_dict)
return data
def close(self):
"""Close the Neo4j driver."""
self._driver.close()
# {"name": "id", "type": "STRING", "optional": False},
# {"name": "name", "type": "STRING", "optional": False, "index": True},
# {"name": "num", "type": "STRING", "optional": False, "unique": True},
# {"name": "desc", "type": "STRING", "optional": True}
# 构建节点json语句用于tugraph创建节点
def bulid_node_json(node_name:str,pro_key:str ,node_properties):
vertex_label_json = {
"label": node_name,
"primary": pro_key,
"type": "VERTEX",
"detach_property": True,
"properties": [
]
}
for node_property in node_properties:
proper_info = {"name": node_property[0], "type": "STRING", "optional": False}
vertex_label_json['properties'].append(proper_info)
return json.dumps(vertex_label_json)
def bulid_edge_json(edge_name:str,edge_properties,start_node_key,end_node_key):
edge_label_json = {
"label": edge_name,
"type": "EDGE",
"detach_property": True,
"constraints": [],
"properties": []
}
edge_label_json['constraints'].append([edge_properties[0][1][0],edge_properties[0][2][0]])
# 这是在边属性中存储节点的主键(不需要也可以)
# edge_label_json['properties'].append({"name": start_node_key if start_node_key != end_node_key else start_node_key+'1', "type": "STRING", "optional": False})
# edge_label_json['properties'].append({"name": end_node_key if start_node_key != end_node_key else start_node_key+'2', "type": "STRING", "optional": False})
for edge_property in edge_properties:
proper_info = {"name": edge_property[0], "type": "STRING", "optional": True}
edge_label_json['properties'].append(proper_info)
return json.dumps(edge_label_json)
def neo4jNode2Tugrapg(connector,tugraphConn):
query = """CALL db.labels() YIELD label
RETURN label;"""
print("Executing query:", query)
results_nodes = connector.run(query)
print(f"所有的节点:{results_nodes}")
print("指定的节点:",connector.specified_node)
for node in results_nodes:
#获取节点结构
query = f"""
MATCH (n:{node[0]})
UNWIND keys(n) AS key
RETURN DISTINCT key"""
node_properties = connector.run(query)
if node[0] not in connector.specified_node and len(connector.specified_node) != 0:
continue
print(f"当前 neo4j 节点 {node[0]} , roperties : {node_properties}!!")
if tugraphConn.check_label_exists(node[0],"node"):
print(node[0],"节点已经存在!")
else:
print(node[0],"节点不存在,需要新建!")
node_json = bulid_node_json(node[0],connector.node_pro_key[node[0]],node_properties)
# 新建不存在的节点
tugraphConn.create_vertex_labels(node_json)
# neo4j中查询出当前节点标签下所有节点
queryNode = f"MATCH (n:{node[0]}) RETURN n"
# 构建插入语句同步节点
synchronize_node(node[0],connector.run(queryNode),node_properties,tugraphConn)
# node_name 当前节点标签名
# node_result neo4j中查询出的节点结果
# tugraphConn tugraph连接器
# 构建新增节点语句并tugraphConn 执行,一次执行300条
# CREATE (:node1 {id: "2", name: "李四", num: "001", desc: "李四的信息"}),
# (:node1 {id: "3", name: "李四", num: "001", desc: "李四的信息"});
def synchronize_node(node_name:str,node_result,node_properties,tugraphConn):
# 构建Cypher查询语句
print(f"同步 {node_name} 节点共 {len(node_result)} 记录,请等待执行完成...")
create_node_cypher_parts = []
count = 0
skip_num = 0
for node in node_result:
# print("aa",aa)
item = node[0]._properties
properties_list = []
is_skip = False
for key in node_properties:
# 如果节点结构与当前节点属性结构不一致,则跳过当前节点
if key[0] not in item.keys():
skip_num += 1
is_skip = True
break
if is_skip:
continue
for key, value in item.items():
properties_list.append(f"{key}: '{value}'")
# if isinstance(value, str):
# # 如果是字符串,则添加引号
# properties_list.append(f"{key}: '{value}'")
# else:
# # 否则直接添加
# properties_list.append(f"{key}: {value}")
cypher_query = f"(:{node_name} {{{', '.join(properties_list)}}})"
create_node_cypher_parts.append(cypher_query)
count += 1
# 每300个节点执行一次TuGraph数据库操作
if count % 300 == 0:
create_node_cypher = f"CREATE {', '.join(create_node_cypher_parts)}"
# print(create_node_cypher) # 打印生成的Cypher查询语句以便调试
tugraphConn.run(create_node_cypher)
create_node_cypher_parts = [] # 清空列表以准备下一批节点
# 处理剩余的节点
if create_node_cypher_parts:
create_node_cypher = f"CREATE {', '.join(create_node_cypher_parts)}"
# print(create_node_cypher) # 打印生成的Cypher查询语句以便调试
tugraphConn.run(create_node_cypher)
print(f"所有 {node_name} 节点同步完成,共 {len(node_result)} 条记录,不符合要求 {skip_num} 条;成功导入 {count} 条!")
# 导入边
def neo4jEdge2Tugrapg(connector,tugraphConn):
query = """CALL db.relationshipTypes() YIELD relationshipType
RETURN relationshipType;"""
print("Executing query:", query)
results_dege = connector.run(query)
print(f"所有的关系:{results_dege}")
print(f"指定的关系:{connector.specified_relation}")
for edge in results_dege:
if edge[0] not in connector.specified_relation and len(connector.specified_relation) != 0:
continue
# 获取关系结构
query = f"""
MATCH (n1)-[r:{edge[0]}]->(n2) UNWIND keys(r) AS key RETURN DISTINCT key, labels(n1) AS start_node_labels, labels(n2) AS end_node_labels"""
edge_properties = connector.run(query)
start_node = edge_properties[0][1][0]
end_node = edge_properties[0][2][0]
if start_node not in connector.specified_node or end_node not in connector.specified_node:
print(f"{edge[0]}关系中存在不符合要求的节点,跳过!")
continue
if tugraphConn.check_label_exists(edge[0],"edge"):
print(edge[0],"关系已经存在!")
else:
print(edge[0],"关系不存在,需要新建!")
#获取节点结构
node_json = bulid_edge_json(edge[0],edge_properties, connector.node_pro_key[start_node], connector.node_pro_key[end_node])
# 新建不存在的节点
tugraphConn.create_edge_labels(node_json)
# neo4j中查询出当前节点标签下所有节点
queryNode = f"MATCH (n1)-[r:{edge[0]}]->(n2) RETURN n1,r,n2;"
results = connector.run(queryNode)
# 构建插入语句同步节点
synchronize_edge(edge[0],results,start_node,end_node,tugraphConn)
def synchronize_edge(edge_name:str,edge_results,start_node_name,end_node_name,tugraphConn):
# 构建Cypher查询语句
print(f"同步 {edge_name} 关系共 {len(edge_results)} 记录,请等待执行完成...")
create_node_cypher_parts = []
count = 0
skip_num = 0
for edge in edge_results:
properties_list = []
for gx in edge:
if hasattr(gx, 'type'):
if list(gx.start_node.labels)[0] == start_node_name and list(gx.end_node.labels)[0] == end_node_name:
start_node = gx.start_node
end_node = gx.end_node
start_pro_key = tugraphConn.node_pro_key[start_node_name]
end_pro_key = tugraphConn.node_pro_key[end_node_name]
start_pro_val = start_node[start_pro_key]
end_pro_val = end_node[end_pro_key]
# 创建一个字典来存储所有属性
csv_map = {
# start_pro_key if start_node_name != end_node_name else start_pro_key+'1': start_pro_val,
# end_pro_key if start_node_name != end_node_name else end_pro_key+'2': end_pro_val
}
csv_map.update(gx)
# 将属性字典转换为 JSON 风格的字符串
# 构造关系属性字符串
rel_props_list = [f"{key}: '{value}'" for key, value in csv_map.items()]
rel_props_str = "{ " + ", ".join(rel_props_list) + " }"
# todo 批量操作存储属性的
# str1 = f"{{startId:'{start_pro_val}', endId:'{end_pro_val}', relProps:{rel_props_str}}}"
# properties_list.append(str1)
# create_node_cypher_parts.append(str1)
create_edge_cypher = f"""
MATCH (n1:{start_node_name} {{{start_pro_key}: '{start_pro_val}'}}),
(n2:{end_node_name} {{{end_pro_key}: '{end_pro_val}'}})
CREATE (n1)-[:{edge_name} {rel_props_str}]->(n2);
"""
# print(f"执行新增关系[{edge_name}]的cypher:{create_edge_cypher}")
tugraphConn.run(create_edge_cypher)
count += 1
else:
break
# 批量操作 (tugraph不支持)
# if count % 3 == 0 and create_node_cypher_parts:
# map = {
#
# }
# queue_cypher = f"""
# UNWIND [{', '.join(create_node_cypher_parts)}] AS relData
# MATCH (a:{start_node_name} {{{tugraphConn.node_pro_key[start_node_name]}: relData.startId}}), (b:{end_node_name} {{{tugraphConn.node_pro_key[end_node_name]}: relData.endId}})
# MERGE (a)-[r:{edge_name}]->(b)
# SET r += relData.relProps
# RETURN r;
# """
# print(f"执行新增关系[{edge_name}]的cypher:{queue_cypher}")
# # tugraphConn.run(queue_cypher)
# create_node_cypher_parts = []
# if create_node_cypher_parts:
# queue_cypher = f"""
# UNWIND [{', '.join(create_node_cypher_parts)}] AS relData
# MATCH (a:{start_node_name} {{{tugraphConn.node_pro_key[start_node_name]}: relData.startId}}), (b:{end_node_name} {{{tugraphConn.node_pro_key[end_node_name]}: relData.endId}})
# MERGE (a)-[r:{edge_name}]->(b)
# SET r += relData.relProps
# RETURN r;
# """
# print(f"执行新增关系[{edge_name}]的cypher:{queue_cypher}")
# tugraphConn.run(queue_cypher)
print(f"所有 {edge_name} 节点同步完成,共 {len(edge_results)} 条记录,不符合要求 {skip_num} 条;成功导入 {count} 条!")
# 创建连接器
def conn_tugraph():
# 配置连接信息
host = "1111"
port = 111
user = "111"
password = "111"
db_name = "test121"
db_type = "tugraph"
connector = GraphConnector.from_uri_db(host, port, user, password, db_name, db_type)
return connector
def conn_neo4j():
# 配置连接信息
host = "11111"
port = 111
user = "111"
password = "111111"
db_name = "111"
db_type = "neo4j"
connector = GraphConnector.from_uri_db(host, port, user, password, db_name, db_type)
return connector
def main():
neo4jConn = conn_neo4j()
tugraphConn = conn_tugraph()
print("Successfully connected to Graph!")
# 创建TuGraph新图库 - 连接时选中,可以手动创建,或者在初始化方法中创建
tugraphConn.create_graph_new("test121")
# 导入节点
neo4jNode2Tugrapg(neo4jConn,tugraphConn)
# 导入边
neo4jEdge2Tugrapg(neo4jConn,tugraphConn)
# get_relation_tocsv(connector)
# 关闭连接
neo4jConn.close()
tugraphConn.close()
print("Connection closed.")
if __name__ == "__main__":
main()
说明
只是用Python简单写了一个可以执行导入操作的脚本,欢迎指正和优化。边的导入比较慢(单条导入)。
有两种优化思路:
一、Cypher 语句:
UNWIND [{startId:'11', endId:'21', relProps:{ hphm: '33', sj: '44' }},
{startId:'22', endId:'23', relProps:{ hphm: '44', sj: '20080102' }},
{startId:'33', endId:'24', relProps:{ hphm: '55, sj: '20120110' }}] AS relData
MATCH (a:Ren {zjhm: relData.startId}), (b:Che {rowkey: relData.endId})
MERGE (a)-[r:sycl]->(b)
SET r += relData.relProps
RETURN r;
二、
https://tugraph-db.readthedocs.io/zh-cn/latest/development_guide.html Tugraph 官网的批量操作
CALL db.upsertEdge('edge1',{type:'node1',key:'node1_id'}, {type:'node2',key:'node2_id'}, [{node1_id:1,node2_id:2,score:10},{node1_id:3,node2_id:4,score:20}])
代码里面留了
但 我的版本好像都不支持!!!
后文
欢迎讨论