python
复制代码
import duckdb as QhDB
import concurrent.futures
from datetime import date,timedelta
import time,os,traceback,glob
from loguru import logger
def QhConnDB(QhPath = "D:\duckdb_cli", # 数据湖路径
QhDuckLakeName = "QhTestDuckLake001", # 数据湖名称
QhDuckLakeAsName = "QhTestDuckLake001",
QhBaseDbType = "sqlite"
): # 数据湖别名
"""
数据湖连接或创建
作者:阙辉
"""
if QhBaseDbType == "duckdb":
# select database_name from duckdb_databases where database_name = 'QhTestDuckLake001';
# 判断数据湖是否存在
QhDataPath = "{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.ducklake".format(QhPath=QhPath,
QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName) # 1.路径组装
if os.path.isfile(QhDataPath): # 判断文件是否存在
logger.info("【QH_INIT_DUCKLAKE】[{}]||数据湖不存在,本次为数据湖[连接],QueHui!".format(QhDuckLakeName))
else:
logger.info("【QH_INIT_DUCKLAKE】[{}]||数据湖不存在,本次为数据湖[创建并连接],QueHui!".format(QhDuckLakeName))
# # 连接数据湖
# # attach 'ducklake:D:\duckdb_cli\QhTestDuckLake001\QhTestDuckLake001.ducklake' as QhTestDuckLake001;
QhSql = "attach 'ducklake:{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.ducklake' as {QhDuckLakeAsName};".format(
QhPath=QhPath,
QhDuckLakePath=QhDuckLakeName,
QhDuckLakeName=QhDuckLakeName,
QhDuckLakeAsName=QhDuckLakeAsName)
elif QhBaseDbType == "sqlite":
# select database_name from duckdb_databases where database_name = 'QhTestDuckLake001';
# 判断数据湖是否存在
QhDataPath = "{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.sqlite".format(QhPath=QhPath,
QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName) # 1.路径组装
if os.path.isfile(QhDataPath): # 判断文件是否存在
logger.info("【QH_INIT_DUCKLAKE】[{}]||数据湖不存在,本次为数据湖[连接],QueHui!".format(QhDuckLakeName))
else:
logger.info("【QH_INIT_DUCKLAKE】[{}]||数据湖不存在,本次为数据湖[创建并连接],QueHui!".format(QhDuckLakeName))
# # 连接数据湖
# # attach 'ducklake:D:\duckdb_cli\QhTestDuckLake001\QhTestDuckLake001.ducklake' as QhTestDuckLake001;
QhPath00 = QhPath.replace("\\","/")
QhPathother = "{QhPath00}/{QhDuckLakePath}/".format(QhPath00=QhPath00,QhDuckLakePath=QhDuckLakeName)
QhSql = "attach 'ducklake:sqlite:{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.sqlite' as {QhDuckLakeAsName} (DATA_PATH '{QhPathother}');".format(
QhPath=QhPath,
QhDuckLakePath=QhDuckLakeName,
QhDuckLakeName=QhDuckLakeName,
QhDuckLakeAsName=QhDuckLakeAsName,
QhPathother=QhPathother)
try:
logger.info("【QH_INIT_DUCKLAKE】连接数据湖代码,QueHui!\n\n{}\n".format(QhSql))
QhCon = QhDB.connect()
QhCon.sql(QhSql)
logger.info("【QH_INIT_DUCKLAKE】[{}]数据湖连接成功,QueHui!".format(QhDuckLakeName))
return QhCon
except Exception:
QhError_msg = traceback.format_exc()
logger.error("【QH_INIT_DUCKLAKE】[{}]数据湖连接失败,QueHui!\n{}".format(QhDuckLakeName,QhError_msg))
return None
def QhIsTableExist(QhCon,
QhSchema_name = "", # 数据湖分层名称
QhTable_name = "QhTablePer"): # 数据湖表名称
"""
判断数据湖表是否存在
作者:阙辉
"""
# select table_name from duckdb_tables where 1=1 and schema_name <> 'main' and schema_name <> 'ods' and table_name = 'QhTablePer';
# and schema_name <> 'main'
QhSql0 = """
select table_name from duckdb_tables
where 1=1
"""
# 组装sql语句 数据湖分层名称
if QhSchema_name not in ["",None]:
QhSql = QhSql0 + f" and schema_name = '{QhSchema_name}'\n"
logger.info("【QH_IS_TABLE】[{}]表-数据湖有数据分层,数据湖层为[{}],QueHui!".format(QhTable_name,QhSchema_name))
else:
QhSql = QhSql0
logger.info("【QH_IS_TABLE】[{}]表-数据湖无数据分层,QueHui!".format(QhTable_name))
# 组装sql语句 数据湖表名称
if QhTable_name not in ["",None]:
QhSql += f" and table_name = '{QhTable_name}';\n"
logger.info("【QH_IS_TABLE】[{}]表-判断表是否存在的SQL,QueHui!\n\n{}\n".format(QhTable_name,QhSql))
try:
QhResCueson = QhCon.cursor()
QhTableName=QhResCueson.execute(QhSql).fetchall()
if len(QhTableName)>0:
QhTableName = QhTableName[0][0]
else:
return False # 表不存在
QhResCueson.close()
except Exception:
QhError_msg = traceback.format_exc()
logger.error("【QH_IS_TABLE】[{}]表-判断表失败,QueHui!\n{}".format(QhTable_name,QhError_msg))
return None
if QhTableName == QhTable_name:
return True # 表存在
else:
return False # 表不存在
else:
logger.info("【QH_IS_TABLE】[{}]表-请传入表名,QueHui!".format(QhTable_name))
return None
def QhTablePartition(QhCon,
QhDuckLakeName = "QhTestDuckLake001", # 数据湖名称
QhSchema_name = "", # 数据湖分层名称
QhTable_name = "QhTablePer", # 数据湖表名称
Qhpartitioned = ""): # 分区字段
"""
建立数据湖表分区
设计表的时候就要考虑好,否则写入过数据后就不能分区了
作者:阙辉
"""
if QhSchema_name not in ["",None]:
QhTablename = QhDuckLakeName + "." + QhSchema_name
QhTablename = QhTablename + "." + QhTable_name
logger.info("【QH_FENQU_TABLE】数据湖表名是[{}],QueHui!".format(QhTablename))
QhSql= """alter table {} set partitioned by({});""".format(QhTablename,Qhpartitioned)
logger.info("【QH_FENQU_TABLE】[{}]数据湖表分区SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
QhCon.sql(QhSql)
logger.info("【QH_FENQU_TABLE】[{}]数据湖表分区成功,QueHui!".format(QhTablename))
def QhCreateTable(QhCon,
QhDuckLakeName = "QhTestDuckLake001", # 数据湖名称
QhSchema_name = "", # 数据湖分层名称
QhTable_name = "QhTablePer", # 数据湖表名称
Qhpartitioned = "", # 分区字段
QhMuBiaoHost = "", # 目标数据库主机地址
QhMuBiaoPort = "", # 目标数据库端口号
QhMuBiaoDB = "", # 目标数据库名称
QhMuBiaoUser = "", # 目标数据库用户名
QhMuBiaoPwd = "", # 目标数据库密码
QhMuBiaoTable = "", # 目标表名称
):
"""
根据目标表创建目标数据湖表
作者:阙辉
功能说明:
1. 检查目标表是否已存在
2. 如果表不存在,则连接目标数据库并创建新表
3. 创建完成后关闭与目标数据库的连接
"""
# 检查表是否存在
QhIsTable = QhIsTableExist(QhCon,
QhSchema_name = QhSchema_name, # 数据湖分层名称
QhTable_name = QhTable_name)
# 如果表检查结果为空或None,记录错误信息并返回
if QhIsTable in [None,""]:
logger.info("【QH_CREATE_TABLE】[{}]判断表是否存在程序可能有报错,QueHui!".format(QhTable_name))
return
# 如果表不存在,则执行创建表操作
if not QhIsTable:
# 连接目标数据库
if QhSchema_name not in ["",None]:
QhTablename = QhDuckLakeName + "." + QhSchema_name
QhTablename = QhTablename + "." + QhTable_name
QhMuBiaoSql = "attach 'host={QhMuBiaoHost} user={QhMuBiaoUser} password={QhMuBiaoPwd} port={QhMuBiaoPort} database={QhMuBiaoDB}' As db (TYPE mysql_scanner);"\
.format(QhMuBiaoHost=QhMuBiaoHost,
QhMuBiaoUser=QhMuBiaoUser,
QhMuBiaoPwd=QhMuBiaoPwd,
QhMuBiaoPort=QhMuBiaoPort,
QhMuBiaoDB=QhMuBiaoDB)
logger.info("【QH_CREATE_TABLE】[{}]开始连接目标数据库,QueHui!".format(QhTable_name))
QhCon.sql(QhMuBiaoSql)
logger.info("【QH_CREATE_TABLE】[{}]连接目标数据库成功,QueHui!".format(QhTable_name))
# 创建表
QhSql= """create table {} as select * from db.{} where 1=0;""".format(QhTablename,QhMuBiaoTable)
logger.info("【QH_CREATE_TABLE】[{}]创建数据湖表SQL,QueHui!\n\n{}\n".format(QhTable_name,QhSql))
# QhResCueson = QhCon.cursor()
QhCon.sql(QhSql)
if Qhpartitioned not in ["",None]:
# 创建表分区
QhTablePartition(QhCon,
QhDuckLakeName = QhDuckLakeName, # 数据湖名称
QhSchema_name = QhSchema_name, # 数据湖分层名称
QhTable_name = QhTable_name, # 数据湖表名称
Qhpartitioned = Qhpartitioned)
# QhResCueson.close()
logger.info("【QH_CREATE_TABLE】[{}]创建数据湖表成功,QueHui!".format(QhTable_name))
QhCon.sql("detach db;")
# QhDB.sql("detach db;") # 关闭目标数据库连接
logger.info("【QH_CREATE_TABLE】[{}]关闭目标数据库,QueHui!".format(QhTable_name))
else:
logger.info("【QH_CREATE_TABLE】[{}]表已存在,无需创建,QueHui!".format(QhTable_name))
def QhremoveTableFile(QhDataPath):
"""
删除要同步的数据湖表数表文件
防止过分占用磁盘
作者:阙辉
"""
if os.path.isdir(QhDataPath):
logger.info("【QH_REMOVE_TABLE】[{}]数据湖表路径存在,QueHui!".format(QhDataPath))
QhTableFiles = glob.glob('{}\*.parquet'.format(QhDataPath))
# print(QhTableFiles)
if len(QhTableFiles):
logger.info("【QH_REMOVE_TABLE】数据湖表文件列表,QueHui!\n\n{}\n".format(QhTableFiles))
for QhTableFile in QhTableFiles:
os.remove(os.path.join(QhDataPath,QhTableFile))
logger.error("【QH_REMOVE_TABLE】[{}]数据湖表文件删除成功,QueHui!".format(QhTableFile))
def QhDeleteTableData(QhResCueson,
QhPath = "D:\duckdb_cli", # 数据湖路径
QhDuckLakeName = "QhTestDuckLake001", # 数据湖名称
QhSchema_name = "", # 数据湖分层名称
QhTable_name = "QhTablePer", # 数据湖表名称
Qhpartitioned = "", # 分区字段
QhpartitionedValue = "", # 分区字段值
QhBaseDbType = "sqlite", # 数据库类型
):
"""
删除要同步的数据湖表数据 并删除表文件
防止过分占用磁盘
作者:阙辉
"""
if QhSchema_name not in ["",None]:
QhTablename = QhDuckLakeName + "." + QhSchema_name
if QhBaseDbType == "duckdb":
QhDataPath0 = "{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.sqlite.files\{QhSchema_name}".format(QhPath=QhPath,
QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName,QhSchema_name=QhSchema_name)
elif QhBaseDbType == "sqlite":
QhDataPath0 = "{QhPath}\{QhDuckLakePath}\{QhSchema_name}".format(QhPath=QhPath,
QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName,QhSchema_name=QhSchema_name)
else:
if QhBaseDbType == "duckdb":
QhDataPath0 = "{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.sqlite.files\{QhSchema_name}".format(QhPath=QhPath,
QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName,QhSchema_name="main")
elif QhBaseDbType == "sqlite":
QhDataPath0 = "{QhPath}\{QhDuckLakePath}\{QhSchema_name}".format(QhPath=QhPath,
QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName,QhSchema_name="main")
QhTablename = QhTablename + "." + QhTable_name
QhDataPath0 = "{QhDataPath0}\{QhTable_name}".format(QhDataPath0=QhDataPath0,QhTable_name=QhTable_name) # 组装表路径
logger.info("【QH_DELETE_TABLEDATA】数据湖表名是[{}],QueHui!".format(QhTablename))
logger.info("【QH_DELETE_TABLEDATA】[{}]数据湖表文件路径[{}],QueHui!".format(QhTablename,QhDataPath0))
try:
if Qhpartitioned not in ["",None]:
# delete from QhTestDuckLake001.ods.hsar_wish_document_tracking03 where partion_value = {i};
QhSql= """delete from {QhTablename} where {Qhpartitioned} = {QhpartitionedValue};""".format(QhTablename = QhTablename,
Qhpartitioned = Qhpartitioned,QhpartitionedValue = QhpartitionedValue)
logger.info("【QH_DELETE_TABLEDATA】[{}]数据湖表DELECTE_SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
QhResCueson.execute(QhSql)
logger.error("【QH_DELETE_TABLEDATA】[{}]数据湖表数据删除成功,QueHui!".format(QhTablename))
QhDataPath = "{}\{}={}".format(QhDataPath0,Qhpartitioned,QhpartitionedValue) # 组装表路径
QhremoveTableFile(QhDataPath)
else:
QhSql= """delete from {QhTablename};""".format(QhTablename = QhTablename)
logger.info("【QH_DELETE_TABLEDATA】[{}]数据湖表DELECTE_SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
QhResCueson.execute(QhSql)
logger.error("【QH_DELETE_TABLEDATA】[{}]数据湖表数据删除成功,QueHui!".format(QhTablename))
QhDataPath = QhDataPath0 # 组装表路径
QhremoveTableFile(QhDataPath)
return True
except Exception:
QhError_msg = traceback.format_exc()
logger.error("【QH_DELETE_TABLEDATA】[{}]表-可能是新建表,还未存数据,QueHui!\n{}".format(QhTable_name,QhError_msg))
return False
def QhInsertTableData(QhCon,
QhPath = "D:\duckdb_cli\QhTestDuckLake001", # 数据湖路径
QhDuckLakeName = "QhTestDuckLake001", # 数据湖名称
QhSchema_name = "", # 数据湖分层名称
QhTable_name = "QhTablePer", # 数据湖表名称
Qhpartitioned = "", # 分区字段
QhpartitionedValue = "", # 分区字段值
QhBaseDbType = "sqlite", # 数据库类型
QhMuBiaoHost = "", # 目标数据库主机地址
QhMuBiaoPort = "", # 目标数据库端口号
QhMuBiaoDB = "", # 目标数据库名称
QhMuBiaoUser = "", # 目标数据库用户名
QhMuBiaoPwd = "", # 目标数据库密码
QhMuBiaoTable = "", # 目标表名称
):
"""
同步的数据湖表数据
作者:阙辉
"""
# 1、创建数据湖表 表存在就跳过,不存在就创建
QhCreateTable(QhCon = QhCon,
QhDuckLakeName = QhDuckLakeName, # 数据湖名称
QhSchema_name = QhSchema_name, # 数据湖分层名称
QhTable_name = QhTable_name, # 数据湖表名称
Qhpartitioned = Qhpartitioned, # 分区字段
QhMuBiaoHost = QhMuBiaoHost, # 目标数据库主机地址
QhMuBiaoPort = QhMuBiaoPort, # 目标数据库端口号
QhMuBiaoDB = QhMuBiaoDB, # 目标数据库名称
QhMuBiaoUser = QhMuBiaoUser, # 目标数据库用户名
QhMuBiaoPwd = QhMuBiaoPwd, # 目标数据库密码
QhMuBiaoTable =QhMuBiaoTable, # 目标表名称
)
# 2、删除数据湖表数据 并删除表文件
# insert into QhTestDuckLake001.ods.hsar_wish_document_tracking03 select * from db.hsar_wish_document_tracking where partion_value = {i};
if QhSchema_name not in ["",None]:
QhTablename = QhDuckLakeName + "." + QhSchema_name
QhTablename = QhTablename + "." + QhTable_name
logger.info("【QH_INSERT_TABLEDATA】数据湖表名是[{}],QueHui!".format(QhTablename))
QhResCueson = QhCon.cursor()
logger.info("【QH_INSERT_TABLEDATA】[{}]数据湖执行删除数据表数据,QueHui!".format(QhTablename))
if QhDeleteTableData(QhResCueson,
QhPath = QhPath, # 数据湖路径
QhDuckLakeName = QhDuckLakeName, # 数据湖名称
QhSchema_name = QhSchema_name, # 数据湖分层名称
QhTable_name = QhTable_name, # 数据湖表名称
Qhpartitioned = Qhpartitioned, # 分区字段
QhpartitionedValue = QhpartitionedValue, # 分区字段值
QhBaseDbType = QhBaseDbType, # 数据库类型
):
logger.info("【QH_INSERT_TABLEDATA】[{}]数据湖执行删除数据表数据,QueHui!".format(QhTablename))
else:
logger.error("【QH_INSERT_TABLEDATA】[{}]数据湖执行删除数据表数据报错或者无数据需要删除,QueHui!".format(QhTablename))
# 3、连接目标数据库,并将目标数据库数据插入到数据湖表
QhMuBiaoSql = "attach 'host={QhMuBiaoHost} user={QhMuBiaoUser} password={QhMuBiaoPwd} port={QhMuBiaoPort} database={QhMuBiaoDB}' As db (TYPE mysql_scanner);"\
.format(QhMuBiaoHost=QhMuBiaoHost,
QhMuBiaoUser=QhMuBiaoUser,
QhMuBiaoPwd=QhMuBiaoPwd,
QhMuBiaoPort=QhMuBiaoPort,
QhMuBiaoDB=QhMuBiaoDB)
logger.info("【QH_INSERT_TABLEDATA】[{}]开始连接目标数据库,QueHui!".format(QhTable_name))
QhCon.sql(QhMuBiaoSql)
logger.info("【QH_INSERT_TABLEDATA】[{}]连接目标数据库成功,QueHui!".format(QhTable_name))
QhStartTime = time.time()
logger.info("【QH_INSERT_TABLEDATA】[{}]开始同步目标数据库到数据湖表,QueHui!".format(QhTable_name))
if Qhpartitioned not in ["",None]:
# delete from QhTestDuckLake001.ods.hsar_wish_document_tracking03 where partion_value = {i};
if not isinstance(QhpartitionedValue, (int, float, complex)): # 如果不是文本 加上单引号
QhpartitionedValue = "'{}'".format(QhpartitionedValue)
QhSql= """insert into {QhTablename} select * from db.{QhMuBiaoTable} where {Qhpartitioned} = {QhpartitionedValue};""".format(QhTablename = QhTablename,
QhMuBiaoTable=QhMuBiaoTable,Qhpartitioned = Qhpartitioned,QhpartitionedValue = QhpartitionedValue)
logger.info("【QH_INSERT_TABLEDATA】[{}]数据湖表INSERT_SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
QhResCueson.execute(QhSql)
else:
QhSql= """insert into {QhTablename} select * from db.{QhMuBiaoTable};""".format(QhTablename = QhTablename,
QhMuBiaoTable=QhMuBiaoTable)
logger.info("【QH_INSERT_TABLEDATA】[{}]数据湖表INSERT_SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
QhResCueson.execute(QhSql)
QhResCueson.close()
QhHaoShiTime = time.time() - QhStartTime
logger.info("【QH_INSERT_TABLEDATA】[{}]同步目标数据库到数据湖表成功,共耗时【{}】秒,QueHui!".format(QhTable_name,QhHaoShiTime))
QhCon.sql("detach db;")
logger.info("【QH_INSERT_TABLEDATA】[{}]关闭目标数据库,QueHui!".format(QhTable_name))
if __name__ == '__main__':
QhPath = "D:\duckdb_cli" # 数据湖路径
QhDuckLakeName = "QhSqliteLake005" # 数据湖名称
QhDuckLakeAsName = "QhSqliteLake005"
QhBaseDbType = "sqlite"
QhCon=QhConnDB(QhPath = QhPath, # 数据湖路径
QhDuckLakeName = QhDuckLakeName, # 数据湖名称
QhDuckLakeAsName = QhDuckLakeAsName, # 数据湖别名
QhBaseDbType = QhBaseDbType)
# QhSelectLakeBASE(QhCon,QhDuckLakeName = "QhSqliteLake001")