DUCKLAKE 同步数据库

python 复制代码
import duckdb as QhDB
import concurrent.futures
from datetime import date,timedelta
import time,os,traceback,glob
from loguru import logger


def QhConnDB(QhPath = "D:\duckdb_cli",   # 数据湖路径
             QhDuckLakeName = "QhTestDuckLake001",         # 数据湖名称
             QhDuckLakeAsName = "QhTestDuckLake001",
             QhBaseDbType = "sqlite"
             ):       # 数据湖别名
    
    """
    数据湖连接或创建  
    作者:阙辉
    """

    if QhBaseDbType == "duckdb":
        # select database_name from duckdb_databases where database_name = 'QhTestDuckLake001';
        # 判断数据湖是否存在
        QhDataPath = "{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.ducklake".format(QhPath=QhPath, 
                                QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName)           # 1.路径组装
        if os.path.isfile(QhDataPath):  # 判断文件是否存在
            logger.info("【QH_INIT_DUCKLAKE】[{}]||数据湖不存在,本次为数据湖[连接],QueHui!".format(QhDuckLakeName))
        else:
            logger.info("【QH_INIT_DUCKLAKE】[{}]||数据湖不存在,本次为数据湖[创建并连接],QueHui!".format(QhDuckLakeName))
        # # 连接数据湖
        # # attach 'ducklake:D:\duckdb_cli\QhTestDuckLake001\QhTestDuckLake001.ducklake' as QhTestDuckLake001;
        QhSql = "attach 'ducklake:{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.ducklake' as {QhDuckLakeAsName};".format(
                                                                                        QhPath=QhPath,
                                                                                        QhDuckLakePath=QhDuckLakeName,
                                                                                        QhDuckLakeName=QhDuckLakeName,
                                                                                        QhDuckLakeAsName=QhDuckLakeAsName)
    elif QhBaseDbType == "sqlite":
        # select database_name from duckdb_databases where database_name = 'QhTestDuckLake001';
        # 判断数据湖是否存在
        QhDataPath = "{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.sqlite".format(QhPath=QhPath, 
                                QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName)           # 1.路径组装
        if os.path.isfile(QhDataPath):  # 判断文件是否存在
            logger.info("【QH_INIT_DUCKLAKE】[{}]||数据湖不存在,本次为数据湖[连接],QueHui!".format(QhDuckLakeName))
        else:
            logger.info("【QH_INIT_DUCKLAKE】[{}]||数据湖不存在,本次为数据湖[创建并连接],QueHui!".format(QhDuckLakeName))
        # # 连接数据湖
        # # attach 'ducklake:D:\duckdb_cli\QhTestDuckLake001\QhTestDuckLake001.ducklake' as QhTestDuckLake001;
        QhPath00 = QhPath.replace("\\","/")
        QhPathother = "{QhPath00}/{QhDuckLakePath}/".format(QhPath00=QhPath00,QhDuckLakePath=QhDuckLakeName)
        QhSql = "attach 'ducklake:sqlite:{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.sqlite' as {QhDuckLakeAsName}  (DATA_PATH '{QhPathother}');".format(
                                                                                        QhPath=QhPath,
                                                                                        QhDuckLakePath=QhDuckLakeName,
                                                                                        QhDuckLakeName=QhDuckLakeName,
                                                                                        QhDuckLakeAsName=QhDuckLakeAsName,
                                                                                        QhPathother=QhPathother)
    try:
        logger.info("【QH_INIT_DUCKLAKE】连接数据湖代码,QueHui!\n\n{}\n".format(QhSql))
        QhCon = QhDB.connect()
        QhCon.sql(QhSql)
        logger.info("【QH_INIT_DUCKLAKE】[{}]数据湖连接成功,QueHui!".format(QhDuckLakeName))
        return QhCon
    except Exception:
        QhError_msg = traceback.format_exc()
        logger.error("【QH_INIT_DUCKLAKE】[{}]数据湖连接失败,QueHui!\n{}".format(QhDuckLakeName,QhError_msg))
        return None
        
def QhIsTableExist(QhCon,
                   QhSchema_name = "",           # 数据湖分层名称 
                   QhTable_name = "QhTablePer"):        # 数据湖表名称
    """
    判断数据湖表是否存在
    作者:阙辉
    """
    # select table_name from duckdb_tables where 1=1 and schema_name <>  'main' and schema_name <>  'ods' and table_name = 'QhTablePer';
    # and schema_name <> 'main'
    QhSql0 = """
    select table_name from duckdb_tables 
        where 1=1 
    """
    # 组装sql语句 数据湖分层名称
    if QhSchema_name not in ["",None]:
        QhSql = QhSql0 + f"    and schema_name = '{QhSchema_name}'\n"
        logger.info("【QH_IS_TABLE】[{}]表-数据湖有数据分层,数据湖层为[{}],QueHui!".format(QhTable_name,QhSchema_name))
    else:
        QhSql = QhSql0
        logger.info("【QH_IS_TABLE】[{}]表-数据湖无数据分层,QueHui!".format(QhTable_name))
    
    # 组装sql语句 数据湖表名称
    if QhTable_name not in ["",None]:   
        QhSql += f"        and table_name = '{QhTable_name}';\n"
        logger.info("【QH_IS_TABLE】[{}]表-判断表是否存在的SQL,QueHui!\n\n{}\n".format(QhTable_name,QhSql))
        try:
            QhResCueson = QhCon.cursor()
            QhTableName=QhResCueson.execute(QhSql).fetchall()
            if len(QhTableName)>0:
                QhTableName = QhTableName[0][0]
            else:
                return False     # 表不存在
            QhResCueson.close()
        except Exception:
            QhError_msg = traceback.format_exc()
            logger.error("【QH_IS_TABLE】[{}]表-判断表失败,QueHui!\n{}".format(QhTable_name,QhError_msg))
            return None
        if QhTableName == QhTable_name:
            return True      # 表存在
        else:
            return False     # 表不存在
    else:
        logger.info("【QH_IS_TABLE】[{}]表-请传入表名,QueHui!".format(QhTable_name))
        return None

def QhTablePartition(QhCon,
                   QhDuckLakeName = "QhTestDuckLake001",         # 数据湖名称
                   QhSchema_name = "",               # 数据湖分层名称 
                   QhTable_name = "QhTablePer",         # 数据湖表名称
                   Qhpartitioned = ""):                # 分区字段
    """
    建立数据湖表分区
    设计表的时候就要考虑好,否则写入过数据后就不能分区了 
    作者:阙辉
    """
    if QhSchema_name not in ["",None]:
        QhTablename = QhDuckLakeName + "." + QhSchema_name
    QhTablename = QhTablename + "." + QhTable_name
    logger.info("【QH_FENQU_TABLE】数据湖表名是[{}],QueHui!".format(QhTablename))
    QhSql= """alter table {} set partitioned by({});""".format(QhTablename,Qhpartitioned)
    logger.info("【QH_FENQU_TABLE】[{}]数据湖表分区SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
    QhCon.sql(QhSql)   
    logger.info("【QH_FENQU_TABLE】[{}]数据湖表分区成功,QueHui!".format(QhTablename))


def QhCreateTable(QhCon,
                QhDuckLakeName = "QhTestDuckLake001",         # 数据湖名称
                QhSchema_name = "",               # 数据湖分层名称 
                QhTable_name = "QhTablePer",         # 数据湖表名称
                Qhpartitioned = "",               # 分区字段
                QhMuBiaoHost = "",                # 目标数据库主机地址
                QhMuBiaoPort = "",                # 目标数据库端口号
                QhMuBiaoDB = "",                  # 目标数据库名称
                QhMuBiaoUser = "",                # 目标数据库用户名
                QhMuBiaoPwd = "",                # 目标数据库密码
                QhMuBiaoTable = "",              # 目标表名称
                ): 
    """
    根据目标表创建目标数据湖表
    作者:阙辉
    功能说明:
    1. 检查目标表是否已存在
    2. 如果表不存在,则连接目标数据库并创建新表
    3. 创建完成后关闭与目标数据库的连接
    """       
    
    # 检查表是否存在
    QhIsTable = QhIsTableExist(QhCon,
               QhSchema_name = QhSchema_name,           # 数据湖分层名称 
               QhTable_name = QhTable_name)
    # 如果表检查结果为空或None,记录错误信息并返回
    if QhIsTable in [None,""]:
        logger.info("【QH_CREATE_TABLE】[{}]判断表是否存在程序可能有报错,QueHui!".format(QhTable_name))
        return
    # 如果表不存在,则执行创建表操作
    if not QhIsTable:
        # 连接目标数据库
        if QhSchema_name not in ["",None]:
            QhTablename = QhDuckLakeName + "." + QhSchema_name
        QhTablename = QhTablename + "." + QhTable_name
        QhMuBiaoSql = "attach 'host={QhMuBiaoHost} user={QhMuBiaoUser} password={QhMuBiaoPwd} port={QhMuBiaoPort} database={QhMuBiaoDB}' As db (TYPE mysql_scanner);"\
                .format(QhMuBiaoHost=QhMuBiaoHost,
                        QhMuBiaoUser=QhMuBiaoUser,
                        QhMuBiaoPwd=QhMuBiaoPwd,
                        QhMuBiaoPort=QhMuBiaoPort,
                        QhMuBiaoDB=QhMuBiaoDB)
        logger.info("【QH_CREATE_TABLE】[{}]开始连接目标数据库,QueHui!".format(QhTable_name))
        QhCon.sql(QhMuBiaoSql)
        logger.info("【QH_CREATE_TABLE】[{}]连接目标数据库成功,QueHui!".format(QhTable_name))

        # 创建表
        QhSql= """create table {} as select * from db.{} where 1=0;""".format(QhTablename,QhMuBiaoTable)
        logger.info("【QH_CREATE_TABLE】[{}]创建数据湖表SQL,QueHui!\n\n{}\n".format(QhTable_name,QhSql))
        # QhResCueson = QhCon.cursor()
        QhCon.sql(QhSql)
        if Qhpartitioned not in ["",None]:
        # 创建表分区
            QhTablePartition(QhCon,
                        QhDuckLakeName = QhDuckLakeName,         # 数据湖名称
                        QhSchema_name = QhSchema_name,               # 数据湖分层名称 
                        QhTable_name = QhTable_name,         # 数据湖表名称
                        Qhpartitioned = Qhpartitioned)
        # QhResCueson.close()
        logger.info("【QH_CREATE_TABLE】[{}]创建数据湖表成功,QueHui!".format(QhTable_name))
        QhCon.sql("detach db;")
        # QhDB.sql("detach db;")  # 关闭目标数据库连接
        logger.info("【QH_CREATE_TABLE】[{}]关闭目标数据库,QueHui!".format(QhTable_name))
    else:
        logger.info("【QH_CREATE_TABLE】[{}]表已存在,无需创建,QueHui!".format(QhTable_name))

def QhremoveTableFile(QhDataPath):

    """
    删除要同步的数据湖表数表文件
    防止过分占用磁盘
    作者:阙辉
    """
    if os.path.isdir(QhDataPath):
        logger.info("【QH_REMOVE_TABLE】[{}]数据湖表路径存在,QueHui!".format(QhDataPath))
        QhTableFiles = glob.glob('{}\*.parquet'.format(QhDataPath))
        # print(QhTableFiles)
        if len(QhTableFiles):
            logger.info("【QH_REMOVE_TABLE】数据湖表文件列表,QueHui!\n\n{}\n".format(QhTableFiles))
            for QhTableFile in QhTableFiles:
                os.remove(os.path.join(QhDataPath,QhTableFile))
                logger.error("【QH_REMOVE_TABLE】[{}]数据湖表文件删除成功,QueHui!".format(QhTableFile))

def QhDeleteTableData(QhResCueson,
                      QhPath = "D:\duckdb_cli",   # 数据湖路径
                      QhDuckLakeName = "QhTestDuckLake001",         # 数据湖名称
                      QhSchema_name = "",                           # 数据湖分层名称 
                      QhTable_name = "QhTablePer",                  # 数据湖表名称
                      Qhpartitioned = "",                           # 分区字段
                      QhpartitionedValue = "",                      # 分区字段值
                      QhBaseDbType = "sqlite",                    # 数据库类型
                      ): 
    """
    删除要同步的数据湖表数据 并删除表文件
    防止过分占用磁盘
    作者:阙辉
    """
    if QhSchema_name not in ["",None]:
        QhTablename = QhDuckLakeName + "." + QhSchema_name
        if QhBaseDbType == "duckdb":
            QhDataPath0 = "{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.sqlite.files\{QhSchema_name}".format(QhPath=QhPath,
                        QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName,QhSchema_name=QhSchema_name)
        elif QhBaseDbType == "sqlite":
            QhDataPath0 = "{QhPath}\{QhDuckLakePath}\{QhSchema_name}".format(QhPath=QhPath,
                        QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName,QhSchema_name=QhSchema_name)
    else:
        if QhBaseDbType == "duckdb":
            QhDataPath0 = "{QhPath}\{QhDuckLakePath}\{QhDuckLakeName}.sqlite.files\{QhSchema_name}".format(QhPath=QhPath,
                        QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName,QhSchema_name="main")
        elif QhBaseDbType == "sqlite":
            QhDataPath0 = "{QhPath}\{QhDuckLakePath}\{QhSchema_name}".format(QhPath=QhPath,
                        QhDuckLakePath=QhDuckLakeName,QhDuckLakeName=QhDuckLakeName,QhSchema_name="main")

    QhTablename = QhTablename + "." + QhTable_name
    QhDataPath0 = "{QhDataPath0}\{QhTable_name}".format(QhDataPath0=QhDataPath0,QhTable_name=QhTable_name)  # 组装表路径
    logger.info("【QH_DELETE_TABLEDATA】数据湖表名是[{}],QueHui!".format(QhTablename))
    logger.info("【QH_DELETE_TABLEDATA】[{}]数据湖表文件路径[{}],QueHui!".format(QhTablename,QhDataPath0))
    try:
        if Qhpartitioned not in ["",None]:
            # delete from QhTestDuckLake001.ods.hsar_wish_document_tracking03 where partion_value = {i};
            QhSql= """delete from {QhTablename} where {Qhpartitioned} = {QhpartitionedValue};""".format(QhTablename = QhTablename,
                                            Qhpartitioned = Qhpartitioned,QhpartitionedValue = QhpartitionedValue)
            logger.info("【QH_DELETE_TABLEDATA】[{}]数据湖表DELECTE_SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
            QhResCueson.execute(QhSql)
            logger.error("【QH_DELETE_TABLEDATA】[{}]数据湖表数据删除成功,QueHui!".format(QhTablename))
            QhDataPath = "{}\{}={}".format(QhDataPath0,Qhpartitioned,QhpartitionedValue)  # 组装表路径
            QhremoveTableFile(QhDataPath)
        else:
            QhSql= """delete from {QhTablename};""".format(QhTablename = QhTablename)
            logger.info("【QH_DELETE_TABLEDATA】[{}]数据湖表DELECTE_SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
            QhResCueson.execute(QhSql)
            logger.error("【QH_DELETE_TABLEDATA】[{}]数据湖表数据删除成功,QueHui!".format(QhTablename))
            QhDataPath = QhDataPath0  # 组装表路径
            QhremoveTableFile(QhDataPath)
        return True
    except Exception:
        QhError_msg = traceback.format_exc()
        logger.error("【QH_DELETE_TABLEDATA】[{}]表-可能是新建表,还未存数据,QueHui!\n{}".format(QhTable_name,QhError_msg))
        return False
    
def QhInsertTableData(QhCon,
                      QhPath = "D:\duckdb_cli\QhTestDuckLake001",   # 数据湖路径
                      QhDuckLakeName = "QhTestDuckLake001",         # 数据湖名称
                      QhSchema_name = "",                           # 数据湖分层名称 
                      QhTable_name = "QhTablePer",                  # 数据湖表名称
                      Qhpartitioned = "",                           # 分区字段
                      QhpartitionedValue = "",                      # 分区字段值
                      QhBaseDbType = "sqlite",                      # 数据库类型
                      QhMuBiaoHost = "",                # 目标数据库主机地址
                      QhMuBiaoPort = "",                # 目标数据库端口号
                      QhMuBiaoDB = "",                  # 目标数据库名称
                      QhMuBiaoUser = "",                # 目标数据库用户名
                      QhMuBiaoPwd = "",                 # 目标数据库密码
                      QhMuBiaoTable = "",               # 目标表名称
                     ):
    """
    同步的数据湖表数据
    作者:阙辉
    """
    # 1、创建数据湖表 表存在就跳过,不存在就创建
    QhCreateTable(QhCon = QhCon,
                QhDuckLakeName = QhDuckLakeName,         # 数据湖名称
                QhSchema_name = QhSchema_name,               # 数据湖分层名称 
                QhTable_name = QhTable_name,         # 数据湖表名称
                Qhpartitioned = Qhpartitioned,               # 分区字段
                QhMuBiaoHost = QhMuBiaoHost,                # 目标数据库主机地址
                QhMuBiaoPort = QhMuBiaoPort,                # 目标数据库端口号
                QhMuBiaoDB = QhMuBiaoDB,                  # 目标数据库名称
                QhMuBiaoUser = QhMuBiaoUser,                # 目标数据库用户名
                QhMuBiaoPwd = QhMuBiaoPwd,                # 目标数据库密码
                QhMuBiaoTable =QhMuBiaoTable,              # 目标表名称
                )
    # 2、删除数据湖表数据 并删除表文件
    # insert into QhTestDuckLake001.ods.hsar_wish_document_tracking03 select * from db.hsar_wish_document_tracking where partion_value = {i};
    if QhSchema_name not in ["",None]:
        QhTablename = QhDuckLakeName + "." + QhSchema_name
    QhTablename = QhTablename + "." + QhTable_name
    logger.info("【QH_INSERT_TABLEDATA】数据湖表名是[{}],QueHui!".format(QhTablename))
    QhResCueson = QhCon.cursor()
    logger.info("【QH_INSERT_TABLEDATA】[{}]数据湖执行删除数据表数据,QueHui!".format(QhTablename))
    if QhDeleteTableData(QhResCueson,
                      QhPath = QhPath,                         # 数据湖路径
                      QhDuckLakeName = QhDuckLakeName,         # 数据湖名称
                      QhSchema_name = QhSchema_name,           # 数据湖分层名称 
                      QhTable_name = QhTable_name,              # 数据湖表名称
                      Qhpartitioned = Qhpartitioned,            # 分区字段
                      QhpartitionedValue = QhpartitionedValue,   # 分区字段值
                      QhBaseDbType = QhBaseDbType,               # 数据库类型
                      ):
        logger.info("【QH_INSERT_TABLEDATA】[{}]数据湖执行删除数据表数据,QueHui!".format(QhTablename))
    else:
        logger.error("【QH_INSERT_TABLEDATA】[{}]数据湖执行删除数据表数据报错或者无数据需要删除,QueHui!".format(QhTablename))
    # 3、连接目标数据库,并将目标数据库数据插入到数据湖表
    QhMuBiaoSql = "attach 'host={QhMuBiaoHost} user={QhMuBiaoUser} password={QhMuBiaoPwd} port={QhMuBiaoPort} database={QhMuBiaoDB}' As db (TYPE mysql_scanner);"\
                .format(QhMuBiaoHost=QhMuBiaoHost,
                        QhMuBiaoUser=QhMuBiaoUser,
                        QhMuBiaoPwd=QhMuBiaoPwd,
                        QhMuBiaoPort=QhMuBiaoPort,
                        QhMuBiaoDB=QhMuBiaoDB)
    logger.info("【QH_INSERT_TABLEDATA】[{}]开始连接目标数据库,QueHui!".format(QhTable_name))
    QhCon.sql(QhMuBiaoSql)
    logger.info("【QH_INSERT_TABLEDATA】[{}]连接目标数据库成功,QueHui!".format(QhTable_name))
    QhStartTime = time.time()
    logger.info("【QH_INSERT_TABLEDATA】[{}]开始同步目标数据库到数据湖表,QueHui!".format(QhTable_name))
    if Qhpartitioned not in ["",None]:
        # delete from QhTestDuckLake001.ods.hsar_wish_document_tracking03 where partion_value = {i};
        if not isinstance(QhpartitionedValue, (int, float, complex)):  # 如果不是文本 加上单引号
            QhpartitionedValue = "'{}'".format(QhpartitionedValue)
        QhSql= """insert into {QhTablename} select * from db.{QhMuBiaoTable} where {Qhpartitioned} = {QhpartitionedValue};""".format(QhTablename = QhTablename,
                QhMuBiaoTable=QhMuBiaoTable,Qhpartitioned = Qhpartitioned,QhpartitionedValue = QhpartitionedValue)
        logger.info("【QH_INSERT_TABLEDATA】[{}]数据湖表INSERT_SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
        QhResCueson.execute(QhSql)
    else:
        QhSql= """insert into {QhTablename} select * from db.{QhMuBiaoTable};""".format(QhTablename = QhTablename,
                QhMuBiaoTable=QhMuBiaoTable)
        logger.info("【QH_INSERT_TABLEDATA】[{}]数据湖表INSERT_SQL,QueHui!\n\n{}\n".format(QhTablename,QhSql))
        QhResCueson.execute(QhSql)
    QhResCueson.close()
    QhHaoShiTime = time.time() - QhStartTime
    logger.info("【QH_INSERT_TABLEDATA】[{}]同步目标数据库到数据湖表成功,共耗时【{}】秒,QueHui!".format(QhTable_name,QhHaoShiTime))
    QhCon.sql("detach db;")
    logger.info("【QH_INSERT_TABLEDATA】[{}]关闭目标数据库,QueHui!".format(QhTable_name))

if __name__ == '__main__':

    QhPath = "D:\duckdb_cli"   # 数据湖路径
    QhDuckLakeName = "QhSqliteLake005"        # 数据湖名称
    QhDuckLakeAsName = "QhSqliteLake005"
    QhBaseDbType = "sqlite"


    QhCon=QhConnDB(QhPath = QhPath,   # 数据湖路径
                    QhDuckLakeName = QhDuckLakeName,         # 数据湖名称
                    QhDuckLakeAsName = QhDuckLakeAsName,       # 数据湖别名
                    QhBaseDbType = QhBaseDbType)
    
    # QhSelectLakeBASE(QhCon,QhDuckLakeName = "QhSqliteLake001")
    
相关推荐
^辞安2 小时前
什么是Mvcc
java·数据库·mysql
王百万_3 小时前
【浅谈Spark和Flink区别及应用】
大数据·数据库·分布式·flink·spark·数据治理·数据库架构
白云偷星子4 小时前
MySQL笔记8
数据库·笔记·mysql
维尔切4 小时前
MySQL 主从复制
linux·运维·数据库·mysql·adb
wow_DG4 小时前
【MySQL✨】MySQL 入门之旅 · 第十一篇:MySQL 表连接(JOIN)基础
数据库·mysql
sun_qqq5 小时前
数据可视化的中间表方案
数据库·mysql·数据分析·数据库开发·数据可视化
马克学长5 小时前
SSM滁州学院考研信息分享论坛0iaj2 (程序+源码+数据库+调试部署+开发环境)带论文文档1万字以上,文末可获取,系统界面在最后面
数据库·考研·ssm
先做个垃圾出来………5 小时前
Pydantic库应用
java·数据库·python
深耕AI6 小时前
【12/20】数据库高级查询:MongoDB 聚合管道在用户数据分析中的应用,实现报告生成
数据库·mongodb·数据分析