MySQL 增量同步脚本

一、整体说明

这套脚本实现:

  • 源库 自动增量同步数据到目标库
  • 自动创建表结构
  • 按时间字段增量同步(create_time/update_time
  • 支持批量、多线程、断点续传
  • 自动记录同步状态、日志、异常

核心文件:3 个

  1. 增量同步入口.py(你自己的启动脚本)
  2. mysql_diff_sync.py(核心同步逻辑)
  3. mysql_diff_common.py(工具函数)

二、完整脚本代码(可直接复制使用)

1. 启动入口:增量同步-BIDB.py

python 复制代码
from sqlalchemy import create_engine
from mysql_diff_sync import sync_data

# ===================== 数据库连接配置 =====================
# 源库(星月库)
db_prod_xingyue = 'mysql+pymysql://xxx:xxx@101.42.40.19:3306'

# 配置库(存储 diff_sync_config 表)
db_config = 'mysql+pymysql://root:root@localhost:3306/ops_config?charset=utf8mb4'

# 目标库(同步到这里)
db_all = 'mysql+pymysql://root:root@localhost:3306/ops_xingyue?charset=utf8mb4'

# ===================== 创建连接 =====================
config_engine = create_engine(db_config)
source_engine = create_engine(db_prod_xingyue)
target_engine = create_engine(db_all)

# ===================== 开始同步 =====================
sync_data(
    config_engine=config_engine,
    source_engine=source_engine,
    target_engine=target_engine,
    source_flag='ob_xingyue'
)

2. 核心同步逻辑:mysql_diff_sync.py

python 复制代码
import pandas as pd
from sqlalchemy import create_engine, text, Table, MetaData, inspect, Column
from sqlalchemy.orm import Session
from sqlalchemy.dialects.mysql import insert
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from mysql_diff_common import *

logger = logger('log_MySQL_diff_sync')

def sync_data(config_engine, source_engine, target_engine, source_flag, only_compare=False, vs_input_time='', max_workers=4):
    logger.info(f"{source_flag} 开始数据增量同步 =====>")
    batch_size = 10000
    vs_today_time = datetime.now().date()

    if vs_input_time == '':
        vs_input_time = vs_today_time
    else:
        try:
            date_format = "%Y-%m-%d"
            vs_input_time = datetime.strptime(vs_input_time, date_format)
        except ValueError:
            logger.info(f"错误:提供的字符串不是有效的日期格式!")
            return 1

    config_table = f"""
        select * from diff_sync_config 
        where is_enable=1 
        and source_instance='{source_flag}'  
        and vs_time_value<'{vs_input_time}' 
        order by id;
    """
    config_df = pd.read_sql(config_table, config_engine)

    inspector_source = inspect(source_engine)
    inspector_target = inspect(target_engine)
    logger.info(f"{source_flag} 待执行操作的源-目标对有: {len(config_df)} ,比较的基准时间值为: {vs_input_time}")

    def process_row(index, row):
        syn_tag = True
        syned_msg = ""
        source_db, source_table, target_db, target_table = row[['source_db', 'source_table', 'target_db', 'target_table']]

        vs_time_col = row['vs_time_col'] if row['vs_time_col'] else 'create_time;update_time'
        vs_time_col_list = vs_time_col.split(';')
        vs_time_col_sync = vs_time_col_list[0]
        vs_time_col_old = vs_time_col_list[0]

        vs_time_value = row['vs_time_value'] if row['vs_time_value'] else vs_input_time
        start_time = datetime.now()
        count = 0
        view_names = inspector_source.get_view_names(schema=source_db)
        view_names_lower_set = {v.lower() for v in view_names}

        if not inspector_target.has_table(target_table, schema=target_db):
            logger.info(f"目标表不存在,创建: {target_db}.{target_table} ")
            copy_table_schema(source_engine, target_engine, source_db, source_table, target_db, target_table)
            logger.info(f"成功创建目标表: {target_db}.{target_table}")

        target = f"目标表: {target_db}.{target_table}"

        old_diff_rows, source_cc_old, old_msg = check_data_consistency(
            source_engine, target_engine,
            source_db, target_db,
            source_table, target_table,
            vs_time_col_old, '<', vs_time_value
        )

        new_diff_rows, source_cc_new, new_msg = check_data_consistency(
            source_engine, target_engine,
            source_db, target_db,
            source_table, target_table,
            vs_time_col_sync, '>=', vs_time_value
        )

        msg = f'已同步的检查: {old_msg}; 待同步的对比: {new_msg} '
        update_config_table(config_engine, ID=row['ID'], flag='ok-1-check', msg=msg, diff_rows=new_diff_rows, syned_diff_rows=old_diff_rows, syned_msg='同步前的检查')

        if only_compare:
            syned_msg = '当前处于比较模式,不执行同步。'
            syn_tag = False
            logger.info(f"{target}: {syned_msg}")
        else:
            if old_diff_rows == 0:
                if source_cc_new == 0:
                    syned_msg = '无增量,不执行同步。'
                    syn_tag = False
            else:
                syned_msg = '在已同步数据中发现差异,因此重置比较时间,清空目标表,重新初始化同步。'
                logger.info(f"{target}:  {syned_msg}")
                vs_time_value = '2020-01-01'
                try:
                    with Session(target_engine) as session:
                        session.execute(text(f"TRUNCATE TABLE `{target_db}`.`{target_table}`"))
                        session.commit()
                        logger.info(f"{target}: truncate表:{target_db}.{target_table}。")
                except Exception as e:
                    logger.error(f"{target}: Error truncating table: {e}")

        if syn_tag:
            logger.info(f"{target}: 源表和目标表数据不一致,需要同步数据. 源: {source_db}.{source_table} ,目标: {target_db}.{target_table} ")
            if str(source_engine.url).split('@')[-1] == str(target_engine.url).split('@')[-1]:
                try:
                    with Session(target_engine) as session:
                        if source_table.lower() in view_names_lower_set:
                            logger.info(f"{target}: 源为同实例的视图,先truncate再insert. 源: {source_db}.{source_table}")
                            session.execute(text(f"TRUNCATE TABLE `{target_db}`.{target_table}"))
                            session.commit()
                            session.execute(text(f"INSERT INTO `{target_db}`.{target_table} SELECT * FROM `{source_db}`.{source_table}"))
                            session.commit()
                        else:
                            columns = pd.read_sql(f"SHOW COLUMNS FROM `{source_db}`.`{source_table}`", source_engine)["Field"].tolist()
                            update_cols = [f"{col} = VALUES({col})" for col in columns]
                            insert_sql = f"INSERT INTO `{target_db}`.`{target_table}` ({','.join(columns)}) SELECT {','.join(columns)} FROM `{source_db}`.`{source_table}` ON DUPLICATE KEY UPDATE {','.join(update_cols)}"
                            session.execute(text(insert_sql))
                            session.commit()
                    msg = f'插入数据{count}条. <== '+msg
                    flag = 'ok-2'
                except Exception as e:
                    logger.error(f"{target}: 同实例源和目标同步报错: {e}")
                    msg = f'{str(e)[:100]}. <== '+msg
                    flag = 'error-2'
                    return -1
                finally:
                    end_time = datetime.now()
                    duration = format_duration(int((end_time - start_time).total_seconds()))
                    update_config_table(config_engine, ID=row['ID'], flag=flag, duration=duration, msg=msg)
            else:
                offset = 0
                total_diff_rows = 0
                while True:
                    query_sql = f"SELECT * FROM `{source_db}`.{source_table} WHERE {vs_time_col_sync} >= '{vs_time_value}' ORDER BY {vs_time_col_sync} LIMIT {batch_size} OFFSET {offset}"

                    try:
                        source_df = pd.read_sql(query_sql, source_engine)
                        if source_df.empty:
                            flag = 'ok-2'
                            msg = f'无增量数据. <== '+msg
                            break
                        current_diff_rows = len(source_df)
                        total_diff_rows += current_diff_rows
                        source_df.to_sql(target_table, con=target_engine, index=False, if_exists='append', schema=target_db, method=mysql_replace_into)
                        flag = 'doing-2'
                    except Exception as e:
                        flag = 'error-2'
                        msg = f'{str(e)[:100]}. <== '+msg
                        syn_tag = False
                        logger.error(f"{target}: 错误信息:{str(e)}")
                    finally:
                        update_config_table(config_engine, ID=row['ID'], flag=flag, msg=msg, diff_rows=total_diff_rows)
                        if offset // batch_size < 2:
                            logger.info(f"{target}: query_sql:\n {query_sql} ")
                        if flag == 'error-2':
                            break
                        offset += batch_size

        if not syn_tag:
            update_config_table(config_engine, ID=row['ID'], flag='ok-3', syned_diff_rows=0, syned_msg=syned_msg, vs_time_value=vs_input_time)
        else:
            old_diff_rows, source_cc, syned_msg = check_data_consistency(
                source_engine, target_engine,
                source_db, target_db,
                source_table, target_table,
                vs_time_col_old, '<', vs_today_time
            )
            update_config_table(config_engine, ID=row['ID'], flag='ok-3', syned_diff_rows=old_diff_rows, syned_msg=syned_msg, vs_time_value=vs_input_time)

        end_time = datetime.now()
        duration = format_duration(int((end_time - start_time).total_seconds()))
        update_config_table(config_engine, ID=row['ID'], duration=duration)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_row, index, row): index for index, row in config_df.iterrows()}
        for future in as_completed(futures):
            index = futures[future]
            try:
                future.result()
            except Exception as exc:
                logger.error(f"Config ID {config_df.loc[index, 'ID']} 生成异常: {exc}")
                update_config_table(config_engine, ID=config_df.loc[index, 'ID'], flag='error-0', msg=str(exc))

    logger.info(f"{source_flag} 数据增量同步完成 <=====")

3. 工具类:mysql_diff_common.py

python 复制代码
import pandas as pd
from sqlalchemy import create_engine, text, Table, MetaData, inspect, Column
from sqlalchemy.orm import Session
from sqlalchemy.dialects.mysql import insert
from datetime import datetime, timedelta
import logging
from logging.handlers import TimedRotatingFileHandler

def logger(log_file_name):
    logger_instance = logging.getLogger(log_file_name)
    logger_instance.setLevel(logging.INFO)
    handler = TimedRotatingFileHandler(
        filename=f"{log_file_name}.log",
        when="midnight",
        interval=1,
        backupCount=14,
        encoding='utf-8'
    )
    formatter = logging.Formatter('%(asctime)s \t %(levelname)s \t %(message)s\n')
    handler.setFormatter(formatter)
    logger_instance.addHandler(handler)
    return logger_instance

def copy_table_schema(source_engine, target_engine, source_db, source_table, target_db, target_table):
    try:
        source_metadata = MetaData()
        source_metadata.reflect(bind=source_engine, schema=source_db, only=[source_table])
        source_table_obj = source_metadata.tables[f'{source_db}.{source_table}']
        columns = [
            Column(col.name, col.type, primary_key=col.primary_key, default=col.default,
                   nullable=col.nullable, autoincrement=col.autoincrement, comment=col.comment)
            for col in source_table_obj.columns
        ]
        target_metadata = MetaData()
        target_table_obj = Table(target_table, target_metadata, *columns, schema=target_db)
        target_table_obj.create(bind=target_engine, checkfirst=True)
        logging.info(f"Schema copied: {source_db}.{source_table} -> {target_db}.{target_table}")
    except Exception as e:
        logging.error(f"Failed to copy schema: {e}")
        raise

def mysql_replace_into(table, conn, keys, data_iter):
    data = [dict(zip(keys, row)) for row in data_iter]
    if not data:
        return 0
    try:
        stmt = insert(table.table).values(data)
        update_stmt = stmt.on_duplicate_key_update(**dict(zip(stmt.inserted.keys(), stmt.inserted.values())))
        result = conn.execute(update_stmt)
        return len(data)
    except Exception as e:
        logging.error(f"Error: {e}")
        raise

def format_duration(seconds):
    seconds = int(seconds)
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    parts = []
    if hours > 0: parts.append(f"{hours}小时")
    if minutes > 0: parts.append(f"{minutes}分钟")
    if seconds > 0 or not parts: parts.append(f"{seconds}秒")
    return ''.join(parts) if parts else "0秒"

def update_config_table(config_engine, ID, **kwargs):
    update_data = {'ID': ID}
    allowed_fields = ['vs_time_value', 'diff_rows', 'syned_diff_rows', 'syned_msg', 'flag', 'msg', 'duration']
    for key, value in kwargs.items():
        if key in allowed_fields:
            update_data[key] = value
    try:
        with Session(config_engine) as session:
            set_clauses = [f"{key} = :{key}" for key in update_data if key != 'ID']
            set_clauses.append("uptime = current_timestamp")
            stmt_text = f"UPDATE diff_sync_config SET {', '.join(set_clauses)} WHERE ID = :ID"
            session.execute(text(stmt_text), update_data)
            session.commit()
    except Exception as e:
        logging.error(f"Failed to update config: {e}")

def check_data_consistency(source_engine, target_engine, source_db, target_db, source_table, target_table, vs_time_col, vs_tag, vs_time_value):
    source_cc_sql = f"SELECT COUNT(*) AS total FROM `{source_db}`.{source_table} WHERE {vs_time_col} {vs_tag} '{vs_time_value}'"
    target_cc_sql = f"SELECT COUNT(*) AS total FROM `{target_db}`.{target_table} WHERE {vs_time_col} {vs_tag} '{vs_time_value}'"
    source_cc = pd.read_sql(source_cc_sql, source_engine).iloc[0]['total']
    target_cc = pd.read_sql(target_cc_sql, target_engine).iloc[0]['total']
    diff_rows = target_cc - source_cc
    msg = f'same. ' if diff_rows == 0 else f'{"目标多" if diff_rows > 0 else "目标少"} {abs(diff_rows)}条.'
    msg += f'目标: {target_cc}, 源: {source_cc}'
    return diff_rows, source_cc, msg

三、配置表结构(必须创建)

库:ops_config

表:diff_sync_config

sql 复制代码
CREATE TABLE `diff_sync_config` (
  `ID` int NOT NULL AUTO_INCREMENT,
  `source_instance` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `source_db` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `source_table` varchar(80) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `target_db` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `target_table` varchar(80) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `vs_time_col` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `vs_time_value` date NOT NULL DEFAULT '2000-01-01',
  `is_enable` tinyint NOT NULL DEFAULT '1',
  `flag` varchar(18) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `diff_rows` int NOT NULL DEFAULT '-1',
  `msg` varchar(300) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `syned_diff_rows` int NOT NULL DEFAULT '-1',
  `syned_msg` varchar(300) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `duration` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  `uptime` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  `relyon` varchar(160) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
  PRIMARY KEY (`ID`),
  UNIQUE KEY `ux` (`source_instance`,`source_db`,`source_table`,`target_db`,`target_table`)
) ENGINE=InnoDB AUTO_INCREMENT=10235 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;

四、最关键配置说明(决定是否同步)

1. 是否启用同步

复制代码
is_enable = 1   启用
is_enable = 0   禁用(不同步)

2. 按哪个时间字段同步

复制代码
vs_time_col = create_time

3. 从哪个时间开始同步

复制代码
vs_time_value = '2026-04-01'

同步完成后会自动更新为当前日期,所以第二遍不会重复同步。


相关推荐
穷人小水滴2 小时前
手机安装 Cross Profile Test App (Android)
android·手机·一加
ningmengjing_2 小时前
从零推导出 Redis
数据库·redis
zh_xuan2 小时前
Android Paging 3实现分页加载
android·git·github·paging 3
殷紫川2 小时前
InnoDB 索引性能天花板:聚簇 & 二级索引存储本质拆解,覆盖索引零回表优化全攻略
mysql
IvorySQL2 小时前
PostgreSQL & IvorySQL 技术交流 Meetup・郑州站| 4.18 线下开讲,只聊硬核技术
数据库·postgresql·开源
殷紫川2 小时前
MySQL IN 里塞 10000 个值?90% 开发者都踩过的坑,底层原理 + 全场景解决方案一次讲透
mysql
iOS妖狐小北2 小时前
mysql中主键索引和联合索引的原理解析
数据库·mysql
大嘴皮猴儿2 小时前
AI图片翻译技术解析:以跨马翻译为例看电商图片翻译的实际效果
大数据·数据库·人工智能·自动翻译·教育电商
光泽雨2 小时前
一、什么是 MySQL 函数?
mysql