一、整体说明
这套脚本实现:
- 从源库 自动增量同步数据到目标库
- 自动创建表结构
- 按时间字段增量同步(
create_time/update_time) - 支持批量、多线程、断点续传
- 自动记录同步状态、日志、异常
核心文件:3 个
增量同步入口.py(你自己的启动脚本)mysql_diff_sync.py(核心同步逻辑)mysql_diff_common.py(工具函数)
二、完整脚本代码(可直接复制使用)
1. 启动入口:增量同步-BIDB.py
python
from sqlalchemy import create_engine
from mysql_diff_sync import sync_data
# ===================== 数据库连接配置 =====================
# 源库(星月库)
db_prod_xingyue = 'mysql+pymysql://xxx:xxx@101.42.40.19:3306'
# 配置库(存储 diff_sync_config 表)
db_config = 'mysql+pymysql://root:root@localhost:3306/ops_config?charset=utf8mb4'
# 目标库(同步到这里)
db_all = 'mysql+pymysql://root:root@localhost:3306/ops_xingyue?charset=utf8mb4'
# ===================== 创建连接 =====================
config_engine = create_engine(db_config)
source_engine = create_engine(db_prod_xingyue)
target_engine = create_engine(db_all)
# ===================== 开始同步 =====================
sync_data(
config_engine=config_engine,
source_engine=source_engine,
target_engine=target_engine,
source_flag='ob_xingyue'
)
2. 核心同步逻辑:mysql_diff_sync.py
python
import pandas as pd
from sqlalchemy import create_engine, text, Table, MetaData, inspect, Column
from sqlalchemy.orm import Session
from sqlalchemy.dialects.mysql import insert
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from mysql_diff_common import *
logger = logger('log_MySQL_diff_sync')
def sync_data(config_engine, source_engine, target_engine, source_flag, only_compare=False, vs_input_time='', max_workers=4):
logger.info(f"{source_flag} 开始数据增量同步 =====>")
batch_size = 10000
vs_today_time = datetime.now().date()
if vs_input_time == '':
vs_input_time = vs_today_time
else:
try:
date_format = "%Y-%m-%d"
vs_input_time = datetime.strptime(vs_input_time, date_format)
except ValueError:
logger.info(f"错误:提供的字符串不是有效的日期格式!")
return 1
config_table = f"""
select * from diff_sync_config
where is_enable=1
and source_instance='{source_flag}'
and vs_time_value<'{vs_input_time}'
order by id;
"""
config_df = pd.read_sql(config_table, config_engine)
inspector_source = inspect(source_engine)
inspector_target = inspect(target_engine)
logger.info(f"{source_flag} 待执行操作的源-目标对有: {len(config_df)} ,比较的基准时间值为: {vs_input_time}")
def process_row(index, row):
syn_tag = True
syned_msg = ""
source_db, source_table, target_db, target_table = row[['source_db', 'source_table', 'target_db', 'target_table']]
vs_time_col = row['vs_time_col'] if row['vs_time_col'] else 'create_time;update_time'
vs_time_col_list = vs_time_col.split(';')
vs_time_col_sync = vs_time_col_list[0]
vs_time_col_old = vs_time_col_list[0]
vs_time_value = row['vs_time_value'] if row['vs_time_value'] else vs_input_time
start_time = datetime.now()
count = 0
view_names = inspector_source.get_view_names(schema=source_db)
view_names_lower_set = {v.lower() for v in view_names}
if not inspector_target.has_table(target_table, schema=target_db):
logger.info(f"目标表不存在,创建: {target_db}.{target_table} ")
copy_table_schema(source_engine, target_engine, source_db, source_table, target_db, target_table)
logger.info(f"成功创建目标表: {target_db}.{target_table}")
target = f"目标表: {target_db}.{target_table}"
old_diff_rows, source_cc_old, old_msg = check_data_consistency(
source_engine, target_engine,
source_db, target_db,
source_table, target_table,
vs_time_col_old, '<', vs_time_value
)
new_diff_rows, source_cc_new, new_msg = check_data_consistency(
source_engine, target_engine,
source_db, target_db,
source_table, target_table,
vs_time_col_sync, '>=', vs_time_value
)
msg = f'已同步的检查: {old_msg}; 待同步的对比: {new_msg} '
update_config_table(config_engine, ID=row['ID'], flag='ok-1-check', msg=msg, diff_rows=new_diff_rows, syned_diff_rows=old_diff_rows, syned_msg='同步前的检查')
if only_compare:
syned_msg = '当前处于比较模式,不执行同步。'
syn_tag = False
logger.info(f"{target}: {syned_msg}")
else:
if old_diff_rows == 0:
if source_cc_new == 0:
syned_msg = '无增量,不执行同步。'
syn_tag = False
else:
syned_msg = '在已同步数据中发现差异,因此重置比较时间,清空目标表,重新初始化同步。'
logger.info(f"{target}: {syned_msg}")
vs_time_value = '2020-01-01'
try:
with Session(target_engine) as session:
session.execute(text(f"TRUNCATE TABLE `{target_db}`.`{target_table}`"))
session.commit()
logger.info(f"{target}: truncate表:{target_db}.{target_table}。")
except Exception as e:
logger.error(f"{target}: Error truncating table: {e}")
if syn_tag:
logger.info(f"{target}: 源表和目标表数据不一致,需要同步数据. 源: {source_db}.{source_table} ,目标: {target_db}.{target_table} ")
if str(source_engine.url).split('@')[-1] == str(target_engine.url).split('@')[-1]:
try:
with Session(target_engine) as session:
if source_table.lower() in view_names_lower_set:
logger.info(f"{target}: 源为同实例的视图,先truncate再insert. 源: {source_db}.{source_table}")
session.execute(text(f"TRUNCATE TABLE `{target_db}`.{target_table}"))
session.commit()
session.execute(text(f"INSERT INTO `{target_db}`.{target_table} SELECT * FROM `{source_db}`.{source_table}"))
session.commit()
else:
columns = pd.read_sql(f"SHOW COLUMNS FROM `{source_db}`.`{source_table}`", source_engine)["Field"].tolist()
update_cols = [f"{col} = VALUES({col})" for col in columns]
insert_sql = f"INSERT INTO `{target_db}`.`{target_table}` ({','.join(columns)}) SELECT {','.join(columns)} FROM `{source_db}`.`{source_table}` ON DUPLICATE KEY UPDATE {','.join(update_cols)}"
session.execute(text(insert_sql))
session.commit()
msg = f'插入数据{count}条. <== '+msg
flag = 'ok-2'
except Exception as e:
logger.error(f"{target}: 同实例源和目标同步报错: {e}")
msg = f'{str(e)[:100]}. <== '+msg
flag = 'error-2'
return -1
finally:
end_time = datetime.now()
duration = format_duration(int((end_time - start_time).total_seconds()))
update_config_table(config_engine, ID=row['ID'], flag=flag, duration=duration, msg=msg)
else:
offset = 0
total_diff_rows = 0
while True:
query_sql = f"SELECT * FROM `{source_db}`.{source_table} WHERE {vs_time_col_sync} >= '{vs_time_value}' ORDER BY {vs_time_col_sync} LIMIT {batch_size} OFFSET {offset}"
try:
source_df = pd.read_sql(query_sql, source_engine)
if source_df.empty:
flag = 'ok-2'
msg = f'无增量数据. <== '+msg
break
current_diff_rows = len(source_df)
total_diff_rows += current_diff_rows
source_df.to_sql(target_table, con=target_engine, index=False, if_exists='append', schema=target_db, method=mysql_replace_into)
flag = 'doing-2'
except Exception as e:
flag = 'error-2'
msg = f'{str(e)[:100]}. <== '+msg
syn_tag = False
logger.error(f"{target}: 错误信息:{str(e)}")
finally:
update_config_table(config_engine, ID=row['ID'], flag=flag, msg=msg, diff_rows=total_diff_rows)
if offset // batch_size < 2:
logger.info(f"{target}: query_sql:\n {query_sql} ")
if flag == 'error-2':
break
offset += batch_size
if not syn_tag:
update_config_table(config_engine, ID=row['ID'], flag='ok-3', syned_diff_rows=0, syned_msg=syned_msg, vs_time_value=vs_input_time)
else:
old_diff_rows, source_cc, syned_msg = check_data_consistency(
source_engine, target_engine,
source_db, target_db,
source_table, target_table,
vs_time_col_old, '<', vs_today_time
)
update_config_table(config_engine, ID=row['ID'], flag='ok-3', syned_diff_rows=old_diff_rows, syned_msg=syned_msg, vs_time_value=vs_input_time)
end_time = datetime.now()
duration = format_duration(int((end_time - start_time).total_seconds()))
update_config_table(config_engine, ID=row['ID'], duration=duration)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(process_row, index, row): index for index, row in config_df.iterrows()}
for future in as_completed(futures):
index = futures[future]
try:
future.result()
except Exception as exc:
logger.error(f"Config ID {config_df.loc[index, 'ID']} 生成异常: {exc}")
update_config_table(config_engine, ID=config_df.loc[index, 'ID'], flag='error-0', msg=str(exc))
logger.info(f"{source_flag} 数据增量同步完成 <=====")
3. 工具类:mysql_diff_common.py
python
import pandas as pd
from sqlalchemy import create_engine, text, Table, MetaData, inspect, Column
from sqlalchemy.orm import Session
from sqlalchemy.dialects.mysql import insert
from datetime import datetime, timedelta
import logging
from logging.handlers import TimedRotatingFileHandler
def logger(log_file_name):
logger_instance = logging.getLogger(log_file_name)
logger_instance.setLevel(logging.INFO)
handler = TimedRotatingFileHandler(
filename=f"{log_file_name}.log",
when="midnight",
interval=1,
backupCount=14,
encoding='utf-8'
)
formatter = logging.Formatter('%(asctime)s \t %(levelname)s \t %(message)s\n')
handler.setFormatter(formatter)
logger_instance.addHandler(handler)
return logger_instance
def copy_table_schema(source_engine, target_engine, source_db, source_table, target_db, target_table):
try:
source_metadata = MetaData()
source_metadata.reflect(bind=source_engine, schema=source_db, only=[source_table])
source_table_obj = source_metadata.tables[f'{source_db}.{source_table}']
columns = [
Column(col.name, col.type, primary_key=col.primary_key, default=col.default,
nullable=col.nullable, autoincrement=col.autoincrement, comment=col.comment)
for col in source_table_obj.columns
]
target_metadata = MetaData()
target_table_obj = Table(target_table, target_metadata, *columns, schema=target_db)
target_table_obj.create(bind=target_engine, checkfirst=True)
logging.info(f"Schema copied: {source_db}.{source_table} -> {target_db}.{target_table}")
except Exception as e:
logging.error(f"Failed to copy schema: {e}")
raise
def mysql_replace_into(table, conn, keys, data_iter):
data = [dict(zip(keys, row)) for row in data_iter]
if not data:
return 0
try:
stmt = insert(table.table).values(data)
update_stmt = stmt.on_duplicate_key_update(**dict(zip(stmt.inserted.keys(), stmt.inserted.values())))
result = conn.execute(update_stmt)
return len(data)
except Exception as e:
logging.error(f"Error: {e}")
raise
def format_duration(seconds):
seconds = int(seconds)
hours, remainder = divmod(seconds, 3600)
minutes, seconds = divmod(remainder, 60)
parts = []
if hours > 0: parts.append(f"{hours}小时")
if minutes > 0: parts.append(f"{minutes}分钟")
if seconds > 0 or not parts: parts.append(f"{seconds}秒")
return ''.join(parts) if parts else "0秒"
def update_config_table(config_engine, ID, **kwargs):
update_data = {'ID': ID}
allowed_fields = ['vs_time_value', 'diff_rows', 'syned_diff_rows', 'syned_msg', 'flag', 'msg', 'duration']
for key, value in kwargs.items():
if key in allowed_fields:
update_data[key] = value
try:
with Session(config_engine) as session:
set_clauses = [f"{key} = :{key}" for key in update_data if key != 'ID']
set_clauses.append("uptime = current_timestamp")
stmt_text = f"UPDATE diff_sync_config SET {', '.join(set_clauses)} WHERE ID = :ID"
session.execute(text(stmt_text), update_data)
session.commit()
except Exception as e:
logging.error(f"Failed to update config: {e}")
def check_data_consistency(source_engine, target_engine, source_db, target_db, source_table, target_table, vs_time_col, vs_tag, vs_time_value):
source_cc_sql = f"SELECT COUNT(*) AS total FROM `{source_db}`.{source_table} WHERE {vs_time_col} {vs_tag} '{vs_time_value}'"
target_cc_sql = f"SELECT COUNT(*) AS total FROM `{target_db}`.{target_table} WHERE {vs_time_col} {vs_tag} '{vs_time_value}'"
source_cc = pd.read_sql(source_cc_sql, source_engine).iloc[0]['total']
target_cc = pd.read_sql(target_cc_sql, target_engine).iloc[0]['total']
diff_rows = target_cc - source_cc
msg = f'same. ' if diff_rows == 0 else f'{"目标多" if diff_rows > 0 else "目标少"} {abs(diff_rows)}条.'
msg += f'目标: {target_cc}, 源: {source_cc}'
return diff_rows, source_cc, msg
三、配置表结构(必须创建)
库:ops_config
表:diff_sync_config
sql
CREATE TABLE `diff_sync_config` (
`ID` int NOT NULL AUTO_INCREMENT,
`source_instance` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`source_db` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`source_table` varchar(80) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`target_db` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`target_table` varchar(80) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`vs_time_col` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`vs_time_value` date NOT NULL DEFAULT '2000-01-01',
`is_enable` tinyint NOT NULL DEFAULT '1',
`flag` varchar(18) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`diff_rows` int NOT NULL DEFAULT '-1',
`msg` varchar(300) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`syned_diff_rows` int NOT NULL DEFAULT '-1',
`syned_msg` varchar(300) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
`duration` varchar(64) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
`uptime` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`relyon` varchar(160) COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
PRIMARY KEY (`ID`),
UNIQUE KEY `ux` (`source_instance`,`source_db`,`source_table`,`target_db`,`target_table`)
) ENGINE=InnoDB AUTO_INCREMENT=10235 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
四、最关键配置说明(决定是否同步)
1. 是否启用同步
is_enable = 1 启用
is_enable = 0 禁用(不同步)
2. 按哪个时间字段同步
vs_time_col = create_time
3. 从哪个时间开始同步
vs_time_value = '2026-04-01'
同步完成后会自动更新为当前日期,所以第二遍不会重复同步。