MySQL数据库容灾设计案例与SQL实现
一、主从复制容灾方案
1. 配置主从复制
sql
-- 在主库执行(创建复制账号)
CREATE USER 'repl_user'@'%' IDENTIFIED BY 'SecurePass123!';
GRANT REPLICATION SLAVE ON *.* TO 'repl_user'@'%';
-- 查看主库状态(记录File和Position)
SHOW MASTER STATUS;
-- 示例输出:
-- File: mysql-bin.000001
-- Position: 107
-- 在从库执行配置
CHANGE MASTER TO
MASTER_HOST='master_host',
MASTER_USER='repl_user',
MASTER_PASSWORD='SecurePass123!',
MASTER_LOG_FILE='mysql-bin.000001',
MASTER_LOG_POS=107;
-- 启动从库复制
START SLAVE;
-- 检查从库状态
SHOW SLAVE STATUS\G
2. 自动故障转移检测
sql
-- 创建监控表
CREATE TABLE replication_monitor (
id INT AUTO_INCREMENT PRIMARY KEY,
check_time DATETIME DEFAULT CURRENT_TIMESTAMP,
master_status VARCHAR(20),
slave_io_running VARCHAR(3),
slave_sql_running VARCHAR(3),
seconds_behind INT,
last_error TEXT
);
-- 创建监控存储过程
DELIMITER //
CREATE PROCEDURE check_replication_status()
BEGIN
DECLARE io_status VARCHAR(3);
DECLARE sql_status VARCHAR(3);
DECLARE behind INT;
-- 获取从库状态
SELECT Slave_IO_Running, Slave_SQL_Running, Seconds_Behind_Master
INTO io_status, sql_status, behind
FROM performance_schema.replication_applier_status;
-- 记录状态
INSERT INTO replication_monitor (master_status, slave_io_running, slave_sql_running, seconds_behind)
VALUES ('OK', io_status, sql_status, behind);
-- 如果复制中断,发送警报
IF io_status != 'Yes' OR sql_status != 'Yes' THEN
INSERT INTO alert_logs (alert_type, alert_message)
VALUES ('REPLICATION_ERROR', CONCAT('Replication error - IO: ', io_status, ', SQL: ', sql_status));
END IF;
END //
DELIMITER ;
-- 创建定时事件
CREATE EVENT ev_replication_monitor
ON SCHEDULE EVERY 1 MINUTE
DO CALL check_replication_status();
二、MGR(MySQL Group Replication)高可用方案
1. 初始化MGR集群
sql
-- 在所有节点执行基础配置
SET SQL_LOG_BIN=0;
CREATE USER 'mgr_user'@'%' IDENTIFIED BY 'MgrSecurePass456!';
GRANT REPLICATION SLAVE ON *.* TO 'mgr_user'@'%';
GRANT BACKUP_ADMIN ON *.* TO 'mgr_user'@'%';
FLUSH PRIVILEGES;
SET SQL_LOG_BIN=1;
-- 配置第一个节点(引导节点)
SET GLOBAL group_replication_bootstrap_group=ON;
START GROUP_REPLICATION USER='mgr_user', PASSWORD='MgrSecurePass456!';
SET GLOBAL group_replication_bootstrap_group=OFF;
-- 其他节点加入集群
START GROUP_REPLICATION USER='mgr_user', PASSWORD='MgrSecurePass456!';
-- 查看集群状态
SELECT * FROM performance_schema.replication_group_members;
2. MGR故障自动处理
sql
-- 创建故障处理存储过程
DELIMITER //
CREATE PROCEDURE handle_mgr_failure()
BEGIN
DECLARE member_count INT;
DECLARE primary_member VARCHAR(255);
-- 检查集群成员数量
SELECT COUNT(*) INTO member_count
FROM performance_schema.replication_group_members
WHERE MEMBER_STATE = 'ONLINE';
-- 如果少于2个在线成员,触发警报
IF member_count < 2 THEN
INSERT INTO alert_logs (alert_type, alert_message)
VALUES ('MGR_WARNING', CONCAT('Only ', member_count, ' members online'));
-- 尝试自动恢复
IF member_count = 0 THEN
SET GLOBAL group_replication_bootstrap_group=ON;
START GROUP_REPLICATION;
SET GLOBAL group_replication_bootstrap_group=OFF;
END IF;
END IF;
-- 检查主节点是否存在
SELECT MEMBER_HOST INTO primary_member
FROM performance_schema.replication_group_members
WHERE MEMBER_ROLE = 'PRIMARY' LIMIT 1;
IF primary_member IS NULL THEN
-- 触发选举新主节点
STOP GROUP_REPLICATION;
START GROUP_REPLICATION;
END IF;
END //
DELIMITER ;
-- 创建监控事件
CREATE EVENT ev_mgr_monitor
ON SCHEDULE EVERY 30 SECOND
DO CALL handle_mgr_failure();
三、跨机房容灾方案
1. 配置异地灾备
sql
-- 主中心配置
CREATE USER 'dr_user'@'backup_center_IP' IDENTIFIED BY 'DrPass789!';
GRANT REPLICATION SLAVE ON *.* TO 'dr_user'@'backup_center_IP';
-- 灾备中心配置(延迟复制,避免逻辑错误传播)
CHANGE MASTER TO
MASTER_HOST='primary_center_IP',
MASTER_USER='dr_user',
MASTER_PASSWORD='DrPass789!',
MASTER_DELAY=3600; -- 延迟1小时
START SLAVE;
-- 配置半同步复制(确保数据安全)
-- 在主库执行
INSTALL PLUGIN rpl_semi_sync_master SONAME 'semisync_master.so';
SET GLOBAL rpl_semi_sync_master_enabled = 1;
SET GLOBAL rpl_semi_sync_master_timeout = 30000; -- 30秒超时
-- 在灾备库执行
INSTALL PLUGIN rpl_semi_sync_slave SONAME 'semisync_slave.so';
SET GLOBAL rpl_semi_sync_slave_enabled = 1;
2. 灾备切换流程
sql
-- 1. 停止应用写入
-- 2. 确保所有数据同步完成
STOP SLAVE IO_THREAD;
SHOW PROCESSLIST; -- 确保所有SQL线程完成
-- 3. 提升灾备库为主库
STOP SLAVE;
RESET MASTER; -- 清除复制信息
SET GLOBAL read_only = OFF;
-- 4. 在原主库配置为新从库(故障恢复后)
CHANGE MASTER TO
MASTER_HOST='new_master_IP',
MASTER_USER='dr_user',
MASTER_PASSWORD='DrPass789!';
START SLAVE;
四、数据校验与修复
1. 定期数据校验
sql
-- 创建校验表
CREATE TABLE data_checksum (
table_name VARCHAR(64) PRIMARY KEY,
source_checksum BIGINT,
target_checksum BIGINT,
last_check_time DATETIME,
status ENUM('OK', 'MISMATCH', 'NOT_CHECKED')
);
-- 创建校验存储过程
DELIMITER //
CREATE PROCEDURE verify_data_consistency()
BEGIN
DECLARE done INT DEFAULT FALSE;
DECLARE tname VARCHAR(64);
DECLARE cur CURSOR FOR SELECT table_name FROM information_schema.tables
WHERE table_schema = DATABASE() AND table_type = 'BASE TABLE';
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
OPEN cur;
read_loop: LOOP
FETCH cur INTO tname;
IF done THEN
LEAVE read_loop;
END IF;
SET @sql = CONCAT('INSERT INTO data_checksum (table_name, source_checksum, last_check_time, status)
VALUES (?, (SELECT COUNT(*) FROM ', tname, '), NOW(), ''NOT_CHECKED'')
ON DUPLICATE KEY UPDATE source_checksum = VALUES(source_checksum),
last_check_time = VALUES(last_check_time)');
PREPARE stmt FROM @sql;
EXECUTE stmt USING tname;
DEALLOCATE PREPARE stmt;
END LOOP;
CLOSE cur;
-- 标记不一致的表
UPDATE data_checksum dc
JOIN (SELECT table_name, COUNT(*) AS cnt
FROM information_schema.tables
WHERE table_schema = DATABASE()
GROUP BY table_name) t
ON dc.table_name = t.table_name
SET dc.status = IF(dc.source_checksum = t.cnt, 'OK', 'MISMATCH');
END //
DELIMITER ;
-- 创建定时校验任务
CREATE EVENT ev_data_verification
ON SCHEDULE EVERY 6 HOUR
DO CALL verify_data_consistency();
五、自动化容灾演练
sql
-- 创建演练日志表
CREATE TABLE disaster_recovery_drill (
drill_id INT AUTO_INCREMENT PRIMARY KEY,
start_time DATETIME NOT NULL,
end_time DATETIME,
scenario VARCHAR(50) NOT NULL,
status ENUM('RUNNING', 'SUCCESS', 'FAILED') DEFAULT 'RUNNING',
details TEXT
);
-- 创建演练存储过程
DELIMITER //
CREATE PROCEDURE execute_drill(IN scenario VARCHAR(50))
BEGIN
DECLARE drill_start DATETIME DEFAULT NOW();
DECLARE drill_id INT;
-- 记录演练开始
INSERT INTO disaster_recovery_drill (start_time, scenario, status)
VALUES (drill_start, scenario, 'RUNNING');
SET drill_id = LAST_INSERT_ID();
-- 根据场景执行不同操作
CASE scenario
WHEN 'MASTER_FAILOVER' THEN
-- 模拟主库故障
SET @kill_id = (SELECT ID FROM information_schema.processlist
WHERE USER = 'app_user' LIMIT 1);
SET @sql = CONCAT('KILL ', @kill_id);
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- 这里添加实际的故障转移逻辑
-- ...
UPDATE disaster_recovery_drill
SET status = 'SUCCESS', end_time = NOW()
WHERE drill_id = drill_id;
WHEN 'DATACENTER_FAILURE' THEN
-- 模拟数据中心故障
-- ...
ELSE
UPDATE disaster_recovery_drill
SET status = 'FAILED', end_time = NOW(),
details = 'Unknown scenario'
WHERE drill_id = drill_id;
END CASE;
END //
DELIMITER ;
六、监控与告警系统
sql
-- 创建监控仪表板视图
CREATE VIEW disaster_recovery_status AS
SELECT
(SELECT COUNT(*) FROM replication_monitor WHERE slave_io_running = 'No' OR slave_sql_running = 'No') AS replication_errors,
(SELECT COUNT(*) FROM performance_schema.replication_group_members WHERE MEMBER_STATE != 'ONLINE') AS mgr_offline_members,
(SELECT COUNT(*) FROM data_checksum WHERE status = 'MISMATCH') AS data_mismatches,
(SELECT MAX(TIMESTAMPDIFF(MINUTE, last_check_time, NOW())) FROM data_checksum) AS minutes_since_last_check,
(SELECT COUNT(*) FROM alert_logs WHERE alert_time > DATE_SUB(NOW(), INTERVAL 1 HOUR)) AS recent_alerts;
以上SQL语句实现了一个完整的MySQL数据库容灾方案,包括主从复制、MGR高可用、异地灾备、数据校验和自动化演练等功能模块。实际应用时,需要根据具体环境调整IP地址、用户名密码等参数。