MySQL数据库容灾设计案例与SQL实现

MySQL数据库容灾设计案例与SQL实现

一、主从复制容灾方案

1. 配置主从复制

sql 复制代码
-- 在主库执行(创建复制账号)
CREATE USER 'repl_user'@'%' IDENTIFIED BY 'SecurePass123!';
GRANT REPLICATION SLAVE ON *.* TO 'repl_user'@'%';

-- 查看主库状态(记录File和Position)
SHOW MASTER STATUS;
-- 示例输出:
-- File: mysql-bin.000001
-- Position: 107

-- 在从库执行配置
CHANGE MASTER TO
MASTER_HOST='master_host',
MASTER_USER='repl_user',
MASTER_PASSWORD='SecurePass123!',
MASTER_LOG_FILE='mysql-bin.000001',
MASTER_LOG_POS=107;

-- 启动从库复制
START SLAVE;

-- 检查从库状态
SHOW SLAVE STATUS\G

2. 自动故障转移检测

sql 复制代码
-- 创建监控表
CREATE TABLE replication_monitor (
    id INT AUTO_INCREMENT PRIMARY KEY,
    check_time DATETIME DEFAULT CURRENT_TIMESTAMP,
    master_status VARCHAR(20),
    slave_io_running VARCHAR(3),
    slave_sql_running VARCHAR(3),
    seconds_behind INT,
    last_error TEXT
);

-- 创建监控存储过程
DELIMITER //
CREATE PROCEDURE check_replication_status()
BEGIN
    DECLARE io_status VARCHAR(3);
    DECLARE sql_status VARCHAR(3);
    DECLARE behind INT;
    
    -- 获取从库状态
    SELECT Slave_IO_Running, Slave_SQL_Running, Seconds_Behind_Master
    INTO io_status, sql_status, behind
    FROM performance_schema.replication_applier_status;
    
    -- 记录状态
    INSERT INTO replication_monitor (master_status, slave_io_running, slave_sql_running, seconds_behind)
    VALUES ('OK', io_status, sql_status, behind);
    
    -- 如果复制中断,发送警报
    IF io_status != 'Yes' OR sql_status != 'Yes' THEN
        INSERT INTO alert_logs (alert_type, alert_message)
        VALUES ('REPLICATION_ERROR', CONCAT('Replication error - IO: ', io_status, ', SQL: ', sql_status));
    END IF;
END //
DELIMITER ;

-- 创建定时事件
CREATE EVENT ev_replication_monitor
ON SCHEDULE EVERY 1 MINUTE
DO CALL check_replication_status();

二、MGR(MySQL Group Replication)高可用方案

1. 初始化MGR集群

sql 复制代码
-- 在所有节点执行基础配置
SET SQL_LOG_BIN=0;
CREATE USER 'mgr_user'@'%' IDENTIFIED BY 'MgrSecurePass456!';
GRANT REPLICATION SLAVE ON *.* TO 'mgr_user'@'%';
GRANT BACKUP_ADMIN ON *.* TO 'mgr_user'@'%';
FLUSH PRIVILEGES;
SET SQL_LOG_BIN=1;

-- 配置第一个节点(引导节点)
SET GLOBAL group_replication_bootstrap_group=ON;
START GROUP_REPLICATION USER='mgr_user', PASSWORD='MgrSecurePass456!';
SET GLOBAL group_replication_bootstrap_group=OFF;

-- 其他节点加入集群
START GROUP_REPLICATION USER='mgr_user', PASSWORD='MgrSecurePass456!';

-- 查看集群状态
SELECT * FROM performance_schema.replication_group_members;

2. MGR故障自动处理

sql 复制代码
-- 创建故障处理存储过程
DELIMITER //
CREATE PROCEDURE handle_mgr_failure()
BEGIN
    DECLARE member_count INT;
    DECLARE primary_member VARCHAR(255);
    
    -- 检查集群成员数量
    SELECT COUNT(*) INTO member_count 
    FROM performance_schema.replication_group_members
    WHERE MEMBER_STATE = 'ONLINE';
    
    -- 如果少于2个在线成员,触发警报
    IF member_count < 2 THEN
        INSERT INTO alert_logs (alert_type, alert_message)
        VALUES ('MGR_WARNING', CONCAT('Only ', member_count, ' members online'));
        
        -- 尝试自动恢复
        IF member_count = 0 THEN
            SET GLOBAL group_replication_bootstrap_group=ON;
            START GROUP_REPLICATION;
            SET GLOBAL group_replication_bootstrap_group=OFF;
        END IF;
    END IF;
    
    -- 检查主节点是否存在
    SELECT MEMBER_HOST INTO primary_member
    FROM performance_schema.replication_group_members
    WHERE MEMBER_ROLE = 'PRIMARY' LIMIT 1;
    
    IF primary_member IS NULL THEN
        -- 触发选举新主节点
        STOP GROUP_REPLICATION;
        START GROUP_REPLICATION;
    END IF;
END //
DELIMITER ;

-- 创建监控事件
CREATE EVENT ev_mgr_monitor
ON SCHEDULE EVERY 30 SECOND
DO CALL handle_mgr_failure();

三、跨机房容灾方案

1. 配置异地灾备

sql 复制代码
-- 主中心配置
CREATE USER 'dr_user'@'backup_center_IP' IDENTIFIED BY 'DrPass789!';
GRANT REPLICATION SLAVE ON *.* TO 'dr_user'@'backup_center_IP';

-- 灾备中心配置(延迟复制,避免逻辑错误传播)
CHANGE MASTER TO
MASTER_HOST='primary_center_IP',
MASTER_USER='dr_user',
MASTER_PASSWORD='DrPass789!',
MASTER_DELAY=3600;  -- 延迟1小时

START SLAVE;

-- 配置半同步复制(确保数据安全)
-- 在主库执行
INSTALL PLUGIN rpl_semi_sync_master SONAME 'semisync_master.so';
SET GLOBAL rpl_semi_sync_master_enabled = 1;
SET GLOBAL rpl_semi_sync_master_timeout = 30000;  -- 30秒超时

-- 在灾备库执行
INSTALL PLUGIN rpl_semi_sync_slave SONAME 'semisync_slave.so';
SET GLOBAL rpl_semi_sync_slave_enabled = 1;

2. 灾备切换流程

sql 复制代码
-- 1. 停止应用写入
-- 2. 确保所有数据同步完成
STOP SLAVE IO_THREAD;
SHOW PROCESSLIST;  -- 确保所有SQL线程完成

-- 3. 提升灾备库为主库
STOP SLAVE;
RESET MASTER;  -- 清除复制信息
SET GLOBAL read_only = OFF;

-- 4. 在原主库配置为新从库(故障恢复后)
CHANGE MASTER TO
MASTER_HOST='new_master_IP',
MASTER_USER='dr_user',
MASTER_PASSWORD='DrPass789!';

START SLAVE;

四、数据校验与修复

1. 定期数据校验

sql 复制代码
-- 创建校验表
CREATE TABLE data_checksum (
    table_name VARCHAR(64) PRIMARY KEY,
    source_checksum BIGINT,
    target_checksum BIGINT,
    last_check_time DATETIME,
    status ENUM('OK', 'MISMATCH', 'NOT_CHECKED')
);

-- 创建校验存储过程
DELIMITER //
CREATE PROCEDURE verify_data_consistency()
BEGIN
    DECLARE done INT DEFAULT FALSE;
    DECLARE tname VARCHAR(64);
    DECLARE cur CURSOR FOR SELECT table_name FROM information_schema.tables 
                          WHERE table_schema = DATABASE() AND table_type = 'BASE TABLE';
    DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
    
    OPEN cur;
    read_loop: LOOP
        FETCH cur INTO tname;
        IF done THEN
            LEAVE read_loop;
        END IF;
        
        SET @sql = CONCAT('INSERT INTO data_checksum (table_name, source_checksum, last_check_time, status) 
                         VALUES (?, (SELECT COUNT(*) FROM ', tname, '), NOW(), ''NOT_CHECKED'') 
                         ON DUPLICATE KEY UPDATE source_checksum = VALUES(source_checksum), 
                         last_check_time = VALUES(last_check_time)');
        PREPARE stmt FROM @sql;
        EXECUTE stmt USING tname;
        DEALLOCATE PREPARE stmt;
    END LOOP;
    CLOSE cur;
    
    -- 标记不一致的表
    UPDATE data_checksum dc
    JOIN (SELECT table_name, COUNT(*) AS cnt 
          FROM information_schema.tables 
          WHERE table_schema = DATABASE() 
          GROUP BY table_name) t
    ON dc.table_name = t.table_name
    SET dc.status = IF(dc.source_checksum = t.cnt, 'OK', 'MISMATCH');
END //
DELIMITER ;

-- 创建定时校验任务
CREATE EVENT ev_data_verification
ON SCHEDULE EVERY 6 HOUR
DO CALL verify_data_consistency();

五、自动化容灾演练

sql 复制代码
-- 创建演练日志表
CREATE TABLE disaster_recovery_drill (
    drill_id INT AUTO_INCREMENT PRIMARY KEY,
    start_time DATETIME NOT NULL,
    end_time DATETIME,
    scenario VARCHAR(50) NOT NULL,
    status ENUM('RUNNING', 'SUCCESS', 'FAILED') DEFAULT 'RUNNING',
    details TEXT
);

-- 创建演练存储过程
DELIMITER //
CREATE PROCEDURE execute_drill(IN scenario VARCHAR(50))
BEGIN
    DECLARE drill_start DATETIME DEFAULT NOW();
    DECLARE drill_id INT;
    
    -- 记录演练开始
    INSERT INTO disaster_recovery_drill (start_time, scenario, status)
    VALUES (drill_start, scenario, 'RUNNING');
    SET drill_id = LAST_INSERT_ID();
    
    -- 根据场景执行不同操作
    CASE scenario
        WHEN 'MASTER_FAILOVER' THEN
            -- 模拟主库故障
            SET @kill_id = (SELECT ID FROM information_schema.processlist 
                           WHERE USER = 'app_user' LIMIT 1);
            SET @sql = CONCAT('KILL ', @kill_id);
            PREPARE stmt FROM @sql;
            EXECUTE stmt;
            DEALLOCATE PREPARE stmt;
            
            -- 这里添加实际的故障转移逻辑
            -- ...
            
            UPDATE disaster_recovery_drill 
            SET status = 'SUCCESS', end_time = NOW()
            WHERE drill_id = drill_id;
            
        WHEN 'DATACENTER_FAILURE' THEN
            -- 模拟数据中心故障
            -- ...
            
        ELSE
            UPDATE disaster_recovery_drill 
            SET status = 'FAILED', end_time = NOW(),
                details = 'Unknown scenario'
            WHERE drill_id = drill_id;
    END CASE;
END //
DELIMITER ;

六、监控与告警系统

sql 复制代码
-- 创建监控仪表板视图
CREATE VIEW disaster_recovery_status AS
SELECT 
    (SELECT COUNT(*) FROM replication_monitor WHERE slave_io_running = 'No' OR slave_sql_running = 'No') AS replication_errors,
    (SELECT COUNT(*) FROM performance_schema.replication_group_members WHERE MEMBER_STATE != 'ONLINE') AS mgr_offline_members,
    (SELECT COUNT(*) FROM data_checksum WHERE status = 'MISMATCH') AS data_mismatches,
    (SELECT MAX(TIMESTAMPDIFF(MINUTE, last_check_time, NOW())) FROM data_checksum) AS minutes_since_last_check,
    (SELECT COUNT(*) FROM alert_logs WHERE alert_time > DATE_SUB(NOW(), INTERVAL 1 HOUR)) AS recent_alerts;

以上SQL语句实现了一个完整的MySQL数据库容灾方案,包括主从复制、MGR高可用、异地灾备、数据校验和自动化演练等功能模块。实际应用时,需要根据具体环境调整IP地址、用户名密码等参数。

相关推荐
码农黛兮_4632 分钟前
MySQL 数据库集群部署、性能优化及高可用架构设计
数据库·mysql·性能优化
AllenO.o1 小时前
Redis五种数据结构详解
java·数据结构·数据库·redis·缓存
消失在人海中1 小时前
数据分析基础:需要掌握的入门知识
数据库·人工智能·数据分析
闪电麦坤951 小时前
SQL:SELF JOIN(自连接)与CROSS JOIN(交叉连接)
数据库·sql·mysql
码农飞哥2 小时前
互联网大厂Java面试实战:从Spring Boot到微服务的技术问答与解析
java·数据库·spring boot·安全·微服务·面试·电商
周盛欢2 小时前
数据库故障排查指南
数据库·mysql
计算机学姐2 小时前
基于SpringBoot的在线教育管理系统
java·vue.js·spring boot·后端·mysql·spring·mybatis
onkel in blog2 小时前
【Docker】Docker Compose方式搭建分布式内存数据库(Redis)集群
数据库·redis·分布式·docker
Minyy112 小时前
“爱生活”小项目问题总结
java·数据库·spring boot·spring·maven·intellij-idea
大G哥3 小时前
【SQL 周周练】爬取短视频发现数据缺失,如何用 SQL 填充
数据库·sql