MySQL主从架构深度解析:原理、优化与实践指南
文档名称
MySQL高可用架构:主从复制原理与读写分离实践
一、核心概念体系
1.1 主从复制基础架构
sql
-- 架构示意图
Master (主库) → Binary Log → Relay Log → Slave (从库)
↓
Write Operations Read Operations
1.2 复制类型对比
| 复制类型 | 数据一致性 | 性能影响 | 适用场景 |
|---|---|---|---|
| 异步复制 | 最终一致 | 低延迟 | 读写分离 |
| 半同步复制 | 较强一致 | 中等延迟 | 金融交易 |
| 全同步复制 | 强一致 | 高延迟 | 数据强一致 |
二、主从复制工作原理
2.1 二进制日志(Binlog)格式
sql
-- 查看当前binlog格式
SHOW VARIABLES LIKE 'binlog_format';
-- 三种格式对比
-- 1. STATEMENT: 记录SQL语句
-- 2. ROW: 记录行数据变更(推荐)
-- 3. MIXED: 混合模式
2.2 复制工作流程
sql
-- Master端配置示例
-- my.cnf配置
[mysqld]
server-id = 1
log-bin = mysql-bin
binlog_format = ROW
expire_logs_days = 7
max_binlog_size = 100M
sync_binlog = 1
-- 创建复制用户
CREATE USER 'repl'@'%' IDENTIFIED BY 'Repl123456';
GRANT REPLICATION SLAVE ON *.* TO 'repl'@'%';
-- 查看Master状态
SHOW MASTER STATUS;
/*
+------------------+----------+--------------+------------------+
| File | Position | Binlog_Do_DB | Binlog_Ignore_DB |
+------------------+----------+--------------+------------------+
| mysql-bin.000001 | 107 | | |
+------------------+----------+--------------+------------------+
*/
2.3 Slave端配置
sql
-- my.cnf配置
[mysqld]
server-id = 2
relay-log = mysql-relay-bin
read_only = 1
log_slave_updates = 1
-- 配置复制链路
CHANGE MASTER TO
MASTER_HOST = 'master_host',
MASTER_USER = 'repl',
MASTER_PASSWORD = 'Repl123456',
MASTER_PORT = 3306,
MASTER_LOG_FILE = 'mysql-bin.000001',
MASTER_LOG_POS = 107,
MASTER_CONNECT_RETRY = 60,
MASTER_HEARTBEAT_PERIOD = 10;
-- 启动复制
START SLAVE;
-- 查看复制状态
SHOW SLAVE STATUS\G
/*
Slave_IO_Running: Yes # I/O线程状态
Slave_SQL_Running: Yes # SQL线程状态
Seconds_Behind_Master: 0 # 复制延迟秒数
Last_IO_Error: # 最后IO错误
Last_SQL_Error: # 最后SQL错误
*/
三、主从延迟分析与优化
3.1 延迟原因深度分析
3.1.1 硬件资源瓶颈
sql
-- 监控指标查询
-- 磁盘IO性能
SHOW GLOBAL STATUS LIKE 'Innodb_data_%';
-- 网络延迟检测
-- Master执行
SELECT NOW();
-- Slave执行对比时间差
-- CPU负载
SHOW PROCESSLIST;
3.1.2 配置参数影响
sql
-- 关键参数检查
SHOW VARIABLES LIKE '%sync_binlog%'; -- 建议=1
SHOW VARIABLES LIKE '%innodb_flush_log_at_trx_commit%'; -- 建议=1
SHOW VARIABLES LIKE '%slave_parallel_workers%'; -- 并行复制
SHOW VARIABLES LIKE '%slave_parallel_type%'; -- LOGICAL_CLOCK
3.2 延迟监控方案
sql
-- 创建延迟监控表
CREATE TABLE replication_monitor (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
check_time DATETIME DEFAULT CURRENT_TIMESTAMP,
slave_host VARCHAR(50),
seconds_behind_master INT,
slave_io_running VARCHAR(3),
slave_sql_running VARCHAR(3),
last_io_error TEXT,
last_sql_error TEXT,
KEY idx_check_time (check_time)
);
-- 监控存储过程
DELIMITER $$
CREATE PROCEDURE monitor_replication_lag()
BEGIN
DECLARE v_seconds_behind_master INT;
DECLARE v_slave_io_running VARCHAR(3);
DECLARE v_slave_sql_running VARCHAR(3);
DECLARE v_last_io_error TEXT;
DECLARE v_last_sql_error TEXT;
SELECT
Seconds_Behind_Master,
Slave_IO_Running,
Slave_SQL_Running,
Last_IO_Error,
Last_SQL_Error
INTO
v_seconds_behind_master,
v_slave_io_running,
v_slave_sql_running,
v_last_io_error,
v_last_sql_error
FROM performance_schema.replication_applier_status_by_worker
WHERE CHANNEL_NAME = '';
INSERT INTO replication_monitor
(slave_host, seconds_behind_master, slave_io_running,
slave_sql_running, last_io_error, last_sql_error)
VALUES
(@@hostname, v_seconds_behind_master, v_slave_io_running,
v_slave_sql_running, v_last_io_error, v_last_sql_error);
END$$
DELIMITER ;
-- 创建定时事件
CREATE EVENT IF NOT EXISTS monitor_replication_event
ON SCHEDULE EVERY 30 SECOND
DO CALL monitor_replication_lag();
3.3 优化策略实践
3.3.1 并行复制配置
sql
-- MySQL 5.7+ 并行复制配置
STOP SLAVE;
SET GLOBAL slave_parallel_type = 'LOGICAL_CLOCK';
SET GLOBAL slave_parallel_workers = 8; -- 根据CPU核心数调整
START SLAVE;
-- 查看并行复制状态
SHOW VARIABLES LIKE 'slave_parallel%';
SELECT * FROM performance_schema.replication_applier_status_by_worker;
3.3.2 大事务优化
sql
-- 1. 拆分大事务
-- 不推荐
START TRANSACTION;
INSERT INTO large_table SELECT * FROM huge_source; -- 百万行
COMMIT;
-- 推荐:分批提交
SET autocommit = 0;
INSERT INTO large_table SELECT * FROM huge_source LIMIT 10000;
COMMIT;
-- 循环处理...
-- 2. 使用pt-online-schema-change避免DDL锁表
-- 示例命令
pt-online-schema-change \
--alter="ADD INDEX idx_name (name)" \
D=test,t=large_table \
--execute
3.3.3 网络优化配置
sql
-- 增大复制缓冲区
SET GLOBAL slave_net_timeout = 60; -- 默认60秒
SET GLOBAL slave_compressed_protocol = ON; -- 开启压缩
-- Master端增大binlog缓存
SET GLOBAL binlog_cache_size = 4M;
SET GLOBAL max_binlog_size = 512M;
四、读写分离实现方案
4.1 应用层分离方案
java
// Spring Boot + MyBatis 多数据源配置示例
@Configuration
public class DataSourceConfig {
@Primary
@Bean(name = "masterDataSource")
@ConfigurationProperties(prefix = "spring.datasource.master")
public DataSource masterDataSource() {
return DataSourceBuilder.create().build();
}
@Bean(name = "slaveDataSource")
@ConfigurationProperties(prefix = "spring.datasource.slave")
public DataSource slaveDataSource() {
return DataSourceBuilder.create().build();
}
@Bean(name = "dynamicDataSource")
public DataSource dynamicDataSource() {
Map<Object, Object> targetDataSources = new HashMap<>();
targetDataSources.put("master", masterDataSource());
targetDataSources.put("slave", slaveDataSource());
DynamicDataSource dynamicDataSource = new DynamicDataSource();
dynamicDataSource.setTargetDataSources(targetDataSources);
dynamicDataSource.setDefaultTargetDataSource(masterDataSource());
return dynamicDataSource;
}
}
// 自定义路由数据源
public class DynamicDataSource extends AbstractRoutingDataSource {
@Override
protected Object determineCurrentLookupKey() {
// 根据注解选择数据源
return DynamicDataSourceContextHolder.getDataSourceType();
}
}
// 自定义注解
@Target({ElementType.METHOD, ElementType.TYPE})
@Retention(RetentionPolicy.RUNTIME)
@Documented
public @interface DataSource {
String value() default "master";
}
// 使用示例
@Service
public class UserService {
@DataSource("master")
public void createUser(User user) {
// 写操作,使用主库
userMapper.insert(user);
}
@DataSource("slave")
public User getUserById(Long id) {
// 读操作,使用从库
return userMapper.selectById(id);
}
}
4.2 中间件方案(ProxySQL示例)
sql
-- 1. 安装后配置后端服务器
INSERT INTO mysql_servers(hostgroup_id, hostname, port)
VALUES
(10, 'master_host', 3306), -- 写组
(20, 'slave1_host', 3306), -- 读组
(20, 'slave2_host', 3306);
-- 2. 配置监控用户
UPDATE global_variables
SET variable_value='monitor'
WHERE variable_name='mysql-monitor_username';
UPDATE global_variables
SET variable_value='monitor_password'
WHERE variable_name='mysql-monitor_password';
-- 3. 配置路由规则
-- 写操作路由到hostgroup 10
INSERT INTO mysql_query_rules (rule_id, active, match_digest, destination_hostgroup, apply)
VALUES
(1, 1, '^INSERT', 10, 1),
(2, 1, '^UPDATE', 10, 1),
(3, 1, '^DELETE', 10, 1),
(4, 1, '^SELECT.*FOR UPDATE', 10, 1);
-- 读操作路由到hostgroup 20
INSERT INTO mysql_query_rules (rule_id, active, match_digest, destination_hostgroup, apply)
VALUES
(5, 1, '^SELECT', 20, 1);
-- 4. 加载配置
LOAD MYSQL SERVERS TO RUNTIME;
SAVE MYSQL SERVERS TO DISK;
LOAD MYSQL QUERY RULES TO RUNTIME;
SAVE MYSQL QUERY RULES TO DISK;
-- 5. 配置读写分离用户
INSERT INTO mysql_users(username, password, default_hostgroup)
VALUES ('app_user', 'password', 10);
4.3 延迟感知路由
java
// 延迟感知的负载均衡策略
public class LagAwareLoadBalance {
private List<SlaveInfo> slaves = new ArrayList<>();
private static final int MAX_LAG_THRESHOLD = 5; // 最大允许延迟5秒
public DataSource selectSlave() {
List<SlaveInfo> availableSlaves = slaves.stream()
.filter(s -> s.getLagSeconds() <= MAX_LAG_THRESHOLD)
.collect(Collectors.toList());
if (availableSlaves.isEmpty()) {
// 所有从库延迟过高,降级到主库
return masterDataSource;
}
// 加权随机选择(延迟越低权重越高)
return selectByWeight(availableSlaves);
}
private DataSource selectByWeight(List<SlaveInfo> slaves) {
int totalWeight = slaves.stream()
.mapToInt(s -> calculateWeight(s.getLagSeconds()))
.sum();
int random = new Random().nextInt(totalWeight);
int current = 0;
for (SlaveInfo slave : slaves) {
current += calculateWeight(slave.getLagSeconds());
if (random < current) {
return slave.getDataSource();
}
}
return slaves.get(0).getDataSource();
}
private int calculateWeight(int lagSeconds) {
// 延迟越低,权重越高
return Math.max(10 - lagSeconds, 1);
}
}
五、故障处理与恢复
5.1 常见故障处理
sql
-- 1. 主从复制中断
-- 查看错误信息
SHOW SLAVE STATUS\G
-- 常见错误1:主键冲突
-- 解决方案:跳过错误
STOP SLAVE;
SET GLOBAL sql_slave_skip_counter = 1;
START SLAVE;
-- 或指定GTID跳过
STOP SLAVE;
SET GTID_NEXT='aaa-bbb-ccc-ddd:N';
BEGIN; COMMIT;
SET GTID_NEXT='AUTOMATIC';
START SLAVE;
-- 2. 数据不一致修复
-- 使用pt-table-checksum检查
pt-table-checksum \
--replicate=test.checksums \
--databases=your_db \
h=master_host,u=user,p=password
-- 使用pt-table-sync修复
pt-table-sync \
--execute \
--replicate test.checksums \
h=master_host,u=user,p=password \
h=slave_host,u=user,p=password
-- 3. 重新同步数据
-- 方法1:使用mysqldump
-- Master端
mysqldump --master-data=2 --single-transaction -uroot -p dbname > dbname.sql
-- Slave端
STOP SLAVE;
source dbname.sql;
START SLAVE;
-- 方法2:使用xtrabackup
# Master备份
xtrabackup --backup --target-dir=/backup/master/
# Slave恢复
xtrabackup --prepare --target-dir=/backup/master/
xtrabackup --copy-back --target-dir=/backup/master/
5.2 高可用架构
sql
-- MHA (Master High Availability) 配置示例
-- 1. 配置SSH免密登录
-- 2. 编辑配置文件 /etc/mha/app1.cnf
[server default]
manager_workdir=/var/log/mha/app1
manager_log=/var/log/mha/app1/manager.log
ssh_user=mysql
user=repl
password=Repl123456
repl_user=repl
repl_password=Repl123456
[server1]
hostname=master_host
candidate_master=1
[server2]
hostname=slave1_host
candidate_master=1
[server3]
hostname=slave2_host
no_master=1
-- 3. 启动MHA监控
masterha_manager --conf=/etc/mha/app1.cnf
六、监控与告警体系
6.1 Prometheus监控配置
yaml
# prometheus.yml 配置
scrape_configs:
- job_name: 'mysql'
static_configs:
- targets: ['master_host:9104', 'slave1_host:9104']
params:
collect[]:
- global_status
- slave_status
- info_schema.innodb_metrics
# mysqld_exporter启动
./mysqld_exporter \
--collect.slave_status \
--collect.info_schema.innodb_metrics \
--collect.global_status \
--web.listen-address=":9104"
6.2 Grafana监控面板
json
// 关键监控指标
{
"panels": [
{
"title": "复制延迟",
"targets": [{
"expr": "mysql_slave_status_seconds_behind_master",
"legendFormat": "{{instance}}"
}],
"thresholds": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 5},
{"color": "red", "value": 30}
]
},
{
"title": "复制线程状态",
"targets": [{
"expr": "mysql_slave_status_slave_io_running",
"legendFormat": "IO线程 {{instance}}"
}]
}
]
}
6.3 告警规则
yaml
# alertmanager配置
groups:
- name: mysql_alerts
rules:
- alert: HighReplicationLag
expr: mysql_slave_status_seconds_behind_master > 30
for: 2m
labels:
severity: warning
annotations:
summary: "MySQL复制延迟过高"
description: "实例 {{ $labels.instance }} 复制延迟已达 {{ $value }} 秒"
- alert: SlaveNotRunning
expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running == 0
for: 1m
labels:
severity: critical
annotations:
summary: "MySQL从库复制停止"
七、最佳实践总结
7.1 配置清单
sql
-- 主库配置要点
[mysqld]
server-id = 1
log-bin = mysql-bin
binlog_format = ROW
expire_logs_days = 7
sync_binlog = 1
innodb_flush_log_at_trx_commit = 1
-- 从库配置要点
[mysqld]
server-id = 2
relay-log = mysql-relay-bin
read_only = 1
log_slave_updates = 1
slave_parallel_type = LOGICAL_CLOCK
slave_parallel_workers = 4
7.2 维护脚本
bash
#!/bin/bash
# 复制状态检查脚本
check_replication() {
local host=$1
local user=$2
local password=$3
result=$(mysql -h$host -u$user -p$password -e "SHOW SLAVE STATUS\G")
io_running=$(echo "$result" | grep "Slave_IO_Running" | awk '{print $2}')
sql_running=$(echo "$result" | grep "Slave_SQL_Running" | awk '{print $2}')
lag=$(echo "$result" | grep "Seconds_Behind_Master" | awk '{print $2}')
if [ "$io_running" = "Yes" ] && [ "$sql_running" = "Yes" ]; then
echo "Replication is running. Lag: ${lag} seconds"
if [ $lag -gt 30 ]; then
echo "WARNING: High replication lag detected!"
return 2
fi
return 0
else
echo "ERROR: Replication is broken!"
return 1
fi
}
# 定期检查
while true; do
check_replication "slave_host" "monitor" "password"
sleep 60
done
7.3 性能优化检查表
- 使用ROW格式的binlog
- 开启并行复制
- 合理设置innodb_buffer_pool_size
- 监控和优化慢查询
- 定期清理无用binlog
- 配置适当的网络超时时间
- 实现延迟感知的路由策略
- 建立完善的监控告警体系
结论
MySQL主从架构是企业级应用的基础架构,合理配置和优化主从复制、有效管理主从延迟、智能实现读写分离是保障系统稳定性和扩展性的关键。本文提供了从基础配置到高级优化的完整解决方案,结合实际案例代码,可以帮助开发者构建高性能、高可用的数据库架构。
建议根据实际业务场景选择合适的复制策略和读写分离方案,并建立完善的监控体系,确保数据库服务的稳定可靠。