14-运维手册
本文档提供完整的运维指南,包括日常巡检、故障处理、备份恢复等内容。
日常巡检
1. 容器状态巡检
#!/bin/bash
# 每日巡检脚本
echo "============================================"
echo " Docker容器化集群日常巡检"
echo " $(date '+%Y-%m-%d %H:%M:%S')"
echo "============================================"
echo ""
echo "【1. 容器状态】"
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "【2. VIP状态】"
vip=$(docker exec keepalived ip addr show ens33 2>/dev/null | grep "172.20.1.100")
if [ -n "$vip" ]; then
echo "✓ VIP已绑定: $vip"
else
echo "✗ VIP未绑定"
fi
echo ""
echo "【3. 资源使用】"
docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}"
echo ""
echo "【4. 网络状态】"
for net in frontend-net backend-net cache-net database-net; do
count=$(docker network ls -q -f name=$net | wc -l)
if [ $count -gt 0 ]; then
echo "✓ $net 存在"
else
echo "✗ $net 缺失"
fi
done
echo ""
echo "【5. 健康检查】"
# Nginx-LB
curl -s http://172.20.1.11/health >/dev/null && echo "✓ Nginx-LB-01" || echo "✗ Nginx-LB-01"
curl -s http://172.20.1.12/health >/dev/null && echo "✓ Nginx-LB-02" || echo "✗ Nginx-LB-02"
curl -s http://172.20.1.13/health >/dev/null && echo "✓ Nginx-LB-03" || echo "✗ Nginx-LB-03"
# Redis
docker exec redis-master redis-cli -a 'YourStr0ng!Pass' ping 2>/dev/null && echo "✓ Redis Master" || echo "✗ Redis Master"
docker exec redis-slave redis-cli -a 'YourStr0ng!Pass' ping 2>/dev/null && echo "✓ Redis Slave" || echo "✗ Redis Slave"
# MySQL
docker exec mysql-01 mysqladmin -uroot -p'YourStr0ng!Pass' ping -h127.0.0.1 2>/dev/null && echo "✓ MySQL-01" || echo "✗ MySQL-01"
docker exec mysql-02 mysqladmin -uroot -p'YourStr0ng!Pass' ping -h127.0.0.1 2>/dev/null && echo "✓ MySQL-02" || echo "✗ MySQL-02"
docker exec mysql-03 mysqladmin -uroot -p'YourStr0ng!Pass' ping -h127.0.0.1 2>/dev/null && echo "✓ MySQL-03" || echo "✗ MySQL-03"
echo ""
echo "【6. MGR状态】"
docker exec mysql-01 mysql -uroot -p'YourStr0ng!Pass' -h127.0.0.1 -e "
SELECT member_host, member_role, member_state
FROM performance_schema.replication_group_members
WHERE channel_name = 'group_replication_applier';
" 2>/dev/null
echo ""
echo "【7. Redis复制状态】"
docker exec redis-master redis-cli -a 'YourStr0ng!Pass' info replication 2>/dev/null | grep -E "role:|connected_slaves:"
echo ""
echo "【8. 日志检查】"
echo "最近错误日志:"
docker logs --tail 20 nginx-lb 2>&1 | grep -i error || echo "无错误"
docker logs --tail 20 mysql-01 2>&1 | grep -i error || echo "无错误"
echo ""
echo "============================================"
echo " 巡检完成"
echo "============================================"
2. 资源使用巡检
# 查看CPU和内存使用
docker stats --no-stream
# 查看磁盘使用
docker system df
# 查看容器磁盘使用
docker ps --size
3. 日志巡检
# 查看容器日志
docker logs --tail 100 nginx-lb
docker logs --tail 100 mysql-01
# 查看Keepalived日志
docker logs keepalived
# 实时跟踪日志
docker logs -f nginx-lb
故障处理
1. VIP故障处理
现象
-
VIP无法ping通
-
服务不可用
排查步骤
# 1. 检查Keepalived容器状态
docker ps | grep keepalived
# 2. 查看Keepalived日志
docker logs keepalived
# 3. 查看VIP绑定状态
docker exec keepalived ip addr show ens33
# 4. 检查Nginx是否运行
docker exec keepalived ps aux | grep nginx
# 5. 检查VRRP配置
docker exec keepalived cat /etc/keepalived/keepalived.conf
处理方法
# 重启Keepalived容器
docker restart keepalived
# 如果需要,重启Nginx-LB
docker restart nginx-lb
# 手动绑定VIP(临时)
ip addr add 172.20.1.100/24 dev eth0
2. Nginx-LB故障处理
现象
-
Nginx-LB健康检查失败
-
上游服务全部不可达
排查步骤
# 1. 检查容器状态
docker ps | grep nginx-lb
# 2. 查看日志
docker logs nginx-lb
# 3. 检查配置文件
docker exec nginx-lb nginx -t
# 4. 测试上游连接
docker exec nginx-lb curl -f http://172.20.2.11/health
处理方法
# 重启Nginx-LB
docker restart nginx-lb
# 重新加载配置
docker exec nginx-lb nginx -s reload
3. PHP服务故障处理
现象
-
PHP页面返回502/503
-
健康检查失败
排查步骤
# 1. 检查PHP容器状态
docker ps | grep -E "php|php-fpm"
# 2. 检查PHP容器日志
docker logs php
docker logs php-fpm
# 3. 测试PHP-FPM
docker exec php-fpm php-fpm -t
# 4. 检查fastcgi连接
docker exec php netstat -tlnp | grep 9000
处理方法
# 重启PHP容器
docker restart php
docker restart php-fpm
# 检查配置
docker exec php cat /etc/nginx/nginx.conf | grep fastcgi_pass
4. Redis故障处理
现象
-
Redis连接失败
-
主从复制中断
排查步骤
# 1. 检查Redis容器状态
docker ps | grep redis
# 2. 测试连接
docker exec redis-master redis-cli -a 'YourStr0ng!Pass' ping
docker exec redis-slave redis-cli -a 'YourStr0ng!Pass' ping
# 3. 查看复制状态
docker exec redis-master redis-cli -a 'YourStr0ng!Pass' info replication
# 4. 查看Sentinel状态
docker exec sentinel-01 redis-cli -p 26379 SENTINEL master mymaster
处理方法
# 重启Redis Master
docker restart redis-master
# 重启Redis Slave
docker restart redis-slave
# 如果需要,重新配置主从
docker exec redis-slave redis-cli -a 'YourStr0ng!Pass' SLAVEOF 172.20.3.11 6379
5. MySQL故障处理
现象
-
MySQL连接失败
-
MGR成员离线
-
复制中断
排查步骤
# 1. 检查MySQL容器状态
docker ps | grep mysql
# 2. 测试连接
docker exec mysql-01 mysqladmin -uroot -p'YourStr0ng!Pass' ping -h127.0.0.1
# 3. 查看MGR状态
docker exec mysql-01 mysql -uroot -p'YourStr0ng!Pass' -h127.0.0.1 -e "
SELECT * FROM performance_schema.replication_group_members;
"
# 4. 查看错误日志
docker logs mysql-01 | tail -50
处理方法
# 重启MySQL容器
docker restart mysql-01
# 如果MGR成员离线,重新加入
docker exec mysql-02 mysql -uroot -p'YourStr0ng!Pass' -h127.0.0.1 -e "
START GROUP_REPLICATION;
"
6. 容器退出故障
排查步骤
# 1. 查看退出的容器
docker ps -a | grep Exited
# 2. 查看容器退出原因
docker logs <container_name>
# 3. 检查配置文件
docker inspect <container_name>
处理方法
# 重新启动容器
docker start <container_name>
# 如果是配置问题,修复后重新创建
docker rm <container_name>
docker compose -f /opt/cluster-deploy/docker-compose-node1.yml up -d
备份与恢复
1. Redis备份
# 创建Redis备份目录
mkdir -p /backup/redis
# 备份Redis数据
docker exec redis-master redis-cli -a 'YourStr0ng!Pass' SAVE
# 复制RDB文件
docker cp redis-master:/data/dump.rdb /backup/redis/dump-$(date +%Y%m%d).rdb
# 保留最近7天的备份
find /backup/redis -name "*.rdb" -mtime +7 -delete
2. MySQL备份
# 创建备份目录
mkdir -p /backup/mysql
# 备份所有数据库
docker exec mysql-01 mysqldump -uroot -p'YourStr0ng!Pass' -h127.0.0.1 \
--all-databases --single-transaction --routines --triggers --events \
> /backup/mysql/all-databases-$(date +%Y%m%d).sql
# 保留最近7天的备份
find /backup/mysql -name "*.sql" -mtime +7 -delete
3. 恢复MySQL
# 恢复所有数据库
docker exec -i mysql-01 mysql -uroot -p'YourStr0ng!Pass' -h127.0.0.1 < /backup/mysql/all-databases-20240101.sql
4. 备份配置
# 备份配置文件
tar -czf /backup/config-$(date +%Y%m%d).tar.gz /opt/cluster-deploy/config/
# 保留最近30天的备份
find /backup -name "config-*.tar.gz" -mtime +30 -delete
版本升级
1. Docker镜像升级
# 拉取新镜像
docker pull nginx:alpine
docker compose -f docker-compose-node1.yml build php-fpm # 自定义Dockerfile安装pdo_mysql
docker pull redis:7-alpine
docker pull mysql:8.0
docker pull ednxzu/keepalived:2.3.4
# 重新创建容器
cd /opt/cluster-deploy
docker compose -f docker-compose-node1.yml up -d
2. PHP版本升级
# 修改docker-compose文件中的镜像版本
sed -i 's/php:8.2-fpm-alpine/php:8.3-fpm-alpine/g' config/php-fpm/Dockerfile # 修改基础镜像版本
# 重新创建PHP容器
docker compose -f docker-compose-node1.yml up -d php-fpm
容量扩展
1. 添加新节点
参见13-快速部署.md中的Node部署脚本。
2. 调整Redis内存
# 修改redis配置文件
sed -i 's/maxmemory 256mb/maxmemory 512mb/g' /opt/cluster-deploy/config/redis/redis-master.conf
# 重启Redis
docker restart redis-master
3. 调整MySQL资源
# 在docker-compose中添加资源限制
mysql-01:
image: mysql:8.0
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G
安全管理
1. 修改密码
# 修改Redis密码
docker exec redis-master redis-cli -a 'YourStr0ng!Pass' CONFIG SET requirepass "NewPass123!"
# 修改MySQL密码
docker exec mysql-01 mysql -uroot -p'YourStr0ng!Pass' -h127.0.0.1 -e "
ALTER USER 'root'@'%' IDENTIFIED BY 'NewPass123!';
"
# 更新所有配置中的密码
sed -i 's/YourStr0ng!Pass/NewPass123!/g' /opt/cluster-deploy/config/redis/sentinel.conf
sed -i 's/YourStr0ng!Pass/NewPass123!/g' /opt/cluster-deploy/config/php/php.ini
2. 防火墙配置
# 开放必要端口
firewall-cmd --permanent --add-port=80/tcp
firewall-cmd --permanent --add-port=443/tcp
firewall-cmd --reload
# 限制管理端口
firewall-cmd --permanent --add-rich-rule='rule family="ipv4" source address="192.168.64.0/24" port port="2375" protocol="tcp" accept'
3. 日志审计
# 配置Docker日志轮转
cat > /etc/docker/daemon.json << 'EOF'
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
}
}
EOF
systemctl restart docker
应急预案
1. 整体宕机
# 按顺序重启所有服务
# Node1
cd /opt/cluster-deploy
docker compose -f docker-compose-node1.yml restart
# 等待就绪后Node2
ssh root@192.168.64.129 "cd /opt/cluster-deploy && docker compose -f docker-compose-node2.yml restart"
# 等待就绪后Node3
ssh root@192.168.64.130 "cd /opt/cluster-deploy && docker compose -f docker-compose-node3.yml restart"
# 验证
sleep 30
curl http://172.20.1.100/health
2. VIP漂移异常
# 查看VIP漂移原因
docker logs keepalived | grep -i "vrrp"
# 手动接管VIP
docker exec keepalived ip addr add 172.20.1.100/24 dev eth0
3. 数据不一致
# Redis数据不一致
docker exec redis-slave redis-cli -a 'YourStr0ng!Pass' SLAVEOF NO ONE
docker exec redis-slave redis-cli -a 'YourStr0ng!Pass' FLUSHALL
docker exec redis-slave redis-cli -a 'YourStr0ng!Pass' SLAVEOF 172.20.3.11 6379
# MySQL数据不一致(重新同步)
docker exec mysql-02 mysql -uroot -p'YourStr0ng!Pass' -h127.0.0.1 -e "
STOP GROUP_REPLICATION;
RESET SLAVE ALL;
"
# 重新加入MGR(参见11-MySQL-MGR初始化.md)
命令速查
容器管理
# 启动所有容器
docker compose -f /opt/cluster-deploy/docker-compose-node1.yml up -d
# 停止所有容器
docker compose -f /opt/cluster-deploy/docker-compose-node1.yml down
# 重启单个容器
docker restart nginx-lb
# 查看容器状态
docker ps -a
日志查看
# 查看容器日志
docker logs -f nginx-lb
# 查看最近100行
docker logs --tail 100 nginx-lb
网络检查
# 查看网络
docker network ls
# 检查网络连通性
ping 172.20.1.100
ping 172.20.2.11
服务测试
# 测试HTTP服务
curl http://172.20.1.100/health
# 测试Redis
docker exec redis-master redis-cli -a 'YourStr0ng!Pass' ping
# 测试MySQL
docker exec mysql-01 mysqladmin -uroot -p'YourStr0ng!Pass' ping -h127.0.0.1
下一步
-
12-验证测试.md - 完整验证
-
答辩演讲稿.md - 项目答辩