一、监控Nginx
1.Nginx需要开启stub_status
这里我的nginx容器名为mynignx,进入容器查看。
docker exec -it mynginx bash #进入容器
nginx -v 2>&1 | grep -o with-http_stub_status_module #查看
修改nginx.conf
...
location /stub_status {
stub_status on;
access_log off;
#allow nginx_export的ip;
allow 0.0.0.0/0;
deny all;
}
...
#检查配置文件
docker exec -it mynginx nginx -t
#重新加载配置文件
docker restart mynginx
#检查,若监听默认80端口,端口号可不填,192.168.88.129/stub_status
curl http://192.168.88.129:5173/stub_status
2.DockerCompose安装nginx_exporter
容器内默认监听9113,注意端口映射
#注意文件缩进两个空格
cat >docker-compose.yaml <<EOF
version: '3.8'
services:
nginx_exporter:
image: nginx/nginx-prometheus-exporter
container_name: nginx_exporter
hostname: nginx_exporter
#设置nginx配置中的数据地址
command:
- '-nginx.scrape-uri=http://192.168.88.129:5173/stub_status'
restart: always
#容器内默认监听9113
ports:
- 5174:9113
EOF
#启动
docker compose up -d
#访问
http://192.168.88.129:5174
3.修改Prometheus配置
bash
cat >> prometheus/prometheus.yml << "EOF"
- job_name: 'nginx_exporter'
static_configs:
- targets: ['192.168.88.129:5174']
labels:
instance: Nginx服务器
EOF
bash
#Prometheus热部署更新配置,或者重新启动Prometheus也可
curl -X POST http://localhost:9090/-/reload
4.常用监控数据
5.添加触发器
bash
cat >>prometheus/alert.yml << "EOF"
- name: nginx
rules:
# 对任何实例超过30秒无法联系的情况发出警报,up默认为1,设置为0的时候报警
- alert: NginxDown
expr: nginx_up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "nginx异常,实例:{{ $labels.instance }}"
description: "{{ $labels.job }} nginx已关闭"
EOF
bash
#检查
docker exec -it prometheus promtool check config /etc/prometheus/prometheus.yml
bash
curl -X POST http://localhost:9090/-/reload
成功。
二、监控Redis
1.安装exporter
bash
#直接运行
docker run -d --name redis_exporter -p 9121:9121 --restart always oliver006/redis_exporter --redis.addr redis://192.168.88.129:6379 --redis.password '123456'
默认端口为9121,访问192.168.88.129:9121
2.修改Prometheus配置
shift+y(大写Y)进入可视化模式选中上面的nginx,y 复制,p粘贴
curl -X POST http://localhost:9090/-/reload
3.常用监控指标
4.触发器配置
创建一个alert.yml存放规则,并且把该文件配置到Prometheus中。
bash
#Prometheus
rule_files:
- "alert.yml"
- "rules/*.yml"
bash
# 文件
redis-exporter.yml
groups:
- name: Redis #报警规则组的名字
rules:
- alert: RedisDown
expr: redis_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Redis down (instance {{ $labels.instance }})"
description: "Redis instance is down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisMissingMaster
expr: count(redis_instance_info{role="master"}) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Redis missing master (instance {{ $labels.instance }})"
description: "Redis cluster has no node marked as master.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisTooManyMasters
expr: count(redis_instance_info{role="master"}) > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Redis too many masters (instance {{ $labels.instance }})"
description: "Redis cluster has too many nodes marked as master.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisDisconnectedSlaves
expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Redis disconnected slaves (instance {{ $labels.instance }})"
description: "Redis not replicating for all slaves. Consider reviewing the redis replication status.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisReplicationBroken
expr: delta(redis_connected_slaves[1m]) < 0
for: 5m
labels:
severity: critical
annotations:
summary: "Redis replication broken (instance {{ $labels.instance }})"
description: "Redis instance lost a slave\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisClusterFlapping
expr: changes(redis_connected_slaves[5m]) > 2
for: 5m
labels:
severity: critical
annotations:
summary: "Redis cluster flapping (instance {{ $labels.instance }})"
description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisMissingBackup
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 5m
labels:
severity: critical
annotations:
summary: "Redis missing backup (instance {{ $labels.instance }})"
description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisOutOfMemory
expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Redis out of memory (instance {{ $labels.instance }})"
description: "Redis is running out of memory (> 90%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisTooManyConnections
expr: redis_connected_clients > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Redis too many connections (instance {{ $labels.instance }})"
description: "Redis instance has too many connections\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisNotEnoughConnections
expr: redis_connected_clients < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Redis not enough connections (instance {{ $labels.instance }})"
description: "Redis instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Redis rejected connections (instance {{ $labels.instance }})"
description: "Some connections to Redis has been rejected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
bash
#检查
docker exec -it prometheus promtool check config /etc/prometheus/prometheus.yml
curl -X POST http://localhost:9090/-/reload
最大内存为0可以让Redis在内存不足时只读取数据而不写入数据。这种模式被称为noeviction模式。
bash
#进入redis容器,redis-cli连接,设置最大内存
config set maxmemory 3G
http://192.168.88.129:9090/rules
4.Dashboard
Grafana可以使用11835