部署监控
bash
version: '3.7'
services:
#dingtalk
dingtalk:
image: timonwong/prometheus-webhook-dingtalk:latest
container_name: dingtalk
restart: always
command:
- '--config.file=/etc/prometheus-webhook-dingtalk/config.yml'
volumes:
- /data/monitor/dingtalk/config.yml:/etc/prometheus-webhook-dingtalk/config.yml
- /etc/localtime:/etc/localtime:ro
ports:
- "8060:8060"
#alertmanager
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: always
volumes:
- /data/monitor/alertmanager/config/alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
#prometheus
prometheus:
image: prom/prometheus
container_name: prometheus
restart: always
ports:
- "9090:9090"
volumes:
- /data/monitor/promethues/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/monitor/promethues/alert.yml:/etc/prometheus/rule.yml
- /etc/localtime:/etc/localtime:ro
#grafana
grafana:
image: grafana/grafana
container_name: grafana
restart: always
ports:
- "3000:3000"
volumes:
- /data/monitor/grafana:/var/lib/grafana
#node-exporter
node-exporter:
image: prom/node-exporter
container_name: node-exporter
restart: always
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
Dingtalk配置文件
/data/monitor/dingtalk/config.yml
bash
templates:
- /etc/prometheus-webhook-dingtalk/templates/templates.tmpl
targets: #配置多个接收方
webhook2:
url: https://oapi.dingtalk.com/robot/send?access_token=钉钉token
secret: 钉钉加签
Alertmanager配置文件
/data/monitor/alertmanager/config/alertmanager.yml
bash
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qiye.163.com:465' #邮箱smtp服务器代理,启用SSL发信, 端口一般是465
smtp_from: 'user@163.com' #发送邮箱名称
smtp_auth_username: 'user@163.com' #邮箱名称
smtp_auth_password: 'password' #邮箱密码或授权码
smtp_require_tls: false
route:
receiver: 'default'
group_wait: 10s
group_interval: 1m
repeat_interval: 1h
group_by: ['alertname']
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
receivers:
- name: 'default'
webhook_configs:
- url: 'http://dingtalk-IP:8060/dingtalk/webhook2/send' #webhoo2匹配dingtalk targets
send_resolved: true
Prometheus配置prometheus文件
/data/monitor/promethues/prometheus.yml
bash
global:
scrape_interval: 60s
evaluation_interval: 60s
alerting:
alertmanagers:
- static_configs:
- targets: ['IP:9093']
rule_files:
- "/etc/prometheus/rule.yml"
- "rules/*.yml"
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
labels:
instance: prometheus
- job_name: lite
static_configs:
- targets: ['IP:9100']
labels:
env: dev
- job_name: redis_exporter
static_configs:
- targets: ['IP:9121']
labels:
env: dev
ident: redis
- job_name: mysql_exporter
static_configs:
- targets: ['IP:9104']
labels:
env: dev
ident: mysql
- job_name: emqx_exporter
metrics_path: /api/v5/prometheus/stats
scrape_interval: 5s
honor_labels: true
static_configs:
- targets: ['IP:18083']
- job_name: 'alertmanager'
scrape_interval: 15s
static_configs:
- targets: ['IP:9100']
Prometheus配置alert文件
/data/monitor/promethues/alert.yml
bash
groups:
- name: 服务器主机信息监控告警
rules:
- alert: 公司内部服务器监控
expr: up {job="公司内部服务器"} == 0
for: 0m
labels:
severity: 非常严重
annotations:
description: "监控的目标已丢失,请检查服务器自身或node_exporter服务"
- alert: "内存报警"
expr: 100 - ((node_memory_MemAvailable_bytes * 100) / node_memory_MemTotal_bytes) > 10
for: 1m # 告警持续时间,超过这个时间才会发送给alertmanager
labels:
severity: 严重
annotations:
summary: "{{ $labels.instance }} 内存使用率过高,请尽快处理!"
description: "{{ $labels.instance }}内存使用率超过95%,当前使用率{{ $value }}%."
- alert: "磁盘空间报警"
expr: (1 - node_filesystem_avail_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"}) * 100 > 20
for: 60s
labels:
severity: 严重
annotations:
summary: "{{ $labels.instance }}磁盘空间使用超过95%了"
description: "{{ $labels.instance }}磁盘使用率超过95%,当前使用率{{ $value }}%."
- alert: "CPU报警"
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 5
for: 120s
labels:
severity: 严重
instance: "{{ $labels.instance }}"
annotations:
summary: "{{$labels.instance}}CPU使用率超过95%了"
description: "{{ $labels.instance }}CPU使用率超过95%,当前使用率{{ $value }}%."
- alert: "磁盘IO性能报警"
expr: ((irate(node_disk_io_time_seconds_total[30m]))* 100) > 95
for: 3m
labels:
severity: 严重
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高,请尽快处理!"
description: "{{$labels.instance}} 流入磁盘IO大于95%,当前使用率{{ $value }}%."