1、部署VictoriaMetrics
cd /usr/local
wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v1.65.0/victoria-metrics-amd64-v1.65.0.tar.gz
mkdir victoria-metrics && tar -xvzf victoria-metrics-amd64-v1.65.0.tar.gz && \
mv victoria-metrics-prod victoria-metrics/victoria-metrics && cd victoria-metrics
nohup ./victoria-metrics -retentionPeriod=30d -storageDataPath=data &
2、配置Prometheus
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
external_labels:
datacenter: "victoria-1"
# 远程写入victoria
remote_write:
- url: "http://127.0.0.1:8428/api/v1/write"
queue_config:
max_samples_per_send: 10000
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/usr/local/prometheus/rules/rule_node_down.yml"
- "/usr/local/prometheus/rules/rule_disk_over.yml"
- "/usr/local/prometheus/rules/rule_cpu_over.yml"
- "/usr/local/prometheus/rules/rule_memory_over.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# 用于配置victoria
- job_name: 'victoria'
static_configs:
- targets: ['47.105.38.224:8480']
- targets: ['47.105.38.224:8481']
- targets: ['47.105.38.224:8482']
# metrics_path defaults to '/metrics'
# # scheme defaults to 'http'.
- job_name: 'consul-prometheus'
# metrics_path: "/v1/agent/metrics"
scrape_interval: 60s
scrape_timeout: 10s
scheme: http
params:
format: ['prometheus']
#static_configs:
# - targets:
# - 47.105.38.224:8500
consul_sd_configs:
- server: '47.105.38.224:8500'
services: []
relabel_configs:
- source_labels: [__metrics_path__]
separator: ;
regex: /metrics
target_label: __metrics_path__
replacement: /actuator/prometheus
action: replace
- source_labels: ['__meta_consul_tags']
regex: '^.*,metrics=true,.*$'
action: keep
- job_name: "node20_exporter"
static_configs:
- targets: ["localhost:9100"]
- job_name: "node21_exporter"
static_configs:
- targets: ["172.16.17.21:9100"] #监控主机
- job_name: "node22_exporter"
static_configs:
- targets: ["172.16.17.22:9100"]
- job_name: "node23_exporter"
static_configs:
- targets: ["172.16.17.23:9100"] #监控主机
- job_name: "alertmanager"
static_configs:
- targets: ["localhost:9093"]
3、配置Grafana数据源
4、构建vmalert
从源代码构建vmalert:
git clone https://github.com/VictoriaMetrics/VictoriaMetrics
cd VictoriaMetrics
make vmalert
构建二进制文件将放置在VictoriaMetrics/bin文件夹中。
5、添加alert.rules
vim alert.rules
#rule示例
groups:
- name: test-rule
rules:
- alert: 主机状态
expr: up == 0
for: 2m
labels:
status: warning
annotations:
summary: "{{$labels.instance}}:服务器关闭"
description: "{{$labels.instance}}:服务器关闭"
6、修改钉钉prometheus-webhook-dingtalk配置文件
vim /usr/local/prometheus-webhook-dingtalk/config.example.yml
## Request timeout
# timeout: 5s
## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true
## Customizable templates path
templates:
- '/usr/local/alertmanager/template/default.tmpl'
## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=XXXXXXXX
# secret for signature
secret: SEC000000000000000000000
# webhook2:
# url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
# webhook_legacy:
# url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
# Customize template content
message:
# Use legacy template
# title: '{{ template "legacy.title" . }}'
text: '{{ template "wechat.default.message" . }}'
webhook_mention_all:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
all: true
webhook_mention_users:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
mobiles: ['156xxxx8827', '189xxxx8325']
7、修改alertmanager配置文件
vim /usr/local/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m # 处理超时时间,默认为5min
templates: # 指定邮件模板的路径,可以使用相对路径,template/*.tmpl的方式
- '/usr/local/alertmanager/template/default.tmpl'
# 定义路由树信息
route:
group_by: [alertname] # 报警分组依据
receiver: ops_notify # 设置默认接收人
group_wait: 30s # 最初即第一次等待多久时间发送一组警报的通知
group_interval: 60s # 在发送新警报前的等待时间
repeat_interval: 1h # 重复发送告警时间。默认1h
routes:
- receiver: ops_notify # 基础告警通知人
group_wait: 10s
match_re:
alertname: 实例存活告警|磁盘使用率告警 # 匹配告警规则中的名称发送
- receiver: info_notify # 消息告警通知人
group_wait: 10s
match_re:
alertname: 内存使用率告警|CPU使用率告警|目录大小告警
# 定义基础告警接收者
receivers:
- name: ops_notify
webhook_configs:
- url: http://localhost:8060/dingtalk/webhook1/send
send_resolved: true # 警报被解决之后是否通知
# message: '{{ template "wechat.default.message" . }}'
# 定义消息告警接收者
- name: info_notify
webhook_configs:
- url: http://localhost:8060/dingtalk/webhook1/send
send_resolved: true
# message: '{{ template "wechat.default.message" . }}'
# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的#警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
8、启动vmalert
./bin/vmalert -rule=alert.rules \
-datasource.url=http://localhost:8428 \
-notifier.url=http://localhost:9093 &