prometheus部署及钉钉告警集成Grafana

1、准备工作

安装包

📎alertmanager-0.23.0.linux-amd64.tar.gz

📎node_exporter-1.3.1.linux-amd64.tar.gz

📎prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz

服务端口

|-----------------------------|------|
| Prometheus | 9090 |
| node_exporter | 9100 |
| alertmanager | 9093 |
| prometheus-webhook-dingtalk | 8060 |

#修改配置文件之前先备份

复制代码
systemctl stop firewalld
setenforce 0

2、安装go环境

/usr/local

复制代码
curl -O https://storage.googleapis.com/golang/go1.8.3.linux-amd64.tar.gz  或者   wget -c https://storage.googleapis.com/golang/go1.8.3.linux-amd64.tar.gz
tar -C /usr/local -zxvf go1.8.3.linux-amd64.tar.gz
vim /etc/profile   #修改配置文件
	export PATH=$PATH:/usr/local/go/bin   #文件末添加
source /etc/profile  #保存配置文件
go version  #验证go环境是否安装成功

3、部署prometheus

(1)下载安装Prometheus

/usr/local

复制代码
curl -O https://blockchain-sre.oss-cn-hangzhou.aliyuncs.com/prometheus-2.31.1.linux-amd64.tar.gz  或者  wget -c https://blockchain-sre.oss-cn-hangzhou.aliyuncs.com/prometheus-2.31.1.linux-amd64.tar.gz
tar -C /usr/local -zxvf prometheus-2.31.1.linux-amd64.tar.gz
cd /usr/local
mv prometheus-2.31.1.linux-amd64 prometheus   #为方便进入目录,修改目录名为prometheus
cd
useradd -M -s /sbin/nologin prometheus
mkdir -p /data/prometheus
chown -R prometheus:prometheus /usr/local/prometheus /data/prometheus    # 修改权限 新增配置文件之后最好也执行一下这步
cd /usr/local/prometheus
mkdir bin
mv promtool bin
vim /etc/profile
	export PATH=$PATH:/sbin:/usr/bin:/usr/sbin
	export PATH=$PATH:/usr/local/go/bin
	export PATH=/usr/local/prometheus/bin:$PATH:$HOME/bin
source /etc/profile

(2)修改配置文件

/usr/local/prometheus/prometheus.yml

复制代码
cd /usr/local/prometheus
cp prometheus.yml prometheus.yml.bak   #修改配置文件前先进行备份

vim prometheus.yml
修改添加
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - localhost:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
   - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]

  - job_name: "node_exporter"
    static_configs:
      - targets: ["172.19.88.86:9100"]

  - job_name: "node2_exporter"
    static_configs:
      - targets: ["localhost:9100"]

  - job_name: "alertmanager"
    static_configs:
      - targets: ["localhost:9093"]

#  - job_name: 'consul'
#    consul_sd_configs:
#      - server:   'localhost:8500'
#        services: ['test']

#  - job_name: 'blackbox'
#    metrics_path: /probe
#    params:
#      module: [http_2xx]  # Look for a HTTP 200 response.
#    file_sd_configs:
#      - refresh_interval: 1m
#        files:
#          - "/usr/local/prometheus/conf/blackbox*.yml"
#    relabel_configs:
#      - source_labels: [__address__]
#        target_label: __param_target
#      - source_labels: [__param_target]
#        target_label: instance
#      - target_label: __address__
#        replacement: localhost:9115   # The blackbox exporter's real hostname:port.

#检查配置文件
promtool check config /usr/local/prometheus/prometheus.yml

(3)配置服务启动脚本

/usr/lib/systemd/system/prometheus.service

复制代码
cat >> /usr/lib/systemd/system/prometheus.service <<EOF
[Unit]
Description=Prometheus
After=network.target

[Service]
Type=simple
Environment="GOMAXPROCS=4"
User=prometheus
Group=prometheus
ExecReload=/bin/kill -HUP $MAINPID
ExecStart=/usr/local/prometheus/prometheus \
  --config.file=/usr/local/prometheus/prometheus.yml \
  --storage.tsdb.path=/data/prometheus \
  --storage.tsdb.retention=30d \
  --web.console.libraries=/usr/local/prometheus/console_libraries \
  --web.console.templates=/usr/local/prometheus/consoles \
  --web.listen-address=0.0.0.0:9090 \
  --web.read-timeout=5m \
  --web.max-connections=10 \
  --query.max-concurrency=20 \
  --query.timeout=2m \
  --web.enable-lifecycle
PrivateTmp=true
PrivateDevices=true
ProtectHome=true
NoNewPrivileges=true
LimitNOFILE=infinity
ReadWriteDirectories=/data/prometheus
ProtectSystem=full

SyslogIdentifier=prometheus
Restart=always

[Install]
WantedBy=multi-user.target

EOF


#启动服务
systemctl daemon-reload
systemctl enable prometheus
systemctl start prometheus
systemctl status prometheus

netstat -ntlp | grep 9090

#网页访问:localhost:9090  进入prometheus监控界面

4、部署node_exporter

(1)下载安装node_exporter

/usr/local

复制代码
curl -O https://github.com/prometheus/node_exporter/releases/download/v1.3.1/node_exporter-1.3.1.linux-amd64.tar.gz 或者 wget -c https://github.com/prometheus/node_exporter/releases/download/v1.3.1/node_exporter-1.3.1.linux-amd64.tar.gz

exit
scp C:\Users\wangdachu\Desktop\node_exporter-1.3.1.linux-amd64.tar.gz [email protected]:/root
Aldaba123!@#
ssh [email protected]

tar -C /usr/local -zxvf node_exporter-1.3.1.linux-amd64.tar.gz
cd /usr/local
mv node_exporter-1.3.1.linux-amd64 node_exporter
chown -R root:root /usr/local/node_exporter

(2)配置服务启动脚本

/usr/lib/systemd/system/node_exporter.service

复制代码
cat >> /usr/lib/systemd/system/node_exporter.service <<EOF
[Unit]
Description=node_exporter
After=network.target

[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/node_exporter/node_exporter \
  --collector.textfile.directory=/var/lib/node_exporter/textfile_collector
  --web.listen-address=0.0.0.0:9100 \
  --web.telemetry-path=/metrics \
  --log.level=info \
  --log.format=logfmt
Restart=always

[Install]
WantedBy=multi-user.target

EOF


#启动服务
systemctl daemon-reload
systemctl enable node_exporter
systemctl start node_exporter
systemctl status node_exporter

netstat -ntlp | grep 9100

#网页访问:localhost:9100

(3)被监控主机安装node_exporter及配置服务启动脚本

/usr/lib/systemd/system/node_exporter.service

复制代码
tar -C /usr/local -zxvf node_exporter-1.3.1.linux-amd64.tar.gz
cd /usr/local
mv node_exporter-1.3.1.linux-amd64 node_exporter
chown -R prometheus:prometheus /usr/local/node_exporter

cat >> /usr/lib/systemd/system/node_exporter.service <<EOF
[Unit]
Description=node_export
Documentation=https://github.com/prometheus/node_exporter
After=network.target

[Service]
Type=simple
User=prometheus
ExecStart=/usr/local/node_exporter/node_exporter --collector.textfile.directory=/var/lib/node_exporter/textfile_collector   #指定数据采集的路径
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF

#启动服务
systemctl daemon-reload
systemctl enable node_exporter
systemctl start node_exporter
systemctl status node_exporter

netstat -ntlp | grep 9100

#网页访问:被监控主机IP:9100

#localhost:9090进入监控界面 -> Graph -> 键入up -> Execute  查看监控状态

5、配置定时任务(采集目录)

复制代码
cd /var/lib
mkdir -p node_exproter/textfile_collector
vim /etc/cron.d/directory_size
*/5 * * * * root du -sb /var/log /var/cache/apt /var/lib/prometheus | sed -ne 's/^\([0-9]\+\)\t\(.*\)$/node_directory_size_bytes{directory="\2"} \1/p' > /var/lib/node_exporter/textfile_collector/directory_size.prom.$$ && mv /var/lib/node_exporter/textfile_collector/directory_size.prom.$$ /var/lib/node_exporter/textfile_collector/directory_size.prom

crontab -u root /etc/cron.d/directory_size
crontab -l

修改node_exporter自启动脚本
cat /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_export
Documentation=https://github.com/prometheus/node_exporter
After=network.target

[Service]
Type=simple
User=prometheus
ExecStart=/usr/local/node_exporter/node_exporter --collector.textfile.directory=/var/lib/node_exporter/textfile_collector   #指定路径
Restart=on-failure
[Install]
WantedBy=multi-user.target

表达式:node_directory_size_bytes{directory="/var/lib/prometheus"}
查看监控图形

6、部署alertmanager

(1)下载安装alertmanager

/usr/local

复制代码
curl -O https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz 或者 wget -c https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
tar -C /usr/local -zxvf alertmanager-0.23.0.linux-amd64.tar.gz
mv alertmanager-0.23.0.linux-amd64 alertmanager
mkdir /usr/local/alertmanager/data
chown -R prometheus:prometheus /usr/local/alertmanager

(2)修改配置文件

/usr/local/alertmanager/alertmanager.yml

复制代码
cp alertmanager.yml alertmanager.yml.bak   #备份配置文件

# 全局配置项
global:
  resolve_timeout: 5m # 处理超时时间,默认为5min

# 定义路由树信息
route:
  group_by: [alertname]  # 报警分组依据
  receiver: ops_notify   # 设置默认接收人
  group_wait: 30s        # 最初即第一次等待多久时间发送一组警报的通知
  group_interval: 60s    # 在发送新警报前的等待时间
  repeat_interval: 1h    # 重复发送告警时间。默认1h
  routes:

  - receiver: ops_notify  # 基础告警通知
    group_wait: 10s
    match_re:
      alertname: 实例存活告警|磁盘使用率告警   # 匹配告警规则中的名称发送

  - receiver: info_notify  # 消息告警通知
    group_wait: 10s
    match_re:
      alertname: 内存使用率告警|CPU使用率告警

# 定义基础告警接收者
receivers:
- name: ops_notify
  webhook_configs:
  - url: http://localhost:8060/dingtalk/webhook2/send        #prometheus-webhook-dingtalk的url地址
    send_resolved: true  # 警报被解决之后是否通知

# 定义消息告警接收者
- name: info_notify
  webhook_configs:
  - url: http://localhost:8060/dingtalk/webhook2/send          #prometheus-webhook-dingtalk的url地址
    send_resolved: true

# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

(3)配置服务启动脚本

/usr/lib/systemd/system/alertmanager.service

复制代码
cat >> /usr/lib/systemd/system/alertmanager.service <<EOF
[Unit]
Description=Prometheus: the alerting system
Documentation=http://prometheus.io/docs/
After=prometheus.service

[Service]
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
Restart=always
StartLimitInterval=0
RestartSec=10

[Install]
WantedBy=multi-user.target
EOF

#启动服务
systemctl daemon-reload && systemctl enable alertmanager && systemctl start alertmanager
systemctl status alertmanager

netstat -natp | grep alertmanager

#网页:localhost:9093

7、部署prometheus-webhook-dingtalk

(1)下载安装prometheus-webhook-dingtalk

/usr/local

复制代码
curl -O https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.0.0/prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz 或者 wget -c https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.0.0/prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz
tar -C /usr/local -zxvf prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-2.0.0.linux-amd64 prometheus-webhook-dingtalk

1\#修改配置文件     /usr/local/prometheus-webhook-dingtalk/config.yml
cp config.yml config.yml.bak

targets:
  webhook2:
    url: https://oapi.dingtalk.com/robot/send?access_token=cec57e121cf51ffdcf108ac9218bb01591826ab16b535928b6a860c87eebc9e6        #修改url为钉钉机器人的token ,机器人的webhook地址
    # secret for signature
    secret: SEC000000000000000000000

剩余注释#到message

(2)配置服务启动脚本

/usr/lib/systemd/system/prometheus-webhook-dingtalk.service

复制代码
cat >> /usr/lib/systemd/system/prometheus-webhook-dingtalk.service <<EOF
[Unit]
Description='start prometheus-webhook-dingtalk service'
Documentation='https://github.com/timonwong/prometheus-webhook-dingtalk'
After=network.target

[Service]
Type=simple
User=root
PIDFile=/var/run/prometheus-webhook-dingtalk.pid
ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk \
      --web.listen-address=:8060 \
          --web.enable-lifecycle \
          --web.enable-ui \
          --config.file=/usr/local/prometheus-webhook-dingtalk/config.yml
Restart=on-failure

[Install]
WantedBy=multi-user.target
EOF

systemctl daemon-reload && systemctl enable prometheus-webhook-dingtalk && systemctl start prometheus-webhook-dingtalk
systemctl status prometheus-webhook-dingtalk

netstat -natp | grep 8060

8、命令行测试机器人发送消息,验证是否可以发送成功

复制代码
curl -H "Content-Type: application/json" -d '{"msgtype":"text","text":{"content":"prometheus alert test"}}' https://oapi.dingtalk.com/robot/send?access_token=cec57e121cf51ffdcf108ac9218bb01591826ab16b535928b6a860c87eebc9e6
#修改url为钉钉机器人的token ,机器人的webhook地址
curl -H "Content-Type: application/json" -d '{"msgtype":"text","text":{"content":"prometheus alert test"}}' https://oapi.dingtalk.com/robot/send?access_token=72405a3c5684584a2a13447cc58977fb34ae9c10e060696ef228c6daed1b6f61

查看prometheus-webhook-dingtalk的url地址,altermanager会将通知像这个地址发送
journalctl -u prometheus-webhook-dingtalk -f

可以看到url    urls=http://localhost:8060/dingtalk/webhook1/send

9、配置告警规则

/usr/local/prometheus/first_rules.yml

复制代码
cat >> /usr/local/prometheus/first_rules.yml << EOF
groups:
# 实例存活报警
- name: 实例存活告警规则
  rules:
  - alert: 实例存活告警
    expr: up == 0
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."

# mem报警
- name: 内存报警规则
  rules:
  - alert: 内存使用率告警
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      description: "服务器: 内存使用超过80%!(当前值: {{ $value }}%)"


# disk报警
- name: 磁盘报警规则
  rules:
  - alert: 磁盘使用率告警
    expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 30
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      description: "服务器: 磁盘设备: 使用超过30%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"


# cpu报警
- name: CPU报警规则
  rules:
  - alert: CPU使用率告警
    expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 30
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      description: "服务器: CPU使用超过30%!(当前值: {{ $value }}%)"

#目录大小告警(复制前面规则不会出错)
- name: 目录报警规则
  rules:
  - alert:目录大小告警
    expr: node_directory_size_bytes > 10
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      description: "服务器: 目录大小超过0!(当前值: {{ $value }})"
EOF


#检查规则配置文件语法是否正确
promtool check rules /usr/local/prometheus/first_rules.yml

10、告警规则模板

/usr/local/prometheus-webhook-dingtalk/contrib/templates/legacy/template.tmpl

11、部署grafana

(1)下载安装grafana

复制代码
cd prometheus-1/
curl -O https://dl.grafana.com/oss/release/grafana-7.1.3.linux-amd64.tar.gz 或者 wget -c https://dl.grafana.com/oss/release/grafana-7.1.3.linux-amd64.tar.gz
tar -C /usr/local -zxvf grafana-7.1.3.linux-amd64.tar.gz
mv grafana-7.1.3 grafana
mkdir /usr/local/grafana/{data,log}
chown -R prometheus:prometheus /usr/local/grafana

(2)修改配置文件、

/usr/local/grafana/conf/

复制代码
cd /usr/local/grafana/conf/
cp defaults.ini grafana.ini

vim grafana.ini
	# logs = data/log
	logs = log

(3)配置服务启动脚本

/usr/lib/systemd/system/grafana-server.service

复制代码
cat >> /usr/lib/systemd/system/grafana-server.service <<EOF
[Unit]
Description=Grafana instance
Documentation=http://docs.grafana.org
Wants=network-online.target
After=network-online.target
After=postgresql.service mariadb.service mysqld.service

[Service]
Type=simple
User=prometheus
Group=prometheus
WorkingDirectory=/usr/local/grafana
ExecStart=/usr/local/grafana/bin/grafana-server \
    --config=/usr/local/grafana/conf/grafana.ini \
    --pidfile=/usr/local/grafana/grafana-server.pid

Restart=on-failure
LimitNOFILE=10000
TimeoutStopSec=20

[Install]
WantedBy=multi-user.target
EOF	

systemctl daemon-reload && systemctl enable grafana-server && systemctl start grafana-server
systemctl status grafana-server
netstat -ntlp | grep 3000

网页访问:139.224.12.165:3000
默认账号/密码:admin/admin


URL:http://139.224.12.165:9090
相关推荐
weixin_307779131 小时前
Clickhouse统计指定表中各字段的空值、空字符串或零值比例
运维·数据仓库·clickhouse
bubiyoushang8882 小时前
Windows11 WSL2 Ubuntu编译安装perf工具
linux·运维·ubuntu
xuanwojiuxin3 小时前
linux panic-propagation
linux·运维·服务器
藥瓿亭5 小时前
K8S认证|CKS题库+答案| 9. 网络策略 NetworkPolicy
linux·运维·docker·云原生·容器·kubernetes·cks
liuzhenghua666 小时前
Python任务调度模型
java·运维·python
黎相思6 小时前
应用层自定义协议与序列化
运维·服务器·网络
测试开发Kevin6 小时前
详解Jenkins Pipeline 中git 命令的使用方法
运维·jenkins
什么半岛铁盒7 小时前
Linux线程与进程关系及底层实现
java·linux·运维
langmeng1107 小时前
使用docker在3台服务器上搭建基于版本redis 6.x的一主两从模式
运维·redis·docker·容器·集群
jllllyuz7 小时前
如何为服务器生成TLS证书
运维·服务器·数据库