helm 部署 Kube-Prometheus + Grafana + 钉钉告警部署 Kube-Prometheus

背景

角色 IP K8S 版本 容器运行时
k8s-master-1 172.16.16.108 v1.24.1 containerd://1.6.8
k8s-node-1 172.16.16.109 v1.24.1 containerd://1.6.8
k8s-node-2 172.16.16.110 v1.24.1 containerd://1.6.8

安装 kube-prometheus

复制代码
mkdir -p /data/yaml/kube-prometheus/prometheus && cd /data/yaml/kube-prometheus/prometheus

# 添加 bitnami charts 仓库
helm repo add bitnami https://charts.bitnami.com/bitnami

helm search repo kube-prometheus

helm pull bitnami/kube-prometheus --version 8.3.0

tar -zxvf kube-prometheus-8.3.0.tgz

cat > my-values.yaml << EOF
global:
  storageClass: "nfs-client"  # 默认 storageClass

prometheus:
  service:
    type: NodePort      # 配置 NodePort
    nodePorts: 
      http: 30090       # 配置 NodePort 端口
  persistence:
    enabled: true       # 开启持久化
    size: 9Gi           # 存储大小

alertmanager:
  service:
    type: NodePort      # 配置 NodePort
    nodePorts: 
      http: 30093       # 配置 NodePort 端口
  persistence:
    enabled: true       # 开启持久化
    size: 9Gi           # 存储大小
  config:
    route:
      receiver: 'devops'   # 告警接收者
      routes:
        - match:
          receiver: 'devops'
    receivers:
      - name: 'devops'       # 告警接收者
        webhook_configs:
        - url: 'http://prometheus-webhook-dingtalk.kube-prometheus:8060/dingtalk/devops/send'     # 注意这里的 devops 需要与 prometheus-webhook-dingtalk 中的 --ding.profile 值相同
          send_resolved: true
EOF

# 创建命名空间
kubectl create ns kube-prometheus

# 测试
helm install --namespace kube-prometheus prometheus -f my-values.yaml --dry-run  kube-prometheus

# 启动
helm install --namespace kube-prometheus prometheus -f my-values.yaml  kube-prometheus

# 查看
helm -n kube-prometheus ls

kubectl -n kube-prometheus get pod

访问 Prometheus

http://172.16.16.108:30090/

配置 Pod 告警策略

复制代码
mkdir -p /data/yaml/kube-prometheus/prometheus/rules && cd /data/yaml/kube-prometheus/prometheus/rules

cat >> k8s-pod-rules.yaml << -'EOF'
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  labels:
    prometheus-name: kube-prometheus-prometheus
    managed-by: prometheus-operator          
  name: prometheus-k8s-pod-rules
  namespace:  kube-prometheus
spec:
  groups:
  - name: PodMemUsage
    rules:
    - alert: Pod内存使用率告警
      expr: sum by (pod, namespace, job, container) (container_memory_working_set_bytes{pod!="",container !=""}) / sum by (pod, namespace, job, container) (container_spec_memory_limit_bytes{pod!="",container !=""}) * 100 != +Inf > 95
      for: 1m
      labels:
        severity: 紧急告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 当前Pod内存使用率大于95% ,使用率为: {{ $value }}"
        summary: "Pod:{{ $labels.pod }} 检测到内存使用率超过limit值95%"  

  - name: Pod_cpu
    rules:
    - alert: Pod_CPU使用率告警
      expr: sum(irate(container_cpu_usage_seconds_total{pod!="",container !=""}[1m])) by (container, pod) / (sum(container_spec_cpu_quota{pod!="",container !=""}/100000) by (container, pod)) * 100 > 130
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.pod}}: 一分钟内Pod的cpu使用率大于130%,当前的使用率为: {{ $value }}"  


  - name: Pod_Network_rx
    rules:
    - alert: Pod网络IO(rx方向)告警
      expr: (sum (rate (container_network_receive_bytes_total{pod!=""}[1m])) by (pod)) / 1024  / 1024 > 200
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 一分钟内Pod的Pod网络IO(rx方向)大于200Mbps,当前的值为: {{ $value }} Mbps"
        summary: "Pod:{{ $labels.pod }} 检测到一分钟内网络IO(rx方向)过高"  

  - name: Pod_Network_tx
    rules:
    - alert: Pod网络IO(tx方向)告警
      expr: (sum (rate (container_network_transmit_bytes_total{pod!=""}[1m])) by (pod)) / 1024 / 1024 > 200
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 一分钟内Pod的Pod网络IO(tx方向)大于200Mbps,当前的值为: {{ $value }} Mbps"
        summary: "检测到一分钟内Pod网络IO(tx方向)过高"  

  - name: imagepullbackoff
    rules:
    - alert: 拉取镜像失败
      expr: kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"} == 1
      for: 1m
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 拉取镜像失败,无法创建容器"
        description: "请确认镜像是否存在"
        
  - name: Pod_Start_Exception
    rules:
    - alert: POD 资源配置不正确
      expr: sum by (namespace, pod) (kube_pod_status_phase{ phase=~"Pending|Unknown"}) == 1
      for: 15s
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 启动失败,请及时查看"
        description: "POD 无法正常启动,请查看资源是否配置正确"

  - name: crashloopbackoff
    rules:
    - alert: POD启动失败
      expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} == 1
      for: 1m
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 启动失败,请查看程序日志"
        description: "确认配置参数是否正确" 
-EOF

kubectl apply -f k8s-pod-rules.yaml

# 检查
kubectl -n kube-prometheus get cm
相关推荐
江南风月18 小时前
WGCLOUD保姆级教程最新版整理
运维·zabbix·运维开发·prometheus·日志审计
江南风月1 天前
Hermes Agent 接入WGCLOUD实战:打造团队 AI 智能运维解决方案
运维·zabbix·运维开发·prometheus
蓝宝石的傻话1 天前
VictoriaMetrics指标流聚合三年回顾与现状(2026)
go·prometheus·victoriametrics
_codemonster1 天前
Prometheus + Grafana + Alertmanager和ELK 栈(Elasticsearch + Logstash + Kibana)
elk·grafana·prometheus
IT WorryFree2 天前
Zabbix7.4 + Grafana 天蝎(Zabbix 官方插件数据源)完整配置 + 解决 412 报错
zabbix·grafana
gws8135391622 天前
Hyperf3.1接入服务器监控
grafana·prometheus·hyperf·metrics
Adorable老犀牛2 天前
MySQL Server Exporter:Prometheus 监控 MySQL/MariaDB 指南
mysql·prometheus·mariadb
成为你的宁宁2 天前
【K8S黑盒监控实践:Probe配置、Prometheus验证与Grafana可视化】
kubernetes·grafana·prometheus
成为你的宁宁2 天前
【Prometheus Operator监控K8S Nginx】
nginx·kubernetes·prometheus
人生匆匆2 天前
部署cadvisor+prometheus+grafana
grafana·prometheus