helm 部署 Kube-Prometheus + Grafana + 钉钉告警部署 Kube-Prometheus

背景

角色 IP K8S 版本 容器运行时
k8s-master-1 172.16.16.108 v1.24.1 containerd://1.6.8
k8s-node-1 172.16.16.109 v1.24.1 containerd://1.6.8
k8s-node-2 172.16.16.110 v1.24.1 containerd://1.6.8

安装 kube-prometheus

复制代码
mkdir -p /data/yaml/kube-prometheus/prometheus && cd /data/yaml/kube-prometheus/prometheus

# 添加 bitnami charts 仓库
helm repo add bitnami https://charts.bitnami.com/bitnami

helm search repo kube-prometheus

helm pull bitnami/kube-prometheus --version 8.3.0

tar -zxvf kube-prometheus-8.3.0.tgz

cat > my-values.yaml << EOF
global:
  storageClass: "nfs-client"  # 默认 storageClass

prometheus:
  service:
    type: NodePort      # 配置 NodePort
    nodePorts: 
      http: 30090       # 配置 NodePort 端口
  persistence:
    enabled: true       # 开启持久化
    size: 9Gi           # 存储大小

alertmanager:
  service:
    type: NodePort      # 配置 NodePort
    nodePorts: 
      http: 30093       # 配置 NodePort 端口
  persistence:
    enabled: true       # 开启持久化
    size: 9Gi           # 存储大小
  config:
    route:
      receiver: 'devops'   # 告警接收者
      routes:
        - match:
          receiver: 'devops'
    receivers:
      - name: 'devops'       # 告警接收者
        webhook_configs:
        - url: 'http://prometheus-webhook-dingtalk.kube-prometheus:8060/dingtalk/devops/send'     # 注意这里的 devops 需要与 prometheus-webhook-dingtalk 中的 --ding.profile 值相同
          send_resolved: true
EOF

# 创建命名空间
kubectl create ns kube-prometheus

# 测试
helm install --namespace kube-prometheus prometheus -f my-values.yaml --dry-run  kube-prometheus

# 启动
helm install --namespace kube-prometheus prometheus -f my-values.yaml  kube-prometheus

# 查看
helm -n kube-prometheus ls

kubectl -n kube-prometheus get pod

访问 Prometheus

http://172.16.16.108:30090/

配置 Pod 告警策略

复制代码
mkdir -p /data/yaml/kube-prometheus/prometheus/rules && cd /data/yaml/kube-prometheus/prometheus/rules

cat >> k8s-pod-rules.yaml << -'EOF'
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  labels:
    prometheus-name: kube-prometheus-prometheus
    managed-by: prometheus-operator          
  name: prometheus-k8s-pod-rules
  namespace:  kube-prometheus
spec:
  groups:
  - name: PodMemUsage
    rules:
    - alert: Pod内存使用率告警
      expr: sum by (pod, namespace, job, container) (container_memory_working_set_bytes{pod!="",container !=""}) / sum by (pod, namespace, job, container) (container_spec_memory_limit_bytes{pod!="",container !=""}) * 100 != +Inf > 95
      for: 1m
      labels:
        severity: 紧急告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 当前Pod内存使用率大于95% ,使用率为: {{ $value }}"
        summary: "Pod:{{ $labels.pod }} 检测到内存使用率超过limit值95%"  

  - name: Pod_cpu
    rules:
    - alert: Pod_CPU使用率告警
      expr: sum(irate(container_cpu_usage_seconds_total{pod!="",container !=""}[1m])) by (container, pod) / (sum(container_spec_cpu_quota{pod!="",container !=""}/100000) by (container, pod)) * 100 > 130
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.pod}}: 一分钟内Pod的cpu使用率大于130%,当前的使用率为: {{ $value }}"  


  - name: Pod_Network_rx
    rules:
    - alert: Pod网络IO(rx方向)告警
      expr: (sum (rate (container_network_receive_bytes_total{pod!=""}[1m])) by (pod)) / 1024  / 1024 > 200
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 一分钟内Pod的Pod网络IO(rx方向)大于200Mbps,当前的值为: {{ $value }} Mbps"
        summary: "Pod:{{ $labels.pod }} 检测到一分钟内网络IO(rx方向)过高"  

  - name: Pod_Network_tx
    rules:
    - alert: Pod网络IO(tx方向)告警
      expr: (sum (rate (container_network_transmit_bytes_total{pod!=""}[1m])) by (pod)) / 1024 / 1024 > 200
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 一分钟内Pod的Pod网络IO(tx方向)大于200Mbps,当前的值为: {{ $value }} Mbps"
        summary: "检测到一分钟内Pod网络IO(tx方向)过高"  

  - name: imagepullbackoff
    rules:
    - alert: 拉取镜像失败
      expr: kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"} == 1
      for: 1m
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 拉取镜像失败,无法创建容器"
        description: "请确认镜像是否存在"
        
  - name: Pod_Start_Exception
    rules:
    - alert: POD 资源配置不正确
      expr: sum by (namespace, pod) (kube_pod_status_phase{ phase=~"Pending|Unknown"}) == 1
      for: 15s
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 启动失败,请及时查看"
        description: "POD 无法正常启动,请查看资源是否配置正确"

  - name: crashloopbackoff
    rules:
    - alert: POD启动失败
      expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} == 1
      for: 1m
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 启动失败,请查看程序日志"
        description: "确认配置参数是否正确" 
-EOF

kubectl apply -f k8s-pod-rules.yaml

# 检查
kubectl -n kube-prometheus get cm
相关推荐
企鹅侠客1 天前
Prometheus operator怎么添加targets和告警规则
运维·云原生·kubernetes·prometheus·pod
wu8587734572 天前
【实战指南】Spring Boot + Grafana 实时监控API请求与异常,让系统问题无处可藏
spring boot·grafana
木二2 天前
附042.Kubernetes_v1.32.3生成环境高可用部署
云原生·kubernetes·prometheus·ingress·longhorn
yunson_Liu3 天前
kubernet在prometheus+alertmanager+grafana框架下新增部署loki模块
grafana·prometheus·loki
时空无限3 天前
grafana 配置页面告警
grafana
维C°3 天前
Grafana-查询和转换数据
grafana
树下一少年3 天前
docker-compose部署prometheus+grafana+node_exporter
docker·json·grafana·prometheus·node_exporter
云上艺旅9 天前
K8S学习之基础四十一:Prometheus基于Pushgateway采集数据
学习·云原生·容器·kubernetes·prometheus
云上艺旅10 天前
K8S学习之基础三十八:Kube-static-metrics监控
学习·云原生·容器·kubernetes·prometheus
sj116373940312 天前
Prometheus Exporter系列-Mysql_Exporter一键部署
prometheus