3. 部署redis服务并监控redis

一、环境清单

1. 节点信息

机器名称	机器 IP	操作系统	角色
k8s-master	10.132.47.60	Centos7.9	Master
k8s-node1	10.132.47.61	Centos7.9	Worker
k8s-node2	10.132.47.62	Centos7.9	Worker

2. 验证集群环境

bash 复制代码

# 检查集群节点状态（所有节点应为 Ready 状态）
kubectl get nodes

# 检查prometheus 组件状态
kubectl get pod -n monitoring 

# 检查pod网络（确保calico/flannel等网络插件正常）
kubectl get pods -n kube-system -o wide

# 检查存储类（确认nfs-client存在且可用）
kubectl get sc

二、部署redis服务

1. 部署redis

shell 复制代码

vim redis.yaml

yaml 复制代码

# ============== 第一部分：创建独立的redis命名空间 ==============
apiVersion: v1
kind: Namespace
metadata:
  name: redis-test         # redis专属命名空间，和mysql-test对应
  labels:
    app: redis             # 统一标签规范
---
# ============== 第二部分：创建ConfigMap 存储redis核心配置 ==============
apiVersion: v1
kind: ConfigMap
metadata:
  name: redis-config       # ConfigMap名称
  namespace: redis-test
data:
  redis.conf: |            # redis核心配置文件
    # 允许所有地址访问，容器环境必配
    bind 0.0.0.0
    # 关闭保护模式，否则仅本地访问
    protected-mode no
    # redis默认端口
    port 6379
    # 非守护进程运行，容器内必须设为no（否则容器启动后立即退出）
    daemonize no
    # 日志级别，生产推荐notice
    loglevel notice
    # 数据存储目录，对应PVC挂载路径
    dir /data
    # 开启AOF持久化，保障数据不丢失
    appendonly yes
    # 每秒刷盘一次，兼顾性能和数据安全
    appendfsync everysec
    # ✅ 预留密码配置注释，后续需要时取消注释即可
    # requirepass "Redis@123456"
    # 最大内存限制，避免占用过多节点资源
    maxmemory 256mb   
    # 内存满时的淘汰策略，生产常用LRU
    maxmemory-policy allkeys-lru
---
# ============== 第三部分：创建PVC 持久化redis数据 ==============
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: redis-pvc          # PVC名称
  namespace: redis-test
  labels:
    app: redis
spec:
  accessModes:
    - ReadWriteOnce        # 单节点读写，适配redis单实例
  storageClassName: nfs-client  # 存储类
  resources:
    requests:
      storage: 1Gi         # 按需分配存储，可根据业务调整
---
# ============== 第四部分：Redis Deployment 核心配置 ==============
apiVersion: apps/v1
kind: Deployment
metadata:
  name: redis              # Deployment名称
  namespace: redis-test
  labels:
    app: redis
spec:
  replicas: 1              # 单实例部署，redis集群需调整为StatefulSet
  selector:
    matchLabels:
      app: redis
  strategy:
    type: Recreate         # 重建更新，有状态服务必备（避免多实例挂载同PVC）
  template:
    metadata:
      labels:
        app: redis
    spec:
      restartPolicy: Always
      containers:
      - name: redis
        image: crpi-vdf183p7g8m4zpud.cn-hangzhou.personal.cr.aliyuncs.com/shy_my/redis:7.2.6  # 你的redis镜像
        imagePullPolicy: Always
        ports:
        - containerPort: 6379  # redis默认端口
        # 启动命令：加载自定义配置文件
        command:
        - redis-server
        - /etc/redis/redis.conf
        # 健康检查-存活探针：检测redis是否存活
        livenessProbe:
          exec:
            command: ["redis-cli", "ping"]  # redis特有探测方式，返回PONG则正常
          initialDelaySeconds: 30
          periodSeconds: 10
          timeoutSeconds: 5
        # 健康检查-就绪探针：检测redis是否就绪
        readinessProbe:
          exec:
            command: ["redis-cli", "ping"]
          initialDelaySeconds: 10
          periodSeconds: 5
          timeoutSeconds: 3
        # 资源限制：按需分配，比mysql低
        resources:
          limits:
            cpu: 200m
            memory: 256Mi
          requests:
            cpu: 100m
            memory: 128Mi
        # 卷挂载：配置文件+数据持久化
        volumeMounts:
        - name: redis-config-volume
          mountPath: /etc/redis/
          #subPath: redis.conf  # 精准挂载配置文件，避免覆盖目录
        - name: redis-data
          mountPath: /data     # redis数据目录
      # 定义卷
      volumes:
      - name: redis-config-volume
        configMap:
          name: redis-config
      - name: redis-data
        persistentVolumeClaim:
          claimName: redis-pvc
---
# ============== 第五部分：Redis Service 固定访问入口 ==============
apiVersion: v1
kind: Service
metadata:
  name: redis              # Service名称，集群内域名：redis.redis-test.svc
  namespace: redis-test
  labels:
    app: redis
spec:
  type: NodePort           # 暴露NodePort，方便外部访问
  selector:
    app: redis
  ports:
  - port: 6379             # 集群内访问端口
    targetPort: 6379       # 容器端口
    nodePort: 30379        # 节点端口
    protocol: TCP
    name: redis-port       # 端口名称，方便后续监控匹配

2. 验证服务

bash 复制代码

kubectl apply -f redis.yaml

# 验证Secret创建成功
kubectl get secrets -n redis-test

# 验证ConfigMap创建成功
kubectl get configmaps -n redis-test 

# 验证PVC状态（应为 Bound 状态，说明动态存储已分配PV）
kubectl get pvc -n redis-test 

# 验证Pod状态
kubectl get pods -n redis-test 

# 验证Service创建成功（查看NodePort端口）
kubectl get svc -n redis-test

三、创建redis监控服务

1.创建文件

复制代码

vim redis-exporter.yaml

yaml 复制代码

# 1. ConfigMap（无密码Redis，预留密码配置注释）
apiVersion: v1
kind: ConfigMap
metadata:
  name: redis-exporter-cm  # 对应mysql-exporter-cm
  namespace: monitoring
data:
  # 无密码时无需配置认证，预留密码配置注释
  # redis_exporter.conf: |-
  #   REDIS_PASSWORD="Redis@123456"  # 后续设置密码时取消注释
---
# 2. Redis Exporter Deployment（对应mysql-exporter Deployment）
apiVersion: apps/v1
kind: Deployment
metadata:
  name: redis-exporter  
  namespace: monitoring
  labels:
    app: redis-exporter  
spec:
  replicas: 1
  selector:
    matchLabels:
      app: redis-exporter
  template:
    metadata:
      labels:
        app: redis-exporter
    spec:
      # 无密码时无需挂载ConfigMap，预留挂载注释（有密码时启用）
      # volumes:
      # - name: redis-config
      #   configMap:
      #     name: redis-exporter-cm
      #     items:
      #     - key: redis_exporter.conf
      #       path: redis_exporter.conf
      containers:
      - name: redis-exporter
        # Redis Exporter官方镜像（适配Redis 7.x）
        image: oliver006/redis_exporter:v1.50.0
        command:
        - /redis_exporter
        # 指定Redis地址（替换为你的节点IP+Redis NodePort，你的输出是30379）
        - --redis.addr=10.132.47.60:30379
        # 有密码时取消注释：
        # - --redis.password=$(REDIS_PASSWORD)
        securityContext:
          runAsUser: 0  # 和MySQL Exporter保持一致
        ports:
        - containerPort: 9121  # Redis Exporter默认端口（对应MySQL的9104）
          name: metrics        # 端口名称，和ServiceMonitor匹配
        # 有密码时添加环境变量（预留）
        # env:
        # - name: REDIS_PASSWORD
        #   valueFrom:
        #     configMapKeyRef:
        #       name: redis-exporter-cm
        #       key: REDIS_PASSWORD
        volumeMounts:
        # 有密码时启用挂载（预留）
        # - name: redis-config
        #   mountPath: /etc/redis-exporter/redis_exporter.conf
        #   subPath: redis_exporter.conf
        resources:
          # 资源限制和MySQL Exporter保持一致
          limits:
            cpu: 100m
            memory: 128Mi
          requests:
            cpu: 50m
            memory: 64Mi
---
# 3. Redis Exporter Service（对应mysql-exporter Service）
apiVersion: v1
kind: Service
metadata:
  name: redis-exporter  # 对应mysql-exporter
  namespace: monitoring
  labels:
    app: redis-exporter  # 对应app: mysql-exporter
spec:
  selector:
    app: redis-exporter
  ports:
  - name: metrics  # 端口名称，和ServiceMonitor匹配（和MySQL一致）
    port: 9121      # Redis Exporter端口（对应MySQL的9104）
    targetPort: 9121
  type: ClusterIP
---
# 4. Redis Exporter ServiceMonitor（对应mysql-exporter ServiceMonitor）
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: redis-exporter  # 对应mysql-exporter
  namespace: monitoring
  labels:
    app: redis-exporter  # 对应app: mysql-exporter
spec:
  selector:
    matchLabels:
      app: redis-exporter  # 匹配Service标签
  namespaceSelector:
    matchNames:
    - monitoring
  endpoints:
  - port: metrics        # 匹配Service端口名称（和MySQL一致）
    interval: 15s        # 采集间隔和MySQL一致
    path: /metrics       # Redis Exporter默认metrics路径

2. 验证资源创建

shell 复制代码

kubectl apply -f redis-exporter.yaml

# 验证ConfigMap（对应你验证mysql-exporter-cm）
kubectl get configmaps -n monitoring | grep redis-exporter-cm

#  验证Pod+Service
kubectl get pod,svc -n monitoring -l app=redis-exporter

# 验证ServiceMonitor）
kubectl get servicemonitor redis-exporter -n monitoring

#验证
#先获取redis-exporter的Service集群IP
REDIS_EXPORTER_IP=$(kubectl get svc redis-exporter -n monitoring -o jsonpath='{.spec.clusterIP}')
#查看获取到的 ClusterIP
echo $REDIS_EXPORTER_IP
#访问 metrics 接口，查看 mysql_up（核心验证）
curl -s http://${REDIS_EXPORTER_IP}:9121/metrics | grep redis_up
#返回
# HELP redis_up Information about the Redis instance
# TYPE redis_up gauge
redis_up 1
# HELP redis_uptime_in_seconds uptime_in_seconds metric
# TYPE redis_uptime_in_seconds gauge
redis_uptime_in_seconds 2485

3. 访问 Prometheus Web UI

查看 mysql-exporter 的 Target 状态（核心验证）

进入 UI 后，点击左侧菜单栏的 Status → Targets；
在页面右上角的搜索框输入 redis-exporter，过滤出目标；
重点看 State列：
- ✅ UP：说明 Prometheus 已成功发现并采集 mysql-exporter 的指标（监控链路完全打通）；
- ❌ Down：采集失败（鼠标悬停在红色的 Down 上，会显示具体错误原因，比如端口不通、超时等）；
- 📶 无此条目：极少数情况，可重新应用 ServiceMonitor 配置：
  复制代码
```
kubectl apply -f redis-exporter.yaml -n monitoring
```

4. 导入 redis 监控面板

Grafana 官方有成熟的 redis 监控面板

ID：11835、763

四、Prometheus指标查询语句

bash 复制代码

#3.1 检查 Redis exporter 是否正常运行
redis_up{job="redis-exporter"}
#3.2 Redis 已用内存（适配 v1.50.0，补充缺失时的兼容指标）
redis_used_memory{job="redis-exporter"} OR redis_memory_used_bytes{job="redis-exporter"}
#3.3 Redis 最大内存（适配 v1.50.0）
redis_maxmemory{job="redis-exporter"} OR redis_memory_max_bytes{job="redis-exporter"}
#3.4 Redis 内存使用率（适配 v1.50.0）
(redis_memory_used_bytes{job="redis-exporter"} / redis_memory_max_bytes{job="redis-exporter"}) * 100
#3.5 Redis 缓存命中率（适配 v1.50.0）
redis_keyspace_hits_total{job="redis-exporter"} / (redis_keyspace_hits_total{job="redis-exporter"} + redis_keyspace_misses_total{job="redis-exporter"}) * 100
#3.6 Redis 连接数（适配 v1.50.0）
redis_connected_clients{job="redis-exporter"}
#3.7 Redis 命令执行速率（适配 v1.50.0，1 分钟滑动窗口）
rate(redis_commands_processed_total{job="redis-exporter"}[1m])
#3.8 Redis 过期 key 数量（适配 v1.50.0）
redis_expired_keys_total{job="redis-exporter"}
#3.9 Redis 阻塞客户端数（适配 v1.50.0）
redis_blocked_clients{job="redis-exporter"}
#3.10 Redis RDB 持久化状态（适配 v1.50.0）
redis_rdb_bgsave_in_progress{job="redis-exporter"}
#3.11 Redis AOF 重写状态（适配 v1.50.0）
redis_aof_rewrite_in_progress{job="redis-exporter"}

五、添加redis告警规则

bash 复制代码

vim redis-rules.yaml

yaml 复制代码

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: redis-rules
  namespace: monitoring
  labels:
    custom-rule: "true"  # 专属标签，用于筛选
    prometheus: k8s
    role: alert-rules
spec:
  groups:
  - name: redis.rules  # 保持和你模板一致的group名称
    rules:
    # === P0级（严重，立即处理）===
    # Redis宕机
    - alert: Redis服务不可用
      expr: redis_up == 0
      for: 1m
      labels:
        severity: critical
        level: P0
      annotations:
        summary: "Redis服务不可用"
        description: "Redis实例 {{ $labels.instance }} 无法连接 (状态码: {{ $value }})"
        runbook: |
          【K8s环境】
          1. 检查Redis Pod状态: kubectl get pods -n {{ $labels.namespace }} | grep redis
          2. 查看Redis容器日志: kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }}
          3. 重启Redis Pod: kubectl delete pod -n {{ $labels.namespace }} {{ $labels.pod }}
          【Linux原生环境】
          1. 检查Redis进程状态: ps -ef | grep redis
          2. 查看Redis日志: cat /var/log/redis/redis-server.log
          3. 测试网络连通性: telnet {{ $labels.instance }} 6379
          4. 重启Redis服务: systemctl restart redis (或 ./redis-server redis.conf)

    # Redis主从复制异常
    - alert: Redis主从复制异常
      expr: redis_master_link_status == 0
      for: 1m
      labels:
        severity: critical
        level: P0
      annotations:
        summary: "Redis主从复制异常"
        description: "Redis从库 {{ $labels.instance }} 复制状态异常"
        runbook: |
          【K8s环境】
          1. 进入Redis容器: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- bash
          2. 查看复制状态: redis-cli info replication
          3. 检查主库Service连通性: ping 主库Service名称.{{ $labels.namespace }}.svc.cluster.local
          【Linux原生环境】
          1. 登录Redis查看复制状态: redis-cli -h {{ $labels.instance }} info replication
          2. 检查主库地址/密码配置
          3. 重新配置主从: redis-cli -h {{ $labels.instance }} slaveof 主库IP 主库端口
          4. 检查主从网络互通性

    # Redis内存耗尽（达到95%）
    - alert: Redis内存即将耗尽
      expr: redis_used_memory / redis_maxmemory > 0.95
      for: 1m
      labels:
        severity: critical
        level: P0
      annotations:
        summary: "Redis内存即将耗尽"
        description: "Redis实例 {{ $labels.instance }} 内存使用率: {{ $value | humanizePercentage }} (超过95%)"
        runbook: |
          【K8s环境】
          1. 临时扩容Redis资源: kubectl edit statefulset -n {{ $labels.namespace }} redis名称
          2. 进入容器清理key: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli --scan | xargs redis-cli del
          3. 长期优化: 修改Redis ConfigMap中的maxmemory参数并重启
          【Linux原生环境】
          1. 紧急清理无效key: redis-cli -h {{ $labels.instance }} --scan | xargs redis-cli -h {{ $labels.instance }} del
          2. 临时扩容maxmemory: redis-cli -h {{ $labels.instance }} config set maxmemory 更大值
          3. 排查内存泄漏: redis-cli -h {{ $labels.instance }} info memory
          4. 重启Redis释放碎片（需先备份）

    # Redis持久化失败（RDB/AOF）
    - alert: Redis持久化失败
      expr: redis_rdb_last_bgsave_status == 0 or redis_aof_last_bgrewrite_status == 0
      for: 1m
      labels:
        severity: critical
        level: P0
      annotations:
        summary: "Redis持久化失败"
        description: "Redis实例 {{ $labels.instance }} RDB/AOF持久化失败: RDB状态={{ $value }}, AOF状态={{ $labels.aof_status }}"
        runbook: |
          【K8s环境】
          1. 检查持久化目录PVC空间: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- df -h /data
          2. 查看容器内持久化日志: kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }} | grep -i 'rdb\|aof'
          3. 扩容PVC: kubectl edit pvc -n {{ $labels.namespace }} redis-pvc名称
          【Linux原生环境】
          1. 查看持久化日志: cat /var/log/redis/redis-server.log | grep -i 'rdb\|aof'
          2. 检查磁盘空间: df -h (持久化目录所在分区)
          3. 手动触发RDB持久化: redis-cli -h {{ $labels.instance }} bgsave
          4. 检查AOF文件完整性: redis-check-aof --fix 持久化目录/appendonly.aof

    # Redis集群节点下线
    - alert: Redis集群节点下线
      expr: redis_cluster_nodes_state{state="fail"} > 0
      for: 1m
      labels:
        severity: critical
        level: P0
      annotations:
        summary: "Redis集群节点下线"
        description: "Redis集群 {{ $labels.instance }} 有 {{ $value }} 个节点处于故障状态"
        runbook: |
          【K8s环境】
          1. 查看集群所有Pod状态: kubectl get pods -n {{ $labels.namespace }} -l app=redis-cluster
          2. 重启故障Pod: kubectl delete pod -n {{ $labels.namespace }} 故障Pod名称
          3. 检查集群Service: kubectl get svc -n {{ $labels.namespace }} redis-cluster
          【Linux原生环境】
          1. 查看集群状态: redis-cli -h {{ $labels.instance }} cluster nodes
          2. 检查故障节点进程: ps -ef | grep redis (故障节点IP)
          3. 重启故障节点: systemctl restart redis (或 ./redis-server redis.conf)
          4. 重新加入集群: redis-cli -h 故障节点IP cluster meet 集群主节点IP 6379

    # === P1级（警告，重点关注）===
    # Redis内存使用率>85%
    - alert: Redis内存使用率过高
      expr: redis_used_memory / redis_maxmemory > 0.85
      for: 5m
      labels:
        severity: warning
        level: P1
      annotations:
        summary: "Redis内存使用率过高"
        description: "Redis实例 {{ $labels.instance }} 内存使用率: {{ $value | humanizePercentage }} (超过85%)"
        runbook: |
          【K8s环境】
          1. 进入Redis容器: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- bash
          2. 查看内存详情: redis-cli info memory
          3. 清理过期key: redis-cli --scan --pattern '*' | xargs -I {} redis-cli ttl {} | grep -E '^[0-9]+$' | xargs redis-cli del
          【Linux原生环境】
          1. 登录Redis: redis-cli -h {{ $labels.instance }}
          2. 查看内存详情: info memory
          3. 清理过期key: redis-cli -h {{ $labels.instance }} --scan --pattern '*' | xargs -I {} redis-cli -h {{ $labels.instance }} ttl {} | grep -E '^[0-9]+$' | xargs redis-cli -h {{ $labels.instance }} del
          4. 分析大key: redis-cli -h {{ $labels.instance }} --bigkeys

    # Redis缓存命中率<80%
    - alert: Redis缓存命中率低
      expr: redis_keyspace_hits / (redis_keyspace_hits + redis_keyspace_misses) < 0.8
      for: 5m
      labels:
        severity: warning
        level: P1
      annotations:
        summary: "Redis缓存命中率低"
        description: "Redis实例 {{ $labels.instance }} 命中率: {{ $value | humanizePercentage }} (低于80%)"
        runbook: |
          【K8s环境】
          1. 进入容器分析访问统计: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli info stats
          2. 修改ConfigMap调整缓存策略: kubectl edit configmap -n {{ $labels.namespace }} redis-config
          3. 重启Redis生效: kubectl rollout restart statefulset -n {{ $labels.instance }} redis名称
          【Linux原生环境】
          1. 分析访问统计: redis-cli -h {{ $labels.instance }} info stats
          2. 优化缓存策略: 增加热点key过期时间
          3. 检查缓存穿透/击穿问题
          4. 调整maxmemory-policy为allkeys-lru: redis-cli -h {{ $labels.instance }} config set maxmemory-policy allkeys-lru

    # Redis阻塞客户端>10个
    - alert: Redis阻塞客户端过多
      expr: redis_blocked_clients > 10
      for: 5m
      labels:
        severity: warning
        level: P1
      annotations:
        summary: "Redis阻塞客户端过多"
        description: "Redis实例 {{ $labels.instance }} 阻塞客户端数: {{ $value | humanize }}"
        runbook: |
          【K8s环境】
          1. 进入容器查看阻塞客户端: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli client list | grep blocked
          2. 查看慢查询: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli slowlog get 10
          3. 临时调整超时时间: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli config set timeout 300
          【Linux原生环境】
          1. 查看阻塞客户端: redis-cli -h {{ $labels.instance }} client list | grep blocked
          2. 分析阻塞命令: redis-cli -h {{ $labels.instance }} slowlog get 10
          3. 优化慢查询命令（如大key操作）
          4. 调整超时时间: redis-cli -h {{ $labels.instance }} config set timeout 300

    # Redis连接数>最大连接数80%
    - alert: Redis连接数接近上限
      expr: redis_connected_clients / redis_config_maxclients > 0.8
      for: 5m
      labels:
        severity: warning
        level: P1
      annotations:
        summary: "Redis连接数接近上限"
        description: "Redis实例 {{ $labels.instance }} 连接数: {{ $value | humanizePercentage }} (超过最大连接数80%)"
        runbook: |
          【K8s环境】
          1. 进入容器查看连接: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli client list
          2. 修改ConfigMap调整maxclients: kubectl edit configmap -n {{ $labels.namespace }} redis-config
          3. 重启Redis生效: kubectl rollout restart statefulset -n {{ $labels.namespace }} redis名称
          【Linux原生环境】
          1. 查看所有连接: redis-cli -h {{ $labels.instance }} client list
          2. 关闭无效连接: redis-cli -h {{ $labels.instance }} client kill 客户端IP:端口
          3. 临时调整最大连接数: redis-cli -h {{ $labels.instance }} config set maxclients 更大值
          4. 长期优化: 修改redis.conf中maxclients参数并重启

    # Redis慢查询数激增（5分钟>10个）
    - alert: Redis慢查询数激增
      expr: rate(redis_slowlog_length[5m]) > 10
      for: 5m
      labels:
        severity: warning
        level: P1
      annotations:
        summary: "Redis慢查询数激增"
        description: "Redis实例 {{ $labels.instance }} 5分钟内慢查询数: {{ $value | humanize }}"
        runbook: |
          【K8s环境】
          1. 查看慢查询日志: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli slowlog get 20
          2. 调整慢查询阈值: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli config set slowlog-log-slower-than 10000
          3. 长期优化: 修改ConfigMap中的slowlog配置并重启
          【Linux原生环境】
          1. 查看慢查询日志: redis-cli -h {{ $labels.instance }} slowlog get 20
          2. 优化慢查询命令（拆分大key、避免全量扫描）
          3. 调整慢查询阈值: redis-cli -h {{ $labels.instance }} config set slowlog-log-slower-than 10000
          4. 监控慢查询趋势: 定期执行slowlog get

    # Redis Key过期数突增（5分钟>1000个）
    - alert: RedisKey过期数突增
      expr: rate(redis_expired_keys_total[5m]) > 1000
      for: 5m
      labels:
        severity: warning
        level: P1
      annotations:
        summary: "Redis Key过期数突增"
        description: "Redis实例 {{ $labels.instance }} 5分钟内过期key数: {{ $value | humanize }}"
        runbook: |
          【K8s环境】
          1. 分析key过期分布: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli info keyspace
          2. 开启异步删除: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli config set lazyfree-lazy-expire yes
          【Linux原生环境】
          1. 分析key过期分布: redis-cli -h {{ $labels.instance }} info keyspace
          2. 调整非热点key过期时间: 避免集中过期
          3. 检查是否批量设置过期key: 排查业务代码
          4. 开启过期key异步删除: redis-cli -h {{ $labels.instance }} config set lazyfree-lazy-expire yes

    # Redis AOF重写耗时过长（>30分钟）
    - alert: RedisAOF重写耗时过长
      expr: redis_aof_rewrite_time_in_seconds > 1800
      for: 5m
      labels:
        severity: warning
        level: P1
      annotations:
        summary: "Redis AOF重写耗时过长"
        description: "Redis实例 {{ $labels.instance }} AOF重写耗时: {{ $value }} 秒 (超过30分钟)"
        runbook: |
          【K8s环境】
          1. 查看AOF状态: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli info aof
          2. 检查容器磁盘IO: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- iostat -x 1
          3. 临时关闭自动AOF重写: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli config set auto-aof-rewrite-percentage 0
          【Linux原生环境】
          1. 查看AOF状态: redis-cli -h {{ $labels.instance }} info aof
          2. 检查磁盘IO负载: iostat -x 1 (持久化目录所在磁盘)
          3. 临时关闭自动AOF重写: redis-cli -h {{ $labels.instance }} config set auto-aof-rewrite-percentage 0
          4. 手动触发AOF重写（低峰期）: redis-cli -h {{ $labels.instance }} bgrewriteaof

    # === P2级（提示，按需处理）===
    # Redis连接数>1000
    - alert: Redis连接数过高
      expr: redis_connected_clients > 1000
      for: 5m
      labels:
        severity: warning
        level: P2
      annotations:
        summary: "Redis连接数过高"
        description: "Redis实例 {{ $labels.instance }} 当前连接数: {{ $value | humanize }}"
        runbook: |
          【K8s环境】
          1. 查看连接详情: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli client list | wc -l
          2. 筛选无效连接: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli client list | grep -i 'idle' | awk '{print $1}' | xargs -I {} redis-cli client kill {}
          3. 调整连接超时: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli config set timeout 60
          【Linux原生环境】
          1. 查看连接详情: redis-cli -h {{ $labels.instance }} client list | wc -l
          2. 筛选无效连接: redis-cli -h {{ $labels.instance }} client list | grep -i 'idle' | awk '{print $1}' | xargs -I {} redis-cli -h {{ $labels.instance }} client kill {}
          3. 调整连接超时: redis-cli -h {{ $labels.instance }} config set timeout 60
          4. 检查业务侧连接池配置

    # Redis命令执行速率突增（1分钟内>10000次）
    - alert: Redis命令执行速率突增
      expr: rate(redis_commands_processed_total[1m]) > 10000
      for: 5m
      labels:
        severity: warning
        level: P2
      annotations:
        summary: "Redis命令执行速率突增"
        description: "Redis实例 {{ $labels.instance }} 命令执行速率: {{ $value | humanize }}次/秒 (阈值10000)"
        runbook: |
          【K8s环境】
          1. 查看高频命令: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli info commandstats | sort -k 2 -r
          2. 检查Pod网络: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- netstat -an | grep 6379 | wc -l
          3. 结合NetworkPolicy限流高频IP访问
          【Linux原生环境】
          1. 查看高频命令: redis-cli -h {{ $labels.instance }} info commandstats | sort -k 2 -r
          2. 检查异常访问: netstat -an | grep 6379 | wc -l
          3. 临时限流: 结合iptables限制高频IP访问
          4. 优化高频命令: 如使用pipeline批量执行

    # Redis过期key数量突增（5分钟内>1000个）
    - alert: Redis过期key数量突增
      expr: rate(redis_expired_keys_total[5m]) > 1000
      for: 5m
      labels:
        severity: warning
        level: P2
      annotations:
        summary: "Redis过期key数量突增"
        description: "Redis实例 {{ $labels.instance }} 5分钟内过期key数: {{ $value | humanize }} (阈值1000)"
        runbook: |
          【K8s环境】
          1. 分析过期key分布: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli --scan | xargs -I {} redis-cli ttl {} | sort | uniq -c
          2. 调整缓存淘汰策略: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli config set maxmemory-policy volatile-lru
          【Linux原生环境】
          1. 分析过期key分布: redis-cli -h {{ $labels.instance }} --scan | xargs -I {} redis-cli -h {{ $labels.instance }} ttl {} | sort | uniq -c
          2. 分散过期时间: 给key添加随机过期偏移量
          3. 优化缓存淘汰策略: redis-cli -h {{ $labels.instance }} config set maxmemory-policy volatile-lru
          4. 监控过期key趋势: 定期执行info stats

    # Redis内存碎片率过高（>1.5）
#    - alert: Redis内存碎片率过高
#      expr: redis_mem_fragmentation_ratio > 1.5
#      for: 10m
#      labels:
#        severity: warning
#        level: P2
#      annotations:
#        summary: "Redis内存碎片率过高"
#        description: "Redis实例 {{ $labels.instance }} 内存碎片率: {{ $value }} (阈值1.5)"
#        runbook: |
#          【K8s环境】
#          1. 查看碎片率详情: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli info memory | grep fragment
#          2. 开启主动碎片整理: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli config set activedefrag yes
#          3. 低峰期重启Pod: kubectl delete pod -n {{ $labels.namespace }} {{ $labels.pod }}
#          【Linux原生环境】
#          1. 查看碎片率详情: redis-cli -h {{ $labels.instance }} info memory | grep fragment
#          2. 临时缓解: 执行redis-cli -h {{ $labels.instance }} config set activedefrag yes
#          3. 彻底解决: 低峰期重启Redis（需先确认持久化完成）
#          4. 调整内存分配器: 修改redis.conf中allocator为jemalloc

    # Redis集群槽位分配不均（单节点槽位>5000）
    - alert: Redis集群槽位分配不均
      expr: redis_cluster_nodes_slots > 5000
      for: 1h
      labels:
        severity: warning
        level: P2
      annotations:
        summary: "Redis集群槽位分配不均"
        description: "Redis集群 {{ $labels.instance }} 节点 {{ $labels.node }} 槽位数: {{ $value }} (超过5000)"
        runbook: |
          【K8s环境】
          1. 查看槽位分配详情: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli cluster slots
          2. 自动平衡槽位: kubectl exec -it -n {{ $labels.namespace }} {{ $labels.pod }} -- redis-cli cluster rebalance
          【Linux原生环境】
          1. 查看槽位分配详情: redis-cli -h {{ $labels.instance }} cluster slots
          2. 手动重新分配槽位: redis-cli -h {{ $labels.instance }} cluster reshard 目标节点ID
          3. 自动平衡槽位: redis-cli -h {{ $labels.instance }} cluster rebalance
          4. 验证平衡结果: redis-cli -h {{ $labels.instance }} cluster nodes | grep -i master | awk '{print $2, $8}'

验证规则生效

1.用 kubectl 验证 PrometheusRule 资源是否被正确创建

bash 复制代码

kubectl get prometheusrules -n monitoring | grep "redis-rules"

#看到：
NAME               AGE
redis-rules        2m11s

2.Prometheus UI

打开 Prometheus UI（ http://:30883）点击菜单：

Status → Rules

你应该能看到你创建的 rule groups：

redis-rules

如果这些 groups 都存在，说明 Prometheus 已经成功加载你的规则。

3. Grafana 导入告警规则面板

推荐适配 postgres_exporter 的中文面板：

**模板 ID：14858（k8s - alert）；

导入步骤：

Grafana → + → Import；
输入模板 ID（如14858）；
选择 Prometheus 数据源；