一、环境清单
1. 节点信息
| 机器名称 | 机器 IP | 操作系统 | 角色 |
|---|---|---|---|
| k8s-master | 10.132.47.60 | Centos7.9 | Master |
| k8s-node1 | 10.132.47.61 | Centos7.9 | Worker |
| k8s-node2 | 10.132.47.62 | Centos7.9 | Worker |
2. 验证集群环境
bash
# 检查集群节点状态(所有节点应为 Ready 状态)
kubectl get nodes
# 检查prometheus 组件状态
kubectl get pod -n monitoring
# 检查pod网络(确保calico/flannel等网络插件正常)
kubectl get pods -n kube-system -o wide
# 检查存储类(确认nfs-client存在且可用)
kubectl get sc
二、部署postgresql服务
1. 部署postgresql
shell
vim postgresql.yaml
yaml
# ============== 第一部分:创建独立的postgresql命名空间 ==============
apiVersion: v1
kind: Namespace
metadata:
name: postgresql-test # 对应redis-test/mysql-test
labels:
app: postgresql # 统一标签规范
---
# ============== 第二部分:创建ConfigMap 存储postgresql核心配置 ==============
apiVersion: v1
kind: ConfigMap
metadata:
name: postgresql-config # ConfigMap名称
namespace: postgresql-test
data:
# pg_hba.conf:配置远程访问(允许所有IP)
pg_hba.conf: |
# "local" is for Unix domain socket connections only
local all all peer
# IPv4 local connections:
host all all 127.0.0.1/32 md5
# IPv6 local connections:
host all all ::1/128 md5
# Allow replication connections from localhost
local replication all peer
host replication all 127.0.0.1/32 scram-sha-256
host replication all ::1/128 scram-sha-256
# 新增:允许所有IP远程访问(核心配置)
host all all 0.0.0.0/0 md5
# postgresql.conf:配置监听地址+密码加密方式
postgresql.conf: |
# 监听所有地址(核心配置)
listen_addresses = '*'
# 密码加密方式(和yum安装一致)
password_encryption = md5
# 基础配置(按需调整)
port = 5432
max_connections = 100
shared_buffers = 128MB
dynamic_shared_memory_type = posix
log_destination = 'stderr'
logging_collector = on
log_directory = 'pg_log'
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
log_truncate_on_rotation = off
log_rotation_age = 1d
log_rotation_size = 100MB
datestyle = 'iso, mdy'
timezone = 'UTC'
lc_messages = 'en_US.UTF-8'
lc_monetary = 'en_US.UTF-8'
lc_numeric = 'en_US.UTF-8'
lc_time = 'en_US.UTF-8'
default_text_search_config = 'pg_catalog.english'
---
# ============== 第三部分:创建Secret 存储postgres用户密码 ==============
apiVersion: v1
kind: Secret
metadata:
name: postgresql-secret # Secret名称
namespace: postgresql-test
type: Opaque
data:
# 密码:Shyshy521521!(base64编码,可通过echo -n "Shyshy521521!" | base64生成)
postgres-password: U2h5c2h5NTIxNTIxIQ==
---
# ============== 第四部分:创建PVC 持久化postgresql数据 ==============
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: postgresql-pvc # PVC名称
namespace: postgresql-test
labels:
app: postgresql
spec:
accessModes:
- ReadWriteOnce # 单节点读写
storageClassName: nfs-client # 和Redis/MySQL一致的存储类
resources:
requests:
storage: 2Gi # PostgreSQL数据量较大,分配2Gi
---
# ============== 第五部分:PostgreSQL Deployment 核心配置 ==============
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgresql # Deployment名称
namespace: postgresql-test
labels:
app: postgresql
spec:
replicas: 1 # 单实例部署
selector:
matchLabels:
app: postgresql
strategy:
type: Recreate # 重建更新(有状态服务必备)
template:
metadata:
labels:
app: postgresql
spec:
restartPolicy: Always
containers:
- name: postgresql
# 官方PostgreSQL 15镜像(和你yum安装的15版本一致)
image: crpi-vdf183p7g8m4zpud.cn-hangzhou.personal.cr.aliyuncs.com/shy_my/postgresql:15.7
imagePullPolicy: Always
ports:
- containerPort: 5432 # PostgreSQL默认端口
# 环境变量:设置默认用户密码(核心)
env:
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: postgresql-secret
key: postgres-password
- name: POSTGRES_USER
value: postgres # 默认用户
- name: PGDATA
value: /var/lib/postgresql/data/pgdata # 数据目录
# 健康检查-存活探针
livenessProbe:
exec:
command: ["psql", "-U", "postgres", "-c", "SELECT 1"]
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
# 健康检查-就绪探针
readinessProbe:
exec:
command: ["psql", "-U", "postgres", "-c", "SELECT 1"]
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
# 资源限制
resources:
limits:
cpu: 250m
memory: 256Mi
requests:
cpu: 125m
memory: 128Mi
# 卷挂载:配置文件+数据持久化
volumeMounts:
- name: postgresql-config-pg-hba
mountPath: /var/lib/postgresql/data/pg_hba.conf
subPath: pg_hba.conf # 精准挂载pg_hba.conf
- name: postgresql-config-main
mountPath: /var/lib/postgresql/data/postgresql.conf
subPath: postgresql.conf # 精准挂载主配置
- name: postgresql-data
mountPath: /var/lib/postgresql/data # 数据目录
# 定义卷
volumes:
- name: postgresql-config-pg-hba
configMap:
name: postgresql-config
items:
- key: pg_hba.conf
path: pg_hba.conf
- name: postgresql-config-main
configMap:
name: postgresql-config
items:
- key: postgresql.conf
path: postgresql.conf
- name: postgresql-data
persistentVolumeClaim:
claimName: postgresql-pvc
---
# ============== 第六部分:PostgreSQL Service 固定访问入口 ==============
apiVersion: v1
kind: Service
metadata:
name: postgresql # Service名称
namespace: postgresql-test
labels:
app: postgresql
spec:
type: NodePort # 暴露NodePort,方便外部访问
selector:
app: postgresql
ports:
- port: 5432 # 集群内访问端口
targetPort: 5432 # 容器端口
nodePort: 30432 # 节点端口(自定义,避免冲突)
protocol: TCP
name: postgresql-port # 端口名称
2. 验证服务
bash
# 1. 应用配置文件
kubectl apply -f postgresql.yaml
# 2. 验证资源创建
# 验证命名空间
kubectl get ns | grep postgresql-test
# 验证ConfigMap
kubectl get configmaps -n postgresql-test | grep postgresql-config
# 验证Secret
kubectl get secrets -n postgresql-test | grep postgresql-secret
# 验证PVC(应为Bound状态)
kubectl get pvc -n postgresql-test | grep postgresql-pvc
# 验证Pod(应为Running状态)
kubectl get pods -n postgresql-test | grep postgresql
# 验证Service(查看NodePort)
kubectl get svc -n postgresql-test | grep postgresql
# 3. 核心验证:远程访问测试(替换为你的节点IP)
kubectl exec -it -n postgresql-test <postgresql-pod名称> -- psql -U postgres -c "SELECT version();"
# 或外部访问(需安装psql客户端)
psql -h 10.132.47.60 -p 30432 -U postgres
# 输入密码:Shyshy521521!,能登录则说明远程访问配置成功
三、创建postgresql监控服务
1.创建文件
vim postgres-exporter.yaml
yaml
# 1. ConfigMap:存储exporter连接配置
apiVersion: v1
kind: ConfigMap
metadata:
name: postgres-exporter-cm # 对应redis-exporter-cm
namespace: monitoring
data:
# 直接将连接字符串作为环境变量值(更简单,避免文件挂载问题)
DATA_SOURCE_NAME: postgresql://postgres:Shyshy521521%21@10.132.47.60:30432/postgres?sslmode=disable
---
# 2. Postgres Exporter Deployment(修复所有语法错误)
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgres-exporter
namespace: monitoring
labels:
app: postgres-exporter
spec:
replicas: 1
selector:
matchLabels:
app: postgres-exporter
template:
metadata:
labels:
app: postgres-exporter
spec:
containers:
- name: postgres-exporter
# 官方postgres_exporter镜像
image: prometheuscommunity/postgres-exporter:v0.15.0
# ✅ 读取ConfigMap中的连接字符串
env:
- name: DATA_SOURCE_NAME
valueFrom:
configMapKeyRef:
name: postgres-exporter-cm
key: DATA_SOURCE_NAME
command:
- /bin/postgres_exporter #
securityContext:
runAsUser: 0 # 和Redis/MySQL Exporter一致
ports:
- containerPort: 9187 # postgres_exporter默认端口
name: metrics # 端口名称,匹配ServiceMonitor
resources:
# 资源限制和其他Exporter一致
limits:
cpu: 100m
memory: 128Mi
requests:
cpu: 50m
memory: 64Mi
---
# 3. Postgres Exporter Service
apiVersion: v1
kind: Service
metadata:
name: postgres-exporter # 对应redis-exporter
namespace: monitoring
labels:
app: postgres-exporter
spec:
selector:
app: postgres-exporter
ports:
- name: metrics # 端口名称,匹配ServiceMonitor
port: 9187 # postgres_exporter默认端口
targetPort: 9187
type: ClusterIP
---
# 4. Postgres Exporter ServiceMonitor
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: postgres-exporter # 对应redis-exporter
namespace: monitoring
labels:
app: postgres-exporter
spec:
selector:
matchLabels:
app: postgres-exporter
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: metrics # 匹配Service端口名称
interval: 15s # 采集间隔和其他Exporter一致
path: /metrics # postgres_exporter默认路径
2. 验证资源创建
shell
# 1. 应用配置文件
kubectl apply -f postgres-exporter.yaml
# 2. 验证资源创建
# 验证ConfigMap
kubectl get configmaps -n monitoring | grep postgres-exporter-cm
# 验证Pod+Service
kubectl get pod,svc -n monitoring -l app=postgres-exporter
# 验证ServiceMonitor
kubectl get servicemonitor postgres-exporter -n monitoring
# 3. 核心验证:检查postgres_up指标
# 获取Service集群IP
POSTGRES_EXPORTER_IP=$(kubectl get svc postgres-exporter -n monitoring -o jsonpath='{.spec.clusterIP}')
# 查看IP
echo $POSTGRES_EXPORTER_IP
# 访问metrics接口
curl -s http://${POSTGRES_EXPORTER_IP}:9187/metrics | grep postgres
# 预期输出:
# HELP postgres_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which postgres_exporter was built, and the goos and goarch for the build.
# TYPE postgres_exporter_build_info gauge
postgres_exporter_build_info{branch="HEAD",goarch="amd64",goos="linux",goversion="go1.24.7",revision="320b684f3dcd3f09b6e694e43f9b389b1a8199e5",tags="unknown",version="0.18.1"} 1
# HELP postgres_exporter_config_last_reload_success_timestamp_seconds Timestamp of the last successful configuration reload.
# TYPE postgres_exporter_config_last_reload_success_timestamp_seconds gauge
postgres_exporter_config_last_reload_success_timestamp_seconds 0
# HELP postgres_exporter_config_last_reload_successful Postgres exporter config loaded successfully.
# TYPE postgres_exporter_config_last_reload_successful gauge
postgres_exporter_config_last_reload_successful 0
3. Prometheus Target 验证
- 访问 Prometheus Web UI(http://:9090);
- 左侧
Status→Targets; - 搜索
postgres-exporter,确认State为 UP; - 若为 Down,检查连接字符串(密码、IP、端口是否正确)。
4. Grafana 导入 PostgreSQL 中文监控面板
推荐适配 postgres_exporter 的中文面板:
- 模板 ID:9628(PostgreSQL Overview,可汉化);
导入步骤:
- Grafana → + → Import;
- 输入模板 ID(如 9628);
- 选择 Prometheus 数据源;
- 点击 Import,即可看到 PostgreSQL 监控图表(连接数、CPU、内存、表大小、查询速率等)。
四、Prometheus指标查询语句
bash
四、PostgreSQL 监控(适配 postgres-exporter v0.15.0)
#4.1 检查 PostgreSQL exporter 是否正常运行(实例级)
pg_up{job="postgres-exporter"}
#4.2 PostgreSQL 连接数(适配 v0.15.0)
pg_stat_activity_count{job="postgres-exporter"}
#4.3 PostgreSQL 活跃会话数(适配 v0.15.0)
pg_stat_activity_count{job="postgres-exporter",state="active"}
#4.4 PostgreSQL 数据库大小(适配 v0.15.0)
pg_database_size_bytes{job="postgres-exporter"}
#4.10 PostgreSQL 锁等待数(适配 v0.15.0)
pg_locks_count{job="postgres-exporter"}
#不可用
#4.5 PostgreSQL 表空间大小(适配 v0.15.0)
pg_tablespace_size_bytes{job="postgres-exporter"}
#4.6 PostgreSQL 缓存命中率(适配 v0.15.0)
sum(pg_buffer_cache_hits{job="postgres-exporter"}) / (sum(pg_buffer_cache_hits{job="postgres-exporter"}) + sum(pg_buffer_cache_misses{job="postgres-exporter"})) * 100
#4.7 PostgreSQL 事务提交数(适配 v0.15.0)
pg_transactions_committed_total{job="postgres-exporter"}
#4.8 PostgreSQL 事务回滚数(适配 v0.15.0)
pg_transactions_rolled_back_total{job="postgres-exporter"}
#4.9 PostgreSQL 慢查询数(适配 v0.15.0,需开启 pg_stat_statements)
sum(pg_stat_statements_total{job="postgres-exporter"}) by (queryid, datname)
#4.11 PostgreSQL 磁盘读字节数(适配 v0.15.0)
pg_disk_read_bytes_total{job="postgres-exporter"}
#4.12 PostgreSQL 磁盘写字节数(适配 v0.15.0)
pg_disk_write_bytes_total{job="postgres-exporter"}
五、添加PostgreSQL告警规则
bash
vim postgresql-rules.yaml
yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: postgresql-rules
namespace: monitoring
labels:
custom-rule: "true" # 专属标签,用于筛选
prometheus: k8s
role: alert-rules
spec:
groups:
- name: postgresql.rules # 保持和你模板一致的group名称
rules:
# === P0级(严重,立即处理)===
# PostgreSQL宕机
- alert: PostgreSQL服务不可用
expr: postgres_up == 0
for: 1m
labels:
severity: critical
level: P0
annotations:
summary: "PostgreSQL服务不可用"
description: "PostgreSQL实例 {{ $labels.instance }} 无法连接 (状态码: {{ $value }})"
runbook: "1. 检查PG容器状态: kubectl get pods -n {{ $labels.namespace }} 2. 查看PG日志: kubectl logs -n {{ $labels.namespace }} {{ $labels.pod }} 3. 检查连接串是否正确"
# PostgreSQL主备同步中断
- alert: PostgreSQL主备同步中断
expr: postgres_replication_lag_seconds > 300 or postgres_wal_receiver_status == 0
for: 1m
labels:
severity: critical
level: P0
annotations:
summary: "PostgreSQL主备同步中断"
description: "PostgreSQL备库 {{ $labels.instance }} 同步延迟: {{ $value }} 秒 (超过5分钟)"
runbook: "1. 查看备库状态: select * from pg_stat_replication; 2. 重启wal receiver: select pg_wal_receiver_restart(); 3. 检查主库wal日志"
# PostgreSQL磁盘空间耗尽(数据目录)
- alert: PostgreSQL数据目录磁盘耗尽
expr: postgres_disk_used_bytes{mountpoint=~"/var/lib/postgresql|/data/pgdata"} / postgres_disk_total_bytes{mountpoint=~"/var/lib/postgresql|/data/pgdata"} > 0.95
for: 1m
labels:
severity: critical
level: P0
annotations:
summary: "PostgreSQL数据目录磁盘耗尽"
description: "PostgreSQL实例 {{ $labels.instance }} 数据目录 {{ $labels.mountpoint }} 使用率: {{ $value | humanizePercentage }} (超过95%)"
runbook: "1. 登录节点清理日志: ssh {{ $labels.instance }} 2. 清理WAL日志: pg_archivecleanup /var/lib/postgresql/wal 3. 扩容磁盘"
# PostgreSQL连接数耗尽(达到最大连接数95%)
- alert: PostgreSQL连接数即将耗尽
expr: postgres_connections / postgres_max_connections > 0.95
for: 1m
labels:
severity: critical
level: P0
annotations:
summary: "PostgreSQL连接数即将耗尽"
description: "PostgreSQL实例 {{ $labels.instance }} 连接数占比: {{ $value | humanizePercentage }} (超过最大连接数95%)"
runbook: "1. 紧急终止空闲连接: select pg_terminate_backend(pid) from pg_stat_activity where state='idle' and now()-query_start > '1 hour'::interval; 2. 临时调整max_connections"
# === P1级(警告,重点关注)===
# PostgreSQL活跃会话>50个
- alert: PostgreSQL活跃会话过多
expr: postgres_stat_activity_count{state="active"} > 50
for: 5m
labels:
severity: warning
level: P1
annotations:
summary: "PostgreSQL活跃会话过多"
description: "PostgreSQL实例 {{ $labels.instance }} 活跃会话数: {{ $value | humanize }}"
runbook: "1. 查看活跃会话: select pid, query from pg_stat_activity where state='active'; 2. 终止慢会话: select pg_terminate_backend(pid);"
# PostgreSQL锁等待>100个
- alert: PostgreSQL锁等待过多
expr: postgres_locks_count > 100
for: 5m
labels:
severity: warning
level: P1
annotations:
summary: "PostgreSQL锁等待过多"
description: "PostgreSQL实例 {{ $labels.instance }} 锁数量: {{ $value | humanize }}"
runbook: "1. 查看锁: select * from pg_locks where granted=false; 2. 优化SQL减少锁竞争 3. 缩短事务时长"
# PostgreSQL慢查询数激增(5分钟>10个)
- alert: PostgreSQL慢查询数激增
expr: rate(postgres_stat_statements_total{queryid!="0"}[5m]) > 10 and postgres_stat_statements_mean_time > 1000
for: 5m
labels:
severity: warning
level: P1
annotations:
summary: "PostgreSQL慢查询数激增"
description: "PostgreSQL实例 {{ $labels.instance }} 5分钟内慢查询数: {{ $value | humanize }}"
runbook: "1. 查看慢查询: select query, mean_time from pg_stat_statements where mean_time > 1000; 2. 执行explain分析慢SQL 3. 补充缺失索引"
# PostgreSQL WAL日志堆积(>100个未归档)
- alert: PostgreSQL WAL日志堆积
expr: postgres_wal_files_count > 100
for: 5m
labels:
severity: warning
level: P1
annotations:
summary: "PostgreSQL WAL日志堆积"
description: "PostgreSQL实例 {{ $labels.instance }} 未归档WAL文件数: {{ $value | humanize }} (超过100个)"
runbook: "1. 查看归档状态: select * from pg_stat_archiver; 2. 检查归档脚本: cat /usr/local/bin/pg_archivewal.sh 3. 手动归档WAL文件"
# PostgreSQL表膨胀率过高(>50%)
- alert: PostgreSQL表膨胀率过高
expr: postgres_table_bloat_ratio > 0.5
for: 5m
labels:
severity: warning
level: P1
annotations:
summary: "PostgreSQL表膨胀率过高"
description: "PostgreSQL实例 {{ $labels.instance }} 表 {{ $labels.relname }} 膨胀率: {{ $value | humanizePercentage }} (超过50%)"
runbook: "1. 查看表膨胀: select pg_size_pretty(pg_table_size('{{ $labels.relname }}')); 2. 清理膨胀: VACUUM FULL {{ $labels.relname }}; 3. 调整autovacuum配置"
# === P2级(提示,按需处理)===
# PostgreSQL数据库大小>100GB
- alert: PostgreSQL数据库过大
expr: postgres_database_size_bytes > 100 * 1024 * 1024 * 1024
for: 1d
labels:
severity: warning
level: P2
annotations:
summary: "PostgreSQL数据库过大"
description: "数据库 {{ $labels.datname }} 大小超过100GB"
runbook: "1. 分析表大小: select pg_size_pretty(pg_total_relation_size('table_name')); 2. 清理历史数据/分区表 3. 归档冷数据"
# PostgreSQL连接数突增(5分钟内>基线2倍)
- alert: PostgreSQL连接数突增
expr: postgres_connections > 2 * avg_over_time(postgres_connections[30m])
for: 5m
labels:
severity: warning
level: P2
annotations:
summary: "PostgreSQL连接数突增"
description: "PostgreSQL实例 {{ $labels.instance }} 连接数: {{ $value | humanize }} (超过基线2倍)"
runbook: "1. 查看连接来源: select client_addr, count(*) from pg_stat_activity group by client_addr; 2. 检查是否有异常连接/批量任务"
# PostgreSQL autovacuum未运行(>1小时)
- alert: PostgreSQL autovacuum未运行
expr: time() - postgres_autovacuum_last_run_seconds > 3600
for: 1h
labels:
severity: warning
level: P2
annotations:
summary: "PostgreSQL autovacuum未运行"
description: "PostgreSQL实例 {{ $labels.instance }} autovacuum已超过1小时未运行"
runbook: "1. 查看autovacuum状态: select * from pg_stat_bgwriter; 2. 手动执行VACUUM: VACUUM ANALYZE; 3. 调整autovacuum参数"
# PostgreSQL索引使用率低(<50%)
- alert: PostgreSQL索引使用率低
expr: postgres_index_usage_ratio < 0.5
for: 12h
labels:
severity: warning
level: P2
annotations:
summary: "PostgreSQL索引使用率低"
description: "PostgreSQL实例 {{ $labels.instance }} 索引 {{ $labels.indexname }} 使用率: {{ $value | humanizePercentage }} (低于50%)"
runbook: "1. 分析索引使用: select indexrelname, idx_scan from pg_stat_user_indexes; 2. 删除无用索引: DROP INDEX {{ $labels.indexname }}; 3. 优化索引设计"
# PostgreSQL临时文件创建过多(5分钟>1GB)
- alert: PostgreSQL临时文件创建过多
expr: rate(postgres_temp_files_size_bytes[5m]) > 1024 * 1024 * 1024
for: 5m
labels:
severity: warning
level: P2
annotations:
summary: "PostgreSQL临时文件创建过多"
description: "PostgreSQL实例 {{ $labels.instance }} 5分钟内临时文件大小: {{ $value | humanize }} (超过1GB)"
runbook: "1. 查看临时文件SQL: select query from pg_stat_activity where temp_files > 0; 2. 增大work_mem: set global work_mem=64MB; 3. 优化SQL避免大排序/聚合"
验证规则生效
1.用 kubectl 验证 PrometheusRule 资源是否被正确创建
bash
kubectl get prometheusrules -n monitoring | grep "postgresql-rules"
#看到:
NAME AGE
postgresql-rules 2m11s
2.Prometheus UI
打开 Prometheus UI( http://:30883)点击菜单:
Status → Rules
你应该能看到你创建的 rule groups:
- postgresql-rules
如果这些 groups 都存在,说明 Prometheus 已经成功加载你的规则。
3. Grafana 导入告警规则面板
推荐适配 postgres_exporter 的中文面板:
- **模板 ID:14858(k8s - alert);
导入步骤:
- Grafana → + → Import;
- 输入模板 ID(如14858);
- 选择 Prometheus 数据源;