注:执行方案:全部在postgres用户下执行
前言
流程介绍
bash
┌─────────────────────────────────────────────────────────────────────────┐
│ WAL 恢复完整流程 │
├─────────────────────────────────────────────────────────────────────────┤
│ │
│ 【备份阶段】 │
│ 1. 全量备份 → 记录 LSN 范围 (0/1000000 - 0/2000000) │
│ 2. 增量备份1 → 记录 LSN 范围 (0/2000000 - 0/3000000) │
│ 3. 增量备份2 → 记录 LSN 范围 (0/3000000 - 0/4000000) │
│ 4. WAL 持续归档 → 001, 002, 003, 004, 005, 006, 007, 008, 009, 010 │
│ │
│ 【故障发生】→ 时间点:WAL 010 生成后 │
│ │
│ 【恢复阶段】 │
│ 1. 合并备份 → 全量 + 增量1+ 增量2 = 状态恢复到 0/4000000 │
│ 2. 重放 WAL 009 → 状态推进到 0/5000000 │
│ 3. 重放 WAL 010 → 状态推进到 0/6000000 (故障前最新) │
│ 4. 恢复完成 → 数据库可正常访问 │
│ │
│ 【关键点】 │
│ ✓ WAL 归档必须连续,不能有缺口 │
│ ✓ pg_archivecleanup 不能过早清理 WAL │
│ ✓ restore_command 必须正确配置归档路径 │
│ ✓ recovery.signal 必须存在才能触发恢复 │
│ │
└─────────────────────────────────────────────────────────────────────────┘
一,环境准备
1.1 系统信息确认
bash
su - postgres
# 查看 PostgreSQL 版本
postgres --version
# 查看 PostgreSQL 安装路径
which psql
which pg_basebackup
which pg_combinebackup
which pg_archivecleanup
# 查看数据目录
psql -c "SHOW data_directory;"
# 查看配置文件路径
psql -c "SHOW config_file;"
创建备份目录结构
bash
# 以 root 身份执行(一次性)
sudo mkdir -p /backup/{base,incremental,merged,wal,log,scripts}
sudo chown -R postgres:postgres /backup
sudo chmod 700 /backup
1.2 配置 postgres 用户 sudo 免密
bash
#路径 /etc/sudoers
# 以 root 身份执行
visudo
# 添加以下内容(在文件末尾)
postgres ALL=(ALL) NOPASSWD: /usr/bin/systemctl start postgresql
postgres ALL=(ALL) NOPASSWD: /usr/bin/systemctl stop postgresql
postgres ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart postgresql
postgres ALL=(ALL) NOPASSWD: /usr/bin/systemctl reload postgresql
postgres ALL=(ALL) NOPASSWD: /usr/bin/systemctl status postgresql
1.3 配置 PostgreSQL 环境变量
bash
# 切换到 postgres 用户
su - postgres
# 编辑 ~/.bash_profile
vim ~/.bash_profile
# 添加以下内容
export PGHOME=/opt/pg_install
export PGDATA=/opt/pg_install/pg_data
export PATH=$PGHOME/bin:$PATH
export MANPATH=$PGHOME/share/man:$MANPATH
export LANG=zh_CN.utf8
export LD_LIBRARY_PATH=$PGHOME/lib:$LD_LIBRARY_PATH
export PGPORT=5432
export PGUSER=postgres
#alias psql='psql --timing'
#保存退出
# 使配置生效
source ~/.bash_profile
二,PostgreSQL 配置
2.1 主库配置(postgresql.conf)
bash
su - postgres
vim $PGDATA/postgresql.conf
# ============ 基础配置 ============
listen_addresses = '*' #监听所有网络接口的IP地址。允许远程客户端连接,不仅限于本地连接。
port = 5432 #数据库服务监听端口。PostgreSQL默认端口,客户端通过此端口连接。
max_connections = 200 #最大并发连接数。限制同时连接到数据库的客户端数量,防止资源耗尽。
# ============ WAL配置 ============
wal_level = replica #WAL日志记录级别。设置为replica支持流复制和归档,用于主从复制场景。
max_wal_senders = 10 #最大WAL发送进程数。限制同时连接的从库数量,最多允许10个从库同步。
max_replication_slots = 10 #最大复制槽数量。保留从库所需的WAL日志,防止从库落后时日志被清理。
wal_keep_size = 1GB #保留的WAL日志最小大小。确保主库至少保留1GB的WAL日志,供从库同步使用
# ============ 归档配置 ============
archive_mode = on #开启WAL归档模式。允许将WAL日志归档到外部存储,用于 PITR(时间点恢复)
archive_command = 'cp %p /backup/wal/%f' #归档执行命令。%p=WAL文件路径,%f=文件名,将WAL复制到/backup/wal/目录
archive_timeout = 300 #归档超时时间(秒)。强制每300秒归档一次WAL,即使未满16MB,减少数据丢失风险
# ============ 检查点配置 ============
checkpoint_timeout = 15min #检查点最大间隔时间。每15分钟自动触发一次检查点,将内存数据刷入磁盘
checkpoint_completion_target = 0.9 #检查点完成目标比例。检查点在90%的时间间隔内完成,平滑I/O负载,避免突发写入
max_wal_size = 2GB #WAL文件最大大小。WAL日志超过2GB时触发检查点,控制磁盘空间使用
min_wal_size = 512MB #WAL文件最小保留大小。检查点后保留至少512MB的WAL文件,避免频繁创建/删除
# ============ 日志配置 ============
logging_collector = on #开启日志收集器。启用后台进程收集日志,支持日志轮转和管理
log_directory = 'log' #日志存储目录。日志文件存放在data目录下的log子目录中
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' #日志文件命名格式。按日期时间命名,便于识别和查找特定时间的日志
log_rotation_age = 1d #日志按时间轮转。每天生成一个新日志文件,避免单个文件过大
log_rotation_size = 100MB #日志按大小轮转 单个日志文件达到100MB时自动切换新文件
2.2 复制用户配置(pg_hba.conf)
bash
su - postgres
vim $PGDATA/pg_hba.conf
# ============ 本地连接 ============
local all all trust
host all all 127.0.0.1/32 trust
host all all ::1/128 trust
# ============ 复制连接 ============
local replication all trust
host replication all 127.0.0.1/32 trust
host replication all 0.0.0.0/0 trust
2.3 创建备份专用用户
bash
su - postgres
psql -c "CREATE USER backup_user WITH REPLICATION LOGIN PASSWORD 'backup_pwd';"
psql -c "GRANT pg_read_all_settings TO backup_user;"
2.4 重启 PostgreSQL
bash
su - postgres
sudo systemctl restart postgresql-18
三,完整脚本集合(postgres 用户执行)
3.1 全量备份脚本 /backup/scripts/backup_full.sh
bash
#!/bin/bash
#===============================================================================
# 脚本名称:backup_full.sh
# 功能描述:PostgreSQL 18 全量基线备份
# 执行用户:postgres
# 使用方式:./backup_full.sh
#===============================================================================
set -e
# ============ 配置区域 ============
PG_BIN="/opt/pg_install/bin"
PG_DATA="/opt/pg_install/pg_data"
BACKUP_BASE="/backup/base"
BACKUP_WAL="/backup/wal"
BACKUP_LOG="/backup/log"
PG_HOST="localhost"
PG_PORT="5432"
PG_USER="backup_user"
DATE=$(date +%Y%m%d_%H%M%S)
RETENTION_DAYS=28
# ============ 日志函数 ============
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a ${BACKUP_LOG}/backup_full.log
}
# ============ 主流程 ============
log "========== 开始全量备份 =========="
log "备份时间:${DATE}"
# 1. 检查磁盘空间
DISK_USAGE=$(df -h ${BACKUP_BASE} | tail -1 | awk '{print $5}' | tr -d '%')
if [ $DISK_USAGE -gt 80 ]; then
log "警告:备份目录磁盘使用率 ${DISK_USAGE}%,超过 80%"
fi
# 2. 创建备份目录
BACKUP_DIR="${BACKUP_BASE}/FULL_${DATE}"
mkdir -p ${BACKUP_DIR}
log "备份目录:${BACKUP_DIR}"
# 3. 执行全量备份
log "执行 pg_basebackup..."
${PG_BIN}/pg_basebackup \
-h ${PG_HOST} \
-p ${PG_PORT} \
-U ${PG_USER} \
-D ${BACKUP_DIR} \
-Fp \
-Xs \
-P \
-R \
--checkpoint=fast \
2>&1 | tee -a ${BACKUP_LOG}/basebackup.log
# 4. 验证备份
log "验证备份完整性..."
${PG_BIN}/pg_verifybackup ${BACKUP_DIR} 2>&1 | tee -a ${BACKUP_LOG}/verify.log
# 5. 触发 WAL 切换
log "触发 WAL 切换..."
${PG_BIN}/psql -h ${PG_HOST} -p ${PG_PORT} -U ${PG_USER} -c "SELECT pg_switch_wal();"
# 6. 清理旧备份
log "清理${RETENTION_DAYS}天前的全量备份..."
find ${BACKUP_BASE} -type d -name "FULL_*" -mtime +${RETENTION_DAYS} -exec rm -rf {} \;
OLD_COUNT=$(find ${BACKUP_BASE} -type d -name "FULL_*" -mtime +${RETENTION_DAYS} | wc -l)
log "清理完成,删除 ${OLD_COUNT} 个旧备份"
# 7. 记录备份信息
echo "${DATE}|${BACKUP_DIR}|SUCCESS" >> ${BACKUP_LOG}/backup_history.csv
log "========== 全量备份完成 =========="
log "备份大小:$(du -sh ${BACKUP_DIR} | cut -f1)"
3.2 增量备份脚本 /backup/scripts/backup_incremental.sh
bash
#!/bin/bash
#===============================================================================
# 脚本名称:backup_incremental.sh
# 功能描述:PostgreSQL 18 块级别增量备份
# 执行用户:postgres
# 使用方式:./backup_incremental.sh
#===============================================================================
set -e
# ============ 配置区域 ============
PG_BIN="/opt/pg_install/bin"
BACKUP_BASE="/backup/base"
BACKUP_INCR="/backup/incremental"
BACKUP_WAL="/backup/wal"
BACKUP_LOG="/backup/log"
PG_HOST="localhost"
PG_PORT="5432"
PG_USER="backup_user"
DATE=$(date +%Y%m%d_%H%M%S)
RETENTION_DAYS=7
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a ${BACKUP_LOG}/backup_incremental.log
}
# ============ 获取最新 manifest ============
get_latest_manifest() {
local latest_full=$(ls -td ${BACKUP_BASE}/FULL_* 2>/dev/null | head -1)
local latest_incr=$(ls -td ${BACKUP_INCR}/INC_* 2>/dev/null | head -1)
if [ -n "$latest_incr" ] && [ -f "${latest_incr}/backup_manifest" ]; then
echo "${latest_incr}/backup_manifest"
elif [ -n "$latest_full" ] && [ -f "${latest_full}/backup_manifest" ]; then
echo "${latest_full}/backup_manifest"
else
echo ""
fi
}
log "========== 开始增量备份 =========="
log "备份时间:${DATE}"
# 1. 获取 manifest
MANIFEST=$(get_latest_manifest)
if [ -z "$MANIFEST" ]; then
log "错误:找不到有效的 manifest 文件,请先执行全量备份"
exit 1
fi
log "使用 manifest: ${MANIFEST}"
# 2. 创建备份目录
BACKUP_DIR="${BACKUP_INCR}/INC_${DATE}"
mkdir -p ${BACKUP_DIR}
# 3. 执行增量备份
log "执行增量备份..."
${PG_BIN}/pg_basebackup \
-h ${PG_HOST} \
-p ${PG_PORT} \
-U ${PG_USER} \
-D ${BACKUP_DIR} \
-Fp \
-Xs \
-P \
--incremental=${MANIFEST} \
--checkpoint=fast \
2>&1 | tee -a ${BACKUP_LOG}/incremental.log
# 4. 验证备份
log "验证备份..."
${PG_BIN}/pg_verifybackup ${BACKUP_DIR} 2>&1 | tee -a ${BACKUP_LOG}/verify_incr.log
# 5. 触发 WAL 切换
log "触发 WAL 切换..."
${PG_BIN}/psql -h ${PG_HOST} -p ${PG_PORT} -U ${PG_USER} -c "SELECT pg_switch_wal();"
# 6. 清理旧增量备份
log "清理${RETENTION_DAYS}天前的增量备份..."
find ${BACKUP_INCR} -type d -name "INC_*" -mtime +${RETENTION_DAYS} -exec rm -rf {} \;
# 7. 记录备份信息
echo "${DATE}|${BACKUP_DIR}|SUCCESS|${MANIFEST}" >> ${BACKUP_LOG}/backup_history.csv
log "========== 增量备份完成 =========="
log "备份大小:$(du -sh ${BACKUP_DIR} | cut -f1)"
3.3 备份合并脚本 /backup/scripts/merge_backup.sh
bash
#!/bin/bash
#===============================================================================
# 脚本名称:merge_backup.sh
# 功能描述:合并全量+增量备份为完整备份
# 执行用户:postgres
# 使用方式:./merge_backup.sh
#===============================================================================
set -e
# ============ 配置区域 ============
PG_BIN="/opt/pg_install/bin"
BACKUP_BASE="/backup/base"
BACKUP_INCR="/backup/incremental"
BACKUP_MERGED="/backup/merged"
BACKUP_LOG="/backup/log"
DATE=$(date +%Y%m%d_%H%M%S)
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a ${BACKUP_LOG}/merge_backup.log
}
log "========== 开始备份合并 =========="
# 1. 获取最新全量备份
LATEST_FULL=$(ls -td ${BACKUP_BASE}/FULL_* 2>/dev/null | head -1)
if [ -z "$LATEST_FULL" ]; then
log "错误:找不到全量备份"
exit 1
fi
log "全量备份:${LATEST_FULL}"
# 2. 获取所有增量备份(按时间顺序)
INCREMENTALS=$(ls -td ${BACKUP_INCR}/INC_* 2>/dev/null | tac)
if [ -n "$INCREMENTALS" ]; then
log "增量备份数量:$(echo "$INCREMENTALS" | wc -l)"
echo "$INCREMENTALS" | while read inc; do log " - ${inc}"; done
else
log "无增量备份"
fi
# 3. 创建输出目录
MERGE_OUTPUT="${BACKUP_MERGED}/MERGED_${DATE}"
mkdir -p ${MERGE_OUTPUT}
# 4. 执行合并
log "执行 pg_combinebackup..."
if [ -n "$INCREMENTALS" ]; then
${PG_BIN}/pg_combinebackup \
${LATEST_FULL} \
$(echo "$INCREMENTALS" | tr '\n' ' ') \
-o ${MERGE_OUTPUT} \
2>&1 | tee -a ${BACKUP_LOG}/combine.log
else
log "无增量备份,直接复制全量备份..."
cp -r ${LATEST_FULL}/* ${MERGE_OUTPUT}/
fi
# 5. 验证合并结果
log "验证合并备份..."
${PG_BIN}/pg_verifybackup ${MERGE_OUTPUT} 2>&1 | tee -a ${BACKUP_LOG}/verify_merge.log
# 6. 清理旧合并备份(保留最近 2 个)
log "清理旧合并备份..."
ls -td ${BACKUP_MERGED}/MERGED_* 2>/dev/null | tail -n +3 | xargs rm -rf 2>/dev/null || true
log "清理FULL备份..."
ls -td ${BACKUP_BASE}/FULL_*| xargs rm -rf 2>/dev/null || true
log "清理INC备份..."
ls -td ${BACKUP_INCR}/INC_*| xargs rm -rf 2>/dev/null || true
# 7. 记录
echo "${DATE}|${MERGE_OUTPUT}|SUCCESS" >> ${BACKUP_LOG}/merge_history.csv
log "========== 备份合并完成 =========="
log "合并备份:${MERGE_OUTPUT}"
log "合并备份大小:$(du -sh ${MERGE_OUTPUT} | cut -f1)"
3.4 WAL 清理脚本 /backup/scripts/cleanup_wal.sh
bash
#!/bin/bash
#===============================================================================
# 脚本名称:cleanup_wal.sh
# 功能描述:清理过期的 WAL 归档文件
# 执行用户:postgres
# 使用方式:./cleanup_wal.sh
#===============================================================================
set -e
# ============ 配置区域 ============
PG_BIN="/opt/pg_install/bin"
BACKUP_WAL="/backup/wal"
BACKUP_LOG="/backup/log"
PG_HOST="localhost"
PG_PORT="5432"
PG_USER="backup_user"
RETENTION_WAL_COUNT=20
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a ${BACKUP_LOG}/cleanup_wal.log
}
log "========== 开始 WAL 清理 =========="
# 1. 获取从库需要的最旧 WAL(通过复制槽)
OLDEST_WAL=$(${PG_BIN}/psql -h ${PG_HOST} -p ${PG_PORT} -U ${PG_USER} -t -A -X -c \
"SELECT pg_walfile_name(restart_lsn) FROM pg_replication_slots WHERE active=true ORDER BY restart_lsn LIMIT 1;" 2>/dev/null | tr -d ' ')
if [ -n "$OLDEST_WAL" ]; then
log "检测到复制槽,最旧保留 WAL: ${OLDEST_WAL}"
${PG_BIN}/pg_archivecleanup -d ${BACKUP_WAL} ${OLDEST_WAL} 2>&1 | tee -a ${BACKUP_LOG}/archive_cleanup.log
else
log "未检测到活跃复制槽,使用备用清理策略"
# 备用策略:保留最近 N 个 WAL 文件
WAL_COUNT=$(ls -1 ${BACKUP_WAL} 2>/dev/null | wc -l)
if [ $WAL_COUNT -gt $RETENTION_WAL_COUNT ]; then
DELETE_COUNT=$((WAL_COUNT - RETENTION_WAL_COUNT))
log "WAL 文件数:${WAL_COUNT}, 将删除:${DELETE_COUNT} 个"
ls -1t ${BACKUP_WAL} | tail -n ${DELETE_COUNT} | while read file; do
log "删除:${file}"
rm -f ${BACKUP_WAL}/${file}
done
else
log "WAL 文件数:${WAL_COUNT}, 无需清理"
fi
fi
# 2. 清理 .backup 文件
log "清理 .backup 文件..."
${PG_BIN}/pg_archivecleanup -x .backup ${BACKUP_WAL} 000000000000000000000000 2>/dev/null || true
# 3. 检查磁盘空间
DISK_USAGE=$(df -h ${BACKUP_WAL} | tail -1 | awk '{print $5}')
WAL_SIZE=$(du -sh ${BACKUP_WAL} | cut -f1)
log "归档目录大小:${WAL_SIZE}, 磁盘使用率:${DISK_USAGE}"
# 4. 告警检查
USAGE_NUM=$(echo $DISK_USAGE | tr -d '%')
if [ $USAGE_NUM -gt 80 ]; then
log "警告:WAL 归档目录磁盘使用率超过 80%!"
fi
log "========== WAL 清理完成 =========="
3.5 恢复脚本 /backup/scripts/restore.sh
bash
#!/bin/bash
#===============================================================================
# 脚本名称:restore.sh
# 功能描述:PostgreSQL 18 完整恢复(支持 PITR)
# 执行用户:postgres(需要 sudo 权限)
# 使用方式:./restore.sh [恢复时间点]
# ./restore.sh # 恢复到最新
# ./restore.sh "2026-02-19 12:00:00" # 恢复到指定时间
#===============================================================================
set -e
# ============ 配置区域 ============
PG_BIN="/opt/pg_install/bin"
PG_DATA="/opt/pg_install/pg_data"
PG_USER="postgres"
BACKUP_MERGED="/backup/merged"
BACKUP_WAL="/backup/wal"
BACKUP_LOG="/backup/log"
RECOVERY_TIME="$1"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a ${BACKUP_LOG}/restore.log
}
# ============ 获取最新合并备份 ============
LATEST_MERGED=$(ls -td ${BACKUP_MERGED}/MERGED_* 2>/dev/null | head -1)
if [ -z "$LATEST_MERGED" ]; then
log "错误:找不到合并备份"
exit 1
fi
log "========== 开始恢复 =========="
log "使用备份:${LATEST_MERGED}"
if [ -n "$RECOVERY_TIME" ]; then
log "恢复目标时间:${RECOVERY_TIME}"
fi
# ============ 步骤1:停止 PostgreSQL ============
log "停止 PostgreSQL 服务..."
sudo systemctl stop postgresql-18
sleep 3
# 验证服务已停止
if sudo systemctl is-active --quiet postgresql-18; then
log "错误:PostgreSQL 服务未能停止"
exit 1
fi
log "PostgreSQL 服务已停止"
# ============ 步骤2:备份当前数据目录 ============
log "备份当前数据目录..."
if [ -d "${PG_DATA}" ]; then
FAILED_DIR="${PG_DATA}.failed.$(date +%Y%m%d_%H%M%S)"
mv ${PG_DATA} ${FAILED_DIR}
log "原数据目录已备份:${FAILED_DIR}"
fi
# ============ 步骤3:复制合并备份 ============
log "复制备份到数据目录..."
cp -r ${LATEST_MERGED} ${PG_DATA}
chown -R postgres:postgres ${PG_DATA}
chmod 700 ${PG_DATA}
log "备份复制完成"
# ============ 步骤4:配置恢复参数 ============
log "配置恢复参数..."
touch ${PG_DATA}/recovery.signal
cat > ${PG_DATA}/postgresql.auto.conf << EOF
restore_command = 'cp ${BACKUP_WAL}/%f %p'
recovery_target_timeline = 'latest'
EOF
if [ -n "$RECOVERY_TIME" ]; then
cat >> ${PG_DATA}/postgresql.auto.conf << EOF
recovery_target_time = '${RECOVERY_TIME}'
recovery_target_action = 'promote'
EOF
log "已配置 PITR 恢复时间:${RECOVERY_TIME}"
fi
# ============ 步骤5:启动 PostgreSQL ============
log "启动 PostgreSQL 进行恢复..."
sudo systemctl start postgresql-18
# 验证服务已启动
sleep 3
if ! sudo systemctl is-active --quiet postgresql-18; then
log "错误:PostgreSQL 服务启动失败"
exit 1
fi
log "PostgreSQL 服务已启动"
# ============ 步骤6:等待恢复完成 ============
log "等待恢复完成..."
MAX_WAIT=300
WAITED=0
while [ $WAITED -lt $MAX_WAIT ]; do
RECOVERY_STATUS=$(${PG_BIN}/psql -t -A -X -c "SELECT pg_is_in_recovery();" 2>/dev/null | tr -d ' ')
if [ "$RECOVERY_STATUS" = "f" ]; then
log "恢复完成!"
break
fi
sleep 5
WAITED=$((WAITED + 5))
log "恢复中... (${WAITED}s) - 状态:${RECOVERY_STATUS:-'连接中'}"
done
if [ $WAITED -ge $MAX_WAIT ]; then
log "警告:恢复超时,请检查日志:${PG_DATA}/log/"
fi
# ============ 步骤7:验证恢复 ============
log "验证恢复状态..."
${PG_BIN}/psql -c "SELECT pg_is_in_recovery() AS 恢复中,pg_current_wal_lsn() AS 当前LSN;"
log "数据库列表:"
${PG_BIN}/psql -c "\\l"
# ============ 步骤8:清理恢复信号 ============
if [ -f "${PG_DATA}/recovery.signal" ]; then
log "清理恢复信号文件..."
rm -f ${PG_DATA}/recovery.signal
fi
# ============ 记录恢复信息 ============
echo "$(date +%Y%m%d_%H%M%S)|${LATEST_MERGED}|${RECOVERY_TIME:-LATEST}|SUCCESS" >> ${BACKUP_LOG}/restore_history.csv
log "========== 恢复完成 =========="
3.6 监控脚本 /backup/scripts/monitor_backup.sh
bash
#!/bin/bash
#===============================================================================
# 脚本名称:monitor_backup.sh
# 功能描述:备份系统健康检查
# 执行用户:postgres
# 使用方式:./monitor_backup.sh
#===============================================================================
set -e
# ============ 配置区域 ============
PG_BIN="/opt/pg_install/bin"
BACKUP_BASE="/backup/base"
BACKUP_INCR="/backup/incremental"
BACKUP_MERGED="/backup/merged"
BACKUP_WAL="/backup/wal"
BACKUP_LOG="/backup/log"
PG_HOST="localhost"
PG_PORT="5432"
PG_USER="backup_user"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
echo "=============================================="
echo " PostgreSQL 备份系统健康检查"
echo "=============================================="
# 1. 检查 PostgreSQL 状态
log "1. PostgreSQL 服务状态"
if ${PG_BIN}/psql -h ${PG_HOST} -p ${PG_PORT} -U ${PG_USER} -c "SELECT 1;" >/dev/null 2>&1; then
log " ✓ PostgreSQL 运行正常"
else
log " ✗ PostgreSQL 连接失败"
fi
# 2. 检查归档状态
log "2. WAL 归档状态"
${PG_BIN}/psql -h ${PG_HOST} -p ${PG_PORT} -U ${PG_USER} -c \
"SELECT archived_count, failed_count, last_archived_time FROM pg_stat_archiver;" 2>/dev/null || \
log " 无法查询归档状态"
# 3. 检查备份目录
log "3. 备份目录状态"
log " 全量备份:$(ls -td ${BACKUP_BASE}/FULL_* 2>/dev/null | wc -l) 个"
log " 增量备份:$(ls -td ${BACKUP_INCR}/INC_* 2>/dev/null | wc -l) 个"
log " 合并备份:$(ls -td ${BACKUP_MERGED}/MERGED_* 2>/dev/null | wc -l) 个"
log " WAL 文件:$(ls -1 ${BACKUP_WAL} 2>/dev/null | wc -l) 个"
# 4. 检查磁盘空间
log "4. 磁盘空间"
df -h /backup | tail -1 | awk '{print " 使用率:", $5, "可用:", $4}'
# 5. 检查最新备份时间
log "5. 最新备份时间"
LATEST_FULL=$(ls -td ${BACKUP_BASE}/FULL_* 2>/dev/null | head -1)
LATEST_INCR=$(ls -td ${BACKUP_INCR}/INC_* 2>/dev/null | head -1)
LATEST_MERGED=$(ls -td ${BACKUP_MERGED}/MERGED_* 2>/dev/null | head -1)
if [ -n "$LATEST_FULL" ]; then
log " 全量:$(basename $LATEST_FULL)"
fi
if [ -n "$LATEST_INCR" ]; then
log " 增量:$(basename $LATEST_INCR)"
fi
if [ -n "$LATEST_MERGED" ]; then
log " 合并:$(basename $LATEST_MERGED)"
fi
# 6. 检查复制槽
log "6. 复制槽状态"
${PG_BIN}/psql -h ${PG_HOST} -p ${PG_PORT} -U ${PG_USER} -c \
"SELECT slot_name, active, restart_lsn FROM pg_replication_slots;" 2>/dev/null || \
log " 无复制槽"
echo "=============================================="
log "健康检查完成"
四,定时任务配置
4.1 创建 crontab(postgres 用户)
bash
su - root
echo "postgres" >> /etc/cron.allow
su - postgres
crontab -e
#
# ============ PostgreSQL 备份任务 ============
# 每周日凌晨 2:00 全量备份
0 2 * * 0 /backup/scripts/backup_full.sh
# 周一至周六凌晨 2:00 增量备份
0 2 * * 1-6 /backup/scripts/backup_incremental.sh
# 每天凌晨 3:00 合并备份
0 3 * * * /backup/scripts/merge_backup.sh
# 每小时清理 WAL 归档
0 * * * * /backup/scripts/cleanup_wal.sh
# 每天上午 8:00 发送备份报告
0 8 * * * /backup/scripts/monitor_backup.sh >> /backup/log/daily_report.log 2>&1
# ============ 清理任务 ============
# 每周日清理旧备份
0 4 * * 0 find /backup/base -type d -name "FULL_*" -mtime +28 -exec rm -rf {} \;
0 4 * * 0 find /backup/incremental -type d -name "INC_*" -mtime +7 -exec rm -rf {} \;
0 4 * * 0 find /backup/merged -type d -name "MERGED_*" -mtime +14 -exec rm -rf {} \;
# ============ 日志轮转 ============
# 每月 1 日归档旧日志
0 5 1 * * find /backup/log -name "*.log" -mtime +30 -exec gzip {} \;
五,恢复演练流程
5.1 定期演练计划
| 演练类型 | 频率 | 目标 | 负责人 |
|---|---|---|---|
| 完整恢复演练 | 每月 | 验证备份可用性 | DBA |
| PITR 恢复演练 | 每季度 | 验证时间点恢复 | DBA |
| 灾难恢复演练 | 每半年 | 全流程验证 | 运维团队 |
| [定期演练计划表格] |
5.2 恢复演练步骤
bash
# ============ 演练前准备 ============
# 1. 记录当前数据库状态
su - postgres
psql -c "SELECT count(*) FROM pg_database;" > /tmp/pre_restore.txt
# 2. 记录当前时间
date > /tmp/restore_start_time.txt
# ============ 执行恢复 ============
# 3. 执行恢复脚本
/backup/scripts/restore.sh
# ============ 演练后验证 ============
# 4. 验证数据库状态
psql -c "SELECT count(*) FROM pg_database;" > /tmp/post_restore.txt
# 5. 对比数据
diff /tmp/pre_restore.txt /tmp/post_restore.txt
# 6. 记录恢复时间
date > /tmp/restore_end_time.txt
# 7. 计算 RTO(恢复时间目标)
echo "恢复开始:$(cat /tmp/restore_start_time.txt)"
echo "恢复结束:$(cat /tmp/restore_end_time.txt)"
# ============ 清理演练环境 ============
# 8. 恢复原数据目录(如果需要)
# 9. 更新演练报告
5.3 恢复演练报告模板
bash
# PostgreSQL 恢复演练报告
## 基本信息
- 演练日期:YYYY-MM-DD
- 演练类型:完整恢复 / PITR 恢复
- 参与人员:XXX
## 备份信息
- 使用备份:/backup/merged/MERGED_YYYYMMDD_HHMMSS
- 备份大小:XX GB
- 备份时间:YYYY-MM-DD HH:MM:SS
## 恢复过程
- 开始时间:HH:MM:SS
- 结束时间:HH:MM:SS
- 总耗时:XX 分钟
## 验证结果
- [ ] PostgreSQL 服务启动成功
- [ ] 数据库连接正常
- [ ] 数据完整性验证通过
- [ ] 应用连接测试通过
## 问题与改进
- 问题1: ...
- 改进措施: ...
## 结论
[ ] 演练成功,备份可用
[ ] 演练失败,需要改进
六、常见问题排查
6.1 故障排查手册
| 问题 | 可能原因 | 解决方案 |
|---|---|---|
pg_basebackup 失败 |
复制用户权限不足 | 检查 pg_hba.conf 和用户权限 |
| 归档失败 | archive_command 路径错误 |
检查归档目录权限和路径 |
pg_archivecleanup 不删除文件 |
复制槽未消费 | 检查 pg_replication_slots |
| 恢复后无法连接 | pg_hba.conf 丢失 |
单独备份配置文件 |
| PITR 恢复超时 | WAL 文件不完整 | 检查 WAL 归档连续性 |
pg_combinebackup 失败 |
备份链断裂 | 重新执行全量备份 |
| sudo 权限不足 | 未配置免密 | 检查 /etc/sudoers 配置 |
| [故障排查手册表格] |
6.2 关键检查命令
bash
su - postgres
# 检查归档状态
psql -c "SELECT * FROM pg_stat_archiver;"
# 检查复制槽
psql -c "SELECT * FROM pg_replication_slots;"
# 检查复制状态
psql -c "SELECT * FROM pg_stat_replication;"
# 检查 WAL 位置
psql -c "SELECT pg_current_wal_lsn(), pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn();"
# 检查备份清单
cat /backup/base/FULL_*/backup_manifest | head -50
# 验证备份
/opt/pg_install/bin/pg_verifybackup /backup/merged/MERGED_*
# 检查磁盘空间
df -h /backup
# 查看 PostgreSQL 日志
tail -100 /opt/pg_install/pg_data/log/postgresql-*.log
七,备份策略总结
7.1 推荐备份计划
bash
┌─────────────────────────────────────────────────────────────────┐
│ 备份周期示意图 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 周日 周一 周二 周三 周四 周五 周六 │
│ │ │ │ │ │ │ │ │
│ ▼ ▼ ▼ ▼ ▼ ▼ ▼ │
│ 全量 增量 增量 增量 增量 增量 增量 │
│ │ │ │ │ │ │ │ │
│ └───────┴───────┴───────┴───────┴───────┴───────┘ │
│ │ │
│ ▼ │
│ 每天合并备份 │
│ │ │
│ ▼ │
│ 每小时 WAL 清理 │
│ │
└─────────────────────────────────────────────────────────────────┘
7.2 保留策略
| 备份类型 | 保留周期 | 存储空间估算(100GB 数据库) |
|---|---|---|
| 全量备份 | 28 天 | 400GB(4 个全量) |
| 增量备份 | 7 天 | 50GB(7 个增量) |
| 合并备份 | 14 天 | 200GB(2 个合并) |
| WAL 归档 | 14 天 | 100GB |
| 合计 | - | ~750GB |
7.2 RTO/RPO 目标
| 指标 | 目标值 | 说明 |
|---|---|---|
| RPO(数据丢失) | < 5 分钟 | WAL 归档间隔 |
| RTO(恢复时间) | < 30 分钟 | 取决于数据量 |
| 备份成功率 | > 99.9% | 监控告警保障 |
八,附录
8.1 脚本权限设置
bash
# 一次性设置所有脚本权限
sudo chown postgres:postgres /backup/scripts/*.sh
sudo chmod 755 /backup/scripts/*.sh
8.2 日志文件说明
| 日志文件 | 说明 |
|---|---|
/backup/log/backup_full.log |
全量备份日志 |
/backup/log/backup_incremental.log |
增量备份日志 |
/backup/log/merge_backup.log |
合并备份日志 |
/backup/log/cleanup_wal.log |
WAL 清理日志 |
/backup/log/restore.log |
恢复操作日志 |
/backup/log/backup_history.csv |
备份历史记录 |
/backup/log/restore_history.csv |
恢复历史记录 |
8.3 psql 常用参数速查表
| 参数 | 全称 | 说明 | 脚本中推荐 |
|---|---|---|---|
-t |
--tuples-only | 只输出数据行 | ✓ |
-A |
--no-align | 非对齐输出(不带边框) | ✓ |
-X |
--no-psqlrc | 不读取配置文件 | ✓ |
-c |
--command | 执行命令 | ✓ |
-h |
--host | 指定主机 | 按需 |
-p |
--port | 指定端口 | 按需 |
-U |
--username | 指定用户 | 按需 |
-d |
--dbname | 指定数据库 | 按需 |
-f |
--file | 执行文件 | 按需 |
-1 |
--single-transaction | 单事务执行 | 按需 |
8.4 紧急联系人
| 角色 | 姓名 | 联系方式 |
|---|---|---|
| DBA 负责人 | XXX | phone/email |
| 运维负责人 | XXX | phone/email |
| 应用负责人 | XXX | phone/email |
文档修订记录
| 版本 | 日期 | 修订内容 | 修订人 |
|---|---|---|---|
| v1.0 | 2026-02-19 | 初始版本 | - |