在上一篇PostgreSQL源码分析------基础备份中,我们分析了PG中基础备份的过程以及源码,备份与恢复是不分离的,这里我们继续分析一下,从基础备份中进行恢复的源码。
备份过程
执行备份:
sql
postgres=# select pg_start_backup('bak3');
pg_start_backup
-----------------
0/6000060
(1 row)
postgres=# insert into t1 values(5);
INSERT 0 1
postgres=# select pg_stop_backup();
NOTICE: WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup
pg_stop_backup
----------------
0/60002D8
(1 row)
查看日志:
sql
postgres@slpc:~/pgsql/pgdata/pg_wal$ pg_waldump -p ../pg_wal 000000010000000000000006
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000028, prev 0/05000110, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000060, prev 0/06000028, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: XLOG len (rec/tot): 114/ 114, tx: 0, lsn: 0/06000098, prev 0/06000060, desc: CHECKPOINT_ONLINE redo 0/6000060; tli 1; prev tli 1; fpw true; xid 0:738; oid 16387; multi 1; offset 0; oldest xid 726 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 738; online
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000110, prev 0/06000098, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: Heap len (rec/tot): 54/ 258, tx: 738, lsn: 0/06000148, prev 0/06000110, desc: INSERT off 5 flags 0x00, blkref #0: rel 1663/13010/16384 blk 0 FPW
rmgr: Transaction len (rec/tot): 34/ 34, tx: 738, lsn: 0/06000250, prev 0/06000148, desc: COMMIT 2023-09-18 14:40:06.694650 CST
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000278, prev 0/06000250, desc: RUNNING_XACTS nextXid 739 latestCompletedXid 738 oldestRunningXid 739
rmgr: XLOG len (rec/tot): 34/ 34, tx: 0, lsn: 0/060002B0, prev 0/06000278, desc: BACKUP_END 0/6000060
rmgr: XLOG len (rec/tot): 24/ 24, tx: 0, lsn: 0/060002D8, prev 0/060002B0, desc: SWITCH
查看backup_label文件:
sql
postgres@slpc:~/pgsql/pgbak2$ cat backup_label
START WAL LOCATION: 0/6000060 (file 000000010000000000000006)
CHECKPOINT LOCATION: 0/6000098
BACKUP METHOD: pg_start_backup
BACKUP FROM: primary
START TIME: 2023-09-18 14:39:50 CST
LABEL: bak3
START TIMELINE: 1
恢复源码分析
启动备份数据库,检测到有backup_label文件时,则认为是从一个备份文件中进行恢复,读取backup_label中的检查点信息,而不是从pg_control中读取。
c
main(int argc, char *argv[])
--> PostmasterMain(argc, argv);
--> LocalProcessControlFile(false); // 读pg_control文件
--> StartupPID = StartupDataBase(); // 启动startup子进程
--> StartupProcessMain();
--> StartupXLOG();
--> ValidateXLOGDirectoryStructure(); // Verify that pg_wal and pg_wal/archive_status exist.
--> readRecoverySignalFile(); // Check for signal files, and if so set up state for offline recovery
--> validateRecoveryParameters();
--> XLogReaderAllocate // Allocate and initialize a new XLogReader.
// 是否存在backup_label文件,如果存在的话,则认为是从一个备份文件进行恢复
--> read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)
--> record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); //回放的起点为backup_label中的检查点
--> XLogBeginRead(xlogreader, RecPtr); // Begin reading WAL at 'RecPtr'.
--> record = ReadRecord(xlogreader, LOG, true);
for (;;)
{
record = XLogReadRecord(xlogreader, &errormsg); // Attempt to read an XLOG record.
}
--> StartupCLOG();
/* REDO */
if (InRecovery)
{
UpdateControlFile();
CheckRecoveryConsistency();
if (checkPoint.redo < RecPtr)
{
/* back up to find the record */
XLogBeginRead(xlogreader, checkPoint.redo);
record = ReadRecord(xlogreader, PANIC, false);
} else {
/* just have to read next record after CheckPoint */
record = ReadRecord(xlogreader, LOG, false);
}
if (record != NULL)
{
/* main redo apply loop */
do // 回放日志
{
// 判断否已达到指定恢复位置,PITR用
if (recoveryStopsBefore(xlogreader))
{
reachedRecoveryTarget = true;
break;
}
/* Now apply the WAL record itself */
RmgrTable[record->xl_rmid].rm_redo(xlogreader);
}
}
}
核心函数StartupXLOG
源码分析:
c
void StartupXLOG(void)
{
// ...
/* Set up XLOG reader facility */
MemSet(&private, 0, sizeof(XLogPageReadPrivate));
xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.page_read = &XLogPageRead, .segment_open = NULL, .segment_close = wal_segment_close), &private);
// 读backup_label文件
if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby))
{
/* Archive recovery was requested, and thanks to the backup label
* file, we know how far we need to replay to reach consistency. Enter
* archive recovery directly. */
InArchiveRecovery = true;
/* When a backup_label file is present, we want to roll forward from
* the checkpoint it identifies, rather than using pg_control. */
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
if (record != NULL)
{
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
/* Make sure that REDO location exists. This may not be the case
* if there was a crash during an online backup, which left a
* backup_label around that references a WAL segment that's already been archived. */
if (checkPoint.redo < checkPointLoc)
{
XLogBeginRead(xlogreader, checkPoint.redo);
if (!ReadRecord(xlogreader, LOG, false))
ereport(FATAL,(errmsg("could not find redo location referenced by checkpoint record"), errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir)));
}
}
else
{
ereport(FATAL,
(errmsg("could not locate required checkpoint record"),
errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
"If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir)));
wasShutdown = false; /* keep compiler quiet */
}
/* set flag to delete it later */
haveBackupLabel = true;
}
else // 如果没有backup_label文件,则读pg_control文件,在备机恢复的场景中,如果丢失了backup_label文件,而读取了pg_control文件中的检查点,则会因为回放位置不对,无法达成数据一致,恢复失败。
{
/* Get the last valid checkpoint record. */
checkPointLoc = ControlFile->checkPoint;
RedoStartLSN = ControlFile->checkPointCopy.redo;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
if (record != NULL)
{
ereport(DEBUG1,(errmsg_internal("checkpoint record is at %X/%X", LSN_FORMAT_ARGS(checkPointLoc))));
}
else
{
/*
* We used to attempt to go back to a secondary checkpoint record
* here, but only when not in standby mode. We now just fail if we
* can't read the last checkpoint because this allows us to
* simplify processing around checkpoints.
*/
ereport(PANIC,(errmsg("could not locate a valid checkpoint record")));
}
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
}
/* REDO */
if (InRecovery)
{
/*
* Set backupStartPoint if we're starting recovery from a base backup.
*
* Also set backupEndPoint and use minRecoveryPoint as the backup end
* location if we're starting recovery from a base backup which was
* taken from a standby. In this case, the database system status in
* pg_control must indicate that the database was already in recovery.
* Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
* DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
* before reaching this point; e.g. because restore_command or primary_conninfo were faulty.
*
* Any other state indicates that the backup somehow became corrupted and we can't sensibly continue with recovery.
*/
if (haveBackupLabel)
{
ControlFile->backupStartPoint = checkPoint.redo; // 从基础备份中恢复
ControlFile->backupEndRequired = backupEndRequired;
if (backupFromStandby)
{
if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
ereport(FATAL,
(errmsg("backup_label contains data inconsistent with control file"),
errhint("This means that the backup is corrupted and you will "
"have to use another backup for recovery.")));
ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
}
}
UpdateControlFile(); // 更新pg_control,主要是将Backup start location写入
/*
* We're in recovery, so unlogged relations may be trashed and must be
* reset. This should be done BEFORE allowing Hot Standby
* connections, so that read-only backends don't try to read whatever
* garbage is left over from before.
*/
ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
/* Initialize resource managers */
for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
{
if (RmgrTable[rmid].rm_startup != NULL)
RmgrTable[rmid].rm_startup();
}
CheckRecoveryConsistency(); // Checks if recovery has reached a consistent state.
/*
* Find the first record that logically follows the checkpoint --- it
* might physically precede it, though. */
if (checkPoint.redo < RecPtr)
{
/* back up to find the record */
XLogBeginRead(xlogreader, checkPoint.redo);
record = ReadRecord(xlogreader, PANIC, false);
}
else
{
/* just have to read next record after CheckPoint */
record = ReadRecord(xlogreader, LOG, false);
}
if (record != NULL)
{
// 在这里进行实质的日志回放
/* main redo apply loop */
do
{
bool switchedTLI = false;
// 用于PITR,判断是否已经回放到了指定的Target
/* Have we reached our recovery target? */
if (recoveryStopsBefore(xlogreader))
{
reachedRecoveryTarget = true;
break;
}
/* Now apply the WAL record itself */
RmgrTable[record->xl_rmid].rm_redo(xlogreader); // 调用standby_redo,xlog_redo,heap_redo,xact_redo等,进行回放,
/* Allow read-only connections if we're consistent now */
CheckRecoveryConsistency();
/* Exit loop if we reached inclusive recovery target */
if (recoveryStopsAfter(xlogreader))
{
reachedRecoveryTarget = true;
break;
}
/* Else, try to fetch the next WAL record */
record = ReadRecord(xlogreader, LOG, false);
} while (record != NULL); // 直到结束
}
}
/*
* Determine where to start writing WAL next.
*
* When recovery ended in an incomplete record, write a WAL record about
* that and continue after it. In all other cases, re-fetch the last
* valid or last applied record, so we can identify the exact endpoint of
* what we consider the valid portion of WAL.
*/
XLogBeginRead(xlogreader, LastRec);
record = ReadRecord(xlogreader, PANIC, false);
EndOfLog = EndRecPtr;
// ...
}
一直回放到XLOG_BACKUP_END
,
c
/*
* XLOG resource manager's routines
*
* Definitions of info values are in include/catalog/pg_control.h, though
* not all record types are related to control file updates.
*/
void xlog_redo(XLogReaderState *record)
{
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
XLogRecPtr lsn = record->EndRecPtr;
if (info == XLOG_NEXTOID)
{
// ...
}
else if (info == XLOG_CHECKPOINT_SHUTDOWN)
{
CheckPoint checkPoint;
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
// ...
RecoveryRestartPoint(&checkPoint);
}
else if (info == XLOG_CHECKPOINT_ONLINE)
{
CheckPoint checkPoint;
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
// ...
RecoveryRestartPoint(&checkPoint);
}
else if (info == XLOG_OVERWRITE_CONTRECORD)
{
xl_overwrite_contrecord xlrec;
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
VerifyOverwriteContrecord(&xlrec, record);
}
else if (info == XLOG_END_OF_RECOVERY)
{
xl_end_of_recovery xlrec;
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
/*
* For Hot Standby, we could treat this like a Shutdown Checkpoint,
* but this case is rarer and harder to test, so the benefit doesn't
* outweigh the potential extra cost of maintenance.
*/
/*
* We should've already switched to the new TLI before replaying this
* record.
*/
if (xlrec.ThisTimeLineID != ThisTimeLineID)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
xlrec.ThisTimeLineID, ThisTimeLineID)));
}
else if (info == XLOG_NOOP)
{
/* nothing to do here */
}
else if (info == XLOG_SWITCH)
{
/* nothing to do here */
}
else if (info == XLOG_RESTORE_POINT)
{
/* nothing to do here */
}
else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
{
/*
* Full-page image (FPI) records contain nothing else but a backup
* block (or multiple backup blocks). Every block reference must
* include a full-page image - otherwise there would be no point in
* this record.
*
* No recovery conflicts are generated by these generic records - if a
* resource manager needs to generate conflicts, it has to define a
* separate WAL record type and redo routine.
*
* XLOG_FPI_FOR_HINT records are generated when a page needs to be
* WAL- logged because of a hint bit update. They are only generated
* when checksums are enabled. There is no difference in handling
* XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
* code just to distinguish them for statistics purposes.
*/
for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
{
Buffer buffer;
if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
UnlockReleaseBuffer(buffer);
}
}
else if (info == XLOG_BACKUP_END) // 回放到这里,结束备份恢复过程
{
XLogRecPtr startpoint;
memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
if (ControlFile->backupStartPoint == startpoint)
{
/*
* We have reached the end of base backup, the point where
* pg_stop_backup() was done. The data on disk is now consistent.
* Reset backupStartPoint, and update minRecoveryPoint to make
* sure we don't allow starting up at an earlier point even if
* recovery is stopped and restarted soon after this.
*/
elog(DEBUG1, "end of backup reached");
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (ControlFile->minRecoveryPoint < lsn)
{
ControlFile->minRecoveryPoint = lsn;
ControlFile->minRecoveryPointTLI = ThisTimeLineID;
}
ControlFile->backupStartPoint = InvalidXLogRecPtr;
ControlFile->backupEndRequired = false;
UpdateControlFile();
LWLockRelease(ControlFileLock);
}
}
else if (info == XLOG_PARAMETER_CHANGE)
{
// ...
}
else if (info == XLOG_FPW_CHANGE)
{
// ...
}
}