PostgreSQL源码分析——备份恢复

在上一篇PostgreSQL源码分析------基础备份中，我们分析了PG中基础备份的过程以及源码，备份与恢复是不分离的，这里我们继续分析一下，从基础备份中进行恢复的源码。

备份过程

执行备份：

sql 复制代码

postgres=# select pg_start_backup('bak3');
 pg_start_backup 
-----------------
 0/6000060
(1 row)

postgres=# insert into t1 values(5);
INSERT 0 1
postgres=# select pg_stop_backup();
NOTICE:  WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup
 pg_stop_backup 
----------------
 0/60002D8
(1 row)

查看日志：

sql 复制代码

postgres@slpc:~/pgsql/pgdata/pg_wal$ pg_waldump -p ../pg_wal 000000010000000000000006
rmgr: Standby     len (rec/tot):     50/    50, tx:          0, lsn: 0/06000028, prev 0/05000110, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: Standby     len (rec/tot):     50/    50, tx:          0, lsn: 0/06000060, prev 0/06000028, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: XLOG        len (rec/tot):    114/   114, tx:          0, lsn: 0/06000098, prev 0/06000060, desc: CHECKPOINT_ONLINE redo 0/6000060; tli 1; prev tli 1; fpw true; xid 0:738; oid 16387; multi 1; offset 0; oldest xid 726 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 738; online
rmgr: Standby     len (rec/tot):     50/    50, tx:          0, lsn: 0/06000110, prev 0/06000098, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: Heap        len (rec/tot):     54/   258, tx:        738, lsn: 0/06000148, prev 0/06000110, desc: INSERT off 5 flags 0x00, blkref #0: rel 1663/13010/16384 blk 0 FPW
rmgr: Transaction len (rec/tot):     34/    34, tx:        738, lsn: 0/06000250, prev 0/06000148, desc: COMMIT 2023-09-18 14:40:06.694650 CST
rmgr: Standby     len (rec/tot):     50/    50, tx:          0, lsn: 0/06000278, prev 0/06000250, desc: RUNNING_XACTS nextXid 739 latestCompletedXid 738 oldestRunningXid 739
rmgr: XLOG        len (rec/tot):     34/    34, tx:          0, lsn: 0/060002B0, prev 0/06000278, desc: BACKUP_END 0/6000060
rmgr: XLOG        len (rec/tot):     24/    24, tx:          0, lsn: 0/060002D8, prev 0/060002B0, desc: SWITCH

查看backup_label文件：

sql 复制代码

postgres@slpc:~/pgsql/pgbak2$ cat backup_label 
START WAL LOCATION: 0/6000060 (file 000000010000000000000006)
CHECKPOINT LOCATION: 0/6000098
BACKUP METHOD: pg_start_backup
BACKUP FROM: primary
START TIME: 2023-09-18 14:39:50 CST
LABEL: bak3
START TIMELINE: 1

恢复源码分析

启动备份数据库，检测到有backup_label文件时，则认为是从一个备份文件中进行恢复，读取backup_label中的检查点信息，而不是从pg_control中读取。

c 复制代码

main(int argc, char *argv[])
--> PostmasterMain(argc, argv);
	--> LocalProcessControlFile(false);		// 读pg_control文件
	--> StartupPID = StartupDataBase();		// 启动startup子进程
		--> StartupProcessMain();
			--> StartupXLOG();
				--> ValidateXLOGDirectoryStructure();   // Verify that pg_wal and pg_wal/archive_status exist.
				--> readRecoverySignalFile();		// Check for signal files, and if so set up state for offline recovery
				--> validateRecoveryParameters();
				--> XLogReaderAllocate    // Allocate and initialize a new XLogReader.
				// 是否存在backup_label文件，如果存在的话，则认为是从一个备份文件进行恢复
				--> read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)
				--> record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); //回放的起点为backup_label中的检查点
					--> XLogBeginRead(xlogreader, RecPtr);   // Begin reading WAL at 'RecPtr'.
					--> record = ReadRecord(xlogreader, LOG, true);
									for (;;)
									{
										record = XLogReadRecord(xlogreader, &errormsg);		// Attempt to read an XLOG record.
									}
				--> StartupCLOG();
				/* REDO */
				if (InRecovery)
				{
					UpdateControlFile();
					CheckRecoveryConsistency();
					if (checkPoint.redo < RecPtr)
					{
						/* back up to find the record */
						XLogBeginRead(xlogreader, checkPoint.redo);
						record = ReadRecord(xlogreader, PANIC, false);
					} else {
						/* just have to read next record after CheckPoint */
						record = ReadRecord(xlogreader, LOG, false);
					}

					if (record != NULL)
					{
						/* main redo apply loop */
						do  // 回放日志
						{	
							// 判断否已达到指定恢复位置，PITR用
							if (recoveryStopsBefore(xlogreader))
							{
								reachedRecoveryTarget = true;
								break;
							}

							/* Now apply the WAL record itself */
							RmgrTable[record->xl_rmid].rm_redo(xlogreader);
						}
					}
				}

核心函数StartupXLOG源码分析：

c 复制代码

void StartupXLOG(void)
{
	// ...

	/* Set up XLOG reader facility */
	MemSet(&private, 0, sizeof(XLogPageReadPrivate));
	xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.page_read = &XLogPageRead, .segment_open = NULL, .segment_close = wal_segment_close), &private);

	// 读backup_label文件
	if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby))
	{
		/* Archive recovery was requested, and thanks to the backup label
		 * file, we know how far we need to replay to reach consistency. Enter
		 * archive recovery directly. */
		InArchiveRecovery = true;

		/* When a backup_label file is present, we want to roll forward from
		 * the checkpoint it identifies, rather than using pg_control. */
		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
		if (record != NULL)
		{
			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));

			/* Make sure that REDO location exists. This may not be the case
			 * if there was a crash during an online backup, which left a
			 * backup_label around that references a WAL segment that's already been archived. */
			if (checkPoint.redo < checkPointLoc)
			{
				XLogBeginRead(xlogreader, checkPoint.redo);
				if (!ReadRecord(xlogreader, LOG, false))
					ereport(FATAL,(errmsg("could not find redo location referenced by checkpoint record"), errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir)));
			}
		}
		else
		{
			ereport(FATAL,
					(errmsg("could not locate required checkpoint record"),
					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir)));
			wasShutdown = false;	/* keep compiler quiet */
		}

		/* set flag to delete it later */
		haveBackupLabel = true;
	}
	else  // 如果没有backup_label文件，则读pg_control文件，在备机恢复的场景中，如果丢失了backup_label文件，而读取了pg_control文件中的检查点，则会因为回放位置不对，无法达成数据一致，恢复失败。
	{
		/* Get the last valid checkpoint record. */
		checkPointLoc = ControlFile->checkPoint;
		RedoStartLSN = ControlFile->checkPointCopy.redo;
		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
		if (record != NULL)
		{
			ereport(DEBUG1,(errmsg_internal("checkpoint record is at %X/%X", LSN_FORMAT_ARGS(checkPointLoc))));
		}
		else
		{
			/*
			 * We used to attempt to go back to a secondary checkpoint record
			 * here, but only when not in standby mode. We now just fail if we
			 * can't read the last checkpoint because this allows us to
			 * simplify processing around checkpoints.
			 */
			ereport(PANIC,(errmsg("could not locate a valid checkpoint record")));
		}
		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
	}

	/* REDO */
	if (InRecovery)
	{
		/*
		 * Set backupStartPoint if we're starting recovery from a base backup.
		 *
		 * Also set backupEndPoint and use minRecoveryPoint as the backup end
		 * location if we're starting recovery from a base backup which was
		 * taken from a standby. In this case, the database system status in
		 * pg_control must indicate that the database was already in recovery.
		 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
		 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
		 * before reaching this point; e.g. because restore_command or primary_conninfo were faulty.
		 *
		 * Any other state indicates that the backup somehow became corrupted and we can't sensibly continue with recovery.
		 */
		if (haveBackupLabel)
		{
			ControlFile->backupStartPoint = checkPoint.redo;		// 从基础备份中恢复
			ControlFile->backupEndRequired = backupEndRequired;

			if (backupFromStandby)
			{
				if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
					dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
					ereport(FATAL,
							(errmsg("backup_label contains data inconsistent with control file"),
							 errhint("This means that the backup is corrupted and you will "
									 "have to use another backup for recovery.")));
				ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
			}
		}

		UpdateControlFile();	// 更新pg_control，主要是将Backup start location写入


		/*
		 * We're in recovery, so unlogged relations may be trashed and must be
		 * reset.  This should be done BEFORE allowing Hot Standby
		 * connections, so that read-only backends don't try to read whatever
		 * garbage is left over from before.
		 */
		ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);

		/* Initialize resource managers */
		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
		{
			if (RmgrTable[rmid].rm_startup != NULL)
				RmgrTable[rmid].rm_startup();
		}

		CheckRecoveryConsistency();		// Checks if recovery has reached a consistent state.
	
		/*
		 * Find the first record that logically follows the checkpoint --- it
		 * might physically precede it, though. */
		if (checkPoint.redo < RecPtr)
		{
			/* back up to find the record */
			XLogBeginRead(xlogreader, checkPoint.redo);
			record = ReadRecord(xlogreader, PANIC, false);
		}
		else
		{
			/* just have to read next record after CheckPoint */
			record = ReadRecord(xlogreader, LOG, false);
		}

		if (record != NULL)
		{
			// 在这里进行实质的日志回放
			/* main redo apply loop */
			do
			{
				bool		switchedTLI = false;
				// 用于PITR，判断是否已经回放到了指定的Target
				/* Have we reached our recovery target? */
				if (recoveryStopsBefore(xlogreader))
				{
					reachedRecoveryTarget = true;
					break;
				}

				/* Now apply the WAL record itself */
				RmgrTable[record->xl_rmid].rm_redo(xlogreader);		// 调用standby_redo,xlog_redo,heap_redo，xact_redo等，进行回放，

				/* Allow read-only connections if we're consistent now */
				CheckRecoveryConsistency();

				/* Exit loop if we reached inclusive recovery target */
				if (recoveryStopsAfter(xlogreader))
				{
					reachedRecoveryTarget = true;
					break;
				}
				
				/* Else, try to fetch the next WAL record */
				record = ReadRecord(xlogreader, LOG, false);	
			} while (record != NULL);  // 直到结束
		}
	}

	/*
	 * Determine where to start writing WAL next.
	 *
	 * When recovery ended in an incomplete record, write a WAL record about
	 * that and continue after it.  In all other cases, re-fetch the last
	 * valid or last applied record, so we can identify the exact endpoint of
	 * what we consider the valid portion of WAL.
	 */
	XLogBeginRead(xlogreader, LastRec);
	record = ReadRecord(xlogreader, PANIC, false);
	EndOfLog = EndRecPtr;

	// ...

}

一直回放到XLOG_BACKUP_END，

c 复制代码

/*
 * XLOG resource manager's routines
 *
 * Definitions of info values are in include/catalog/pg_control.h, though
 * not all record types are related to control file updates.
 */
void xlog_redo(XLogReaderState *record)
{
	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
	XLogRecPtr	lsn = record->EndRecPtr;


	if (info == XLOG_NEXTOID)
	{
		// ...
	}
	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
	{
		CheckPoint	checkPoint;
		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));

		// ...

		RecoveryRestartPoint(&checkPoint);
	}
	else if (info == XLOG_CHECKPOINT_ONLINE)
	{
		CheckPoint	checkPoint;
		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
		// ...

		RecoveryRestartPoint(&checkPoint);
	}
	else if (info == XLOG_OVERWRITE_CONTRECORD)
	{
		xl_overwrite_contrecord xlrec;

		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
		VerifyOverwriteContrecord(&xlrec, record);
	}
	else if (info == XLOG_END_OF_RECOVERY)
	{
		xl_end_of_recovery xlrec;

		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));

		/*
		 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
		 * but this case is rarer and harder to test, so the benefit doesn't
		 * outweigh the potential extra cost of maintenance.
		 */

		/*
		 * We should've already switched to the new TLI before replaying this
		 * record.
		 */
		if (xlrec.ThisTimeLineID != ThisTimeLineID)
			ereport(PANIC,
					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
							xlrec.ThisTimeLineID, ThisTimeLineID)));
	}
	else if (info == XLOG_NOOP)
	{
		/* nothing to do here */
	}
	else if (info == XLOG_SWITCH)
	{
		/* nothing to do here */
	}
	else if (info == XLOG_RESTORE_POINT)
	{
		/* nothing to do here */
	}
	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
	{
		/*
		 * Full-page image (FPI) records contain nothing else but a backup
		 * block (or multiple backup blocks). Every block reference must
		 * include a full-page image - otherwise there would be no point in
		 * this record.
		 *
		 * No recovery conflicts are generated by these generic records - if a
		 * resource manager needs to generate conflicts, it has to define a
		 * separate WAL record type and redo routine.
		 *
		 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
		 * WAL- logged because of a hint bit update. They are only generated
		 * when checksums are enabled. There is no difference in handling
		 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
		 * code just to distinguish them for statistics purposes.
		 */
		for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
		{
			Buffer		buffer;

			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
			UnlockReleaseBuffer(buffer);
		}
	}
	else if (info == XLOG_BACKUP_END)	// 回放到这里，结束备份恢复过程
	{
		XLogRecPtr	startpoint;

		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));

		if (ControlFile->backupStartPoint == startpoint)
		{
			/*
			 * We have reached the end of base backup, the point where
			 * pg_stop_backup() was done. The data on disk is now consistent.
			 * Reset backupStartPoint, and update minRecoveryPoint to make
			 * sure we don't allow starting up at an earlier point even if
			 * recovery is stopped and restarted soon after this.
			 */
			elog(DEBUG1, "end of backup reached");

			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

			if (ControlFile->minRecoveryPoint < lsn)
			{
				ControlFile->minRecoveryPoint = lsn;
				ControlFile->minRecoveryPointTLI = ThisTimeLineID;
			}
			ControlFile->backupStartPoint = InvalidXLogRecPtr;
			ControlFile->backupEndRequired = false;
			UpdateControlFile();

			LWLockRelease(ControlFileLock);
		}
	}
	else if (info == XLOG_PARAMETER_CHANGE)
	{
		// ...
	}
	else if (info == XLOG_FPW_CHANGE)
	{
		// ...
	}
}