以insert into语句插入元组生成的wal日志为例:
准备数据
1、XLogRegisterData:
把xl_heap_insert链接到rdatas[num_rdatas++]
再把mainrdata_last指向rdatas[num_rdatas++]
2、XLogRegisterBuffer:
将用于插入该元组的缓冲区rnode、fork、block等存入registered_buffers[0]
3、XLogRegisterBufData:
将xl_heap_header存入rdatas[num_rdatas++]
再将rdatas[num_rdatas++]链接入registered_buffers[0]
4、XLogRegisterBufData:
将去除头部信息的元组数据存入rdatas[num_rdatas++];
再将再将rdatas[num_rdatas++]链接入registered_buffers[0]
写入xlog(XLogInsert)
1、构造xlog记录(XLogRecordAssemble)
构造xlog数据到已经预先分配空间的hdr_scratch中,依次为:
XLogRecord:最后填充
XLogRecordBlockHeader:
id:准备数据中使用block块可能有多个,这里从0开始依次标示第几块
fork_flags:准备数据中第2步的fork
data_length:数据准备中第3第4步数据总长度
BlockNumber:数据准备中第2步中查询到的block号(该插入位置为该表文件的第几页)
XLR_BLOCK_ID_DATA_LONG or XLR_BLOCK_ID_DATA_SHORT:取决于下面的mainrdata_len是否大于255
mainrdata_len:准备数据中第1步数据的长度
mainrdata:准备数据中第1第3第4中保存到rdatas中的数据
2、写入xlog缓存空间并刷新到磁盘
未完待续
详细代码分析
准备数据函数:
cpp
void
heap_insert(Relation relation, HeapTuple tup, CommandId cid,
int options, BulkInsertState bistate)
{
......
// 准备:开始拼装一个新的wal记录
XLogBeginInsert();
// 数据准备1:注册mainrdata到rdatas[num_rdatas++]
XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
xlhdr.t_infomask = heaptup->t_data->t_infomask;
xlhdr.t_hoff = heaptup->t_data->t_hoff;
/*
* note we mark xlhdr as belonging to buffer; if XLogInsert decides to
* write the whole page to the xlog, we don't need to store
* xl_heap_header in the xlog.
*/
// 数据准备2:获取要插入哪个表的哪个分支的那一页等信息,保存到registered_buffers[block_id],block_id=0:因为此处只需要修改一个数据表页,所以只需要一个
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
// 数据准备3:继续向rdatas[num_rdatas++]添加简化的元组头信息。并将rdatas[num_rdatas++]链接到链表registered_buffers[block_id]中
XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
// 数据准备4:继续向rdatas[num_rdatas++]添加完整的元组数据。并将rdatas[num_rdatas++]链接到链表registered_buffers[block_id]中
XLogRegisterBufData(0,
(char *) heaptup->t_data + SizeofHeapTupleHeader,
heaptup->t_len - SizeofHeapTupleHeader);
/* filtering by origin on a row level is much more efficient */
XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
// 数据插入:将wal日志数据复制到wal日志缓冲区并刷盘
recptr = XLogInsert(RM_HEAP_ID, info);
PageSetLSN(page, recptr);
......
}
写入wal日志函数
函数XLogInsert主要分为两步完成写入动作,分别由XLogRecordAssemble生成wal日志记录,由XLogInsertRecord完成复制到wal缓冲区并刷新到磁盘。
cpp
static XLogRecData *
XLogRecordAssemble(RmgrId rmid, uint8 info,
XLogRecPtr RedoRecPtr, bool doPageWrites,
XLogRecPtr *fpw_lsn, int *num_fpi)
{
XLogRecData *rdt;
uint32 total_len = 0;
int block_id;
pg_crc32c rdata_crc;
registered_buffer *prev_regbuf = NULL;
XLogRecData *rdt_datas_last;
XLogRecord *rechdr;
// 初始化时已分配好内存空间hdr_scratch用于拼装wal日志
// scratch指向内存开始处,每添加一部分内容就指向后续空间继续添加其他内容
char *scratch = hdr_scratch;
/*
* Note: this function can be called multiple times for the same record.
* All the modifications we do to the rdata chains below must handle that.
*/
/* The record begins with the fixed-size header */
// wal日志头部固定为结构体XLogRecord ,因为wal日志总长度最后才能确定,所以函数最后再填充这一部分
rechdr = (XLogRecord *) scratch;
scratch += SizeOfXLogRecord;
hdr_rdt.next = NULL;
rdt_datas_last = &hdr_rdt;
hdr_rdt.data = hdr_scratch;
/*
* Enforce consistency checks for this record if user is looking for it.
* Do this before at the beginning of this routine to give the possibility
* for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY directly for
* a record.
*/
if (wal_consistency_checking[rmid])
info |= XLR_CHECK_CONSISTENCY;
/*
* Make an rdata chain containing all the data portions of all block
* references. This includes the data for full-page images. Also append
* the headers for the block references in the scratch buffer.
*/
*fpw_lsn = InvalidXLogRecPtr;
for (block_id = 0; block_id < max_registered_block_id; block_id++)
{
registered_buffer *regbuf = ®istered_buffers[block_id];
bool needs_backup;
bool needs_data;
XLogRecordBlockHeader bkpb;
XLogRecordBlockImageHeader bimg;
XLogRecordBlockCompressHeader cbimg = {0};
bool samerel;
bool is_compressed = false;
bool include_image;
if (!regbuf->in_use)
continue;
/* Determine if this block needs to be backed up */
if (regbuf->flags & REGBUF_FORCE_IMAGE)
needs_backup = true;
else if (regbuf->flags & REGBUF_NO_IMAGE)
needs_backup = false;
else if (!doPageWrites)
needs_backup = false;
else
{
/*
* We assume page LSN is first data on *every* page that can be
* passed to XLogInsert, whether it has the standard page layout
* or not.
*/
XLogRecPtr page_lsn = PageGetLSN(regbuf->page);
needs_backup = (page_lsn <= RedoRecPtr);
if (!needs_backup)
{
if (*fpw_lsn == InvalidXLogRecPtr || page_lsn < *fpw_lsn)
*fpw_lsn = page_lsn;
}
}
/* Determine if the buffer data needs to included */
if (regbuf->rdata_len == 0)
needs_data = false;
else if ((regbuf->flags & REGBUF_KEEP_DATA) != 0)
needs_data = true;
else
needs_data = !needs_backup;
bkpb.id = block_id;
bkpb.fork_flags = regbuf->forkno;
bkpb.data_length = 0;
if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT)
bkpb.fork_flags |= BKPBLOCK_WILL_INIT;
/*
* If needs_backup is true or WAL checking is enabled for current
* resource manager, log a full-page write for the current block.
*/
include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0;
if (include_image)
{
Page page = regbuf->page;
uint16 compressed_len = 0;
/*
* The page needs to be backed up, so calculate its hole length
* and offset.
*/
if (regbuf->flags & REGBUF_STANDARD)
{
/* Assume we can omit data between pd_lower and pd_upper */
uint16 lower = ((PageHeader) page)->pd_lower;
uint16 upper = ((PageHeader) page)->pd_upper;
if (lower >= SizeOfPageHeaderData &&
upper > lower &&
upper <= BLCKSZ)
{
bimg.hole_offset = lower;
cbimg.hole_length = upper - lower;
}
else
{
/* No "hole" to remove */
bimg.hole_offset = 0;
cbimg.hole_length = 0;
}
}
else
{
/* Not a standard page header, don't try to eliminate "hole" */
bimg.hole_offset = 0;
cbimg.hole_length = 0;
}
/*
* Try to compress a block image if wal_compression is enabled
*/
if (wal_compression)
{
is_compressed =
XLogCompressBackupBlock(page, bimg.hole_offset,
cbimg.hole_length,
regbuf->compressed_page,
&compressed_len);
}
/*
* Fill in the remaining fields in the XLogRecordBlockHeader
* struct
*/
bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE;
/* Report a full page image constructed for the WAL record */
*num_fpi += 1;
/*
* Construct XLogRecData entries for the page content.
*/
rdt_datas_last->next = ®buf->bkp_rdatas[0];
rdt_datas_last = rdt_datas_last->next;
bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE;
/*
* If WAL consistency checking is enabled for the resource manager
* of this WAL record, a full-page image is included in the record
* for the block modified. During redo, the full-page is replayed
* only if BKPIMAGE_APPLY is set.
*/
if (needs_backup)
bimg.bimg_info |= BKPIMAGE_APPLY;
if (is_compressed)
{
bimg.length = compressed_len;
bimg.bimg_info |= BKPIMAGE_IS_COMPRESSED;
rdt_datas_last->data = regbuf->compressed_page;
rdt_datas_last->len = compressed_len;
}
else
{
bimg.length = BLCKSZ - cbimg.hole_length;
if (cbimg.hole_length == 0)
{
rdt_datas_last->data = page;
rdt_datas_last->len = BLCKSZ;
}
else
{
/* must skip the hole */
rdt_datas_last->data = page;
rdt_datas_last->len = bimg.hole_offset;
rdt_datas_last->next = ®buf->bkp_rdatas[1];
rdt_datas_last = rdt_datas_last->next;
rdt_datas_last->data =
page + (bimg.hole_offset + cbimg.hole_length);
rdt_datas_last->len =
BLCKSZ - (bimg.hole_offset + cbimg.hole_length);
}
}
total_len += bimg.length;
}
if (needs_data)
{
/*
* Link the caller-supplied rdata chain for this buffer to the
* overall list.
*/
bkpb.fork_flags |= BKPBLOCK_HAS_DATA;
bkpb.data_length = regbuf->rdata_len;
total_len += regbuf->rdata_len;
rdt_datas_last->next = regbuf->rdata_head;
rdt_datas_last = regbuf->rdata_tail;
}
if (prev_regbuf && RelFileNodeEquals(regbuf->rnode, prev_regbuf->rnode))
{
samerel = true;
bkpb.fork_flags |= BKPBLOCK_SAME_REL;
}
else
samerel = false;
prev_regbuf = regbuf;
/* Ok, copy the header to the scratch buffer */
// 在XLogRecord之后紧接着是XLogRecordBlockHeader,
// id:准备数据中使用block块可能有多个,这里从0开始依次标示第几块
// fork_flags:准备数据中第2步的fork
// data_length:数据准备中第3第4步数据总长度
memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader);
scratch += SizeOfXLogRecordBlockHeader;
if (include_image)
{
memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader);
scratch += SizeOfXLogRecordBlockImageHeader;
if (cbimg.hole_length != 0 && is_compressed)
{
memcpy(scratch, &cbimg,
SizeOfXLogRecordBlockCompressHeader);
scratch += SizeOfXLogRecordBlockCompressHeader;
}
}
if (!samerel)
{
memcpy(scratch, ®buf->rnode, sizeof(RelFileNode));
scratch += sizeof(RelFileNode);
}
// 数据准备第2步中查询到的block号(该插入位置为该表文件的第几页)
memcpy(scratch, ®buf->block, sizeof(BlockNumber));
scratch += sizeof(BlockNumber);
}
/* followed by the record's origin, if any */
if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) &&
replorigin_session_origin != InvalidRepOriginId)
{
*(scratch++) = (char) XLR_BLOCK_ID_ORIGIN;
memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin));
scratch += sizeof(replorigin_session_origin);
}
/* followed by toplevel XID, if not already included in previous record */
if (IsSubTransactionAssignmentPending())
{
TransactionId xid = GetTopTransactionIdIfAny();
/* update the flag (later used by XLogResetInsertion) */
XLogSetRecordFlags(XLOG_INCLUDE_XID);
*(scratch++) = (char) XLR_BLOCK_ID_TOPLEVEL_XID;
memcpy(scratch, &xid, sizeof(TransactionId));
scratch += sizeof(TransactionId);
}
/* followed by main data, if any */
if (mainrdata_len > 0)
{
if (mainrdata_len > 255)
{
// 标示出mainrdata_len是否大于255
*(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG;
// mainrdata_len的具体值
memcpy(scratch, &mainrdata_len, sizeof(uint32));
scratch += sizeof(uint32);
}
else
{
*(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT;
*(scratch++) = (uint8) mainrdata_len;
}
rdt_datas_last->next = mainrdata_head;
rdt_datas_last = mainrdata_last;
total_len += mainrdata_len;
}
rdt_datas_last->next = NULL;
hdr_rdt.len = (scratch - hdr_scratch);
total_len += hdr_rdt.len;
/*
* Calculate CRC of the data
*
* Note that the record header isn't added into the CRC initially since we
* don't know the prev-link yet. Thus, the CRC will represent the CRC of
* the whole record in the order: rdata, then backup blocks, then record
* header.
*/
// 初始化CRC上下文
INIT_CRC32C(rdata_crc);
// 计算已经复制到hdr_scratch 中除XLogRecord以外的wal头部数据
COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord);
// 计算所有注册到链表中的wal日志数据
for (rdt = hdr_rdt.next; rdt != NULL; rdt = rdt->next)
COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
/*
* Fill in the fields in the record header. Prev-link is filled in later,
* once we know where in the WAL the record will be inserted. The CRC does
* not include the record header yet.
*/
// 最后可以填充XLogRecord了
rechdr->xl_xid = GetCurrentTransactionIdIfAny();
rechdr->xl_tot_len = total_len;
rechdr->xl_info = info;
rechdr->xl_rmid = rmid;
rechdr->xl_prev = InvalidXLogRecPtr;
rechdr->xl_crc = rdata_crc;
return &hdr_rdt;
}