Redis源码分析之持久化
- Redis持久化原理可以查看本人另一篇文章:Redis原理之持久化
- 在进入源码分析前,需要简单了解一下Redis的主要流程,可以参考本人另外一篇文章:Redis源码分析之基础流程
1. RDB
1.1 RDB文件自动保存
在serverCron函数中,有一段判断,如果配置设置了save ,则会判断是否满足条件。saveparam即为该配置的一个结构体。save可以配置多条,只要满足一个配置的条件,就会执行持久化,执行rdbSaveBackground方法。
c
//server.c#serverCron
for (j = 0; j < server.saveparamslen; j++) {
struct saveparam *sp = server.saveparams+j;
/* Save if we reached the given amount of changes,
* the given amount of seconds, and if the latest bgsave was
* successful or if, in case of an error, at least
* CONFIG_BGSAVE_RETRY_DELAY seconds already elapsed. */
if (server.dirty >= sp->changes &&
server.unixtime-server.lastsave > sp->seconds &&
(server.unixtime-server.lastbgsave_try >
CONFIG_BGSAVE_RETRY_DELAY ||
server.lastbgsave_status == C_OK)) {
serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
sp->changes, (int)sp->seconds);
rdbSaveInfo rsi, *rsiptr;
rsiptr = rdbPopulateSaveInfo(&rsi);
rdbSaveBackground(server.rdb_filename,rsiptr);
break;
}
}
在rdbSaveBackground方法中,会调用fork函数,创建一个子进程,然后执行rdvSave方法。
c
//rdb.c#rdbSaveBackground
if ((childpid = fork()) == 0) {
int retval;
/* Child */
closeClildUnusedResourceAfterFork();
redisSetProcTitle("redis-rdb-bgsave");
retval = rdbSave(filename,rsi);
if (retval == C_OK) {
size_t private_dirty = zmalloc_get_private_dirty(-1);
if (private_dirty) {
serverLog(LL_NOTICE,
"RDB: %zu MB of memory used by copy-on-write",
private_dirty/(1024*1024));
}
server.child_info_data.cow_size = private_dirty;
sendChildInfo(CHILD_INFO_TYPE_RDB);
}
exitFromChild((retval == C_OK) ? 0 : 1);
}
在rdbSave会执行重要的方法,rdbSaveRio将数据持久化,同时也会进行一些其他操作,如记录信息。
c
//rdb.c#rdbSave
if (rdbSaveRio(&rdb,&error,RDB_SAVE_NONE,rsi) == C_ERR) {...}
...
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;
if (fclose(fp) == EOF) goto werr;
if (rename(tmpfile,filename) == -1) {...}
...
server.dirty = 0;
server.lastsave = time(NULL);
server.lastbgsave_status = C_OK;
1.2 RDB子进程执行完毕
在server.c文件中的serverCron,有一段代码进行判断
c
//server.c#serverCron
if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
ldbPendingChildren())
{
int statloc;
pid_t pid;
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
int exitcode = WEXITSTATUS(statloc);
int bysignal = 0;
if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
if (pid == -1) {
serverLog(LL_WARNING,"wait3() returned an error: %s. "
"rdb_child_pid = %d, aof_child_pid = %d",
strerror(errno),
(int) server.rdb_child_pid,
(int) server.aof_child_pid);
} else if (pid == server.rdb_child_pid) {
backgroundSaveDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else if (pid == server.aof_child_pid) {
backgroundRewriteDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else {
if (!ldbRemoveChild(pid)) {
serverLog(LL_WARNING,
"Warning, detected child with unmatched pid: %ld",
(long)pid);
}
}
updateDictResizePolicy();
closeChildInfoPipe();
}
}
最终会调用backgroundSaveDoneHandlerDisk方法,做一些统计工作。
c
//rdb.c#backgroundSaveDoneHandlerDisk
void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
if (!bysignal && exitcode == 0) {
serverLog(LL_NOTICE,
"Background saving terminated with success");
server.dirty = server.dirty - server.dirty_before_bgsave;
server.lastsave = time(NULL);
server.lastbgsave_status = C_OK;
} else if (!bysignal && exitcode != 0) {
serverLog(LL_WARNING, "Background saving error");
server.lastbgsave_status = C_ERR;
} else {
mstime_t latency;
serverLog(LL_WARNING,
"Background saving terminated by signal %d", bysignal);
latencyStartMonitor(latency);
rdbRemoveTempFile(server.rdb_child_pid);
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("rdb-unlink-temp-file",latency);
/* SIGUSR1 is whitelisted, so we have a way to kill a child without
* tirggering an error condition. */
if (bysignal != SIGUSR1)
server.lastbgsave_status = C_ERR;
}
server.rdb_child_pid = -1;
server.rdb_child_type = RDB_CHILD_TYPE_NONE;
server.rdb_save_time_last = time(NULL)-server.rdb_save_time_start;
server.rdb_save_time_start = -1;
/* Possibly there are slaves waiting for a BGSAVE in order to be served
* (the first stage of SYNC is a bulk transfer of dump.rdb) */
updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_DISK);
}
1.3 保存时机
1)执行命令save,内部会走rdbSave逻辑。
2)执行命令bgsave,内部会走rdbSaveBackground逻辑。
3)根据配置save,内部会走rdbSaveBackground逻辑。
1.4 RDB持久化日志
csharp
859:M 13 May 09:08:08.097 * Background saving started by pid 4422
4422:C 13 May 09:08:08.105 * DB saved on disk
4422:C 13 May 09:08:08.106 * RDB: 0 MB of memory used by copy-on-write
859:M 13 May 09:08:08.197 * Background saving terminated with success
2. AOF
每一条命令的执行都会调用server.c#call函数,AOF命令的同步就是在该函数中实现。如果开启了AOF,则每条命令执行完毕后都会同步写入到aof_buf中,aof_buf是一个全局的SDS类型的缓冲区。
2.1 数据格式
Redis通过catAppendOnlyGenericCommand函数,将命令(过期命令除外)转换成保存在缓冲区中的数据结构。以set key aof命令为例,实际存储保存在缓冲区的格式为:
\*3\r\n$3\r\nset\r\n$3\r\nkey\r\n\$3\r\naof\r\n
\r\n
为分隔符
\*3
表示命令有3个参数
\$3
表示接下来的第一个参数长度为3,顺序读取的字符为set
第二个\$3
表示第2个参数的长度为3,读取到key,
同时,第三个\$3
表示第三个参数长度为3,读取到aof
2.2 写入缓冲区
在serer.c#call函数内部,会进行一些系列的判断,然后会调用propagate方法。
c
//server.c#propagate
void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
int flags)
{
if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
feedAppendOnlyFile(cmd,dbid,argv,argc);
if (flags & PROPAGATE_REPL)
replicationFeedSlaves(server.slaves,dbid,argv,argc);
}
然后调用feedAppendOnlyFile函数,把文件追加到aof_buf中。
c
//aof.c#feedAppendOnlyFile
if (server.aof_state == AOF_ON)
server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
如果此时正在进行重写的话,那么也需要把该修改写到aof_rewrite_buf_blocks中
c
//aof.c#feedAppendOnlyFile
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
在call函数里只发现了将文件写入到aof_buf,并没有看到文件持久化策略。其实这部分是在定时任务中执行了。
2.3 AOF文件写入策略
本质上,Redis的执行是依靠事件循环,然后进行事件处理。入口如下:
c
//ae.c#aeMain
void aeMain(aeEventLoop *eventLoop) {
eventLoop->stop = 0;
while (!eventLoop->stop) {
if (eventLoop->beforesleep != NULL)
eventLoop->beforesleep(eventLoop);
aeProcessEvents(eventLoop, AE_ALL_EVENTS|AE_CALL_AFTER_SLEEP);
}
}
1)而eventLoop->beforesleep实际上就是server.c中的beforeSleep函数。在里面会指定两个函数,一个flushAppendOnlyFile是将AOF缓存内容持久化,另外handleClientsWithPendingWrites是回复客户端。
c
//server.c#beforeSleep
/* Write the AOF buffer on disk */
flushAppendOnlyFile(0);
/* Handle writes with pending output buffers. */
handleClientsWithPendingWrites();
2)在flushAppendOnlyFile函数中,会将aof_buf的数据,写入到文件缓冲区里去。
c
//aof.c#flushAppendOnlyFile
nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
3)然后会进行判断,如果是always策略,那么会调用fsync操作,将文件缓冲区数据写到磁盘中。如果是everysec操作的话,需要判断当前unixtime是否大于上一次执行fsync操作的时间,因为unixtime时间为秒,所以保证起码1秒才执行一次。另外会通过后台执fsync操作。后台操作,是在服务启动的时候创建对应的线程,这里是将要执行的任务封装成job,并同时该线程从阻塞中唤醒,然后执行fsync操作。
c
/* Perform the fsync if needed. */
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
/* redis_fsync is defined as fdatasync() for Linux in order to avoid
* flushing metadata. */
latencyStartMonitor(latency);
redis_fsync(server.aof_fd); /* Let's try to get this data on the disk */
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("aof-fsync-always",latency);
server.aof_fsync_offset = server.aof_current_size;
server.aof_last_fsync = server.unixtime;
} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
if (!sync_in_progress) {
aof_background_fsync(server.aof_fd);
server.aof_fsync_offset = server.aof_current_size;
}
server.aof_last_fsync = server.unixtime;
}
4)实际上,如果是everysec策略。Redis考虑到fsync操作可能会比较慢,因此有相应的判断。如果有任务正在执行,sync_in_progress大于0,在判断一一下aof_flush_postponed_start时间,如果是第一次,则会返回。(另外serverCron中也有针对aof_flush_postponed_start参数flushAppendOnlyFile调用)。如果该参数大于0,则会判断是否超过2s,如果没超过则返回。如果超过则会继续往下执行write()方法。此处有阻塞主线程的风险,如果后台子线程由于磁盘负载过高,导致fsync发生阻塞,迟迟不能返回,那么主线程在执行write()系统调用时,也会在阻塞住,直到后台线程fsync执行完成,主线程执行write才能成功返回。
所以实际上,在极端情况下,everysec配置最多可能丢失2s数据。
c
if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
sync_in_progress = aofFsyncInProgress();
if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
/* With this append fsync policy we do background fsyncing.
* If the fsync is still in progress we can try to delay
* the write for a couple of seconds. */
if (sync_in_progress) {
if (server.aof_flush_postponed_start == 0) {
/* No previous write postponing, remember that we are
* postponing the flush and return. */
server.aof_flush_postponed_start = server.unixtime;
return;
} else if (server.unixtime - server.aof_flush_postponed_start < 2) {
/* We were already waiting for fsync to finish, but for less
* than two seconds this is still ok. Postpone again. */
return;
}
/* Otherwise fall trough, and go write since we can't wait
* over two seconds. */
server.aof_delayed_fsync++;
serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
}
}
在flushAppendOnlyFile()执行完之后,会执行handleClientsWithPendingWrites(),将命令的执行结果返回给客户端。因此如果是always策略,可以保证,返回给客户端之前,AOF的内容已经写到磁盘中了。
如果写入策略是No,那么可以看到,只有write操作,并没有fsync操作,该操作交给操作系统处理,由操作系统决定什么时候执行fsync。
2.4 AOF重写
触发AOF重写有两个地方:
1)命令输入,bgrewriteaof命令
2)定时判断
在server.c的serverCron方法中
c
//server.c#serverCron
/* Trigger an AOF rewrite if needed. */
if (server.aof_state == AOF_ON &&
server.rdb_child_pid == -1 &&
server.aof_child_pid == -1 &&
server.aof_rewrite_perc &&
server.aof_current_size > server.aof_rewrite_min_size)
{
long long base = server.aof_rewrite_base_size ?
server.aof_rewrite_base_size : 1;
long long growth = (server.aof_current_size*100/base) - 100;
if (growth >= server.aof_rewrite_perc) {
serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
rewriteAppendOnlyFileBackground();
}
}
这两种方式,最终都会执行rewriteAppendOnlyFileBackground方法。
在该方法中,首先会创建管道(pipe)。在了解管道作用之前,先弄清楚另外一个问题。在重写时,父进程仍然会有命令在执行,那么如何保证重写完成后的文件也包括父进程在这段时间操作的命令呢?首先,需要父进程将重写过程中执行的命令进行保存,其次需要将这些命令在重写后的文件中进行回放。Redis为了尽量减少主进程阻塞的时间,会通过管道按批次将父进程累计的命令发送给子进程,由子进程重写完成后进行回放。因此子进程退出后只会有少量的命令还累计在父进程中,父进程只需要回放这些命令。
c
//aof.c#rewriteAppendOnlyFileBackground
if (aofCreatePipes() != C_OK) return C_ERR;
openChildInfoPipe();
另外在重写过程中,会将接收到的命令保存到aof_rewrite_buf_blocks中,这是一个list类型的缓冲区,每个节点保存一个aofrwblock类型的数据。aofrwblock是一个结构体,会保存10MB大小的缓冲区内容,并且有缓冲区使用和空闲长度的记录,当一个节点缓冲区写满后,会开辟一个新的节点继续保存执行过的命令。该结构体如下:
c
typedef struct aofrwblock {
unsigned long used, free;
char buf[AOF_RW_BUF_BLOCK_SIZE];//#define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10) /* 10 MB per block */
} aofrwblock;
在aofCreatePipes方法里,会创建三对管道。fd0/fd1、fd2/fd3、fd4/fd5。它们各自配对。父进程通过fd1将重写时累计的命令发送给子进程,子进程通过fd0进行接收保存到aof_child_diff中。当子进程完成重写时,向fd3写入一个"!"字符,通知父进程不需要继续通过管道发送累计命令。父进程fd2接收到"!"符号后,也向fd5写入一个"!"符号进行确认。子进程通过fd4同步阻塞收到"!"号后,才可进行后续的退出操作。退出时将收到的累计命令进行回放,然后执行fsync
c
//aof.c#aofCreatePipes
int aofCreatePipes(void) {
int fds[6] = {-1, -1, -1, -1, -1, -1};
int j;
if (pipe(fds) == -1) goto error; /* parent -> children data. */
if (pipe(fds+2) == -1) goto error; /* children -> parent ack. */
if (pipe(fds+4) == -1) goto error; /* parent -> children ack. */
/* Parent -> children data is non blocking. */
if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error;
if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error;
if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error;
server.aof_pipe_write_data_to_child = fds[1];
server.aof_pipe_read_data_from_parent = fds[0];
server.aof_pipe_write_ack_to_parent = fds[3];
server.aof_pipe_read_ack_from_child = fds[2];
server.aof_pipe_write_ack_to_child = fds[5];
server.aof_pipe_read_ack_from_parent = fds[4];
server.aof_stop_sending_diff = 0;
return C_OK;
}
管道用途如下图所示:
下面对于上述描述,进行详细分析。主要流程如下:
1)调用fork()函数,复制一个子进程。然后调用rewriteAppendOnlyFile()方法进行重写。最后完成后,会进行退出,调用exitFromChild()方法
c
//aof.c#rewriteAppendOnlyFileBackground
if ((childpid = fork()) == 0) {
char tmpfile[256];
/* Child */
closeClildUnusedResourceAfterFork();
redisSetProcTitle("redis-aof-rewrite");
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
size_t private_dirty = zmalloc_get_private_dirty(-1);
if (private_dirty) {
serverLog(LL_NOTICE,
"AOF rewrite: %zu MB of memory used by copy-on-write",
private_dirty/(1024*1024));
}
server.child_info_data.cow_size = private_dirty;
sendChildInfo(CHILD_INFO_TYPE_AOF);
exitFromChild(0);
} else {
exitFromChild(1);
}
}
2)在rewriteAppendOnlyFile方法里会调用rewriteAppendOnlyFileRio执行重写的逻辑。重写的逻辑这里就不详细介绍,大致就将数据库的数据,转换成命令,写到文件中。
c
//aof#rewriteAppendOnlyFile
...
snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
fp = fopen(tmpfile,"w");
rioInitWithFile(&aof,fp);
...
if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
3)会花费一些时间,从父进程中等待读取数据。如果超过1s,或者连续20ms都没有可读数据,那么进行后续步骤。在这过程中,会接收父进程的数据。
c
//aof#rewriteAppendOnlyFile
int nodata = 0;
mstime_t start = mstime();
while(mstime()-start < 1000 && nodata < 20) {
if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)
{
nodata++;
continue;
}
nodata = 0; /* Start counting from zero, we stop on N *contiguous*
timeouts. */
aofReadDiffFromParent();
}
4)aofReadDiffFromParent就是从aof_pipe_read_data_from_parent(fd0)该管道中读取数据。然后保存到aof_child_diff中
c
//aof.c#aofReadDiffFromParent
ssize_t aofReadDiffFromParent(void) {
char buf[65536]; /* Default pipe buffer size on most Linux systems. */
ssize_t nread, total = 0;
while ((nread =
read(server.aof_pipe_read_data_from_parent,buf,sizeof(buf))) > 0) {
server.aof_child_diff = sdscatlen(server.aof_child_diff,buf,nread);
total += nread;
}
return total;
}
5)那么父进程何时发送数据给子进程呢?在之前aof有提到,如果进行重写的话,会将命令存储起来。aofRewriteBufferAppend方法中,会将命令存储到aof_rewrite_buf_blocks中,同时也会创建文件事件,该事件的处理逻辑在aofChildWriteDiffData中。
c
//aof.c#feedAppendOnlyFile
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
c
//aof#aofRewriteBufferAppend
if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {
aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,
AE_WRITABLE, aofChildWriteDiffData, NULL);
}
6)在aofChildWriteDiffData方法中,如果是aof_stop_sending_diff,接收到子进程进行发送命令,则会移除该事件。否则会判断aof_rewrite_buf_blocks中是否有数据,如果有则会发送给子进程。
c
//aof#aofChildWriteDiffData
while(1) {
ln = listFirst(server.aof_rewrite_buf_blocks);
block = ln ? ln->value : NULL;
if (server.aof_stop_sending_diff || !block) {
aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,
AE_WRITABLE);
return;
}
if (block->used > 0) {
nwritten = write(server.aof_pipe_write_data_to_child,
block->buf,block->used);
if (nwritten <= 0) return;
memmove(block->buf,block->buf+nwritten,block->used-nwritten);
block->used -= nwritten;
block->free += nwritten;
}
if (block->used == 0) listDelNode(server.aof_rewrite_buf_blocks,ln);
}
7)在花费一段时间接收重写过程中的命令后,会向父进程发送"!",让父进程停止发送命令。然后等待父进程回复"!"命令。
c
//aof.c#rewriteAppendOnlyFile
/* Ask the master to stop sending diffs. */
if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;
if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)
goto werr;
/* We read the ACK from the server using a 10 seconds timeout. Normally
* it should reply ASAP, but just in case we lose its reply, we are sure
* the child will eventually get terminated. */
if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||
byte != '!') goto werr;
8)那么父进程哪里有监听该管道呢?在aofCreatePipes方法中,会创建一个文件事件,处理函数为aofChildPipeReadable。在aofChildPipeReadable函数中,会从子进程中接收到"!"命令,然后回复给子进程"!"字符,并且删除该事件。
c
//aof.c#aofCreatePipes
if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error;
c
//aof.c#aofChildPipeReadable
void aofChildPipeReadable(aeEventLoop *el, int fd, void *privdata, int mask) {
...
if (read(fd,&byte,1) == 1 && byte == '!') {
serverLog(LL_NOTICE,"AOF rewrite child asks to stop sending diffs.");
server.aof_stop_sending_diff = 1;
if (write(server.aof_pipe_write_ack_to_child,"!",1) != 1) {
/* If we can't send the ack, inform the user, but don't try again
* since in the other side the children will use a timeout if the
* kernel can't buffer our write, or, the children was
* terminated. */
serverLog(LL_WARNING,"Can't send ACK to AOF child: %s",
strerror(errno));
}
}
/* Remove the handler since this can be called only one time during a
* rewrite. */
aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE);
}
9)收到父进程的"!"之后,会再调用aofReadDiffFromParent()方法。因为子进程在发送"!"过程和父进程接收过程中,仍然可能主进程接收到命令。在这之后就是将文件刷盘,然后是文件重名了。
c
//aof.c#rewriteAppendOnlyFile
/* Read the final diff if any. */
aofReadDiffFromParent();
10)子进程退出后,会调用backgroundRewriteDoneHandler方法进行后续的处理。该函数定义在server.c#serverCron,和rdb在同一个代码块内进行判断。
c
//server.c#serverCron
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
int exitcode = WEXITSTATUS(statloc);
int bysignal = 0;
if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
if (pid == -1) {
serverLog(LL_WARNING,"wait3() returned an error: %s. "
"rdb_child_pid = %d, aof_child_pid = %d",
strerror(errno),
(int) server.rdb_child_pid,
(int) server.aof_child_pid);
} else if (pid == server.rdb_child_pid) {
backgroundSaveDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else if (pid == server.aof_child_pid) {
backgroundRewriteDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else {
if (!ldbRemoveChild(pid)) {
serverLog(LL_WARNING,
"Warning, detected child with unmatched pid: %ld",
(long)pid);
}
}
updateDictResizePolicy();
closeChildInfoPipe();
}
11)在backgroundRewriteDoneHandler函数中,会执行aofRewriteBufferWrite方法,将停止父进程发送命令之后,产生的命令写到文件中。因为serverCron执行是在主进程中,所以可以保证,此时不会再有命令执行,造成有新的命令没有记录到。
c
//aof.c#backgroundRewriteDoneHandler
if (aofRewriteBufferWrite(newfd) == -1) {
12)最后是一些清除工作,将该部分数据刷盘。更新重写大小,清空aof_buf。清空管道,设置aof_child_pid为-1(命令不会写到aof_rewrite_buf_blocks中)等等工作
c
//aof.c#backgroundRewriteDoneHandler
/* AOF enabled, replace the old fd with the new one. */
oldfd = server.aof_fd;
server.aof_fd = newfd;
if (server.aof_fsync == AOF_FSYNC_ALWAYS)
redis_fsync(newfd);
else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
aof_background_fsync(newfd);
server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
aofUpdateCurrentSize();
server.aof_rewrite_base_size = server.aof_current_size;
server.aof_fsync_offset = server.aof_current_size;
/* Clear regular AOF buffer since its contents was just written to
* the new AOF from the background rewrite buffer. */
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
...
aofClosePipes();
aofRewriteBufferReset();
aofRemoveTempFile(server.aof_child_pid);
server.aof_child_pid = -1;
server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
server.aof_rewrite_time_start = -1;
2.5 混合持久化
混合持久化由以下配置设置:
bash
aof-use-rdb-preamble yes
在子进程执行rewriteAppendOnlyFile方法时,会判断该配置是否开启,如果开启,则按RDB的保存方式保存当前数据快照。保存完毕后回放累计命令到到文件末尾即可。
c
//aof#rewriteAppendOnlyFile
if (server.aof_use_rdb_preamble) {
int error;
if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {
errno = error;
goto werr;
}
} else {
if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
}
在加载时,首先会识别AOF文件是否以REDIS字符串开头,如果是,则按RDB格式加载,加载完RDB之后,继续按AOF格式加载剩余部分。
在main函数执行时,会调用loadDataFromDisk(),从磁盘中加载数据。
AOf开启的话,优先加载AOF。
c
//server.c#loadDataFromDisk
if (server.aof_state == AOF_ON) {
if (loadAppendOnlyFile(server.aof_filename) == C_OK)
serverLog(LL_NOTICE,"DB loaded from append only file: %.3f seconds",(float)(ustime()-start)/1000000);
}
如果AOF文件的开头是"REDIS"字符,则进行RDB加载
c
//aof.c#loadAppendOnlyFile
/* Check if this AOF file has an RDB preamble. In that case we need to
* load the RDB file and later continue loading the AOF tail. */
char sig[5]; /* "REDIS" */
if (fread(sig,1,5,fp) != 5 || memcmp(sig,"REDIS",5) != 0) {
/* No RDB preamble, seek back at 0 offset. */
if (fseek(fp,0,SEEK_SET) == -1) goto readerr;
} else {
/* RDB preamble. Pass loading the RDB functions. */
rio rdb;
serverLog(LL_NOTICE,"Reading RDB preamble from AOF file...");
if (fseek(fp,0,SEEK_SET) == -1) goto readerr;
rioInitWithFile(&rdb,fp);
if (rdbLoadRio(&rdb,NULL,1) != C_OK) {
serverLog(LL_WARNING,"Error reading the RDB preamble of the AOF file, AOF loading aborted");
goto readerr;
} else {
serverLog(LL_NOTICE,"Reading the remaining AOF tail...");
}
}
加载完RDB之后,模拟客户端,进行命令重放。
3. 问题分析
3.1 线上RDB持久化失败,导致数据没写成功
stop-writes-on-bgsave-error参数,默认是yes。
在server.c的processCommand中(调用call函数之前),会调用writeCommandsDeniedByDiskError的判断,如果上一次RDB执行失败,则会直接返回错误信息,不执行命令。
c
//server.c#processCommand
int deny_write_type = writeCommandsDeniedByDiskError();
if (deny_write_type != DISK_ERROR_TYPE_NONE &&
server.masterhost == NULL &&
(c->cmd->flags & CMD_WRITE ||
c->cmd->proc == pingCommand))
{
flagTransaction(c);
if (deny_write_type == DISK_ERROR_TYPE_RDB)
addReply(c, shared.bgsaveerr);
else
addReplySds(c,
sdscatprintf(sdsempty(),
"-MISCONF Errors writing to the AOF file: %s\r\n",
strerror(server.aof_last_write_errno)));
return C_OK;
}
在writeCommandsDeniedByDiskError方法中,会判断stop_writes_on_bgsave_err配置,以及saveparamslen配置数量,以及上一次RDB的结果是否正常。
c
//server.c#writeCommandsDeniedByDiskError
int writeCommandsDeniedByDiskError(void) {
if (server.stop_writes_on_bgsave_err &&
server.saveparamslen > 0 &&
server.lastbgsave_status == C_ERR)
{
return DISK_ERROR_TYPE_RDB;
} else if (server.aof_state != AOF_OFF &&
server.aof_last_write_status == C_ERR)
{
return DISK_ERROR_TYPE_AOF;
} else {
return DISK_ERROR_TYPE_NONE;
}
}
4. 附录
4.1 write和fwrite区别
在AOF落盘过程中,我们知道,会操作write方法,然后根据写回策略执行fsync。先了解下write和fwrite的区别。
fwrite是C语言的库,内部也是通过write来实现的,应用层级别有缓冲区。write是系统调用。
对于write方法来说,每次要写的数据是调用者要求的大小,如要求写入10字节数据,write就会写10个字节数据到内核缓冲区中,所以依然涉及到用户态和内核态之间的切换。操作系统会定期把这些内存缓冲区的数据写回磁盘(如果不调用fsync方法)。
fwrite方法每次都会把数据写入一个应用进程缓冲区,等到该缓冲区满了,或者调用fflush冲洗缓冲区的方法,系统会调用write一次性把数据写入到内核缓冲区。这样可以减少write的系统调用。
对于write系统调用,操作系统内部其实会把I/O文件操作分成两种类型:
1)**缓存I/O。**大多数文件系统的默认I/O操作都是缓存I/O。对于读操作,操作系统会先检查内核缓冲区有没有需要的数据。如果已经缓存了,那就直接从缓冲区中返回,否则从磁盘读取,然后缓存在操作系统的缓冲中。对于写操作,操作系统会将数据从用户空间复制到内核空间的缓存。对于用户进程来说,写操作已经完成。至于什么时候再写到磁盘由操作系统决定,除非显式调用sync同步命令。
2)**直接I/O。**应用程序直接访问磁盘数据,而不经过内核缓冲区,从而减少了在内核缓冲区和用户程序之间的数据复制。
4.2 AOF同步阻塞资料
在redis.conf中,有描述到AOF可能存在的阻塞情况。该段描述在redis.conf的no-appendfsync-on-rewrite 上,该配置是为了解决此类可能出现的情况。
ini
# When the AOF fsync policy is set to always or everysec, and a background
# saving process (a background save or AOF log background rewriting) is
# performing a lot of I/O against the disk, in some Linux configurations
# Redis may block too long on the fsync() call. Note that there is no fix for
# this currently, as even performing fsync in a different thread will block
# our synchronous write(2) call.
5. 参考资料
1)Redis persistence demystified
2)《Redis 5设计与源码分析》
3)Redis 5.0.12 源码
4)一文讲透如何排查Redis性能问题#开启AOF kaito-kidd.com/2021/01/23/...
5)关于write()和fsync() blog.csdn.net/javashareau...
6)Linux 的进程间通信:管道 zhuanlan.zhihu.com/p/58489873
7)read/fread write/fwrite 的区别 blog.csdn.net/ljlstart/ar...
8)《趣谈Linux 30 | 文件缓存:常用文档应该放在触手可得的地方》极客时间