1、目标
本文的主要目标是探究ziplist压缩列表和quicklist快表的数据结构并进行源码分析
2、ZipList压缩列表
ziplist是一个连续内存空间的双端链表,ziplist可以存储字符串或者整数,ziplist可以从头部或者尾部进入或者弹出,时间复杂度是O(1),每次新增或者删除数据都会造成ziplist内存的重新分配因为ziplist是连续内存空间
2.1 ZipList的数据结构
c
* <zlbytes> <zltail> <zllen> <entry> <entry> ... <entry> <zlend>
ziplist包括zlbytes、zltail、zllen、多个entry、zlend,zlbytes保存了ziplist占用的字节大小,zltail记录最后一个entry的偏移量,作用是可以快速从压缩列表的尾部弹出数据,zllen记录ziplist包含的entry结点数量,zllen的最大值是65535,zlend是特殊值0xFF表示ziplist的结束
c
* <prevlen> <encoding> <entry-data>
ziplist的entry是ziplistEntry,它包括prevlen用来保存前一个entry的字节长度,encoding是编码格式,它表示这个entry是整数还是字符串,如果是整数的话encoding保存了整数,如果是字符串的话encoding记录了字符串的个数,entry-data是保存的实际数据,如果是整数的话可能没有entry-data因为整数会保存在encoding中,如果是字符串的话entry-data保存的是多个字符串
因为多个entry是连续内存,因此entry没有用指针记录前一个和后一个结点,而是用前一个entry的长度
prevlen占用1个字节或者5个字节,如果entry的长度小于254个字节那么prevlen占用1个字节,如果entry的长度大于等于254个字节那么prevlen占用5个字节并且第1个字节固定是0xFE(254),后面4个字节才是entry的实际长度
encoding占用1个字节、2个字节或者5个字节,如果encoding的开头2位是00、01、10则表示字符串,如果encoding的开头2位是11表示整数,encoding的开头2位是00表示字符串小于等于63个字节并且encoding占用1个字节,encoding的开头2位是01表示字符串大于63个字节并且小于等于16383个字节并且encoding占用2个字节,encoding的开头2位是10表示字符串大于16383个字节并且encoding占用5个字节,encoding的开头2位是11表示是整数并且encoding占用1个字节,如果encoding是1111xxxx(xxxx在0001到1101之间)则保存的整数放到xxxx,整数是xxxx减去1
举例:
c
* [0f 00 00 00] [0c 00 00 00] [02 00] [00 f3] [02 f6] [ff]
* | | | | | |
* zlbytes zltail zllen "2" "5" end
zlbytes表示这个ziplist的字节长度是15(0f),zltail表示这个ziplist最后一个entry(字符串5)的偏移量是12(0c),zllen表示这个ziplist有2个entry,[00 f3]是这个ziplist的第一个entry,00表示prevlen,即前一个entry的偏移量是0因为第一个entry没有前一个entry,f3表示encoding,即满足1111xxxx(xxxx在0001到1101之间)则保存的整数放到xxxx中,因此整数是3-1=2,[02 f6]是这个ziplist的第二个entry,02表示prevlen,即前一个entry的偏移量是2个字节,f6表示encoding,即满足1111xxxx(xxxx在0001到1101之间)则保存的整数放到xxxx中,因此整数是6-1=5,end表示ziplist的结束
c
* [02] [0b] [48 65 6c 6c 6f 20 57 6f 72 6c 64]
添加一个entry,这个entry的prevlen是02表示前一个entry的偏移量是2,encoding是0b表示字符串,其中b是11表示这个entry保存的字符串有11个,48到64分为是Hello World字符串中每个字符的ASCII值
2.2 ziplistNew方法创建一个新的ziplist
c
/* Create a new empty ziplist. */
unsigned char *ziplistNew(void) {
unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;
unsigned char *zl = zmalloc(bytes);
ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);
ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);
ZIPLIST_LENGTH(zl) = 0;
zl[bytes-1] = ZIP_END;
return zl;
}
ziplistNew方法通过动态分配内存来创建一个ziplist并返回指针,ZIPLIST_BYTES是ziplist占用的字节大小,ZIPLIST_TAIL_OFFSET是ziplist最后一个entry的偏移量,ZIPLIST_LENGTH是ziplist包含的entry结点数量,初始化数量为0,ZIP_END值是255表示ziplist的结束
2.3 ziplistPush方法
c
unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where) {
unsigned char *p;
p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl);
return __ziplistInsert(zl,p,s,slen);
}
ziplistPush方法会判断入参的where是头部和尾部,然后调用__ziplistInsert方法可以将数据插入到ziplist的头部entry或者尾部entry,即__ziplistInsert(zl,p,s,slen)是将字符串或者整数s插入到zl这个ziplist的p指针位置
c
/* Insert item at "p". */
unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), reqlen, newlen;
unsigned int prevlensize, prevlen = 0;
size_t offset;
int nextdiff = 0;
unsigned char encoding = 0;
long long value = 123456789; /* initialized to avoid warning. Using a value
that is easy to see if for some reason
we use it uninitialized. */
zlentry tail;
/* Find out prevlen for the entry that is inserted. */
if (p[0] != ZIP_END) {
ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
} else {
unsigned char *ptail = ZIPLIST_ENTRY_TAIL(zl);
if (ptail[0] != ZIP_END) {
prevlen = zipRawEntryLengthSafe(zl, curlen, ptail);
}
}
/* See if the entry can be encoded */
if (zipTryEncoding(s,slen,&value,&encoding)) {
/* 'encoding' is set to the appropriate integer encoding */
reqlen = zipIntSize(encoding);
} else {
/* 'encoding' is untouched, however zipStoreEntryEncoding will use the
* string length to figure out how to encode it. */
reqlen = slen;
}
/* We need space for both the length of the previous entry and
* the length of the payload. */
reqlen += zipStorePrevEntryLength(NULL,prevlen);
reqlen += zipStoreEntryEncoding(NULL,encoding,slen);
/* When the insert position is not equal to the tail, we need to
* make sure that the next entry can hold this entry's length in
* its prevlen field. */
int forcelarge = 0;
nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;
if (nextdiff == -4 && reqlen < 4) {
nextdiff = 0;
forcelarge = 1;
}
/* Store offset because a realloc may change the address of zl. */
offset = p-zl;
newlen = curlen+reqlen+nextdiff;
zl = ziplistResize(zl,newlen);
p = zl+offset;
/* Apply memory move when necessary and update tail offset. */
if (p[0] != ZIP_END) {
/* Subtract one because of the ZIP_END bytes */
memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);
/* Encode this entry's raw length in the next entry. */
if (forcelarge)
zipStorePrevEntryLengthLarge(p+reqlen,reqlen);
else
zipStorePrevEntryLength(p+reqlen,reqlen);
/* Update offset for tail */
ZIPLIST_TAIL_OFFSET(zl) =
intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+reqlen);
/* When the tail contains more than one entry, we need to take
* "nextdiff" in account as well. Otherwise, a change in the
* size of prevlen doesn't have an effect on the *tail* offset. */
assert(zipEntrySafe(zl, newlen, p+reqlen, &tail, 1));
if (p[reqlen+tail.headersize+tail.len] != ZIP_END) {
ZIPLIST_TAIL_OFFSET(zl) =
intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff);
}
} else {
/* This element will be the new tail. */
ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(p-zl);
}
/* When nextdiff != 0, the raw length of the next entry has changed, so
* we need to cascade the update throughout the ziplist */
if (nextdiff != 0) {
offset = p-zl;
zl = __ziplistCascadeUpdate(zl,p+reqlen);
p = zl+offset;
}
/* Write the entry */
p += zipStorePrevEntryLength(p,prevlen);
p += zipStoreEntryEncoding(p,encoding,slen);
if (ZIP_IS_STR(encoding)) {
memcpy(p,s,slen);
} else {
zipSaveInteger(p,value,encoding);
}
ZIPLIST_INCR_LENGTH(zl,1);
return zl;
}
__ziplistInsert方法先调用zipTryEncoding方法尝试编码数据s,然后计算所需的内存空间大小reqlen,接着调用ziplistResize方法重新分配这个ziplist的内存空间,然后判断如果插入位置p不是尾部entry的话会将插入位置后面的entry向后移动,它的作用是为了方便插入数据,接着调用__ziplistCascadeUpdate方法连续更新后面entry的prevlen字段,这个字段是前一个entry占用的字节长度,最后插入新的entry到ziplist中,会判断entry的encoding是字符串还是整数然后插入,最后返回ziplist的指针
因此,插入或删除新的entry可能会导致连续更新问题,因为如果新增一个entry的字节长度超过254个字节的话会更新后一个entry的prevlen字段占用的字节个数从1个字节增加到5个字节,如果现在有多个entry的字节个数都是253个字节,然后新增一个超过254个字节的entry会导致后面的entry的prevlen字段都被更新,这就是连续更新问题,解决办法是ziplist存储entry较少性能就较好
2.4 __ziplistCascadeUpdate方法
c
unsigned char *__ziplistCascadeUpdate(unsigned char *zl, unsigned char *p) {
zlentry cur;
size_t prevlen, prevlensize, prevoffset; /* Informat of the last changed entry. */
size_t firstentrylen; /* Used to handle insert at head. */
size_t rawlen, curlen = intrev32ifbe(ZIPLIST_BYTES(zl));
size_t extra = 0, cnt = 0, offset;
size_t delta = 4; /* Extra bytes needed to update a entry's prevlen (5-1). */
unsigned char *tail = zl + intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl));
/* Empty ziplist */
if (p[0] == ZIP_END) return zl;
zipEntry(p, &cur); /* no need for "safe" variant since the input pointer was validated by the function that returned it. */
firstentrylen = prevlen = cur.headersize + cur.len;
prevlensize = zipStorePrevEntryLength(NULL, prevlen);
prevoffset = p - zl;
p += prevlen;
/* Iterate ziplist to find out how many extra bytes do we need to update it. */
while (p[0] != ZIP_END) {
assert(zipEntrySafe(zl, curlen, p, &cur, 0));
/* Abort when "prevlen" has not changed. */
if (cur.prevrawlen == prevlen) break;
/* Abort when entry's "prevlensize" is big enough. */
if (cur.prevrawlensize >= prevlensize) {
if (cur.prevrawlensize == prevlensize) {
zipStorePrevEntryLength(p, prevlen);
} else {
/* This would result in shrinking, which we want to avoid.
* So, set "prevlen" in the available bytes. */
zipStorePrevEntryLengthLarge(p, prevlen);
}
break;
}
/* cur.prevrawlen means cur is the former head entry. */
assert(cur.prevrawlen == 0 || cur.prevrawlen + delta == prevlen);
/* Update prev entry's info and advance the cursor. */
rawlen = cur.headersize + cur.len;
prevlen = rawlen + delta;
prevlensize = zipStorePrevEntryLength(NULL, prevlen);
prevoffset = p - zl;
p += rawlen;
extra += delta;
cnt++;
}
/* Extra bytes is zero all update has been done(or no need to update). */
if (extra == 0) return zl;
/* Update tail offset after loop. */
if (tail == zl + prevoffset) {
/* When the last entry we need to update is also the tail, update tail offset
* unless this is the only entry that was updated (so the tail offset didn't change). */
if (extra - delta != 0) {
ZIPLIST_TAIL_OFFSET(zl) =
intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra-delta);
}
} else {
/* Update the tail offset in cases where the last entry we updated is not the tail. */
ZIPLIST_TAIL_OFFSET(zl) =
intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+extra);
}
/* Now "p" points at the first unchanged byte in original ziplist,
* move data after that to new ziplist. */
offset = p - zl;
zl = ziplistResize(zl, curlen + extra);
p = zl + offset;
memmove(p + extra, p, curlen - offset - 1);
p += extra;
/* Iterate all entries that need to be updated tail to head. */
while (cnt) {
zipEntry(zl + prevoffset, &cur); /* no need for "safe" variant since we already iterated on all these entries above. */
rawlen = cur.headersize + cur.len;
/* Move entry to tail and reset prevlen. */
memmove(p - (rawlen - cur.prevrawlensize),
zl + prevoffset + cur.prevrawlensize,
rawlen - cur.prevrawlensize);
p -= (rawlen + delta);
if (cur.prevrawlen == 0) {
/* "cur" is the previous head entry, update its prevlen with firstentrylen. */
zipStorePrevEntryLength(p, firstentrylen);
} else {
/* An entry's prevlen can only increment 4 bytes. */
zipStorePrevEntryLength(p, cur.prevrawlen+delta);
}
/* Forward to previous entry. */
prevoffset -= cur.prevrawlen;
cnt--;
}
return zl;
}
__ziplistCascadeUpdate方法是级联更新ziplist的多个entry的prevlen字段,即前一个entry的字节长度,它会从当前entry开始向后遍历ziplist的多个entry,循环更新entry的prevlen字段,如果entry的prevlen字段等于前一个entry的字节长度就停止循环更新,最后调用zipStorePrevEntryLength方法从后向前逆序更新entry的prevlen字段
为什么要逆序更新entry的prevlen字段?因为更新entry的prevlen字段会改变entry的内存地址,如果从前向后的话会导致后面的entry找不到前一个entry结点
3、quicklist快速链表(快表)
3.1 quicklist快表的结构
c
typedef struct quicklist {
quicklistNode *head;
quicklistNode *tail;
unsigned long count; /* total count of all entries in all listpacks */
unsigned long len; /* number of quicklistNodes */
signed int fill : QL_FILL_BITS; /* fill factor for individual nodes */
unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
unsigned int bookmark_count: QL_BM_BITS;
quicklistBookmark bookmarks[];
} quicklist;
quicklist是一个双向链表,它包含头结点head,尾结点tail,count表示listpack(Redis7)或者ziplist(Redis6)中的entry个数,len表示quicklistNode(listpack或者ziplist)的数量
quicklist快表用来解决ziplist存储entry个数少的问题,quicklist快表中的每个结点都是quicklistNode,quicklistNode包含了listpack或者ziplist,因此quicklist快表结合了listpack或者ziplist存储连续内存的优点和双向链表存储结点多的优点
c
typedef struct quicklistNode {
struct quicklistNode *prev;
struct quicklistNode *next;
unsigned char *entry;
size_t sz; /* entry size in bytes */
unsigned int count : 16; /* count of items in listpack */
unsigned int encoding : 2; /* RAW==1 or LZF==2 */
unsigned int container : 2; /* PLAIN==1 or PACKED==2 */
unsigned int recompress : 1; /* was this node previous compressed? */
unsigned int attempted_compress : 1; /* node can't compress; too small */
unsigned int dont_compress : 1; /* prevent compression of entry that will be used later */
unsigned int extra : 9; /* more bits to steal for future usage */
} quicklistNode;
quicklistNode是quicklist快表的一个结点,它包含了prev表示前一个结点,next表示下一个结点从而构成双向链表,entry是listpack(Redis7)或者ziplist(Redis6)的指针,count表示listpack或者ziplist中entry的个数
3.2 quicklistCreate方法
c
/* Create a new quicklist.
* Free with quicklistRelease(). */
quicklist *quicklistCreate(void) {
struct quicklist *quicklist;
quicklist = zmalloc(sizeof(*quicklist));
quicklist->head = quicklist->tail = NULL;
quicklist->len = 0;
quicklist->count = 0;
quicklist->compress = 0;
quicklist->fill = -2;
quicklist->bookmark_count = 0;
return quicklist;
}
quicklistCreate方法会初始化quicklist快表的每个变量,头结点和尾结点都是null,quicklistNode的长度len为0,listpack或者ziplist中entry的个数count为0