Redis源码初探核心数据结构
本文主要根据redis源码来了解实操过程中常用的redis的数据类型和在使用数据类型过程中redis底层的各种处理(数据结构变化、临界点等),本文源码对于redis的版本为5.0.3。
一、redis object
1.redis object的定义
redis对象定义在server.h里,具体源码如下:
typedef struct redisObject {
/**
* type 有如下五种(4bit):
* #define OBJ_STRING 0 //String object.
* #define OBJ_LIST 1 //List object.
* #define OBJ_SET 2 //Set object.
* #define OBJ_ZSET 3 //Sorted set object.
* #define OBJ_HASH 4 //Hash object.
*/
unsigned type:4;
/*
* encoding编码有如下11种(4bit):
* #define OBJ_ENCODING_RAW 0 // Raw representation
* #define OBJ_ENCODING_INT 1 // Encoded as integer
* #define OBJ_ENCODING_HT 2 // Encoded as hash table
* #define OBJ_ENCODING_ZIPMAP 3 // Encoded as zipmap
* #define OBJ_ENCODING_LINKEDLIST 4 // No longer used: old list encoding.
* #define OBJ_ENCODING_ZIPLIST 5 // Encoded as ziplist
* #define OBJ_ENCODING_INTSET 6 // Encoded as intset
* #define OBJ_ENCODING_SKIPLIST 7 // Encoded as skiplist
* #define OBJ_ENCODING_EMBSTR 8 // Embedded sds string encoding
* #define OBJ_ENCODING_QUICKLIST 9 // Encoded as linked list of ziplists
* #define OBJ_ENCODING_STREAM 10 // Encoded as a radix tree of listpacks
*/
unsigned encoding:4;
/*
* #define LRU_BITS 24 LRU替换算法(24bit)
*/
unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
* LFU data (least significant 8 bits frequency
* and most significant 16 bits access time). */
/*
* 引用计数,#define OBJ_SHARED_REFCOUNT INT_MAX 32bit有符号int类型
*/
int refcount;
/*
* 指针指向实际的数据
*/
void *ptr;
} robj;
由源码可知redis对象类型包括String、list、set、zset、hash五种,该数据结构存储了对象类型、编码、LRU替换算法、引用计数、数据指针。
对象的ptr指针指向对象的底层实现数据结构,而这些数据结构由对象的encoding属性决定。
针对不同类型的对象,其中每种类型对应的encoding将在后面按章分析。
2.redis object的创建
在object.h里面,列举的redis object创建的方法很多,针对不同类型的对象有不同的创建逻辑,这里只列举一个主要的函数:
robj *createObject(int type, void *ptr) {
robj *o = zmalloc(sizeof(*o)); //分配内存
o->type = type; //设置类型
o->encoding = OBJ_ENCODING_RAW; //编码
o->ptr = ptr; //数据
o->refcount = 1; //引用计数,解决了get和del命令冲突的问题
/* Set the LRU to the current lruclock (minutes resolution), or
* alternatively the LFU counter. */
if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
o->lru = (LFUGetTimeInMinutes()<<8) | LFU_INIT_VAL;
} else {
o->lru = LRU_CLOCK();
}
return o;
}
针对redis不同类型的对象通过命令创建的逻辑根据类型分章分析。
二、redis字符串
redis字符串对象是redisObject结构体中,type为0(OBJ_STRING)的对象。
1.Redis字符串简述
Redis没有直接使用c语言传统的字符串表示(以空字符结尾的字符数组),而是自己构建了一种名为简单动态字符串(simple dynamic String,SDS)的抽象类型,并将SDS用作Redis的默认字符串表示。
在redis里面,C字符串只会作为字符串常量用在一些无须对字符串值进行修改的地方。
2.SDS(simple dynamic String)简单动态字符串
2.1 redis sds代码解析
从redis3.2开始,sds就有了5种类型,5种类型分别存放不同大小的字符串:
/* Note: sdshdr5 is never used, we just access the flags byte directly.
* However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
uint8_t len; /* used */
uint8_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
uint16_t len; /* used */
uint16_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
uint32_t len; /* used */
uint32_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
uint64_t len; /* used */
uint64_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
在redis源码中,有一个创建字符串的函数,具体代码如下:
robj *createStringObject(const char *ptr, size_t len) {
// #define OBJ_ENCODING_EMBSTR_SIZE_LIMIT 44
if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT)
return createEmbeddedStringObject(ptr,len);
else
return createRawStringObject(ptr,len);
}
从源码中我们可以知道,redis在创建字符串的时候,会根据长度来进行不同方式的创建。
当字符串长度小于等于44时,redis会创建一种encoding为OBJ_ENCODING_EMBSTR的对象,而当字符串长度大于44,则会创建一种encoding为OBJ_ENCODING_RAW的对象。故在redis客户端可以看到如下输出:
127.0.0.1:6379> set a3 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
OK
127.0.0.1:6379> object encoding a3
"raw"
127.0.0.1:6379> set a4 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
OK
127.0.0.1:6379> object encoding a4
"embstr"
继续跟踪创建字符串的源码,其中函数createEmbeddedStringObject的源码如下:
robj *createEmbeddedStringObject(const char *ptr, size_t len) {
robj *o = zmalloc(sizeof(robj)+sizeof(struct sdshdr8)+len+1);//分配内存
struct sdshdr8 *sh = (void*)(o+1); //sdshdr8,实际数据在内存上连续
o->type = OBJ_STRING;
o->encoding = OBJ_ENCODING_EMBSTR;
o->ptr = sh+1;
o->refcount = 1;
if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
o->lru = (LFUGetTimeInMinutes()<<8) | LFU_INIT_VAL;
} else {
o->lru = LRU_CLOCK();
}
sh->len = len;
sh->alloc = len;
sh->flags = SDS_TYPE_8;
if (ptr == SDS_NOINIT)
sh->buf[len] = '\0';
else if (ptr) {
memcpy(sh->buf,ptr,len);
sh->buf[len] = '\0';
} else {
memset(sh->buf,0,len+1);
}
return o;
}
从函数createEmbeddedStringObject的源码可以看出:
1.embstr底层用的sds数据结构为sdshdr8
2.embstr编码将创建字符串对象的内存分配次数为1次
3.embstr编码的字符串对象的所有数据都保存在一块连续的内存里面
创建raw编码的sds的函数createRawStringObject的源码如下:
robj *createRawStringObject(const char *ptr, size_t len) {
return createObject(OBJ_STRING, sdsnewlen(ptr,len));
}
其中,创建字符串最终由sdsnewlen函数创建,sdsnewlen函数如下:
sds sdsnewlen(const void *init, size_t initlen) {
void *sh;
sds s;
char type = sdsReqType(initlen);
/* Empty strings are usually created in order to append. Use type 8
* since type 5 is not good at this. */
if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
int hdrlen = sdsHdrSize(type);
unsigned char *fp; /* flags pointer. */
sh = s_malloc(hdrlen+initlen+1);
if (init==SDS_NOINIT)
init = NULL;
else if (!init)
memset(sh, 0, hdrlen+initlen+1);
if (sh == NULL) return NULL;
s = (char*)sh+hdrlen;
fp = ((unsigned char*)s)-1;
switch(type) {
case SDS_TYPE_5: {
*fp = type | (initlen << SDS_TYPE_BITS);
break;
}
case SDS_TYPE_8: {
SDS_HDR_VAR(8,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_16: {
SDS_HDR_VAR(16,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_32: {
SDS_HDR_VAR(32,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_64: {
SDS_HDR_VAR(64,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
}
if (initlen && init)
memcpy(s, init, initlen);
s[initlen] = '\0';
return s;
}
从上面sdsnewlen的源码中可知:
1.执行sdsnewlen函数的时候,会分配一次内存
2.执行sdsnewlen函数的时候,会根据initlen选择不同结构的sds
3.执行sdsnewlen函数的时候,字符串末尾会加上了‘\0’,SDS遵循了C字符串以空字符结尾的惯例,保存空字符串的1字节空间不计算在SDS的len属性里面。
在函数createRawStringObject调用createObject函数和sdsnewlen函数时,都进行了内存分配,故raw相对于embstr来说,会进行两次内存分配,且raw编码的对象大多数情况下不是保存在一块连续的内存里面,当redis内存释放的时候,embstr对象只需要调用一次内存释放函数,而raw编码的对象需要调用两次内存释放函数。
当我们在客户端执行set命令时,会执行如下函数:
/* SET key value [NX] [XX] [EX <seconds>] [PX <milliseconds>] */
void setCommand(client *c) {
int j;
robj *expire = NULL;
int unit = UNIT_SECONDS;
int flags = OBJ_SET_NO_FLAGS;
for (j = 3; j < c->argc; j++) {
char *a = c->argv[j]->ptr;
robj *next = (j == c->argc-1) ? NULL : c->argv[j+1];
if ((a[0] == 'n' || a[0] == 'N') &&
(a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
!(flags & OBJ_SET_XX))
{
flags |= OBJ_SET_NX;
} else if ((a[0] == 'x' || a[0] == 'X') &&
(a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
!(flags & OBJ_SET_NX))
{
flags |= OBJ_SET_XX;
} else if ((a[0] == 'e' || a[0] == 'E') &&
(a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
!(flags & OBJ_SET_PX) && next)
{
flags |= OBJ_SET_EX;
unit = UNIT_SECONDS;
expire = next;
j++;
} else if ((a[0] == 'p' || a[0] == 'P') &&
(a[1] == 'x' || a[1] == 'X') && a[2] == '\0' &&
!(flags & OBJ_SET_EX) && next)
{
flags |= OBJ_SET_PX;
unit = UNIT_MILLISECONDS;
expire = next;
j++;
} else {
addReply(c,shared.syntaxerr);
return;
}
}
c->argv[2] = tryObjectEncoding(c->argv[2]);
setGenericCommand(c,flags,c->argv[1],c->argv[2],expire,unit,NULL,NULL);
}
继续跟踪tryObjectEncoding函数,我们会发现在tryObjectEncoding函数里面有如下代码:
len = sdslen(s);
if (len <= 20 && string2l(s,len,&value)) {
/* This object is encodable as a long. Try to use a shared object.
* Note that we avoid using shared integers when maxmemory is used
* because every object needs to have a private LRU field for the LRU
* algorithm to work well. */
if ((server.maxmemory == 0 ||
!(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS)) &&
value >= 0 &&
value < OBJ_SHARED_INTEGERS)
{
decrRefCount(o);
incrRefCount(shared.integers[value]);
return shared.integers[value];
} else {
if (o->encoding == OBJ_ENCODING_RAW) sdsfree(o->ptr);
o->encoding = OBJ_ENCODING_INT;
o->ptr = (void*) value;
return o;
}
}
我们可以发现set字符串的时候,如果是满足条件的整形,那么encoding还可以为OBJ_ENCODING_INT,所以针对Redis的字符串,可以有以下三种编码:
1.OBJ_ENCODING_INT
2.OBJ_ENCODING_EMBSTR
3.OBJ_ENCODING_RAW
2.2 sds和c语言字符串的区别
1.c字符串获取字符串长度的复杂度为O(N),sds获取字符串长度的复杂度为O(1)
2.C字符串可能会造成缓冲区溢出,SDS杜绝缓冲区溢出
3.SDS减少修改字符串时带来的内存重分配次数
sds通过未使用空间,实现了空间预分配和惰性空间释放两种优化策略。
4.c字符串只能保存文本数据,SDS可以保存文本或者二进制数据
5.c字符串可以使用所有<string.h>库中的函数,SDS可以使用部分<string.h>库中的数据。
三、列表对象
列表对象是redisObject结构体中,type为1(OBJ_LIST)的对象。
1.lpush/rpush源码解析
跟踪列表push(lpush/rpush)命令,我们可以看到如下源码:
void lpushCommand(client *c) {
pushGenericCommand(c,LIST_HEAD);
}
void rpushCommand(client *c) {
pushGenericCommand(c,LIST_TAIL);
}
两个指令最终都走的pushGenericCommand函数,其中pushGenericCommand函数源码如下:
void pushGenericCommand(client *c, int where) {
int j, pushed = 0;
//为写操作查找一个键,如果键存在,则返回链接值对象;如 果键不存在,则返回NULL
robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
if (lobj && lobj->type != OBJ_LIST) {
addReply(c,shared.wrongtypeerr);
return;
}
for (j = 2; j < c->argc; j++) {
if (!lobj) {
//当键不存在,则创建一个quicklist对象
lobj = createQuicklistObject();
quicklistSetOptions(lobj->ptr, server.list_max_ziplist_size,
server.list_compress_depth);
dbAdd(c->db,c->argv[1],lobj);
}
listTypePush(lobj,c->argv[j],where);
pushed++;
}
addReplyLongLong(c, (lobj ? listTypeLength(lobj) : 0));
if (pushed) {
char *event = (where == LIST_HEAD) ? "lpush" : "rpush";
signalModifiedKey(c->db,c->argv[1]);
notifyKeyspaceEvent(NOTIFY_LIST,event,c->argv[1],c->db->id);
}
server.dirty += pushed;
}
从源码中可以知道,redis列表对象底层数据结构为quicklist。注:《redis设计与实现》中描述的ziplist和linkedlist在redis3.2版本开始,由quicklist替代。
quicklist结构的源码如下:
typedef struct quicklist {
quicklistNode *head;
quicklistNode *tail;
unsigned long count; /* total count of all entries in all ziplists */
unsigned long len; /* number of quicklistNodes */
int fill : 16; /* fill factor for individual nodes */
unsigned int compress : 16; /* depth of end nodes not to compress;0=off */
} quicklist;
typedef struct quicklistNode {
struct quicklistNode *prev; //上个节点
struct quicklistNode *next; //下个节点
//保存的数据指针,未压缩ziplist结构 压缩后quicklistLZF结构
unsigned char *zl;
unsigned int sz; /* ziplist size in bytes */
unsigned int count : 16; /* count of items in ziplist */
unsigned int encoding : 2; /* RAW==1 or LZF==2 */
unsigned int container : 2; /* NONE==1 or ZIPLIST==2 */
unsigned int recompress : 1; /* was this node previous compressed? */
unsigned int attempted_compress : 1; /* node can't compress; too small */
unsigned int extra : 10; /* more bits to steal for future usage */
} quicklistNode;
在pushGenericCommand源码中,函数listTypePush中调用了quicklistPush函数,根据quicklistPush的int where执行quicklistPushHead函数或quicklistPushTail函数,此处选择跟quicklistPushHead函数源码:
int quicklistPushHead(quicklist *quicklist, void *value, size_t sz) {
quicklistNode *orig_head = quicklist->head;
if (likely(
_quicklistNodeAllowInsert(quicklist->head, quicklist->fill, sz))) {
quicklist->head->zl =
ziplistPush(quicklist->head->zl, value, sz, ZIPLIST_HEAD);
quicklistNodeUpdateSz(quicklist->head);
} else {
quicklistNode *node = quicklistCreateNode();
node->zl = ziplistPush(ziplistNew(), value, sz, ZIPLIST_HEAD);
quicklistNodeUpdateSz(node);
_quicklistInsertNodeBefore(quicklist, quicklist->head, node);
}
quicklist->count++;
quicklist->head->count++;
return (orig_head != quicklist->head);
}
根据源码,我们可以知道quicklistNode的*zl为ziplist压缩后的数据,根据源码可以看出quicklist的数据结构实际上位ziplist和linkedlist的混合,每一个节点quicklistNode使用ziplist来进行数据的存储(压缩/未压缩)。
四、hash对象
hash对象是redisObject结构体中,type为4(OBJ_HASH)的对象。
1.hset源码解析
hash对象源码从hset命令出发进行跟踪,在t_hash.c文件中有如下源码:
void hsetCommand(client *c) {
int i, created = 0;
robj *o;
if ((c->argc % 2) == 1) {
addReplyError(c,"wrong number of arguments for HMSET");
return;
}
if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
hashTypeTryConversion(o,c->argv,2,c->argc-1);
for (i = 2; i < c->argc; i += 2)
created += !hashTypeSet(o,c->argv[i]->ptr,c->argv[i+1]->ptr,HASH_SET_COPY);
/* HMSET (deprecated) and HSET return value is different. */
char *cmdname = c->argv[0]->ptr;
if (cmdname[1] == 's' || cmdname[1] == 'S') {
/* HSET */
addReplyLongLong(c, created);
} else {
/* HMSET */
addReply(c, shared.ok);
}
signalModifiedKey(c->db,c->argv[1]);
notifyKeyspaceEvent(NOTIFY_HASH,"hset",c->argv[1],c->db->id);
server.dirty++;
}
其中,hashTypeLookupWriteOrCreate函数会对键是否存在进行判断,如果不存在就先创建key的对象,源码如下:
robj *hashTypeLookupWriteOrCreate(client *c, robj *key) {
robj *o = lookupKeyWrite(c->db,key);
if (o == NULL) {
o = createHashObject();
dbAdd(c->db,key,o);
} else {
if (o->type != OBJ_HASH) {
addReply(c,shared.wrongtypeerr);
return NULL;
}
}
return o;
}
hashTypeLookupWriteOrCreate源码中的createHashObject的源码为:
robj *createHashObject(void) {
unsigned char *zl = ziplistNew();
robj *o = createObject(OBJ_HASH, zl);
o->encoding = OBJ_ENCODING_ZIPLIST;
return o;
}
根据这段源码可知,hset在执行的时候,会先把key创建为一种编码为OBJ_ENCODING_ZIPLIST的对象。
在执行完hashTypeLookupWriteOrCreate函数后,如果条件不成立,则会进行hashTypeTryConversion,其源码如下:
void hashTypeTryConversion(robj *o, robj **argv, int start, int end) {
int i;
if (o->encoding != OBJ_ENCODING_ZIPLIST) return;
for (i = start; i <= end; i++) {
if (sdsEncodedObject(argv[i]) &&
sdslen(argv[i]->ptr) > server.hash_max_ziplist_value)
{ // hash_max_ziplist_value默认64
//ziplist转hashtable
hashTypeConvert(o, OBJ_ENCODING_HT);
break;
}
}
}
从上述源码可以看出,当向hash对象里面添加一个键或值长度超过64字节时,hash对象编码会有ziplist转hashtable。
在执行完上述代码后,会根据hset后需要插入的键值对进行遍历存数据,执行hashTypeSet函数,hashTypeSet函数的源码中有一句:
/* Check if the ziplist needs to be converted to a hash table */
if (hashTypeLength(o) > server.hash_max_ziplist_entries)
//hash_max_ziplist_entries默认值为512
hashTypeConvert(o, OBJ_ENCODING_HT);
由此可以看出,当hash对象保存的键值对数量大于512时,hash对象编码也会由ziplist转为hashtable。
2.hash总结
2.1 hash对象在redis中编码有OBJ_ENCODING_ZIPLIST和OBJ_ENCODING_HT两种,当hash对象同时满足下面两个条件时,其编码为OBJ_ENCODING_ZIPLIST,否则就会进行编码转换为OBJ_ENCODING_HT,这两个条件为:
(1)hash对象中保存的所有键值对的键和值的字符串长度都小于64
(2)hash对象保存的键值对数量小于512。
2.2 hash对象的键和值(值为键值对)在hset时保存的数据结构为压缩列表
五、集合对象
集合对象是redisObject结构体中,type为2(OBJ_SET)的对象。
1.sadd源码解析
通过sadd源码来对set对象进行分析,在t_set.c中,sadd源码如下:
void saddCommand(client *c) {
robj *set;
int j, added = 0;
//在db中寻找key的set
set = lookupKeyWrite(c->db,c->argv[1]);
if (set == NULL) {
// 如果不存在就创建
set = setTypeCreate(c->argv[2]->ptr);
dbAdd(c->db,c->argv[1],set);
} else {
if (set->type != OBJ_SET) {
addReply(c,shared.wrongtypeerr);
return;
}
}
for (j = 2; j < c->argc; j++) {
if (setTypeAdd(set,c->argv[j]->ptr)) added++;
}
if (added) {
signalModifiedKey(c->db,c->argv[1]);
notifyKeyspaceEvent(NOTIFY_SET,"sadd",c->argv[1],c->db->id);
}
server.dirty += added;
addReplyLongLong(c,added);
}
源码中,当当前键在db中不存在,则会调用setTypeCreate函数进行创建,setTypeCreate的源码如下:
robj *setTypeCreate(sds value) {
if (isSdsRepresentableAsLongLong(value,NULL) == C_OK)
return createIntsetObject();
return createSetObject();
}
由此,我们可以看到set创建有两种编码,整数集合的编码为OBJ_ENCODING_INTSET。其他的集合编码为OBJ_ENCODING_HT。
createIntsetObject函数的源码为:
robj *createIntsetObject(void) {
intset *is = intsetNew();
robj *o = createObject(OBJ_SET,is);
o->encoding = OBJ_ENCODING_INTSET;
return o;
}
createIntsetObject将创建如下结构的整数集合:
typedef struct intset {
uint32_t encoding;
uint32_t length;
int8_t contents[];
} intset;
createSetObject的源码如下:
robj *createSetObject(void) {
dict *d = dictCreate(&setDictType,NULL);
robj *o = createObject(OBJ_SET,d);
o->encoding = OBJ_ENCODING_HT;
return o;
}
在创建set(setTypeCreate)和setTypeAdd的时候,都会调用string2ll对传入的值进行转换,如果在执行set函数时所有的值都能转换为long long类型的整数值,set就会先创建一个intset编码的集合。
当创建完robj *set后,会对set的值进行setTypeAdd,其源码如下:
int setTypeAdd(robj *subject, sds value) {
long long llval;
if (subject->encoding == OBJ_ENCODING_HT) {
dict *ht = subject->ptr;
dictEntry *de = dictAddRaw(ht,value,NULL);
if (de) {
dictSetKey(ht,de,sdsdup(value));
dictSetVal(ht,de,NULL);
return 1;
}
} else if (subject->encoding == OBJ_ENCODING_INTSET) {
//isSdsRepresentableAsLongLong也是调用string2ll
if (isSdsRepresentableAsLongLong(value,&llval) == C_OK) {
uint8_t success = 0;
subject->ptr = intsetAdd(subject->ptr,llval,&success);
if (success) {
/* Convert to regular set when the intset contains
* too many entries. */
// set_max_intset_entries默认值为512
if (intsetLen(subject->ptr) > server.set_max_intset_entries)
setTypeConvert(subject,OBJ_ENCODING_HT);
return 1;
}
} else {
/* Failed to get integer from object, convert to regular set. */
setTypeConvert(subject,OBJ_ENCODING_HT);
/* The set *was* an intset and this value is not integer
* encodable, so dictAdd should always work. */
serverAssert(dictAdd(subject->ptr,sdsdup(value),NULL) == DICT_OK);
return 1;
}
} else {
serverPanic("Unknown set encoding");
}
return 0;
}
由上面的源码可以知道,即使在setTypeCreate时创建编码为OBJ_ENCODING_INTSET的集合对象,当集合中保存的元素(或新增的元素中)有无法转换为long long类型的整数值时或者是集合中保存元素数量大于512时,都会调用setTypeConvert函数对集合进行转换,使集合变为编码为OBJ_ENCODING_HT的hashtable结构。
从上面源码中,也可以看出OBJ_ENCODING_HT编码的集合对象使用字典作为底层实现,字典的每个键都是一个SDS对象,每个SDS对象包含一个集合元素,而字典的值则全部被设置为NULL(dictSetVal(ht,de,NULL))。
2.集合总结
1.集合对象的编码有两种:OBJ_ENCODING_INTSET和OBJ_ENCODING_HT。
2.当集合对象中保存的元素(或新增的元素中)有无法转换为long long类型的整数值时或者是集合中保存元素数量大于512时,都会调用setTypeConvert函数对集合进行转换,使集合变为编码为OBJ_ENCODING_HT的hashtable结构。
六、有序集合对象
有序集合对象是redisObject结构体中type为3(OBJ_ZSET)的对象。
1.zadd源码解析
zadd指令和zincrby指令都是直接对函数zaddGenericCommand进行调用,zaddGenericCommand的源码中有如下代码:
/* Lookup the key and create the sorted set if does not exist. */
zobj = lookupKeyWrite(c->db,key);
if (zobj == NULL) {
if (xx) goto reply_to_client; /* No key + XX option: nothing to do. */
if (server.zset_max_ziplist_entries == 0 ||
server.zset_max_ziplist_value < sdslen(c->argv[scoreidx+1]->ptr))
{
zobj = createZsetObject();
} else {
zobj = createZsetZiplistObject();
}
dbAdd(c->db,key,zobj);
} else {
if (zobj->type != OBJ_ZSET) {
addReply(c,shared.wrongtypeerr);
goto cleanup;
}
}
server.zset_max_ziplist_entries默认值为128,当server.zset_max_ziplist_entries被修改为0或者有序集合中保存的所有元素成员的长度大于64(server.zset_max_ziplist_value默认值为64)时,则会调用createZsetObject函数,createZsetObject的源码如下:
robj *createZsetObject(void) {
zset *zs = zmalloc(sizeof(*zs));
robj *o;
zs->dict = dictCreate(&zsetDictType,NULL);
zs->zsl = zslCreate();
o = createObject(OBJ_ZSET,zs);
o->encoding = OBJ_ENCODING_SKIPLIST;
return o;
}
zset的源码结构为:
typedef struct zset {
dict *dict;
zskiplist *zsl;
} zset;
其中包括一个字典和一个跳跃表zskiplist。zskiplist的源码为:
typedef struct zskiplist {
struct zskiplistNode *header, *tail;
unsigned long length;
int level;
} zskiplist;
zskiplistNode的源码为:
typedef struct zskiplistNode {
sds ele;
double score;
struct zskiplistNode *backward;
struct zskiplistLevel {
struct zskiplistNode *forward;
unsigned long span;
} level[];
} zskiplistNode;
在创建编码为跳跃表的zset对象时(createZsetObject),zslCreate即为zset的创建函数,其源码为:
zskiplist *zslCreate(void) {
int j;
zskiplist *zsl;
zsl = zmalloc(sizeof(*zsl));
zsl->level = 1;
zsl->length = 0;
zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
//#define ZSKIPLIST_MAXLEVEL 64 /* Should be enough for 2^64 elements */
for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
zsl->header->level[j].forward = NULL;
zsl->header->level[j].span = 0;
}
zsl->header->backward = NULL;
zsl->tail = NULL;
return zsl;
}
其中创建header时调用的函数为zslCreateNode,其源码为:
zskiplistNode *zslCreateNode(int level, double score, sds ele) {
zskiplistNode *zn =
zmalloc(sizeof(*zn)+level*sizeof(struct zskiplistLevel));
zn->score = score;
zn->ele = ele;
return zn;
}
根据上面这两段创建zset的逻辑,大致可以画出在创建zskiplist时生成的结构,如下:

针对Redis跳跃表将在后面选择利用详细的篇幅去进行探讨,此处仅根据源码进行前期的简单介绍。
当server.zset_max_ziplist_entries == 0 || server.zset_max_ziplist_value < sdslen(c->argv[scoreidx+1]->ptr)不满足时,会直接创建一个ziplist压缩列表。
在存值时,会调用函数zsetAdd,zsetAdd中会在满足条件后对有序集合的编码结构进行转换,具体源码如下:
if (zobj->encoding == OBJ_ENCODING_ZIPLIST) {
unsigned char *eptr;
if ((eptr = zzlFind(zobj->ptr,ele,&curscore)) != NULL) {
/* NX? Return, same element already exists. */
if (nx) {
*flags |= ZADD_NOP;
return 1;
}
/* Prepare the score for the increment if needed. */
if (incr) {
score += curscore;
if (isnan(score)) {
*flags |= ZADD_NAN;
return 0;
}
if (newscore) *newscore = score;
}
/* Remove and re-insert when score changed. */
if (score != curscore) {
zobj->ptr = zzlDelete(zobj->ptr,eptr);
zobj->ptr = zzlInsert(zobj->ptr,ele,score);
*flags |= ZADD_UPDATED;
}
return 1;
} else if (!xx) {
/* Optimize: check if the element is too large or the list
* becomes too long *before* executing zzlInsert. */
zobj->ptr = zzlInsert(zobj->ptr,ele,score);
if (zzlLength(zobj->ptr) > server.zset_max_ziplist_entries)
zsetConvert(zobj,OBJ_ENCODING_SKIPLIST);
if (sdslen(ele) > server.zset_max_ziplist_value)
zsetConvert(zobj,OBJ_ENCODING_SKIPLIST);
if (newscore) *newscore = score;
*flags |= ZADD_ADDED;
return 1;
} else {
*flags |= ZADD_NOP;
return 1;
}
} else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) {
zset *zs = zobj->ptr;
zskiplistNode *znode;
dictEntry *de;
de = dictFind(zs->dict,ele);
if (de != NULL) {
/* NX? Return, same element already exists. */
if (nx) {
*flags |= ZADD_NOP;
return 1;
}
curscore = *(double*)dictGetVal(de);
/* Prepare the score for the increment if needed. */
if (incr) {
score += curscore;
if (isnan(score)) {
*flags |= ZADD_NAN;
return 0;
}
if (newscore) *newscore = score;
}
/* Remove and re-insert when score changes. */
if (score != curscore) {
znode = zslUpdateScore(zs->zsl,curscore,ele,score);
/* Note that we did not removed the original element from
* the hash table representing the sorted set, so we just
* update the score. */
dictGetVal(de) = &znode->score; /* Update score ptr. */
*flags |= ZADD_UPDATED;
}
return 1;
} else if (!xx) {
ele = sdsdup(ele);
znode = zslInsert(zs->zsl,score,ele);
serverAssert(dictAdd(zs->dict,ele,&znode->score) == DICT_OK);
*flags |= ZADD_ADDED;
if (newscore) *newscore = score;
return 1;
} else {
*flags |= ZADD_NOP;
return 1;
}
} else {
serverPanic("Unknown sorted set encoding");
}
从代码中可以看出,当有序集合编码为OBJ_ENCODING_ZIPLIST且满足下列条件时,会转换为OBJ_ENCODING_SKIPLIST编码的跳跃表底层数据结构:
1.有序集合保存的元素数量小于128个(zzlLength(zobj->ptr) > server.zset_max_ziplist_entries,其中server.zset_max_ziplist_entries默认为128);
2.有序集合保存的所有元素成员的长度大于64字节(sdslen(ele) > server.zset_max_ziplist_value,其中server.zset_max_ziplist_value默认为64);
我们还可以看到,当score改变时,底层会先移除然后再重新插入。
在上面的源码中,zzlInsert时会对score进行比较,压缩列表中的集合元素按分值从小到大进行排序,分值较小的元素被放置在靠近表头的位置,而分值较大的元素则被放置在靠近表尾的方向。当插入的score与原本存在的元素的score相同时,会按照被插入的键的字典顺序进行排序,比如abc和acd的score相同,则abc会在acd之前,具体源码实现是在函数zzlInsert中的zzlCompareElements函数中进行。
2.有序集合总结
1.有序集合的编码有两种:OBJ_ENCODING_ZIPLIST和OBJ_ENCODING_SKIPLIST。
2.当有序集合同时满足下面两个条件时,对象使用ziplist编码:
- 有序集合保存的元素数量小于128个
- 有序集合保存的所有元素成员的长度大于64字节
本文深入剖析Redis源码,讲解了Redis对象的结构、字符串、列表、哈希、集合和有序集合的底层实现,包括SDS、ziplist、skiplist等数据结构的使用与转换策略。

1775

被折叠的 条评论
为什么被折叠?



