linux radix-tree 基数树实现详解

radix tree,又称做基数树,是一种适合于构建key(index)与value(item)相关联的数据结构。内核中使用非常广泛。本文主要聚焦linux内核基数树的代码实现,大量注释过的代码。

radix-tree组织结构如下:

1、数据结构

复制代码
/*
 * The bottom two bits of the slot determine how the remaining bits in the slot are interpreted:
 *
 * 00 - data pointer (item就是data pointer)
 * 01 - internal entry (指向radix-tree下一级,一个sibling entry或则指示slot中的entry被移动到其他位置的指针)
 * 10 - exceptional entry (存放swap entry被标记为异常项,区别于执行struct page的指针)
 * 11 - this bit combination is currently unused/reserved
 *
 * The internal entry may be a pointer to the next level in the tree, a
 * sibling entry, or an indicator that the entry in this slot has been moved
 * to another location in the tree and the lookup should be restarted.  While
 * NULL fits the 'data pointer' pattern, it means that there is no entry in
 * the tree for this index (no matter what level of the tree it is found at).
 * This means that you cannot store NULL in the tree as a value for the index.
 */
#define RADIX_TREE_ENTRY_MASK		3UL
#define RADIX_TREE_INTERNAL_NODE	1UL

/*
 * Most users of the radix tree store pointers but shmem/tmpfs stores swap
 * entries in the same tree.  They are marked as exceptional entries to
 * distinguish them from pointers to struct page.
 * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
 * 基数树的大多数用户存储指针,但shmem/tmpfs在同一棵树中存储交换项,它们被标记为异常项,以区别于指向结构页(struct page)的指针
 *
 */
#define RADIX_TREE_EXCEPTIONAL_ENTRY	2
#define RADIX_TREE_EXCEPTIONAL_SHIFT	2

下面函数用于检查对齐情况
/**
 * radix_tree_exceptional_entry	- radix_tree_deref_slot gave exceptional entry?
 * @arg:	value returned by radix_tree_deref_slot
 * Returns:	0 if well-aligned pointer, non-0 if exceptional entry.
 */
static inline int radix_tree_exceptional_entry(void *arg)
{
	/* Not unlikely because radix_tree_exception often tested first */
	return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY;
}

/**
 * radix_tree_exception	- radix_tree_deref_slot returned either exception?
 * @arg:	value returned by radix_tree_deref_slot
 * Returns:	0 if well-aligned pointer, non-0 if either kind of exception.
 */
static inline int radix_tree_exception(void *arg)
{
	return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}

/*config文件配置*/
CONFIG_BASE_SMALL=0 

#define RADIX_TREE_MAX_TAGS 3

/*shift值为基础步长,每次增长单位为6*/
#define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6) //CONFIG_BASE_SMALL=0,6

/*节点slots[]元素个数*/
#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT) //1<<6,64
#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1) //64-1= 0x3f

/*RADIX_TREE_MAP_SIZE 需要 long 对齐,向上取整,1*/
#define RADIX_TREE_TAG_LONGS	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)

#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long)) //8×8=64

/*64/6向上取整 11,radix-tree的深度,index为64bit,每层占据6bit,64/6向上取整为11,故而radix-tree树深度为11*/
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, RADIX_TREE_MAP_SHIFT))

/* The top bits of gfp_mask are used to store the root tags and the IDR flag */
#define ROOT_IS_IDR		((__force gfp_t)(1 << __GFP_BITS_SHIFT)) //1<<25
#define ROOT_TAG_SHIFT	(__GFP_BITS_SHIFT + 1)  //25+1=26

Linux radix-tree的存储键值index是无符号长整型,如果每次右移一个固定步长后,树的深度加一,则树的深度也是固定的(比特位数/步长)。
而 RADIX_TREE_MAP_SHIFT 就是bit位的步长,Linux默认值为6,其含义是每右移6 bit为一个单位。


数据结构:
/*
 * @count is the count of every non-NULL element in the ->slots array
 * whether that is an exceptional entry, a retry entry, a user pointer,
 * a sibling entry or a pointer to the next level of the tree.
 * @exceptional is the count of every element in ->slots which is
 * either radix_tree_exceptional_entry() or is a sibling entry for an
 * exceptional entry.
 */
/*radix-tree 成员节点*/
struct radix_tree_node {
	/*该层级上的shift值,最底层节点shift值为0,其父节点shift值为6,依次网上类推*/
	unsigned char	shift;		/* Bits remaining in each slot */
	/*该节点在父节点slots[]中的偏移位置*/
	unsigned char	offset;		/* Slot offset in parent */
	/*slots[]中非NULL元素个数*/
	unsigned char	count;		/* Total entry count */
	unsigned char	exceptional;	/* Exceptional entry count */
	/*父节点*/
	struct radix_tree_node *parent;		/* Used when ascending tree */
	/*所属radix-tree 根*/
	struct radix_tree_root *root;		/* The tree we belong to */
	union {
		struct list_head private_list;	/* For tree user */
		struct rcu_head	rcu_head;	    /* Used when freeing node */
	};
	/*slots[]存放子节点*/
	void __rcu	*slots[RADIX_TREE_MAP_SIZE]; //64
	/*tags作用??*/
	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; //tags[3][1]
};

如果用1个bit标记1个slots[]的状态,64个slots[]刚好需要1个64bit的变量。
一个slot如果有3个状态,当然需要3个64bit的变量。这就是unsignedlong tags[3]的来历。

在page cache radix tree中,三个状态定义如下:

/*
 * Radix-tree tags, for tagging dirty and writeback pages within the pagecache radix trees
 */
#define PAGECACHE_TAG_DIRTY	0
#define PAGECACHE_TAG_WRITEBACK	1
#define PAGECACHE_TAG_TOWRITE	2
tags[0][0]的64bit用来标记page 的dirty状态,bit0-63的值分别对应slots[]0-63的状态
tags[1][0]的64bit用来标记page 的WRITEBACK状态,bit0-63的值分别对应slots[]0-63的状态
tags[2][0]的64bit用来标记page 的TOWRITE状态,bit0-63的值分别对应slots[]0-63的状态

/*radix-tree 根节点*/
struct radix_tree_root {
	gfp_t		gfp_mask;
	struct radix_tree_node	__rcu *rnode;
};

2、接口实现

2.1、准备工作

linux内核通过kmem_cache_create为radix-tree提前分配了一块slab内存空间(radix_tree_node_cachep)用于快速分配struct radix_tree_node节点。

又通过radix_tree_node_cachep提前为每个CPU分配了一些struct radix_tree_node节点,通过struct radix_tree_preload将其串联起来进行管理。

复制代码
void __init radix_tree_init(void)
{
	int ret;

	BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
			sizeof(struct radix_tree_node), 0,
			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
			radix_tree_node_ctor);

	/*计算不同height时radix-tree中节点的总数,不包括root,保存在height_to_maxnodes[]中*/
	radix_tree_init_maxnodes();
	ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",NULL, radix_tree_cpu_dead);
	WARN_ON(ret < 0);
}


/*根据height值计算radix-tree中index的最大值*/
static __init unsigned long __maxindex(unsigned int height)
{
	unsigned int width = height * RADIX_TREE_MAP_SHIFT; /*height * 6*/
	int shift = RADIX_TREE_INDEX_BITS - width; /*64 - 6*height*/

	if (shift < 0)
		return ~0UL;
	if (shift >= BITS_PER_LONG)
		return 0UL;
	return ~0UL >> shift;
}

/*计算不同height时radix-tree中节点的总数,不包括root*/
static __init void radix_tree_init_maxnodes(void)
{
	unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; //12
	unsigned int i, j;

	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
		height_to_maxindex[i] = __maxindex(i); /*0,64-1,4096-1,....height为不同值时,radix-tree树中能够存放的index的最大值*/

	for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) {
		for (j = i; j > 0; j--)
			height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1;
	}
}

/*
 * Per-cpu pool of preloaded nodes
 */
struct radix_tree_preload {
	unsigned nr;
	/* nodes->parent points to next preallocated node */
	struct radix_tree_node *nodes;
};
/*per-cpu变量 radix_tree_preloads*/
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };

/*每个CPU上预先分配21个struct radix_tree_node节点*/
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) //11*2-1=21

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
int radix_tree_preload(gfp_t gfp_mask)
{
	/* Warn on non-sensical(无意义) use... */
	WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
	return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
	struct radix_tree_preload *rtp;
	struct radix_tree_node *node;
	int ret = -ENOMEM;

	/*
	 * Nodes preloaded by one cgroup can be be used by another cgroup, so
	 * they should never be accounted to any particular memory cgroup.
	 */
	gfp_mask &= ~__GFP_ACCOUNT;

	preempt_disable(); /*因为要访问percpu变量,这里需要禁止抢占,防止访问percpu变量过程中,执行线程迁移到其他cpu上运行*/
	rtp = this_cpu_ptr(&radix_tree_preloads);
	while (rtp->nr < nr) {
		preempt_enable(); /*分配内存过程中,可能出现阻塞,所以在调用内存分配函数之前,使能抢占 */
		/*预先分配radix-tree节点*/
		node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
		if (node == NULL)
			goto out;
		preempt_disable(); /*分配内存完成,需要重新禁止抢占,重新获取percpu变量,也需要重新判断percpu内存池的内存对象是否充足*/
		rtp = this_cpu_ptr(&radix_tree_preloads);
		if (rtp->nr < nr) {
			/*预先分配的radix-tree节点串联起来*/
			node->parent = rtp->nodes;
			rtp->nodes = node;
			rtp->nr++;
		} else {
			kmem_cache_free(radix_tree_node_cachep, node);
		}
	}
	ret = 0;
out:
	return ret;
}

/*radix_tree_preload和radix_tree_preload_end配对使用*/
static inline void radix_tree_preload_end(void)
{
	preempt_enable();
}

如下示例:
radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
spin_lock(&im->ino_lock);
...
spin_unlock(&im->ino_lock);
radix_tree_preload_end();

2.2、初始化radix-tree root

复制代码
#define RADIX_TREE_INIT(mask) { \

.gfp_mask = (mask), \

.rnode = NULL, \

}



/*初始化radix_tree root节点*/

#define RADIX_TREE(name, mask) \

struct radix_tree_root name = RADIX_TREE_INIT(mask)



#define INIT_RADIX_TREE(root, mask) \

do { \

(root)->gfp_mask = (mask); \

(root)->rnode = NULL; \

} while (0)

2.3、往radix-tree中插入节点

复制代码
/*向radix-tree中插入节点,index为键值,entry为index存储的对应的指针即item*/
static inline int radix_tree_insert(struct radix_tree_root *root,unsigned long index, void *entry)
{
	return __radix_tree_insert(root, index, 0, entry); /*order参数为0*/
}

/**
 *	__radix_tree_insert    -    insert into a radix tree
 *	@root:		radix tree root
 *	@index:		index key
 *	@order:		key covers the 2^order indices(指数) around index,Key覆盖了index周围的2^order指数的index,连续多个index(从index起始)指向同一个item
 *	@item:		item to insert
 *
 *	Insert an item into the radix tree at position @index.
 */
int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, unsigned order, void *item)
{
	struct radix_tree_node *node;
	void __rcu **slot;
	int error;

	/*item为radix-tree的内部节点则bug*/
	BUG_ON(radix_tree_is_internal_node(item));

	/*
		创建radix-tree节点,做的事情很多,终归是将inex对应的节点创建到radix-tree中
		nodep指向radix-tree中index对应的最底层的节点;
		slotp指向radix-tree中index对应的最底层节点的slots[];
	*/
	error = __radix_tree_create(root, index, order, &node, &slot);
	if (error)
		return error;

	/*将item数据插入到radix-tree 最低层节点slots[]对应位置中*/
	error = insert_entries(node, slot, item, order, false);
	if (error < 0)
		return error;

	if (node) {
		unsigned offset = get_slot_offset(node, slot);
		BUG_ON(tag_get(node, 0, offset));
		BUG_ON(tag_get(node, 1, offset));
		BUG_ON(tag_get(node, 2, offset));
	} else {
		BUG_ON(root_tags_get(root));
	}

	return 0;
}

/**
 *	__radix_tree_create	-	create a slot in a radix tree
 *	@root:		radix tree root
 *	@index:		index key
 *	@order:		index occupies 2^order aligned slots
 *	@nodep:		returns node
 *	@slotp:		returns slot
 *
 *	Create, if necessary, and return the node and slot for an item
 *	at position @index in the radix tree @root.
 *
 *	Until there is more than one item in the tree, no nodes are
 *	allocated and @root->rnode is used as a direct slot instead of
 *	pointing to a node, in which case *@nodep will be NULL.
 *
 *	Returns -ENOMEM, or 0 for success.
 */
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
			unsigned order, struct radix_tree_node **nodep,
			void __rcu ***slotp)
{
	struct radix_tree_node *node = NULL, *child;

	void __rcu **slot = (void __rcu **)&root->rnode; /*如果root为刚刚声明的radix-tree,那么root->rnode则为NULL*/

	unsigned long maxindex;
	unsigned int shift, offset = 0;

	unsigned long max = index | ((1UL << order) - 1); /*order用于计算覆盖范围*/

	gfp_t gfp = root_gfp_mask(root);

	/*
	如果radix-tree root为刚刚声明的radix-tree,那么maxindex=0,child=NULL,shift=0
	否则返回radix-tree树能够存储的最大index值(maxindex),root->rnode(child))以及返回shift(root当前shift+RADIX_TREE_MAP_SHIFT)值
	*/
	shift = radix_tree_load_root(root, &child, &maxindex);

	/* Make sure the tree is high enough.  */
	if (order > 0 && max == ((1UL << order) - 1)) /*index已经占满*/
		max++;

	if (max > maxindex) {
		/*
			max > maxindex 且root->rnode存在则需要对radix-tree进行扩展,返回当前radix-tree shift+RADIX_TREE_MAP_SHIFT的值
			如果root->rnode不存在则直接返回shift+RADIX_TREE_MAP_SHIFT的值,后续创建节点来完成扩展
		*/
		int error = radix_tree_extend(root, gfp, max, shift); /*shift值比实际大RADIX_TREE_MAP_SHIFT*/
		if (error < 0)
			return error;
		shift = error;
		child = rcu_dereference_raw(root->rnode);
	}

	while (shift > order) {
		shift -= RADIX_TREE_MAP_SHIFT; /*前面传出来shift+RADIX_TREE_MAP_SHIFT值,这里进行-RADIX_TREE_MAP_SHIFT*/
		if (child == NULL) {
			/* Have to add a child node.  */
			child = radix_tree_node_alloc(gfp, node, root, shift,offset, 0, 0);
			if (!child)
				return -ENOMEM;

			rcu_assign_pointer(*slot, node_to_entry(child)); /*slot指向新节点,netry值*/
			if (node)
				node->count++;
		} else if (!radix_tree_is_internal_node(child))
			break;

		/* Go a level down */
		node = entry_to_node(child);
		/*计算出index在parent slots[]中的偏移位置(返回值)以及slots[offset]值(nodep保存)*/
		offset = radix_tree_descend(node, &child, index);
		slot = &node->slots[offset]; /*index在父节点对应的slots[]*/
	}

	if (nodep) /*nodep指向radix-tree中index对应的最底层的节点*/
		*nodep = node;
	if (slotp)
		*slotp = slot; /*slotp指向radix-tree中index对应的最底层节点的slots[]*/
	return 0;
}

/*
获取radix-tree树当前能够存档的maxindex,以及返回当前shift+RADIX_TREE_MAP_SHIFT
maxindex:radix-tree 能够存放的maxindex
nodep:指向root->rnode
*/
static unsigned radix_tree_load_root(const struct radix_tree_root *root,struct radix_tree_node **nodep, unsigned long *maxindex)
{
	struct radix_tree_node *node = rcu_dereference_raw(root->rnode);

	*nodep = node;

	/*root->rnode为内部节点情况,说明root->rnode存在指向内容*/
	if (likely(radix_tree_is_internal_node(node))) {
		node = entry_to_node(node);
		/*radix-tree树当前能够存储的最大键值*/
		*maxindex = node_maxindex(node);
		return node->shift + RADIX_TREE_MAP_SHIFT; /*为什么要+RADIX_TREE_MAP_SHIFT???*/
	}

	*maxindex = 0;
	return 0;
}

static inline bool radix_tree_is_internal_node(void *ptr)
{
	return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) == RADIX_TREE_INTERNAL_NODE;
}			  

/*清除内部节点属性*/
static inline struct radix_tree_node *entry_to_node(void *ptr)
{
	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

/*添加内部节点属性*/
static inline void *node_to_entry(void *ptr)
{
	return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}

#define RADIX_TREE_RETRY	node_to_entry(NULL)		

/*
 * The maximum index which can be stored in a radix tree
 */
static inline unsigned long shift_maxindex(unsigned int shift)
{
	/*
	shift值必然是 RADIX_TREE_MAP_SHIFT 的整数倍,因为是以RADIX_TREE_MAP_SHIFT 为步长进行增加
	(100 0000 << shift)-1,计算出node当前能够存储的最大index值
	*/
	return (RADIX_TREE_MAP_SIZE << shift) - 1;
}

/*node->shift标识node的shift值,计算出node当前能够存储的最大index值*/
static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
	return shift_maxindex(node->shift);
}


/*
 *	Extend a radix tree so it can store key @index.
 */
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,unsigned long index, unsigned int shift)
{
	void *entry;
	unsigned int maxshift;
	int tag;

	/* Figure out what the shift should be.  */
	maxshift = shift;

	/*计算出index对应的shift值*/
	while (index > shift_maxindex(maxshift))
		maxshift += RADIX_TREE_MAP_SHIFT; /*shift每次以RADIX_TREE_MAP_SHIFT为步长进行增加*/

	entry = rcu_dereference_raw(root->rnode);
	if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
		goto out;/*root->rnode都不存在且(非idr或则root->gfp_mask IDR_FREE置位),退出,直接根据shift值创建新节点无须进行扩展处理*/

	/*
		由于需要对radix-tree进行扩展,需要增加新节点以及深度,必然会对之前已经建立关联的radxir-tree进行挪动;
		挪动其实很简单只需要让老radix-tree root->rnode指向新节点中的slots[0],同时root->rnode指向新节点;
		深度从shift扩展到maxshift;
	*/
	do {
		/*因为radix-tree root->rnode会指向新节点中的slots[0],所以新node中的count设置为1*/
		struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL/*parent*/,root, shift, 0/*offset*/, 1/*count*/, 0/*exceptional*/);
		if (!node)
			return -ENOMEM;

		if (is_idr(root)) { /*!!(root->gfp_mask & ROOT_IS_IDR);*/
			all_tag_set(node, IDR_FREE); /*node->tags[IDR_FREE]全部置位*/
			if (!root_tag_get(root, IDR_FREE)) { /*root->gfp_mask & (1 << (IDR_FREE + ROOT_TAG_SHIFT))*/
				tag_clear(node, IDR_FREE, 0); /*/*清除node->tags[IDR_FREE] bit 0,扩展意味着之前radix-tree中无剩余空间*/*/
				root_tag_set(root, IDR_FREE); /*root设置IDR_FREE标志*/
			}
		} else { /*非idr*/
			/* Propagate the aggregated tag info to the new child */
			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
				if (root_tag_get(root, tag)) /*root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT))*/
					tag_set(node, tag, 0); /*将node->tags[tag] bit 0 值位*/
			}
		}

		BUG_ON(shift > BITS_PER_LONG);
		if (radix_tree_is_internal_node(entry)) {
			entry_to_node(entry)->parent = node;
		} else if (radix_tree_exceptional_entry(entry)) {
			/* Moving an exceptional root->rnode to a node */
			node->exceptional = 1;
		}
		/*
		 * entry was already in the radix tree, so we do not need
		 * rcu_assign_pointer here
		 */
		node->slots[0] = (void __rcu *)entry; /*扩展后,之前radix-tree指向扩展节点的slots[0],注意slots[0]值是entry*/
		entry = node_to_entry(node);
		rcu_assign_pointer(root->rnode, entry); /*radix-tree root->rnode指向新节点即可,也是entry*/
		shift += RADIX_TREE_MAP_SHIFT; /*继续进行扩展*/
	} while (shift <= maxshift);
out:
	return maxshift + RADIX_TREE_MAP_SHIFT; /*为什么返回的shift值都+RADIX_TREE_MAP_SHIFT??*/
}

/*计算出index在parent slots[]中的偏移位置(返回值)以及slots[offset](nodep保存)*/
static unsigned int radix_tree_descend(const struct radix_tree_node *parent,struct radix_tree_node **nodep, unsigned long index)
{
	/*计算出index在parent slosts[]中的offset偏移值*/
	unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
	/*对应的slots[]中的值*/
	void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);

#ifdef CONFIG_RADIX_TREE_MULTIORDER
                /*multiorder情况下,slots[]中存放的未必是值*/
	if (radix_tree_is_internal_node(entry)) {
		if (is_sibling_entry(parent, entry)) { /*兄弟节点存放最开始slots[]地址转为entry的地址*/
			void __rcu **sibentry;
			sibentry = (void __rcu **) entry_to_node(entry);
			offset = get_slot_offset(parent, sibentry);
			entry = rcu_dereference_raw(*sibentry); 
		}
	}
#endif

	*nodep = (void *)entry; /*slots[offset]中的内容*/
	return offset;
}

/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node * radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
			struct radix_tree_root *root,
			unsigned int shift, unsigned int offset,
			unsigned int count, unsigned int exceptional)
{
	struct radix_tree_node *ret = NULL;

	/*
	 * Preload code isn't irq safe and it doesn't make sense to use
	 * preloading during an interrupt anyway as all the allocations have
	 * to be atomic. So just do normal allocation when in interrupt.
	 */
	/*不支持blocking且不在中断上下文中*/
	if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
		struct radix_tree_preload *rtp;

		/*
		 * Even if the caller has preloaded, try to allocate from the
		 * cache first for the new node to get accounted to the memory
		 * cgroup.
		 */
		ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask | __GFP_NOWARN);
		if (ret)
			goto out;

		/*
		 * Provided the caller has preloaded here, we will always
		 * succeed in getting a node here (and never reach
		 * kmem_cache_alloc)
		 */
		/*radix_tree_preloads为per-cpu变量,管理预先分配的radix-tree节点*/
		rtp = this_cpu_ptr(&radix_tree_preloads);
		if (rtp->nr) {
			/*从预先分配的radix-tree pool中取一个节点*/
			ret = rtp->nodes;
			rtp->nodes = ret->parent;
			rtp->nr--;
		}
		/*
		 * Update the allocation stack trace as this is more useful
		 * for debugging.
		 */
		kmemleak_update_trace(ret);
		goto out;
	}
	ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
	BUG_ON(radix_tree_is_internal_node(ret));
	if (ret) {
		/*赋值*/
		ret->shift = shift;
		ret->offset = offset;
		ret->count = count;
		ret->exceptional = exceptional;
		ret->parent = parent;
		ret->root = root;
	}
	return ret;
}

#ifdef CONFIG_RADIX_TREE_MULTIORDER
/*
CONFIG_RADIX_TREE_MULTIORDER=y 成立
MULTIORDER指index周围2^order范围内的slots[]存放同一个item值
*/
static inline int insert_entries(struct radix_tree_node *node,
		void __rcu **slot, void *item, unsigned order, bool replace)
{
	struct radix_tree_node *child;
	unsigned i, n, tag, offset, tags = 0;

	/*n记录index周围2^order范围内的数量*/
	if (node) { /*最底层节点存在,node->shift为0*/
		if (order > node->shift) /*order:index周围2^order范围内的数量*/
			n = 1 << (order - node->shift);
		else
			n = 1;
		offset = get_slot_offset(node, slot); /*slots[]内部偏移*/
	} else { /*最底层节点不存在,直接放在root->rnode中slots[0]中*/
		n = 1;
		offset = 0;
	}

	if (n > 1) { /*index覆盖多个*/
		offset = offset & ~(n - 1); /*offset进行对齐*/
		slot = &node->slots[offset]; /*开始位置对应的slots*/
	}
	child = node_to_entry(slot); /*注意,slots[]由node转为entry*/

	for (i = 0; i < n; i++) {
		if (slot[i]) { /*slots[i]中已经有item存在*/
			if (replace) { /*replace为TRUE,进行替换*/
				node->count--; /*统计计算*/
				for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
					if (tag_get(node, tag, offset + i)) //node->tags[tag]的offset+1 bit是否置位
						tags |= 1 << tag;
			} else
				return -EEXIST; /*不替换则返回存在错误值:-EEXIST*/
		}
	}

	for (i = 0; i < n; i++) {
		struct radix_tree_node *old = rcu_dereference_raw(slot[i]);
		/*index周围2^order范围内的slots[]中,只有最开始的slots[]中存放item数据,其他 sibling slots[]中存放最开始slots[]的地址*/
		if (i) { // i!=0
			rcu_assign_pointer(slot[i], child); /*注意,child为entry,将最开始的slots[]地址存入slots[i]中,将在multiorder情况下,sibling slots[]中存放开始slots[]的地址且为内部节点*/
			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
				if (tags & (1 << tag))
					tag_clear(node, tag, offset + i); /*清除tag值*/
		} else { //i为0
			rcu_assign_pointer(slot[i], item); /*直接将item存入slots[i]中*/
			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
				if (tags & (1 << tag))
					tag_set(node, tag, offset); /*设置tag值*/
		}
		if (radix_tree_is_internal_node(old) &&
					!is_sibling_entry(node, old) &&
					(old != RADIX_TREE_RETRY))
			radix_tree_free_nodes(old); /*old中的内容已经无用,调用radix_tree_free_nodes()采用深度优先的方法将old下面的所有节点都释放掉*/
		if (radix_tree_exceptional_entry(old))
			node->exceptional--;
	}
	if (node) {
		node->count += n; /*增加计数值*/
		if (radix_tree_exceptional_entry(item))
			node->exceptional += n;
	}
	return n;
}
#else
static inline int insert_entries(struct radix_tree_node *node,
		void __rcu **slot, void *item, unsigned order, bool replace)
{
	if (*slot)
		return -EEXIST;
	rcu_assign_pointer(*slot, item); /*radix-tree index对应的最底层的节点的slots[]存放item数据 */
	if (node) {
		node->count++;
		if (radix_tree_exceptional_entry(item))
			node->exceptional++;
	}
	return 1;
}
#endif


/*
 * Free any nodes below this node.释放此节点以下的所有节点。
 * The tree is presumed to not need shrinking, and any user data in the tree is presumed to not need a destructor called on it.
 * 假定树不需要收缩,并且假定树中的任何用户数据都不需要对其调用析构函数
 * If we need to add a destructor, we can add that functionality later.
 * Note that we may not clear tags or slots from the tree as an RCU walker may still have a pointer into this subtree.
 * We could replace the entries with RADIX_TREE_RETRY,
 * but we'll still have to clear those in rcu_free.
 */
/*radix_tree_free_nodes是一个非常粗暴的方式直接将node下的所有节点直接释放掉*/
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
	unsigned offset = 0;
	struct radix_tree_node *child = entry_to_node(node);

	/*深度优先的方法将node下面的所有节点都释放掉*/
	for (;;) {
		void *entry = rcu_dereference_raw(child->slots[offset]);
		/*
			只有multiorder下才存在sibling节点,sibling节点不用释放其下的节点由第一个sibling节点释放即可;
			radix_tree_is_internal_node(entry):确保entry节点有效,无效则offset++,判断下个slots[];
			!is_sibling_entry(child, entry):确保entry不为sibling即其存在子节点;
		*/
		if (radix_tree_is_internal_node(entry) && !is_sibling_entry(child, entry)) {
			child = entry_to_node(entry); /*处理entry的子节点*/
			offset = 0;
			continue;
		}
		offset++; /*如果entry无效或则entry为sibling即其无子节点则offset++,判断下一个slots[]*/
		while (offset == RADIX_TREE_MAP_SIZE) { /*child节点的所有slots[]都已经处理完,child本身可以释放掉了*/
			struct radix_tree_node *old = child;
			offset = child->offset + 1;
			child = child->parent; /*返回到父节点从父节点slots[] child->offset + 1位置继续判断*/
			WARN_ON_ONCE(!list_empty(&old->private_list));
			/*通过call_rcu 注册一个回调函数,当所有现存的读访问完成后,调用这个回调函数注销旧数据*/
			radix_tree_node_free(old);
			if (old == entry_to_node(node)) /*回到了node自身,node下面所有子节点都处理完毕,退出*/
				return;
		}
	}
}

2.4、从radix-tree中删除index

复制代码
/**
 * radix_tree_delete - delete an entry from a radix tree
 * @root: radix tree root
 * @index: index key
 *
 * Remove the entry at @index from the radix tree rooted at @root.
 *
 * Return: The deleted entry, or %NULL if it was not present.
 */
/*删除index键值对应的item,radix-tree内部根据情况判断是否删除index索引路径上的节点以及进行radix-tree shrink操作*/
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
	return radix_tree_delete_item(root, index, NULL);
}

/**
 * radix_tree_delete_item - delete an item from a radix tree
 * @root: radix tree root
 * @index: index key
 * @item: expected item
 *
 * Remove @item at @index from the radix tree rooted at @root.
 *
 * Return: the deleted entry, or %NULL if it was not present
 * or the entry at the given @index was not @item.
 */
void *radix_tree_delete_item(struct radix_tree_root *root,unsigned long index, void *item)
{
	struct radix_tree_node *node = NULL;
	void __rcu **slot = NULL;
	void *entry;

	/*计算出index在parent slosts[]中的offset偏移值以及最底层的节点node,并返回对应的slots[]中的值(返回值)*/
	entry = __radix_tree_lookup(root, index, &node, &slot);
	if (!slot)
		return NULL;

	if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
						get_slot_offset(node, slot))))
		return NULL;

	if (item && entry != item)
		return NULL;

/*
	1.将index对应slots[]中的itemi设置为NULL;
	2.根据情况处理node节点:
	如果node->count > 0,说明node中还存储有其他item,不能直接删除node节点。
	如果node->count =0,则可以直接删除node节点,假设如果node是parent的唯一子节点的话,node被删除了,parent其实也是可以删除的,同时从下往上尝试删除其他无用的节点。
	如果node为root->rnode时可以对radix-tree进行shrink操作。
*/
	__radix_tree_delete(root, node, slot);

	return entry;
}


/**
 *	__radix_tree_lookup	-lookup an item in a radix tree
 *	@root:		radix tree root
 *	@index:		index key
 *	@nodep:		returns node
 *	@slotp:		returns slot
 *
 *	Lookup and return the item at position @index in the radix
 *	tree @root.
 *
 *	Until there is more than one item in the tree, no nodes are
 *	allocated and @root->rnode is used as a direct slot instead of
 *	pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(const struct radix_tree_root *root,
			  unsigned long index, struct radix_tree_node **nodep,
			  void __rcu ***slotp)
{
	struct radix_tree_node *node, *parent;
	unsigned long maxindex;
	void __rcu **slot;

 restart:
	parent = NULL;
	slot = (void __rcu **)&root->rnode;

	/*radixr-tree maxindex以及root->rnode节点*/
	radix_tree_load_root(root, &node, &maxindex);
	if (index > maxindex)
		return NULL;

	/*一层接一层地往下找到index在radix-tree中最底层的节点以及对应的slots[]*/
	while (radix_tree_is_internal_node(node)) {
		unsigned offset;

		if (node == RADIX_TREE_RETRY)
			goto restart;

		parent = entry_to_node(node);
		/*计算出index在parent slosts[]中的offset偏移值以及对应的slots[]中的值存在node中*/
		offset = radix_tree_descend(parent, &node, index);
		slot = parent->slots + offset;
	}

	if (nodep)
		*nodep = parent;
	if (slotp)
		*slotp = slot;
	return node;
}

/*
	1.将index对应slots[]中的itemi设置为NULL;
	2.根据情况处理node节点:
	如果node->count > 0,说明node中还存储有其他item,不能直接删除node节点。
	如果node->count =0,则可以直接删除node节点,假设如果node是parent的唯一子节点的话,node被删除了,parent其实也是可以删除的,同时从下往上尝试删除其他无用的节点。
	如果node为root->rnode时可以对radix-tree进行shrink操作。
*/
static bool __radix_tree_delete(struct radix_tree_root *root,
				struct radix_tree_node *node, void __rcu **slot)
{
	void *old = rcu_dereference_raw(*slot);

	int exceptional = radix_tree_exceptional_entry(old) ? -1 : 0;
	unsigned offset = get_slot_offset(node, slot);
	int tag;

	if (is_idr(root)) /*idr情况*/
		node_tag_set(root, node, IDR_FREE, offset); /*如果node!=NULL,则递归设置node及其父节点tags[tag] offset位否则设置root gfp_mask 该tag位*/
	else
		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
			node_tag_clear(root, node, tag, offset);

	/*处理index对应的item*/
	replace_slot(slot, NULL, node, -1, exceptional); /*count为-1,设置slots[] 为NULL*/

	/*
		如果node->count > 0,说明node中还存储有其他item,不能直接删除node节点。如果node为root->rnode时可以对radix-tree进行shrink操作。
		如果node->count =0,则可以直接删除node节点,假设如果node是parent的唯一子节点的话,node被删除了,parent其实也是可以删除的,同时从下往上尝试删除其他无用的节点。
	*/
	return node && delete_node(root, node, NULL, NULL);
}

static void replace_slot(void __rcu **slot, void *item,struct radix_tree_node *node, int count, int exceptional)
{
	if (WARN_ON_ONCE(radix_tree_is_internal_node(item)))
		return;

	/*node存在且count或exception为true*/
	if (node && (count || exceptional)) {
		node->count += count;
		node->exceptional += exceptional;
		/*count < 0,将sibling的slots[]设置为NULL否则不做处理*/
		replace_sibling_entries(node, slot, count, exceptional);
	}

	/*将slots[]设置为item(NULL)*/
	rcu_assign_pointer(*slot, item);
}

static inline void replace_sibling_entries(struct radix_tree_node *node,
				void __rcu **slot, int count, int exceptional)
{
#ifdef CONFIG_RADIX_TREE_MULTIORDER
	void *ptr = node_to_entry(slot);
	unsigned offset = get_slot_offset(node, slot) + 1;

	while (offset < RADIX_TREE_MAP_SIZE) {
		if (rcu_dereference_raw(node->slots[offset]) != ptr) /*非sibling时则break*/
			break;
		if (count < 0) {
			node->slots[offset] = NULL; /*将sibling的slots[]设置为NULL*/
			node->count--;
		}
		node->exceptional += exceptional;
		offset++;
	}
#endif
}


/*
如果node->count > 0,说明node中还存储有其他item,不能直接删除node节点。如果node为root->rnode时可以对radix-tree进行shrink操作。
如果node->count =0,则可以直接删除node节点,假设如果node是parent的唯一子节点的话,node被删除了,parent其实也是可以删除的,同时从下往上尝试删除其他无用的节点。

*/
static bool delete_node(struct radix_tree_root *root,struct radix_tree_node *node,radix_tree_update_node_t update_node, void *private)
{
	bool deleted = false;

	do {
		struct radix_tree_node *parent;

		if (node->count) {  /*因为node中还存储有其他item,不能直接将node连根拔起,可以对radix-tree进行shrink*/
			if (node_to_entry(node) == rcu_dereference_raw(root->rnode)) /*只有删除的node为root->rnode时才能进行shrink操作*/
				deleted |= radix_tree_shrink(root, update_node,private); /*进行shrink操作*/
			return deleted;
		}

		/*node中没有存储其他item,可以删除node*/
		parent = node->parent;
		if (parent) {
			/*处理其父节点*/
			parent->slots[node->offset] = NULL;
			parent->count--;
		} else {
			/*
			 * Shouldn't the tags already have all been cleared
			 * by the caller?
			 */
			if (!is_idr(root))
				root_tag_clear_all(root);
			root->rnode = NULL;
		}

		WARN_ON_ONCE(!list_empty(&node->private_list));
		radix_tree_node_free(node); /*直接释放node*/
		deleted = true;

		node = parent; /*继续处理,假设如果node是parent的唯一子节点的话,node被删除了,parent其实也是可以删除的*/
	} while (node);

	return deleted;
}

/**
 *	radix_tree_shrink    -    shrink radix tree to minimum height
 *	@root		radix tree root
 */
static inline bool radix_tree_shrink(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private)
{
	bool shrunk = false;

	for (;;) {
		/*shrink只能在root->rnode这一级上进行处理,这一级上只存在一个节点好处理*/
		struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
		struct radix_tree_node *child;

		if (!radix_tree_is_internal_node(node))
			break;
		node = entry_to_node(node);

		/*
		 * The candidate node has more than one child, or its child
		 * is not at the leftmost slot, or the child is a multiorder
		 * entry, we cannot shrink.
		 */
		if (node->count != 1) /*The candidate node has more than one child*/
			break;

		/*
			在最左边意味者存在slots[0]位置且是node的唯一子节点,对于child而言其高位为0所以存放在slots[0]位置;
			所以将node删除,对一继续寻找child是没有影响的。
		*/
		child = rcu_dereference_raw(node->slots[0]); /*its child is not at the leftmost slot*/
		if (!child)
			break;

		if (!radix_tree_is_internal_node(child) && node->shift) /*child is a multiorder entry,*/
			break;

		if (radix_tree_is_internal_node(child))
			entry_to_node(child)->parent = NULL;

		/*
		 * We don't need rcu_assign_pointer(), since we are simply
		 * moving the node from one part of the tree to another: if it
		 * was safe to dereference the old pointer to it
		 * (node->slots[0]), it will be safe to dereference the new
		 * one (root->rnode) as far as dependent read barriers go.
		 */
		root->rnode = (void __rcu *)child; /*root指向child,这个和扩展时操作是逆操作*/
		if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
			root_tag_clear(root, IDR_FREE);

		/*
		 * We have a dilemma here. The node's slot[0] must not be
		 * NULLed in case there are concurrent lookups expecting to
		 * find the item. However if this was a bottom-level node,
		 * then it may be subject to the slot pointer being visible
		 * to callers dereferencing it. If item corresponding to
		 * slot[0] is subsequently deleted, these callers would expect
		 * their slot to become empty sooner or later.
		 *
		 * For example, lockless pagecache will look up a slot, deref
		 * the page pointer, and if the page has 0 refcount it means it
		 * was concurrently deleted from pagecache so try the deref
		 * again. Fortunately there is already a requirement for logic
		 * to retry the entire slot lookup -- the indirect pointer
		 * problem (replacing direct root node with an indirect pointer
		 * also results in a stale slot). So tag the slot as indirect
		 * to force callers to retry.
		 */
		node->count = 0; /*node->count置为0*/
		if (!radix_tree_is_internal_node(child)) {
			node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
			if (update_node)
				update_node(node, private);
		}

		WARN_ON_ONCE(!list_empty(&node->private_list));
		radix_tree_node_free(node); /*直接释放node*/
		shrunk = true;
	}

	return shrunk;
}

static void radix_tree_node_rcu_free(struct rcu_head *head)
{
	struct radix_tree_node *node =
			container_of(head, struct radix_tree_node, rcu_head);

	/*
	 * Must only free zeroed nodes into the slab.  We can be left with
	 * non-NULL entries by radix_tree_free_nodes, so clear the entries
	 * and tags here.
	 */
	memset(node->slots, 0, sizeof(node->slots));
	memset(node->tags, 0, sizeof(node->tags));
	INIT_LIST_HEAD(&node->private_list);

	kmem_cache_free(radix_tree_node_cachep, node);
}

static inline void radix_tree_node_free(struct radix_tree_node *node)
{
	/*注册一个回调函数,当所有现存的读访问完成后,调用这个回调函数注销旧数据*/
	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

注意区分radix_tree_node_free和radix_tree_free_nodes的差异:
radix_tree_node_free释放该节点的radix_tree_node节点内存空间;
radix_tree_free_nodes是一个非常粗暴的方式直接将node下的所有节点直接释放掉;

2.5、从radix-tree中查找index,返回其item

复制代码
/**
 *	radix_tree_lookup    -    perform lookup operation on a radix tree
 *	@root:		radix tree root
 *	@index:		index key
 *
 *	Lookup the item at the position @index in the radix tree @root.
 *
 *	This function can be called under rcu_read_lock, however the caller
 *	must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 *	them safely). No RCU barriers are required to access or modify the
 *	returned item, however.
 */
/*在radix-tree中查找index,返回index在radix-tree中的item值*/
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
{
	/*一层接一层地往下找到index在radix-tree中最底层的节点,对应的slots[],以及slots[]中的值*/
	return __radix_tree_lookup(root, index, NULL, NULL);
}

/**
 *	__radix_tree_lookup	-	lookup an item in a radix tree
 *	@root:		radix tree root
 *	@index:		index key
 *	@nodep:		returns node
 *	@slotp:		returns slot
 *
 *	Lookup and return the item at position @index in the radix
 *	tree @root.
 *
 *	Until there is more than one item in the tree, no nodes are
 *	allocated and @root->rnode is used as a direct slot instead of
 *	pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(const struct radix_tree_root *root,
			  unsigned long index, struct radix_tree_node **nodep,
			  void __rcu ***slotp)
{
	struct radix_tree_node *node, *parent;
	unsigned long maxindex;
	void __rcu **slot;

 restart:
	parent = NULL;
	slot = (void __rcu **)&root->rnode;

	/*radixr-tree maxindex以及root->rnode节点*/
	radix_tree_load_root(root, &node, &maxindex);
	if (index > maxindex)
		return NULL;

	/*一层接一层地往下找到index在radix-tree中最底层的节点以及对应的slots[]*/
	while (radix_tree_is_internal_node(node)) {
		unsigned offset;

		if (node == RADIX_TREE_RETRY)
			goto restart;

		parent = entry_to_node(node);
		/*计算出index在parent slosts[]中的offset偏移值以及对应的slots[]中的值存在node中*/
		offset = radix_tree_descend(parent, &node, index);
		slot = parent->slots + offset;
	}

	if (nodep)
		*nodep = parent;
	if (slotp)
		*slotp = slot;
	return node;
}

2.8、遍历radix-tree中的index

/**
 * radix_tree_for_each_slot - iterate over non-empty slots
 *
 * @slot:	the void** variable for pointer to slot
 * @root:	the struct radix_tree_root pointer
 * @iter:	the struct radix_tree_iter pointer
 * @start:	iteration starting index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
/*
radix_tree_next_chunk:在root中寻找满足flags条件的有效节点(slots[offset]!=NULL),最终找到的index允许 >= 参数index
radix_tree_next_slot:在radix_tree_next_chunk设置的迭代器(iter)index~next_index chunk范围内查找满足条件的slot
以参数start作为开始index查找整个radix-tree中的有效且无tagged slots[]
*/
#define radix_tree_for_each_slot(slot, root, iter, start)		\
	for (slot = radix_tree_iter_init(iter, start) ;			\
	     slot || (slot = radix_tree_next_chunk(root, iter, 0)) ;	\
	     slot = radix_tree_next_slot(slot, iter, 0))

/**
 * radix_tree_for_each_contig - iterate over contiguous slots
 *
 * @slot:	the void** variable for pointer to slot
 * @root:	the struct radix_tree_root pointer
 * @iter:	the struct radix_tree_iter pointer
 * @start:	iteration starting index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
/*
RADIX_TREE_ITER_CONTIG:stop at first hole
radix_tree_for_each_contig和radix_tree_for_each_slot功能类似,只不过如果在chunk内出现无效slots[]则退出,不再继续查找
*/
#define radix_tree_for_each_contig(slot, root, iter, start)			\
	for (slot = radix_tree_iter_init(iter, start) ;			\
	     slot || (slot = radix_tree_next_chunk(root, iter,		\
				RADIX_TREE_ITER_CONTIG)) ;	\
	     slot = radix_tree_next_slot(slot, iter,			\
				RADIX_TREE_ITER_CONTIG))

/**
 * radix_tree_for_each_tagged - iterate over tagged slots
 *
 * @slot:	the void** variable for pointer to slot
 * @root:	the struct radix_tree_root pointer
 * @iter:	the struct radix_tree_iter pointer
 * @start:	iteration starting index
 * @tag:	tag index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
/*
RADIX_TREE_ITER_TAGGED:lookup tagged slots
radix_tree_for_each_tagged和radix_tree_for_each_slot功能类似,其查询对象是tagged slots[],参数tag为用户关心的tag值
*/
#define radix_tree_for_each_tagged(slot, root, iter, start, tag)		\
	for (slot = radix_tree_iter_init(iter, start) ;			\
	     slot || (slot = radix_tree_next_chunk(root, iter,		\
			      RADIX_TREE_ITER_TAGGED | tag)) ;	\
	     slot = radix_tree_next_slot(slot, iter,			\
				RADIX_TREE_ITER_TAGGED | tag))


radix_tree_next_chunk和radix_tree_next_slot 配合使用。

 /**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:	radix tree root
 * @iter:	iterator state
 * @flags:	RADIX_TREE_ITER_* flags and tag index
 * Returns:	pointer to chunk first slot, or NULL if there no more left
 *
 * This function looks up the next chunk in the radix tree starting from @iter->next_index.
 *  It returns a pointer to the chunk's first slot.
 * Also it fills @iter with data about chunk: position in the tree (index),
 * its end (next_index), and constructs a bit mask for tagged iterating (tags).
 */
/*在idr中寻找满足flags条件的有效节点(slots[offset]!=NULL),最终找到的index允许 >=   iter->next_index*/
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,struct radix_tree_iter *iter, unsigned flags)
{
	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; /* tag index in lower nybble */
	struct radix_tree_node *node, *child;
	unsigned long index, offset, maxindex;

	if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
		return NULL;

	/*
	 * Catch next_index overflow after ~0UL. iter->index never overflows
	 * during iterating; it can be zero only at the beginning.
	 * And we cannot overflow iter->next_index in a single step,
	 * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
	 *
	 * This condition also used by radix_tree_next_slot() to stop
	 * contiguous iterating, and forbid switching to the next chunk.
	 */
	index = iter->next_index;
	if (!index && iter->index)
		return NULL;

 restart:
	radix_tree_load_root(root, &child, &maxindex);
	if (index > maxindex)
		return NULL;

	if (!child)
		return NULL;

	if (!radix_tree_is_internal_node(child)) { /*radix-tree 只存在一个item*/
		/* Single-slot tree */
		iter->index = index;
		iter->next_index = maxindex + 1;
		iter->tags = 1;
		iter->node = NULL;
		__set_iter_shift(iter, 0);
		return (void __rcu **)&root->rnode;
	}

	/*从上往下一层一层的寻找满足flags条件的index,最终找到的满足条件的index >= 参数index*/
	do {
		node = entry_to_node(child);
		/*index在当前层级的offset,slots[]*/
		offset = radix_tree_descend(node, &child, index);

		/*
		lookup tagged slots
		查询tagged slots,但是node->tags[tag] offset bit未置位情况,借用相邻slots[]
		不查询tagged slots,但是child为NULL情况,借用相邻slots[]
		*/
		if ((flags & RADIX_TREE_ITER_TAGGED) ? !tag_get(node, tag, offset) : !child) {
			/* Hole detected */
			if (flags & RADIX_TREE_ITER_CONTIG) /* stop at first hole */
				return NULL;

			if (flags & RADIX_TREE_ITER_TAGGED) /*查询tagged slots,但是node->tags[tag] offset bit未置位情况*/
				offset = radix_tree_find_next_bit(node, tag,offset + 1);  /*从node->tags[tag] offset+1位置开始查找firset set bit,找slots[]中相邻节点中tagged的节点*/
			else
				while (++offset < RADIX_TREE_MAP_SIZE) { /*不查询tagged slots,但是child为NULL情况,找slots[]中相邻节点*/
					void *slot = rcu_dereference_raw(node->slots[offset]);
					if (is_sibling_entry(node, slot))
						continue;
					if (slot) /*相邻节点不为NULL*/
						break;
				}

			/*上面操作导致index值在当前层级需要修正*/
			index &= ~node_maxindex(node);
			index += offset << node->shift; /*修改index*/

			/* Overflow after ~0UL */
			if (!index)
				return NULL;

			if (offset == RADIX_TREE_MAP_SIZE) /*在slots[]中没能找到满足条件的相邻节点*/
				goto restart; /*最终index > maxinde从而退出*/

			child = rcu_dereference_raw(node->slots[offset]);
		}

		if (!child)
			goto restart;

		if (child == RADIX_TREE_RETRY)
			break;

	} while (radix_tree_is_internal_node(child));

	/* Update the iterator state */
	/*找到满足条件的index*/
	iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); /*index设置为在当前node内对应的值*/
	iter->next_index = (index | node_maxindex(node)) + 1; /*next_index设置为当前node内能够存储的最大值+1*/
	iter->node = node; /*满足条件的index在最底层的节点*/
	__set_iter_shift(iter, node->shift);

	if (flags & RADIX_TREE_ITER_TAGGED)  /* lookup tagged slots */
		set_iter_tags(iter, node, offset, tag); /*设置iter->tags值*/

	return node->slots + offset; /*返回index对应的item*/
}

 /**
 * radix_tree_next_slot - find next slot in chunk
 *
 * @slot:	pointer to current slot
 * @iter:	pointer to interator state
 * @flags:	RADIX_TREE_ITER_*, should be constant
 * Returns:	pointer to next slot, or NULL if there no more left
 *
 * This function updates @iter->index in the case of a successful lookup.
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
 * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
 * b) we are doing non-tagged iteration, and iter->index and iter->next_index
 *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
 */
static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot, struct radix_tree_iter *iter, unsigned flags)
{
	if (flags & RADIX_TREE_ITER_TAGGED) { /* lookup tagged slots */
		iter->tags >>= 1; /*next slot*/
		if (unlikely(!iter->tags))
			return NULL; /*当前剩余slots[]中无tagged,返回NULL*/
		if (likely(iter->tags & 1ul)) { /*相邻slots[]满足条件*/
			iter->index = __radix_tree_iter_add(iter, 1);
			slot++;
			goto found;
		}

		/*相邻slots[]不满足条件*/
		if (!(flags & RADIX_TREE_ITER_CONTIG)) { /*first hole不stop,继续查找 */
			unsigned offset = __ffs(iter->tags);

			iter->tags >>= offset++;
			iter->index = __radix_tree_iter_add(iter, offset);
			slot += offset;
			goto found;
		}
		/*如果没有找到返回NULL*/
	} else { /*不要求为tagged slots*/
		long count = radix_tree_chunk_size(iter);

		/*老老实实地一个一个地找*/
		while (--count > 0) {
			slot++;
			iter->index = __radix_tree_iter_add(iter, 1);

			if (likely(*slot))
				goto found;
			if (flags & RADIX_TREE_ITER_CONTIG) { /* stop at first hole */
				/* forbid switching to the next chunk */
				iter->next_index = 0;
				break;
			}
		}
	}
	return NULL;

 found:
	if (unlikely(radix_tree_is_internal_node(rcu_dereference_raw(*slot))))
		return __radix_tree_next_slot(slot, iter, flags);
	return slot;
}

感兴趣的读者可以自己手动来尝试下将下面的数据插入基数树过程中的数据结构变化过程。

相关推荐
wdxylb6 小时前
云原生俱乐部-shell知识点归纳(1)
linux·云原生
飞雪20077 小时前
Alibaba Cloud Linux 3 在 Apple M 芯片 Mac 的 VMware Fusion 上部署的完整密码重置教程(二)
linux·macos·阿里云·vmware·虚拟机·aliyun·alibaba cloud
路溪非溪7 小时前
关于Linux内核中头文件问题相关总结
linux
Lovyk9 小时前
Linux 正则表达式
linux·运维
Fireworkitte10 小时前
Ubuntu、CentOS、AlmaLinux 9.5的 rc.local实现 开机启动
linux·ubuntu·centos
sword devil90011 小时前
ubuntu常见问题汇总
linux·ubuntu
ac.char11 小时前
在CentOS系统中查询已删除但仍占用磁盘空间的文件
linux·运维·centos
淮北也生橘1213 小时前
Linux的ALSA音频框架学习笔记
linux·笔记·学习
华强笔记16 小时前
Linux内存管理系统性总结
linux·运维·网络
十五年专注C++开发16 小时前
CMake进阶: CMake Modules---简化CMake配置的利器
linux·c++·windows·cmake·自动化构建