linux内存管理-页面回收之内核线程 kswapd (二)

上面章节将kswapd内核线程的整理逻辑体现出来了,本节将介绍其中的很多细节函数。

1、balance_pgdat函数

balance_pgdat函数是回收页面的主函数。这个函数比较长,查看时可以先看整体框架。

复制代码

/*负责在指定NUMA节点上回收内存,直至满足分配请求或达到回收上限*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order/*期望分配的连续物理页阶数*/,
			int *classzone_idx/*输入为本次分配请求涉及的最高内存域,输出为本次回收实际触及的最高内存域*/)
{
	int i;
	/*本次回收需要扫描的最高zone索引*/
	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
	unsigned long nr_soft_reclaimed;
	unsigned long nr_soft_scanned;

	/*linux内核内存回收子系统中的核心控制结构,它封装了一次内存回收操作的所有控制参数和结果统计*/
	struct scan_control sc = {
		.gfp_mask = GFP_KERNEL,
		.order = order, /*目标分配阶*/
		.priority = DEF_PRIORITY, /*扫描优先级,初始值12,越低越激进*/
		.may_writepage = !laptop_mode, /*是否允许回写脏页*/
		.may_unmap = 1, /*是否允许解除页面映射,为1允许解除映射回收存在映射的页面*/
		.may_swap = 1,  /*是否允许交换页面,为1允许使用swap分区*/
	};
	count_vm_event(PAGEOUTRUN);

	do {
		/*统计在当前优先级下,尝试回收页面的总次数,用于判断是否需要进行内存规整*/
		unsigned long nr_attempted = 0;
		/*标志位,指示是否需要提高回收优先级(即变得更激进)*/
		bool raise_priority = true;
		/*标志位,指示本次回收是否需要考虑内存规整，order>0就需要内存规则*/
		bool pgdat_needs_compaction = (order > 0);

		/*回收页面数置零*/
		sc.nr_reclaimed = 0;

		/*
		 * Scan in the highmem->dma direction for the highest
		 * zone which needs scanning
		 */
		/*
			第一轮扫描,确定回收范围end_zone
			从高端zone向低端zone反向扫描，这是为了与内核内存分配器从高到低查找zone的顺序匹配,避免重复扫描
		*/

		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
			struct zone *zone = pgdat->node_zones + i;

			if (!populated_zone(zone)) /*zone是否为空*/
				continue;

			/*
				当回收压力增大(priority降低)时,跳过那些被标记为不可回收的zone(如ZONE_MOVABLE)
			*/
			if (sc.priority != DEF_PRIORITY &&
			    !zone_reclaimable(zone)/*zone无需再进行回收*/)
				continue;

			/*
			 * Do some background aging of the anon list, to give
			 * pages a chance to be referenced before reclaiming.
			 */
			/*对匿名页的活跃LRU链表进行"老化"处理,非活跃匿名页面较少时,将部分(最多32个)活跃页面移至非活跃链表,为回收做准备*/
			age_active_anon(zone, &sc);

			/*
			 * If the number of buffer_heads in the machine
			 * exceeds the maximum allowed level and this node
			 * has a highmem zone, force kswapd to reclaim from
			 * it to relieve lowmem pressure.
			 */
			/*
			这是一个特殊优化。
			buffer_head是内核中用于管理块设备缓冲区的核心数据结构(定义在<linux/buffer_head.h>),主要关联文件系统的元数据(如inode、目录项)和磁盘块的缓存,
			每个buffer_head对应一个磁盘块(通常512B~4KB),用于跟踪缓冲区的状态(如脏页、锁定、I/O进度等)
			当系统buffer_heads过多且有高端内存时,强制从高端内存回收,以缓解低端内存压力。
			如果触发,则设置end_zone并跳出循环
			*/
			if (buffer_heads_over_limit && is_highmem_idx(i)) {
				end_zone = i;
				break;
			}

			/*
			调用zone_balanced()检查当前zone的水位是否满足order阶分配的需求后仍为高水位值
			如果不满足(!zone_balanced),说明此zone及其之下的所有zone都需要回收,记录end_zone = i并跳出循环
			如果满足(zone_balanced),说明此zone内存充足,清除其ZONE_CONGESTED(拥塞)和ZONE_DIRTY(脏页)标志,表示无需特殊处理
			*/
			if (!zone_balanced(zone, order, 0/*balance_gap为0*/, 0)) {
				end_zone = i;
				break;
			} else {
				/*
				 * If balanced, clear the dirty and congested
				 * flags
				 */
				clear_bit(ZONE_CONGESTED, &zone->flags);
				clear_bit(ZONE_DIRTY, &zone->flags);
			}
		}

		if (i < 0) /*如果上一轮扫描未找到任何不平衡的zone(i=-1),说明整个节点内存充足,直接跳转到out退出*/
			goto out;

		/*
		第二轮扫描,判断是否需要内存规整
		*/
		/*从低端zone向高端zone正向扫描,遍历所有需要回收的zone*/
		for (i = 0; i <= end_zone; i++) {
			struct zone *zone = pgdat->node_zones + i;

			if (!populated_zone(zone))
				continue;

			/*
			 * If any zone is currently balanced then kswapd will
			 * not call compaction as it is expected that the
			 * necessary pages are already available.
			 */
			/*
			如果仍需规整,但发现某个zone的内存水位已达到低水位(low_wmark_pages),则认为该zone有足够的空闲页面,无需规整。
			将pgdat_needs_compaction置为false
			*/
			if (pgdat_needs_compaction &&
					zone_watermark_ok(zone, order,
						low_wmark_pages(zone),
						*classzone_idx, 0))
				pgdat_needs_compaction = false;
		}

		/*
		 * If we're getting trouble reclaiming, start doing writepage
		 * even in laptop mode.
		 */
		/*
		当回收优先级变得很低(即回收压力大)时,强制开启sc.may_writepage,即使在省电的laptop_mode下也允许回写脏页,以释放更多内存
		*/
		if (sc.priority < DEF_PRIORITY - 2)
			sc.may_writepage = 1; /*回收压力大,允许回写操作,回收脏页*/

		/*
		 * Now scan the zone in the dma->highmem direction, stopping
		 * at the last zone which needs scanning.
		 *
		 * We do this because the page allocator works in the opposite
		 * direction.  This prevents the page allocator from allocating
		 * pages behind kswapd's direction of progress, which would
		 * cause too much scanning of the lower zones.
		 */
		/*
		第三轮扫描,执行实际回收
		*/
		/*再次从低端向高端扫描,对end_zone及以下的每个zone执行回收*/
		for (i = 0; i <= end_zone; i++) {
			struct zone *zone = pgdat->node_zones + i;

			if (!populated_zone(zone))
				continue;

			if (sc.priority != DEF_PRIORITY &&
			    !zone_reclaimable(zone))
				continue;

			/*重置本轮对该zone的扫描页面计数*/
			sc.nr_scanned = 0;

			nr_soft_scanned = 0;
			/*
			 * Call soft limit reclaim before calling shrink_zone.
			 */
			/*优先对超出cgroup软限制的页面进行回收,并更新回收统计*/
			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
							order, sc.gfp_mask,
							&nr_soft_scanned);
			sc.nr_reclaimed += nr_soft_reclaimed;

			/*
			 * There should be no need to raise the scanning
			 * priority if enough pages are already being scanned
			 * that that high watermark would be met at 100%
			 * efficiency.
			 */
			/*
			调用kswapd_shrink_zone()对当前zone执行核心回收逻辑。
			如果回收效果显著,则将raise_priority置为false,表示无需再提高优先级
			*/
			if (kswapd_shrink_zone(zone, end_zone,
					       &sc, &nr_attempted))
				raise_priority = false;
		}

		/*
		 * If the low watermark is met there is no need for processes
		 * to be throttled on pfmemalloc_wait as they should not be
		 * able to safely make forward progress. Wake them
		 */
		/*
		 检查是否有进程因内存紧张而在pfmemalloc_wait队列上等待，并且系统整体内存水位是否已恢复到安全水平
		*/
		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
				pfmemalloc_watermark_ok(pgdat))
			wake_up_all(&pgdat->pfmemalloc_wait); /*如果条件满足，则唤醒所有等待的进程，让它们可以继续执行*/

		/*
		 * Fragmentation may mean that the system cannot be rebalanced
		 * for high-order allocations in all zones. If twice the
		 * allocation size has been reclaimed and the zones are still
		 * not balanced then recheck the watermarks at order-0 to
		 * prevent kswapd reclaiming excessively. Assume that a
		 * process requested a high-order can direct reclaim/compact.
		 */
		/*
		如果请求的是高阶内存(order > 0),但回收的页面数已远超需求(2^order的两倍),说明内存碎片严重,难以分配连续内存
		回收量达标但合并失败,已回收页面数达到2^order+1页(目标阶数的2倍),却仍无法形成order阶连续块,说明碎片过多无法合并
		*/
		if (order && sc.nr_reclaimed >= 2UL << order)
			order = sc.order = 0; /*将目标阶数降级为0,转为优先保证基础内存(单页)的充足,避免过度回收*/

		/*检查退出条件*/
		/* Check if kswapd should be suspending */
		if (try_to_freeze() || kthread_should_stop())
			break;

		/*
		 * Compact if necessary and kswapd is reclaiming at least the
		 * high watermark number of pages as requsted
		 */
		/*触发内存规整*/
		/*
		如果仍需规整,并且回收的页面数超过了尝试回收的次数(说明回收有效),则调用compact_pgdat()对该节点执行内存规整,以整理内存碎片
		*/
		if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
			compact_pgdat(pgdat, order);

		/*
		 * Raise priority if scanning rate is too low or there was no
		 * progress in reclaiming pages
		 */
		/*
		如果本轮回收效果不佳(未回收足够页面),则降低sc.priority的值(提高扫描激进程度)
		*/
		if (raise_priority || !sc.nr_reclaimed)
			sc.priority--;
	} while (sc.priority >= 1 && !pgdat_balanced(pgdat, order, *classzone_idx));
		/*当回收优先级降至1以下,或整个节点的内存已满足order阶分配需求时，循环结束*/
out:
	/*
	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
	 * makes a decision on the order we were last reclaiming at. However,
	 * if another caller entered the allocator slow path while kswapd
	 * was awake, order will remain at the higher level
	 */
	/*将本次回收实际触及的最高zone索引写回,供kswapd下次唤醒时使用*/
	*classzone_idx = end_zone;
	return order; /*返回本次回收后,kswapd认为可以满足的最高分配阶*/
}

整体循环的退出条件，当回收优先级降至1以下,或整个节点的内存已满足order阶分配需求时，循环结束。

复制代码

/*判断一个内存节点(Node)是否"平衡",即其管理的所有相关内存区域(Zone)的空闲内存是否充足*/
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
	/*累计节点内所有受伙伴系统管理的页面总数*/
	unsigned long managed_pages = 0;
	/*累计节点内已处于"平衡"状态的页面数量*/
	unsigned long balanced_pages = 0;
	int i;

	/* Check the watermark levels */
	for (i = 0; i <= classzone_idx; i++) {
		struct zone *zone = pgdat->node_zones + i;

		if (!populated_zone(zone)) /*跳过空 Zone*/
			continue;

		/*统计节点内所有zone 伙伴系统管理的页面数量 managed_pages的数量*/
		managed_pages += zone->managed_pages;

		/*
		 * A special case here:
		 *
		 * balance_pgdat() skips over all_unreclaimable after
		 * DEF_PRIORITY. Effectively, it considers them balanced so
		 * they must be considered balanced here as well!
		 */
		/*
		特殊情况,处理不可回收Zone,对于标记为all_unreclaimable的Zone(如ZONE_MOVABLE在内存紧张时),balance_pgdat()回收逻辑会直接跳过它们,
		因此,这里也将其视为"平衡"状态,并将其所有页面计入balanced_pages,以避免kswapd在此类Zone上做无用功
		*/
		if (!zone_reclaimable(zone)) { /*不可回收zone*/
			balanced_pages += zone->managed_pages;
			continue;
		}

		/*判断当前Zone在分配2^order个页面下是否满足水位线要求且能提供order阶的连续内存*/
		if (zone_balanced(zone, order, 0, i))
			balanced_pages += zone->managed_pages; /*若平衡,将该Zone的所有页面计入balanced_pages*/
		else if (!order) /*若不平衡且order为0,对于单页分配,要求所有Zone都必须平衡,一旦发现不平衡的Zone,立即返回false,表示节点不平衡*/
			return false;
	}

	/*
	根据order的值返回最终的平衡判断
	高阶分配(order > 0),只要"平衡"的页面总数balanced_pages不少于总管理页面数managed_pages的25%(managed_pages >> 2),
		就认为节点整体是平衡的。这是一个权衡,避免因个别Zone不平衡而过度回收。
	单页分配(order == 0),由于前面已确保所有Zone都平衡,此处直接返回true
	*/
	if (order)
		return balanced_pages >= (managed_pages >> 2);
	else
		return true;
}

pgdat_balanced()需要注意参数classzone_idx，它表示在页面分配路径上计算出来第一个最合适内存分配的zone的编号，通过wake_all_kswapds()传递下来。

复制代码

判断zone是否能够进行回收的标准就是扫描次数是否>= 可回收页面的总数*6(匿名页+文件页),如果zone中页面之前被扫描的次数足够多再次扫描意义不大
/*
true(可回收),表示已扫描的页面数相对较少,回收工作尚不充分,zone仍被认为是可回收的
false(不可回收),表示已扫描的页面数过多,但回收效果不佳,说明大部分页面可能已"僵死"或难以回收,zone被视为"基本不可回收"
*/
bool zone_reclaimable(struct zone *zone)
{
	/*
	该机制旨在避免在可回收页面还很多时,因少量扫描失败就过早放弃回收,
	乘以6意味着,内核允许扫描的页面数是可回收页面总数的6倍,这被认为是一个合理的上限。
	6这个阈值是在特定内核版本和负载下,通过大量实验确定的"甜点值",
	后续版本的内核虽然对此逻辑有过重构,但*6这个经验阈值的核心思想被保留了下来
	*/
	return zone_page_state(zone, NR_PAGES_SCANNED)/*内存回收过程中,累计扫描过的页面总数*/ <
		zone_reclaimable_pages(zone) * 6; /*统计一个内存区域(zone)中当前"真正可回收"的页面总数 *6*/

}

/*
统计一个内存区域(zone)中当前"真正可回收"的页面总数,包括文件页和(在有Swap时)匿名页
*/
static unsigned long zone_reclaimable_pages(struct zone *zone)
{
	int nr;

	/*
	活跃文件页数量+非活跃文件页数量
	*/
	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
	     zone_page_state(zone, NR_INACTIVE_FILE);

	/*
	文件页与磁盘文件关联,回收时可直接丢弃(对于干净页)或写回文件(脏页),因此不依赖swap即可回收
	匿名页没有对应的磁盘文件(如进程堆、栈),回收时必须将其内容写入Swap空间才能释放物理内存
	如果系统中没有可用的swap(get_nr_swap_pages() == 0),匿名页就无法被回收,将它们计入"可回收页面"总数会严重误导内存管理决策
	*/
	if (get_nr_swap_pages() > 0) /*可用swap页面总数*/
		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
		      zone_page_state(zone, NR_INACTIVE_ANON); /*活跃匿名页数量+非活跃匿名页数量*/


	return nr;
}

注意,zone中可回收页面的统计中对于匿名页面的统计比较特殊,如果设备中无swap分区/文件,那么匿名页面的回收就会存在问题,匿名页面是不纳入可回收页面统计范围内。

/*
内核中kswapd后台线程进行内存回收时,专门处理匿名页老化的函数
其逻辑是,当系统中有swap空间,且某个内存区域(zone)的非活跃匿名页(Inactive Anon)数量过少时,
从活跃匿名页(Active Anon)链表尾部迁移一批页面(32)到非活跃链表,为后续的回收做准备
*/
static void age_active_anon(struct zone *zone, struct scan_control *sc)
{
	struct mem_cgroup *memcg;

	/*total_swap_pages记录系统中Swap空间的物理总大小(所有可用页面数)*/
	if (!total_swap_pages) /*无swap分区则直接返回,无处理活跃匿名页*/
		return;

	memcg = mem_cgroup_iter(NULL, NULL, NULL);
	do {
		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

		if (inactive_anon_is_low(lruvec)) /*非活跃匿名页面数量<活跃匿名页面数量*/
			/*将活跃匿名链表尾部"可能不再活跃"的页面挑出来，根据访问情况决定,仍然活跃的放回active,不再活跃的移到inactive*/
			shrink_active_list(SWAP_CLUSTER_MAX/*32,本次要从active LRU中扫描多少页**/, lruvec,
					   sc, LRU_ACTIVE_ANON/*活跃匿名页面*/);

		memcg = mem_cgroup_iter(NULL, memcg, NULL);
	} while (memcg);
}

判断zone中非活跃页面的数量和活跃页面数量之间的关系,非活跃页面数量<活跃页面数量认为inactive is low,

这时可以将活跃链表中近期未被访问的页面放到非活跃链表上,为后续的回收做准备。

复制代码

/**
 * inactive_anon_is_low - check if anonymous pages need to be deactivated
 * @lruvec: LRU vector to check
 *
 * Returns true if the zone does not have enough inactive anon pages,
 * meaning some active anon pages need to be deactivated.
 */
static int inactive_anon_is_low(struct lruvec *lruvec)
{
	/*
	 * If we don't have swap space, anonymous page deactivation
	 * is pointless.
	 */
	if (!total_swap_pages) /*无swapu分区就无法进行匿名页的回收*/
		return 0;

	if (!mem_cgroup_disabled())
		return mem_cgroup_inactive_anon_is_low(lruvec);

	/*判断非活跃匿名页面和活跃匿名页面之间的关系*/
	return inactive_anon_is_low_global(lruvec_zone(lruvec));
}

static int inactive_anon_is_low_global(struct zone *zone)
{
	unsigned long active, inactive;

	active = zone_page_state(zone, NR_ACTIVE_ANON);
	inactive = zone_page_state(zone, NR_INACTIVE_ANON);

	/*
		非活跃匿名页数量*比例系数 < 活跃匿名页数量
		当zone内部managed_pages容量小于G时,inactive_ratio的值为1,大于则为根号下(10×gb)的取值
	*/
	if (inactive * zone->inactive_ratio /*1*/< active)
		return 1;

	return 0;
}
shrink_active_list函数在后面部分进行详细讲解。

如何评判一个zone是否到达平衡即其内部页面数量充足,能够进行页面的分配?即判断在分配2^order个页面后是否还满足高水位值+balance_gap。

复制代码

static bool zone_balanced(struct zone *zone, int order,unsigned long balance_gap, int classzone_idx)
{
	/*在分配2^order个页面下,是否满足水位值*/
	if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +balance_gap/*高水位+balance_gap值*/, classzone_idx, 0))
		return false;

	/*
	判断指定内存区域(zone)是否适合执行内存压缩(compaction)操作
	*/
	if (IS_ENABLED(CONFIG_COMPACTION)/*开启内存规则*/ && order && compaction_suitable(zone,
				order, 0, classzone_idx) == COMPACT_SKIPPED)
		return false;

	return true;
}

include/linux/vmstat.h
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
			unsigned long mark, int classzone_idx, int alloc_flags)
{
	/*当前内存域zone中完全空闲的页面总数*/
	long free_pages = zone_page_state(z, NR_FREE_PAGES);

	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)/*开启精确计算*/
		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); /*将zone中各个CPU的空闲内存数量进行汇总计算*/

	/*判断水位值是否ok*/
	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
								free_pages);
}

static inline unsigned long zone_page_state_snapshot(struct zone *zone,enum zone_stat_item item)
{
	/*
		vm_stat[item]是全局基准值,记录所有CPU已同步的统计总和
		vm_stat_diff[item]是每个CPU本地未同步的增量
		所以该计算更精确,将单个CPU未同步的数据进行统计
	*/
	/*获取zone->vm_stat[]统计数据*/
	long x = atomic_long_read(&zone->vm_stat[item]);

#ifdef CONFIG_SMP
	int cpu;
	/*强制进行同步操作,将zone中各个CPU的内存统计数据进行汇总*/
	for_each_online_cpu(cpu)
		x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];

	if (x < 0)
		x = 0;
#endif
	return x;
}

判断zone内部的水位值情况,去除需要分配的2^order个页面后,剩余可用页面和调整后水位值之间的关系。
mm/page_alloc.c
bool zone_watermark_ok(struct zone *z, unsigned int order/*分配阶数*/, unsigned long mark,
		      int classzone_idx, int alloc_flags)
{
	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,zone_page_state(z, NR_FREE_PAGES));
}

static bool __zone_watermark_ok(struct zone *z, unsigned int order,
			unsigned long mark, int classzone_idx, int alloc_flags,
			long free_pages)
{
	/* free_pages may go negative - that's OK */
	/*mark值为水位值,分配物理页面后剩余的物理页面和水位值之间的关系*/
	long min = mark;
	int o;
	long free_cma = 0;

	/*去除需要分配的物理页面*/
	free_pages -= (1 << order) - 1;

	/*当分配请求携带ALLOC_HIGH或ALLOC_HARDER标志时,内核会放宽水位检查(如 min减半),允许分配后剩余页略低于lowmem_reserve[]*/
	if (alloc_flags & ALLOC_HIGH) /*高优先级分配:水位要求减半,这样对水位的要就就降低,允许在不满足条件下也能优先分配*/
		min -= min / 2;
	if (alloc_flags & ALLOC_HARDER)
		min -= min / 4;
#ifdef CONFIG_CMA
	/* If allocation can't use CMA areas don't use free CMA pages */
	if (!(alloc_flags & ALLOC_CMA))
		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif

	/*free_pages < min +  z->lowmem_reserve[]的空间时拒绝*/
	if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
		return false;

	for (o = 0; o < order; o++) {
		/* At the next order, this order's pages become unavailable */
		/*去除 zone free_area中比oder小的各个order中空闲物理页面,因为对于当前order而言,小于order的free_area是没法进行分配的,视为无效*/
		free_pages -= z->free_area[o].nr_free << o;

		/* Require fewer higher order pages to be free */
		min >>= 1; /*min的值同步减半,min /= 2*/

		if (free_pages <= min) /*去除需要分配的order个页面后剩余可用页面和min进行比较*/
			return false;
	}
	return true;
}

/*
	负责对单个内存区域(zone)执行回收操作,它根据水位线、内存碎片等因素,决定回收多少页面以及何时停止
*/
static bool kswapd_shrink_zone(struct zone *zone,
			       int classzone_idx,
			       struct scan_control *sc,
			       unsigned long *nr_attempted/*输出参数,用于累计本次回收尝试的页数*/)
{
	int testorder = sc->order;
	/*判断zone是否"足够平衡"的阈值*/
	unsigned long balance_gap;
	/*标识当前是否为低内存压力场景*/
	bool lowmem_pressure;

	/* Reclaim above the high watermark. */
	/*
	SWAP_CLUSTER_MAX在内核中默认定义为32,这是一个经过权衡的经验值,用于平衡I/O效率、CPU开销和系统响应性.
	设置回收目标,将本次回收的目标页数sc->nr_to_reclaim设置为"高水位线"和SWAP_CLUSTER_MAX(一次回收的页数簇)中的较大值。
	这确保了回收工作至少会进行一轮,以将空闲内存推高到高水位线以上.
	*/
	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX/*32*/, high_wmark_pages(zone)/*高水位值*/);
	printk("-----%s,sc->nr_to_reclaim:%x%x \n",__func__,sc->nr_to_reclaim);

	/*
	 * Kswapd reclaims only single pages with compaction enabled. Trying
	 * too hard to reclaim until contiguous free pages have become
	 * available can hurt performance by evicting too much useful data
	 * from memory. Do not reclaim more than needed for compaction.
	 */
	/*
		为内存规整优化,如果启用了内存规整(compaction)且当前是分配高阶连续内存(sc->order > 0),则检查该zone是否适合进行规整
		如果适合,testorder会被设为0,这意味着,在判断是否"平衡"时,将只要求zone有足够的单页空闲,而不是要求有order阶的连续空闲块，
		从而避免因过度回收而影响性能
	*/
	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
			compaction_suitable(zone, sc->order, 0, classzone_idx)
							!= COMPACT_SKIPPED)
		testorder = 0;

	/*
	 * We put equal pressure on every zone, unless one zone has way too
	 * many pages free already. The "too many pages" is defined as the
	 * high wmark plus a "gap" where the gap is either the low
	 * watermark or 1% of the zone, whichever is smaller.
	 */
	/*
		计算平衡阈值,balance_gap定义了一个"宽松"的平衡区间。
		一个zone的空闲页数只要高于high_wmark + balance_gap,就可以被认为是"足够平衡"的。
		这避免了在所有zone水位线都略低于high时,kswapd对每个zone都进行过度回收
	*/
	/*取低水位值和zone->managed_pages/100之间的较小者*/
	balance_gap = min(low_wmark_pages(zone)/*低水位值*/, DIV_ROUND_UP(
			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO/*100*/));

	printk("-----%s,balance_gap:0x%x \n",__func__,balance_gap);
	/*
	 * If there is no low memory pressure or the zone is balanced then no
	 * reclaim is necessary
	 */
	/*
	 lowmem_pressure是一个特定于高端内存的特殊压力标志
	*/
	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
	/*
		判断是否平衡,如果当前没有特殊压力,并且zone_balanced()判断该zone在testorder阶和balance_gap阈值下是平衡的，
		则无需进一步回收,直接返回true
	*/
	if (!lowmem_pressure && zone_balanced(zone, testorder,balance_gap, classzone_idx))
		return true;

	/*
		执行实际回收,调用shrink_zone()函数,扫描并回收zone中的页面。
		第三个参数指示本次回收是否针对classzone_idx指定的最高优先级zone,这会影响kswapd后续的扫描策略
	*/
	shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);

	/* Account for the number of pages attempted to reclaim */
	/*
		累计回收尝试量,将本次shrink_zone尝试回收的页数(sc->nr_to_reclaim)累加到输出参数*nr_attempted中。
		这个统计值用于上层balance_pgdat()判断是否需要提高回收优先级
	*/
	*nr_attempted += sc->nr_to_reclaim;

	/*
		回收工作结束后,清除ZONE_WRITEBACK标志。
		这表明最近一次扫描中遇到的"大量正在回写"的情况已得到缓解
	*/
	clear_bit(ZONE_WRITEBACK, &zone->flags);

	/*
	 * If a zone reaches its high watermark, consider it to be no longer
	 * congested. It's possible there are dirty pages backed by congested
	 * BDIs but as pressure is relieved, speculatively avoid congestion
	 * waits.
	 */
	/*
		如果zone已经恢复到平衡状态(zone_balanced判断通过),则清除ZONE_CONGESTED和ZONE_DIRTY标志
		这向系统表明,由I/O拥塞或大量脏页引起的回收压力已经解除,可以恢复正常的内存分配流程
	*/
	if (zone_reclaimable(zone) &&
	    zone_balanced(zone, testorder, 0, classzone_idx)) {
		clear_bit(ZONE_CONGESTED, &zone->flags);
		clear_bit(ZONE_DIRTY, &zone->flags);
	}

	/*
		函数的最终返回值表示本次回收工作的"努力程度"。
		如果实际扫描的页数sc->nr_scanned达到了目标sc->nr_to_reclaim，则返回true
	*/
	return sc->nr_scanned >= sc->nr_to_reclaim;
}