Linux 内存管理 (5)：buddy 内存分配简要流程

文章目录

[1. 前言](#1. 前言)
[2. buddy 分配流程](#2. buddy 分配流程)
- [2.1 快速分配路径](#2.1 快速分配路径)
- [2.2 慢速分配路径](#2.2 慢速分配路径)

1. 前言

限于作者能力水平，本文可能存在谬误，因此而给读者带来的损失，作者不做任何承诺。

2. buddy 分配流程

头文件 include/linux/gfp.h 导出了多个接口从 buddy 分配器分配页面：

c 复制代码

static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
{
	return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
}

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
	VM_WARN_ON(!node_online(nid));

	return __alloc_pages(gfp_mask, order, nid);
}

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
						unsigned int order)
{
	if (nid == NUMA_NO_NODE)
		nid = numa_mem_id();

	return __alloc_pages_node(nid, gfp_mask, order);
}

#ifdef CONFIG_NUMA
extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);

static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{
	return alloc_pages_current(gfp_mask, order);
}
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
			struct vm_area_struct *vma, unsigned long addr,
			int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\
	alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
#define alloc_pages(gfp_mask, order) \
		alloc_pages_node(numa_node_id(), gfp_mask, order)
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
	alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\
	alloc_pages(gfp_mask, order)
#endif

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr)			\
	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
#define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
	alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)

extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);

#define __get_free_page(gfp_mask) \
		__get_free_pages((gfp_mask), 0)

#define __get_dma_pages(gfp_mask, order) \
		__get_free_pages((gfp_mask) | GFP_DMA, (order))

所有这些接口，最终都会调用同一接口 __alloc_pages_nodemask()，其主要逻辑可以概括为：

尝试从满足分配要求的 NUMA node/zone 空闲链表进行分配，称为快速分配路径
如果从快速路径分配失败，则先后尝试内存回收、内存规则后再进行分配，成为慢速路径

下面来看细节：

c 复制代码

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
							nodemask_t *nodemask)
{
	struct page *page;
	unsigned int alloc_flags = ALLOC_WMARK_LOW; /* 默认在 WMARK_LOW 触发内存回收 */
	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
	struct alloc_context ac = { };

	...

	/*
	 * 页面分配准备工作:
	 * 设定分配上下文 (alloc_context), allocate mask 等
	 */
	gfp_mask &= gfp_allowed_mask;
	alloc_mask = gfp_mask;
	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
		return NULL;

	finalise_ac(gfp_mask, order, &ac);

	/* First allocation attempt */
	/*
	 * 快速分配路径: 
	 * 首先尝试从匹配分配条件的 NUMA node/zone 的当前空闲页面分配
	 */
	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
	if (likely(page)) /* 分配成功：当前空闲页面满足分配请求 */
		goto out;

	/* 
	 * 当前空闲页面不满足分配请求，需做进一步
	 * 内存碎片整理(compact)、回收(swap)工作之后，
	 * 然后再次进行分配请求。
	 */

	/*
	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
	 * resp. GFP_NOIO which has to be inherited for all allocation requests
	 * from a particular context which has been marked by
	 * memalloc_no{fs,io}_{save,restore}.
	 */ 
	alloc_mask = current_gfp_context(gfp_mask);
	ac.spread_dirty_pages = false;

	/*
	 * Restore the original nodemask if it was potentially replaced with
	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
	 */
	if (unlikely(ac.nodemask != nodemask))
		ac.nodemask = nodemask;

	/*
	 * 慢速分配路径: 
	 * 从快速分配路径分配失败, 接下来可能要进行 内存规整、内存回收 来满足分配要求
	 */
	page = __alloc_pages_slowpath(alloc_mask, order, &ac);

	...

out:
	...

	return page;
}

2.1 快速分配路径

先看快速分配路径，其分配又可分为两种情形：

分配单个页面，从每 CPU 的 PCP 空闲链表分配
分配多个页面，从满足分配要求的 NUMA node/zone 空闲列表分配

来看细节：

c 复制代码

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
{
	struct zoneref *z = ac->preferred_zoneref;
	struct zone *zone;
	struct pglist_data *last_pgdat_dirty_limit = NULL;

	/*
	 * Scan zonelist, looking for a zone with enough free.
	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
	 */
	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
								ac->nodemask) {
		struct page *page;
		unsigned long mark;

		...

		/* 检测内存区域 @zone 的页面数是否触及了 @alloc_flags 设定的水准线 */
		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
		if (!zone_watermark_fast(zone, order, mark, /* 内存区域 @zone 的页面数在设定的水准线之下, NUMA 尝试回收 @zone 的页面 */
				       ac_classzone_idx(ac), alloc_flags)) {
			int ret;

			...
			/*
			 * 分配请求不要求对内存区域 @zone 做水准线检测: 
			 * 即使页面数在水准线之下, 仍然尝试进行分配.
			 */
			if (alloc_flags & ALLOC_NO_WATERMARKS)
				goto try_this_zone;

			/*
			 * 检测是否要对 内存区域 @zone 进行内存回收工作:
			 * NUMA 会存在不同情形, UMA 总是不进行回收工作.
			 */
			if (node_reclaim_mode == 0 ||
			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
				continue; /* 内存区域 @zone 没有足够页面, 也不支持页面回收, 只能尝试下一个内存区域(struct zone) */

			/* 内存区域 @zone 内存回收(仅 NUMA) */
			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
			...
		}

try_this_zone:
		/* 从内存区域 @zone 的 ac->migratetype 类型页面空闲列表 分配一个页面 */
		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
				gfp_mask, alloc_flags, ac->migratetype);
		if (page) { /* 分配页面成功 */
			prep_new_page(page, order, gfp_mask, alloc_flags);

			/*
			 * If this is a high-order atomic allocation then check
			 * if the pageblock should be reserved for the future
			 */
			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
				reserve_highatomic_pageblock(page, zone, order);

			return page; /* 返回分配的页面 */
		}
	}

	return NULL; /* 分配失败 */
}

/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
static inline
struct page *rmqueue(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			gfp_t gfp_flags, unsigned int alloc_flags,
			int migratetype)
{
	unsigned long flags;
	struct page *page;

	if (likely(order == 0)) { /* 【单个页面】从 per-cpu 的 PCP 列表分配 */
		page = rmqueue_pcplist(preferred_zone, zone, order,
				gfp_flags, migratetype);
		goto out;
	}
	
	/* 【非单个页面】 从 zone 的 frea_area 空闲列表分配 */
	...
	spin_lock_irqsave(&zone->lock, flags);

	do {
		page = NULL;
		if (alloc_flags & ALLOC_HARDER) {
			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
			...
		}
		if (!page) /* 分配失败, 尝试 fallback 分配 */
			page = __rmqueue(zone, order, migratetype);
	} while (page && check_new_pages(page, order));
	spin_unlock(&zone->lock);
	if (!page)
		goto failed;
	...
	local_irq_restore(flags);

out:
	...
	return page; /* 分配成功，返回分配页面 */

failed:
	local_irq_restore(flags);
	return NULL; /* 分配失败 */
}

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
/* 从 [order -> MAX_ORDER] 分配 */
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area *area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		page = list_first_entry_or_null(&area->free_list[migratetype],
							struct page, lru);
		if (!page)
			continue;
		list_del(&page->lru); /* 将分配的 page 从空闲列表移除 */
		rmv_page_order(page);
		area->nr_free--;
		/* 当从更高 order 空闲列表分配时, 要把剩余的 page 放到合适 order 空闲列表 */
		expand(zone, page, order, current_order, area, migratetype);
		set_pcppage_migratetype(page, migratetype);
		return page;
	}

	return NULL;
}

/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
static struct page *__rmqueue(struct zone *zone, unsigned int order,
				int migratetype)
{
	struct page *page;

retry:
	page = __rmqueue_smallest(zone, order, migratetype); /* 从 [order -> MAX_ORDER] 分配 */
	if (unlikely(!page)) { /* 分配失败, 尝试从 CMA 或 migratetype fallback 分配 */
		if (migratetype == MIGRATE_MOVABLE) /* 尝试从 CMA 分配 */
			page = __rmqueue_cma_fallback(zone, order);

		/*
		 * fallback 分配: 
		 * 找一个合适的 fallback migratetype(兼容请求的 migratetype), 然后尝试分配。
		 */
		if (!page && __rmqueue_fallback(zone, order, migratetype))
			goto retry;
	}

	trace_mm_page_alloc_zone_locked(page, order, migratetype);
	return page;
}

#ifdef CONFIG_CMA
static struct page *__rmqueue_cma_fallback(struct zone *zone,
					unsigned int order)
{
	return __rmqueue_smallest(zone, order, MIGRATE_CMA);
}
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
					unsigned int order) { return NULL; }
#endif

/*
 * Try finding a free buddy page on the fallback list and put it on the free
 * list of requested migratetype, possibly along with other pages from the same
 * block, depending on fragmentation avoidance heuristics. Returns true if
 * fallback was found so that __rmqueue_smallest() can grab it.
 *
 * The use of signed ints for order and current_order is a deliberate
 * deviation from the rest of this file, to make the for loop
 * condition simpler.
 */
/*
 * fallback 分配: 
 * 找一个合适的 fallback migratetype(兼容请求的 migratetype), 然后尝试分配。
 */
static inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
	struct free_area *area;
	int current_order;
	struct page *page;
	int fallback_mt;
	bool can_steal;

	/*
	 * Find the largest available free page in the other list. This roughly
	 * approximates finding the pageblock with the most free pages, which
	 * would be too costly to do exactly.
	 */
	for (current_order = MAX_ORDER - 1; current_order >= order;
				--current_order) {
		area = &(zone->free_area[current_order]);
		fallback_mt = find_suitable_fallback(area, current_order,
				start_migratetype, false, &can_steal);
		if (fallback_mt == -1)
			continue;

		/*
		 * We cannot steal all free pages from the pageblock and the
		 * requested migratetype is movable. In that case it's better to
		 * steal and split the smallest available page instead of the
		 * largest available page, because even if the next movable
		 * allocation falls back into a different pageblock than this
		 * one, it won't cause permanent fragmentation.
		 */
		if (!can_steal && start_migratetype == MIGRATE_MOVABLE
					&& current_order > order)
			goto find_smallest;

		goto do_steal;
	}

	return false;

find_smallest:
	for (current_order = order; current_order < MAX_ORDER;
							current_order++) {
		area = &(zone->free_area[current_order]);
		fallback_mt = find_suitable_fallback(area, current_order,
				start_migratetype, false, &can_steal);
		if (fallback_mt != -1)
			break;
	}

	...

do_steal:
	page = list_first_entry(&area->free_list[fallback_mt],
							struct page, lru);

	steal_suitable_fallback(zone, page, start_migratetype, can_steal);

	...

	return true;
}

快速分配路径 的分析到此为止，不再做进一步的深入，其中 fallback 的规则详见 fallbacks[] 数组定义：

c 复制代码

/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */
static int fallbacks[MIGRATE_TYPES][4] = {
	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
	[MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION
	[MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
#endif
};

2.2 慢速分配路径

慢速分配路径可以概括为：

调整分配标志和 zone，尝试分配
如果调整分配失败，进行内存回收操作，然后再尝试分配失败
如果内存回收后分配仍然失败，则进行内存规整(compact)，然后再尝试分配失败
如果内存规整(compact)分配还是失败，则触发 OOM-killer，然后再尝试分配

来看细节：

c 复制代码

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
						struct alloc_context *ac)
{
	...

	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
		wake_all_kswapds(order, ac); /* 唤醒 每 NUMA 内存节点 的 kswapd，进行内存回收 */

	/*
	 * The adjusted alloc_flags might result in immediate success, so try
	 * that first
	 */
	/* 按调整后的 分配标志，再次尝试分配 */
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page)
		goto got_pg;
	
	...

retry:
	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
		wake_all_kswapds(order, ac);

	...

	/* Attempt with potentially adjusted zonelist and alloc_flags */
	/* 按调整后的 分配标志 和 zone 列表，再次尝试分配 */
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
	if (page)
		goto got_pg;

	/* Try direct reclaim and then allocating */
	/*
	 * 直接进行内存回收，并等待回收操作完成后，再次尝试分配.
	 * 这不同于唤醒 kswapd 的间接回收，时间上更为确定.
	 */
	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
							&did_some_progress);
	if (page)
		goto got_pg;

	/* Try direct compaction and then allocating */
	/* 内存页面回收分配失败，进行内存规整后再尝试分配 */
	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
					compact_priority, &compact_result);
	if (page)
		goto got_pg;

	...

	/* Reclaim has failed us, start killing things */
	/* 触发 OOM-killer 再次尝试分配 */
	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
	if (page)
		goto got_pg;
	
	...

got_pg:
	return page; /* 分配完成，返回 分配页面 或 NULL */
}

本文对于慢速分配路径的分析到此为止，期间可能触发的内存回收(shrink)、内存规则(compat)、OOM-killer 过程，限于篇幅，将不做展开。