文章目录
- [1. 前言](#1. 前言)
- [2. buddy 分配流程](#2. buddy 分配流程)
-
- [2.1 快速分配路径](#2.1 快速分配路径)
- [2.2 慢速分配路径](#2.2 慢速分配路径)
上一篇: Linux 内存管理 (4):buddy 管理系统的建立
1. 前言
限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
2. buddy 分配流程
头文件 include/linux/gfp.h 导出了多个接口从 buddy 分配器分配页面:
c
static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
{
return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
}
/*
* Allocate pages, preferring the node given as nid. The node must be valid and
* online. For more general interface, see alloc_pages_node().
*/
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
VM_WARN_ON(!node_online(nid));
return __alloc_pages(gfp_mask, order, nid);
}
/*
* Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
* prefer the current CPU's closest node. Otherwise node must be valid and
* online.
*/
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
return __alloc_pages_node(nid, gfp_mask, order);
}
#ifdef CONFIG_NUMA
extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)
{
return alloc_pages_current(gfp_mask, order);
}
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr,
int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages(gfp_mask, order)
#endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr) \
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
#define __get_free_page(gfp_mask) \
__get_free_pages((gfp_mask), 0)
#define __get_dma_pages(gfp_mask, order) \
__get_free_pages((gfp_mask) | GFP_DMA, (order))
所有这些接口,最终都会调用同一接口 __alloc_pages_nodemask(),其主要逻辑可以概括为:
- 尝试从满足分配要求的 NUMA node/zone 空闲链表进行分配,称为快速分配路径
- 如果从快速路径分配失败,则先后尝试内存回收、内存规则后再进行分配,成为慢速路径
下面来看细节:
c
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW; /* 默认在 WMARK_LOW 触发内存回收 */
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
...
/*
* 页面分配准备工作:
* 设定分配上下文 (alloc_context), allocate mask 等
*/
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
finalise_ac(gfp_mask, order, &ac);
/* First allocation attempt */
/*
* 快速分配路径:
* 首先尝试从匹配分配条件的 NUMA node/zone 的当前空闲页面分配
*/
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page)) /* 分配成功:当前空闲页面满足分配请求 */
goto out;
/*
* 当前空闲页面不满足分配请求,需做进一步
* 内存碎片整理(compact)、回收(swap)工作之后,
* 然后再次进行分配请求。
*/
/*
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
* resp. GFP_NOIO which has to be inherited for all allocation requests
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}.
*/
alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
/*
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;
/*
* 慢速分配路径:
* 从快速分配路径分配失败, 接下来可能要进行 内存规整、内存回收 来满足分配要求
*/
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
...
out:
...
return page;
}
2.1 快速分配路径
先看快速分配路径,其分配又可分为两种情形:
- 分配单个页面,从每 CPU 的 PCP 空闲链表分配
- 分配多个页面,从满足分配要求的 NUMA node/zone 空闲列表分配
来看细节:
c
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;
...
/* 检测内存区域 @zone 的页面数是否触及了 @alloc_flags 设定的水准线 */
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_fast(zone, order, mark, /* 内存区域 @zone 的页面数在设定的水准线之下, NUMA 尝试回收 @zone 的页面 */
ac_classzone_idx(ac), alloc_flags)) {
int ret;
...
/*
* 分配请求不要求对内存区域 @zone 做水准线检测:
* 即使页面数在水准线之下, 仍然尝试进行分配.
*/
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
/*
* 检测是否要对 内存区域 @zone 进行内存回收工作:
* NUMA 会存在不同情形, UMA 总是不进行回收工作.
*/
if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue; /* 内存区域 @zone 没有足够页面, 也不支持页面回收, 只能尝试下一个内存区域(struct zone) */
/* 内存区域 @zone 内存回收(仅 NUMA) */
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
...
}
try_this_zone:
/* 从内存区域 @zone 的 ac->migratetype 类型页面空闲列表 分配一个页面 */
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) { /* 分配页面成功 */
prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page; /* 返回分配的页面 */
}
}
return NULL; /* 分配失败 */
}
/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
if (likely(order == 0)) { /* 【单个页面】从 per-cpu 的 PCP 列表分配 */
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype);
goto out;
}
/* 【非单个页面】 从 zone 的 frea_area 空闲列表分配 */
...
spin_lock_irqsave(&zone->lock, flags);
do {
page = NULL;
if (alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
...
}
if (!page) /* 分配失败, 尝试 fallback 分配 */
page = __rmqueue(zone, order, migratetype);
} while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
...
local_irq_restore(flags);
out:
...
return page; /* 分配成功,返回分配页面 */
failed:
local_irq_restore(flags);
return NULL; /* 分配失败 */
}
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
/* 从 [order -> MAX_ORDER] 分配 */
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = list_first_entry_or_null(&area->free_list[migratetype],
struct page, lru);
if (!page)
continue;
list_del(&page->lru); /* 将分配的 page 从空闲列表移除 */
rmv_page_order(page);
area->nr_free--;
/* 当从更高 order 空闲列表分配时, 要把剩余的 page 放到合适 order 空闲列表 */
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;
retry:
page = __rmqueue_smallest(zone, order, migratetype); /* 从 [order -> MAX_ORDER] 分配 */
if (unlikely(!page)) { /* 分配失败, 尝试从 CMA 或 migratetype fallback 分配 */
if (migratetype == MIGRATE_MOVABLE) /* 尝试从 CMA 分配 */
page = __rmqueue_cma_fallback(zone, order);
/*
* fallback 分配:
* 找一个合适的 fallback migratetype(兼容请求的 migratetype), 然后尝试分配。
*/
if (!page && __rmqueue_fallback(zone, order, migratetype))
goto retry;
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
#ifdef CONFIG_CMA
static struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
}
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order) { return NULL; }
#endif
/*
* Try finding a free buddy page on the fallback list and put it on the free
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
/*
* fallback 分配:
* 找一个合适的 fallback migratetype(兼容请求的 migratetype), 然后尝试分配。
*/
static inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area *area;
int current_order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
for (current_order = MAX_ORDER - 1; current_order >= order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt == -1)
continue;
/*
* We cannot steal all free pages from the pageblock and the
* requested migratetype is movable. In that case it's better to
* steal and split the smallest available page instead of the
* largest available page, because even if the next movable
* allocation falls back into a different pageblock than this
* one, it won't cause permanent fragmentation.
*/
if (!can_steal && start_migratetype == MIGRATE_MOVABLE
&& current_order > order)
goto find_smallest;
goto do_steal;
}
return false;
find_smallest:
for (current_order = order; current_order < MAX_ORDER;
current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt != -1)
break;
}
...
do_steal:
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
steal_suitable_fallback(zone, page, start_migratetype, can_steal);
...
return true;
}
快速分配路径 的分析到此为止,不再做进一步的深入,其中 fallback 的规则详见 fallbacks[] 数组定义:
c
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
#endif
};
2.2 慢速分配路径
慢速分配路径可以概括为:
- 调整 分配标志 和 zone,尝试分配
- 如果调整分配失败,进行内存回收操作,然后再尝试分配失败
- 如果内存回收后分配仍然失败,则进行内存规整(compact),然后再尝试分配失败
- 如果内存规整(compact)分配还是失败,则触发 OOM-killer,然后再尝试分配
来看细节:
c
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
...
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac); /* 唤醒 每 NUMA 内存节点 的 kswapd,进行内存回收 */
/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
/* 按调整后的 分配标志,再次尝试分配 */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
...
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
...
/* Attempt with potentially adjusted zonelist and alloc_flags */
/* 按调整后的 分配标志 和 zone 列表,再次尝试分配 */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/* Try direct reclaim and then allocating */
/*
* 直接进行内存回收,并等待回收操作完成后,再次尝试分配.
* 这不同于唤醒 kswapd 的间接回收,时间上更为确定.
*/
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;
/* Try direct compaction and then allocating */
/* 内存页面回收分配失败,进行内存规整后再尝试分配 */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;
...
/* Reclaim has failed us, start killing things */
/* 触发 OOM-killer 再次尝试分配 */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
...
got_pg:
return page; /* 分配完成,返回 分配页面 或 NULL */
}
本文对于慢速分配路径的分析到此为止,期间可能触发的内存回收(shrink)、内存规则(compat)、OOM-killer 过程,限于篇幅,将不做展开。