Kcompation 是内核进程,主要解决系统长时间运行引发的内存碎片化问题
mm/compaction.c:
低地址 -------------------- 高地址
c
↑ migrate scanner
↓ free scanner
migrate scanner:从低地址扫描占用页
free scanner:从高地址扫描空闲页
Compate: 将migrate scanner 扫描到的占用页移动到 free scanner 扫描的空闲页
最终效果:占用页集中在高地址、低地址让出大量连续的物理地址,供其他业务用。
kcompactd_init:
c
1 static int __init kcompactd_init(void)
|3262 {
|3263 int nid;
|3264 int ret;
|3265
|3266 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
|3267 "mm/compaction:online",
|3268 kcompactd_cpu_online, NULL);
|3269 if (ret < 0) {
|3270 pr_err("kcompactd: failed to register hotplug callbacks.\n");
|3271 return ret;
|3272 }
|3273
|3274 for_each_node_state(nid, N_MEMORY)
|3275 kcompactd_run(nid);
|3276 register_sysctl_init("vm", vm_compaction);
|3277 return 0;
|3278 }
|3279 subsys_initcall(kcompactd_init)
OS bringup阶段会启动kcompactd_init, 主要是创建kcompactd_run 内核线程 和在/proc/sys/vm/建立一些节点:
- compact_memory: echo 1 >/proc/sys/vm/compact_memory 手动执行一次碎片化治理
- compaction_proactiveness:用户预期碎片化水位,取值范围【0~100】
- ,0表示尽量不去做碎片化处理,100表示频繁碎片化治理 ,
- 用户设置此值,可以调节是否进行碎片化治理,当内存碎片化实际评分 大于 用户预期碎片化水位时,才进行碎片化治理。
- extfrag_threshold: 暂未研究
- compact_unevictable_allowed: 表示是否对unevictable page进行治理(mlock的页就是unevictable 页)。
- kcompactd:
c
|3077 static int kcompactd(void *p)
n|3078 {
f|3079 pg_data_t *pgdat = (pg_data_t *)p;
i|3080 struct task_struct *tsk = current;
c|3081 long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
c|3082 long timeout = default_timeout;
u|3083
r|3084 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
f|3085
i|3086 if (!cpumask_empty(cpumask))
i|3087 set_cpus_allowed_ptr(tsk, cpumask);
k|3088
f|3089 set_freezable();
f|3090
f|3091 pgdat->kcompactd_max_order = 0;
f|3092 pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
s|3093
_|3094 while (!kthread_should_stop()) {
c|3095 unsigned long pflags;
_|3096
c|3097 /*
c|3098 * Avoid the unnecessary wakeup for proactive compaction
c|3099 * when it is disabled.
c|3100 */
c|3101 if (!sysctl_compaction_proactiveness)
t|3102 timeout = MAX_SCHEDULE_TIMEOUT;
p|3103 trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
c|3104 if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
c|3105 kcompactd_work_requested(pgdat), timeout) &&
c|3106 !pgdat->proactive_compact_trigger) {
s|3107
c|3108 psi_memstall_enter(&pflags);
c|3109 kcompactd_do_work(pgdat);
c|3110 psi_memstall_leave(&pflags);
k|3111 /*
k|3112 * Reset the timeout value. The defer timeout from
k|3113 * proactive compaction is lost here but that is fine
w|3114 * as the condition of the zone changing substantionally
k|3115 * then carrying on with the previous defer interval is
k|3116 * not useful.
k|3117 */
k|3118 timeout = default_timeout;
p|3119 continue;
k|3120 }
|3121
d|3122 /*
r|3123 * Start the proactive work with default timeout. Based
_|3124 * on the fragmentation score, this timeout is updated.
_|3125 */
_|3126 timeout = default_timeout;
|3127 if (should_proactive_compact_node(pgdat)) {
c|3128 unsigned int prev_score, score;
_|3129
T|3130 prev_score = fragmentation_score_node(pgdat);
T|3131 proactive_compact_node(pgdat);
T|3132 score = fragmentation_score_node(pgdat);
_|3133 /*
_|3134 * Defer proactive compaction if the fragmentation
E|3135 * score did not go down i.e. no progress made.
E|3136 */
E|3137 if (unlikely(score >= prev_score))
T|3138 timeout =
T|3139 default_timeout << COMPACT_MAX_DEFER_SHIFT;
T|3140 }
T|3141 if (unlikely(pgdat->proactive_compact_trigger))
T|3142 pgdat->proactive_compact_trigger = false;
T|3143 }
T|3144
T|3145 return 0;
kcompactd主体结构就是个大的while循环,kcompactd支持两种唤醒模式:
- 3126~3145行: HPAGE_FRAG_CHECK_INTERVAL_MSEC=500, 500ms循环一次;执行proactive_compact_node函数,根据碎片化分数和水位,来选择是否处理碎片化。
- 3101~3120行:如果sysctl_compaction_proactiveness设置为0, 则循环暂停, kcompactd_wait会无限等待,只支持wakeup叫醒处理碎片化;kcompactd_wait可以在kswap等逻辑中被叫醒等。此种方式不需要计算碎片化分数,直接调用kcompactd_do_work处理碎片化。
should_proactive_compact_node:
c
82 static bool should_proactive_compact_node(pg_data_t *pgdat)
w|2183 {
c|2184 int wmark_high;
|2185
|2186 if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
p|2187 return false;
_|2188
_|2189 wmark_high = fragmentation_score_wmark(false);
t|2190 return fragmentation_score_node(pgdat) > wmark_high;
|2191 }
d|2192
此函数根据用户传入参数sysctl_compaction_proactiveness来计算碎片化水位wmark_high, 只有fragmentation_score_node算出的碎片化分数大于wmark_high,才会真正启动碎片化治理。
fragmentation_score_node:
c
|2152 static unsigned int fragmentation_score_node(pg_data_t *pgdat)
_|2153 {
c|2154 unsigned int score = 0;
_|2155 int zoneid;
c|2156
c|2157 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
c|2158 struct zone *zone;
c|2159
c|2160 zone = &pgdat->node_zones[zoneid];
t|2161 if (!populated_zone(zone))
p|2162 continue;
c|2163 score += fragmentation_score_zone_weighted(zone);
c|2164 }
c|2165
s|2166 return score;
c|2167 }
c|2168
对系统中每个zone单独计算碎片化分数,最终加到一起变成score, 就是内存实际碎片化分数。
fragmentation_score_zone_weighted:
c
37 static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
n|2138 {
f|2139 unsigned long score;
i|2140
c|2141 score = zone->present_pages * fragmentation_score_zone(zone);
c|2142 return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
u|2143 }
这里主要计算的是,每个zone的权重得分:
公式: (当前zone的页总数 * 当前zone的碎片化得分 )/ 所有node的所有zone的页总数
c
引入"加权"机制的主要目的是优化主动规整的触发时机: ~/works/mi/code_2/nvidia/thor/0630/
docker/kernel_jp7.1/aochuang/
- 聚焦大内存域: 像 ZONE_NORMAL 这样的大区域对系统可用性影响更大。通过按比例加权,大区域的碎片化情况会显著拉高整个 Node 的总分。
- 忽略小内存域: 像 ZONE_DMA 或 ZONE_DMA32 这样通常较小的区域,即使它们的碎片化程度很高(fragmentation_score_zone 返回值很大),由于其 present_pages 占比较小,加权后的得分依然会接近于 0。
- 避免无效规整: 这种设计防止了系统因为一些极小的、不重要的区域出现碎片化而频繁触发开销巨大的主动规整操作。
当然我的系统只有一个node。
/sys/devices/system/node/node0 可以看到只有一个node,这里主要是适配numa架构多node。
最重要的每个zone的碎片化得分怎么算的?
fragmentation_score_zone:
fragmentation_score_zone
->extfrag_for_order:
c
|1119 unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
|1120 {
|1121 struct contig_page_info info;
|1122
v|1123 fill_contig_page_info(zone, order, &info);
|1124 if (info.free_pages == 0)
|1125 return 0;
|1126
|1127 return div_u64((info.free_pages -
|1128 (info.free_blocks_suitable << order)) * 100,
|1129 info.free_pages);
|1130 }
order 是fragmentation_score_zone传进来的参数COMPACTION_HPAGE_ORDER,代表2MB的页,
extfrag_for_order通过fill_contig_page_info 计算当前zone 有多少空闲的page,写入info.free_pages, 有多少空闲且size大于2M的block,记录到free_blocks_suitable。
注意free_blocks_suitable记录的是多少个空闲且size大于2M的block,不是pfn本身。free_blocks_suitable << order (order是2M大页的order) 才是多少个page
最后 extfrag_for_order 返回值: (当前zone空闲页数量 减去 当前zone空闲且连续,size大于等于2MB的页数量) 除以 当前zone空闲页数量。 这个值表示当前zone的碎片化得分。
c
结论:
1. 系统碎片化得分 = zone0 碎片化得分+ zone 1碎片化得分 + zone2碎片化得分 +....
2. zone0碎片化得分 = (当前zone空闲页数量 - 当前zone里 order大于等于2M的空闲页数量) / 当前zone空闲页数量
回到kcompactd:
3130行:再次算下碎片化分数fragmentation_score_node,
3131行:会进行真正的碎片化治理:proactive_compact_node
3132行:算治理后的碎片化分数fragmentation_score_node
3137~3140行:如果治理后分数 大于 治理前, 说明治理未达到效果,也说明OS 碎片化目前无法治理,可能系统碎片化不严重,也可能所有内存都在用, 总之推迟下次治理时间
proactive_compact_node
对每个zone调用compact_zone
->compact_zone
c
compact_zone
{
....
isolate_migratepages
....
migrate_pages
->isolate_freepages
....
}
核心函数:isolate_migratepages, 扫描所有可被迁移的页。
c
static int
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t mode)
{
pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
struct lruvec *locked = NULL;
struct folio *folio = NULL;
struct page *page = NULL, *valid_page = NULL;
struct address_space *mapping;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
bool skip_updated = false;
int ret = 0;
cc->migrate_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
while (unlikely(too_many_isolated(cc))) {
/* stop isolation if there are still pages not migrated */
if (cc->nr_migratepages)
return -EAGAIN;
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
return -EAGAIN;
reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
if (fatal_signal_pending(current))
return -EINTR;
}
cond_resched();
if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
skip_on_failure = true;
next_skip_pfn = block_end_pfn(low_pfn, cc->order);
}
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
bool is_dirty, is_unevictable;
if (skip_on_failure && low_pfn >= next_skip_pfn) {
/*
* We have isolated all migration candidates in the
* previous order-aligned block, and did not skip it due
* to failure. We should migrate the pages now and
* hopefully succeed compaction.
*/
if (nr_isolated)
break;
/*
* We failed to isolate in the previous order-aligned
* block. Set the new boundary to the end of the
* current block. Note we can't simply increase
* next_skip_pfn by 1 << order, as low_pfn might have
* been incremented by a higher number due to skipping
* a compound or a high-order buddy page in the
* previous loop iteration.
*/
next_skip_pfn = block_end_pfn(low_pfn, cc->order);
}
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
if (fatal_signal_pending(current)) {
cc->contended = true;
ret = -EINTR;
goto fatal_pending;
}
cond_resched();
}
nr_scanned++;
page = pfn_to_page(low_pfn);
/*
* Check if the pageblock has already been marked skipped.
* Only the first PFN is checked as the caller isolates
* COMPACT_CLUSTER_MAX at a time so the second call must
* not falsely conclude that the block should be skipped.
*/
if (!valid_page && (pageblock_aligned(low_pfn) ||
low_pfn == cc->zone->zone_start_pfn)) {
if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
folio = NULL;
goto isolate_abort;
}
valid_page = page;
}
if (PageHuge(page) && cc->alloc_contig) {
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
/*
* Fail isolation in case isolate_or_dissolve_huge_page()
* reports an error. In case of -ENOMEM, abort right away.
*/
if (ret < 0) {
/* Do not report -EBUSY down the chain */
if (ret == -EBUSY)
ret = 0;
low_pfn += compound_nr(page) - 1;
nr_scanned += compound_nr(page) - 1;
goto isolate_fail;
}
if (PageHuge(page)) {
/*
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
folio = page_folio(page);
low_pfn += folio_nr_pages(folio) - 1;
goto isolate_success_no_list;
}
/*
* Ok, the hugepage was dissolved. Now these pages are
* Buddy and cannot be re-allocated because they are
* isolated. Fall-through as the check below handles
* Buddy pages.
*/
}
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
* the worst thing that can happen is that we skip some
* potential isolation targets.
*/
if (PageBuddy(page)) {
unsigned long freepage_order = buddy_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
* a valid page order. Consider only values in the
* valid order range to prevent low_pfn overflow.
*/
if (freepage_order > 0 && freepage_order <= MAX_PAGE_ORDER) {
low_pfn += (1UL << freepage_order) - 1;
nr_scanned += (1UL << freepage_order) - 1;
}
continue;
}
/*
* Regardless of being on LRU, compound pages such as THP and
* hugetlbfs are not to be compacted unless we are attempting
* an allocation much larger than the huge page size (eg CMA).
* We can potentially save a lot of iterations if we skip them
* at once. The check is racy, but we can consider only valid
* values and the only danger is skipping too much.
*/
if (PageCompound(page) && !cc->alloc_contig) {
const unsigned int order = compound_order(page);
if (likely(order <= MAX_PAGE_ORDER)) {
low_pfn += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
}
goto isolate_fail;
}
/*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU and non-lru movable pages.
* Skip any other type of page
*/
if (!PageLRU(page)) {
/*
* __PageMovable can return false positive so we need
* to verify it under page_lock.
*/
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
if (isolate_movable_page(page, mode)) {
folio = page_folio(page);
goto isolate_success;
}
}
goto isolate_fail;
}
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
folio = folio_get_nontail_page(page);
if (unlikely(!folio))
goto isolate_fail;
/*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
mapping = folio_mapping(folio);
if (!mapping && (folio_ref_count(folio) - 1) > folio_mapcount(folio))
goto isolate_fail_put;
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
if (!(cc->gfp_mask & __GFP_FS) && mapping)
goto isolate_fail_put;
/* Only take pages on LRU: a check now makes later tests safe */
if (!folio_test_lru(folio))
goto isolate_fail_put;
is_unevictable = folio_test_unevictable(folio);
/* Compaction might skip unevictable pages but CMA takes them */
if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
goto isolate_fail_put;
/*
* To minimise LRU disruption, the caller can indicate with
* ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages
* it will be able to migrate without blocking - clean pages
* for the most part. PageWriteback would require blocking.
*/
if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
goto isolate_fail_put;
is_dirty = folio_test_dirty(folio);
if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) ||
(mapping && is_unevictable)) {
bool migrate_dirty = true;
bool is_unmovable;
/*
* Only folios without mappings or that have
* a ->migrate_folio callback are possible to migrate
* without blocking.
*
* Folios from unmovable mappings are not migratable.
*
* However, we can be racing with truncation, which can
* free the mapping that we need to check. Truncation
* holds the folio lock until after the folio is removed
* from the page so holding it ourselves is sufficient.
*
* To avoid locking the folio just to check unmovable,
* assume every unmovable folio is also unevictable,
* which is a cheaper test. If our assumption goes
* wrong, it's not a correctness bug, just potentially
* wasted cycles.
*/
if (!folio_trylock(folio))
goto isolate_fail_put;
mapping = folio_mapping(folio);
if ((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) {
migrate_dirty = !mapping ||
mapping->a_ops->migrate_folio;
}
is_unmovable = mapping && mapping_unmovable(mapping);
folio_unlock(folio);
if (!migrate_dirty || is_unmovable)
goto isolate_fail_put;
}
/* Try isolate the folio */
if (!folio_test_clear_lru(folio))
goto isolate_fail_put;
lruvec = folio_lruvec(folio);
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
if (locked)
unlock_page_lruvec_irqrestore(locked, flags);
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
locked = lruvec;
lruvec_memcg_debug(lruvec, folio);
/*
* Try get exclusive access under lock. If marked for
* skip, the scan is aborted unless the current context
* is a rescan to reach the end of the pageblock.
*/
if (!skip_updated && valid_page) {
skip_updated = true;
if (test_and_set_skip(cc, valid_page) &&
!cc->finish_pageblock) {
low_pfn = end_pfn;
goto isolate_abort;
}
}
/*
* folio become large since the non-locked check,
* and it's on LRU.
*/
if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) {
low_pfn += folio_nr_pages(folio) - 1;
nr_scanned += folio_nr_pages(folio) - 1;
folio_set_lru(folio);
goto isolate_fail_put;
}
}
/* The folio is taken off the LRU */
if (folio_test_large(folio))
low_pfn += folio_nr_pages(folio) - 1;
/* Successfully isolated */
lruvec_del_folio(lruvec, folio);
node_stat_mod_folio(folio,
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
isolate_success:
list_add(&folio->lru, &cc->migratepages);
isolate_success_no_list:
cc->nr_migratepages += folio_nr_pages(folio);
nr_isolated += folio_nr_pages(folio);
nr_scanned += folio_nr_pages(folio) - 1;
/*
* Avoid isolating too much unless this block is being
* fully scanned (e.g. dirty/writeback pages, parallel allocation)
* or a lock is contended. For contention, isolate quickly to
* potentially remove one source of contention.
*/
if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
!cc->finish_pageblock && !cc->contended) {
++low_pfn;
break;
}
continue;
isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
folio_put(folio);
isolate_fail:
if (!skip_on_failure && ret != -ENOMEM)
continue;
/*
* We have isolated some pages, but then failed. Release them
* instead of migrating, as we cannot form the cc->order buddy
* page anyway.
*/
if (nr_isolated) {
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
nr_isolated = 0;
}
if (low_pfn < next_skip_pfn) {
low_pfn = next_skip_pfn - 1;
/*
* The check near the loop beginning would have updated
* next_skip_pfn too, but this is a bit simpler.
*/
next_skip_pfn += 1UL << cc->order;
}
if (ret == -ENOMEM)
break;
}
/*
* The PageBuddy() check could have potentially brought us outside
* the range to be scanned.
*/
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
folio = NULL;
isolate_abort:
if (locked)
unlock_page_lruvec_irqrestore(locked, flags);
if (folio) {
folio_set_lru(folio);
folio_put(folio);
}
/*
* Update the cached scanner pfn once the pageblock has been scanned.
* Pages will either be migrated in which case there is no point
* scanning in the near future or migration failed in which case the
* failure reason may persist. The block is marked for skipping if
* there were no pages isolated in the block or if the block is
* rescanned twice in a row.
*/
if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) {
if (!cc->no_set_skip_hint && valid_page && !skip_updated)
set_pageblock_skip(valid_page);
update_cached_migrate(cc, low_pfn);
}
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
fatal_pending:
cc->total_migrate_scanned += nr_scanned;
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
cc->migrate_pfn = low_pfn;
return ret;
}
函数从start_pfn 到 end_pfn 逐一扫描,发现可以迁移的页,就把发放进cc->migratepages
c
isolate_success:
list_add(&folio->lru, &cc->migratepages);
- 115~153: 如果页是hugepagetlb :
PageHuge(page):判断当前页面是否属于 Hugetlbfs 管理的大页。
cc->alloc_contig:这是一个关键标志:
普通的内存规整,不会为了腾出空间去搬运一个 2MB 甚至 1GB 的 Hugetlbfs 大页,因为开销巨大(page_copy)
只有在 CMA 或 alloc_contig_range() 这种"必须拿到底下这段物理内存"的场景下,才会进入这个分支,此时 cc->alloc_contig = true
核心处理:隔离或溶解:
c
ret = isolate_or_dissolve_huge_page(page, &cc->migratepages)
隔离 (Isolate):如果该大页正在使用,尝试将其隔离并加入迁移列表 cc->migratepages,准备整体搬迁。
溶解 (Dissolve)*:如果这是一个空闲的、巨大的(比如 1GB)大页,尝试将其拆解回普通的 Buddy 系统页面。
- 168~173行,如果是空闲页PageBuddy
直接跳过整个页块 , 空闲页是不用迁移的。
优化:如果你碰到了一个 Order-9 的空闲块,没必要一个 PFN 一个 PFN 往后挪,直接加 511 个 PFN 跳过去。 - 183~191行,如果是合成页PageCompound
直接跳过整个页块 , 合成页是不用迁移的。 - 处理不可移动页与非 LRU 页 、文件系统保护页,不概述
- 248行:处理不可迁移页:
c
ictable = folio_test_unevictable(folio);
/* Compaction might skip unevictable pages but CMA takes them */
if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
goto isolate_fail_put;
```如果用户没有在/proc/sys/vm/compact_unevictable写入1的话, 这里是不处理unevictable 页的, 也就是说用户mlock的页会被kcompaction 忽略。
总结:
1. 物理扫描所有 PFN。
2. 跳过空闲的、已跳过的、不可移动的、已经被钉住的页面。
3. 筛选出"干净"的可移动页面(匿名页或文件页)。
4. 隔离:改变页面标志位,脱离 LRU 全局链表,进入规整私有链表。
接下里介绍真正的迁移函数migrate_pages前 先介绍如何扫描free_page
isolate_freepages: 作用:找到内存空闲的页,让记录在 cc->migratepages 的页迁移到内存空闲页。 实现:
1. 快速通道:fast_isolate_freepages: 去 Buddy 系统的链表里找。
2. 全局逆向扫描:isolate_freepages
把找到的空闲页记录在cc->freepages
migrate_pages:
```c
|1918 */
|1919 int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
|1920 free_folio_t put_new_folio, unsigned long private,
|1921 enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
|1922 {
|1923 int rc, rc_gather;
|1924 int nr_pages;
|1925 struct folio *folio, *folio2;
|1926 LIST_HEAD(folios);
|1927 LIST_HEAD(ret_folios);
|1928 LIST_HEAD(split_folios);
|1929 struct migrate_pages_stats stats;
|1930
|1931 trace_mm_migrate_pages_start(mode, reason);
|1932
|1933 memset(&stats, 0, sizeof(stats));
|1934
|1935 rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private,
|1936 mode, reason, &stats, &ret_folios);
|1937 if (rc_gather < 0)
|1938 goto out;
|1939
|1940 again:
|1941 nr_pages = 0;
|1942 list_for_each_entry_safe(folio, folio2, from, lru) {
|1943 /* Retried hugetlb folios will be kept in list */
|1944 if (folio_test_hugetlb(folio)) {
|1945 list_move_tail(&folio->lru, &ret_folios);
|1946 continue;
|1947 }
|1948
|1949 nr_pages += folio_nr_pages(folio);
|1950 if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
|1951 break;
|1952 }
m|1953 if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
|1954 list_cut_before(&folios, from, &folio2->lru);
|1955 else
|1956 list_splice_init(from, &folios);
|1957 if (mode == MIGRATE_ASYNC)
|1958 rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
|1959 private, mode, reason, &ret_folios,
|1960 &split_folios, &stats,
|1961 NR_MAX_MIGRATE_PAGES_RETRY);
|1962 else
|1963 rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio,
|1964 private, mode, reason, &ret_folios,
|1965 &split_folios, &stats);
|1966 list_splice_tail_init(&folios, &ret_folios);
|1967 if (rc < 0) {
|1968 rc_gather = rc;
|1969 list_splice_tail(&split_folios, &ret_folios);
|1970 goto out;
|1971 }
|1972 if (!list_empty(&split_folios)) {
|1973 /*
|1974 * Failure isn't counted since all split folios of a large folio
|1975 * is counted as 1 failure already. And, we only try to migrate
|1976 * with minimal effort, force MIGRATE_ASYNC mode and retry once.
|1977 */
|1978 migrate_pages_batch(&split_folios, get_new_folio,
|1979 put_new_folio, private, MIGRATE_ASYNC, reason,
|1980 &ret_folios, NULL, &stats, 1);
|1981 list_splice_tail_init(&split_folios, &ret_folios);
|1982 }
|1983 rc_gather += rc;
|1984 if (!list_empty(from))
|1985 goto again;
|1986 out:
|1987 /*
|1988 * Put the permanent failure folio back to migration list, they
|1989 * will be put back to the right list by the caller.
|1990 */
|1991 list_splice(&ret_folios, from);
|1992
|1993 /*
|1994 * Return 0 in case all split folios of fail-to-migrate large folios
|1995 * are migrated successfully.
|1996 */
|1997 if (list_empty(from))
|1998 rc_gather = 0;
|1999
|2000 count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
|2001 count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
|2002 count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded);
|2003 count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed);
|2004 count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
|2005 trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
|2006 stats.nr_thp_succeeded, stats.nr_thp_failed,
|2007 stats.nr_thp_split, stats.nr_split, mode,
|2008 reason);
|2009
|2010 if (ret_succeeded)
|2011 *ret_succeeded = stats.nr_succeeded;
|2012
|2013 return rc_gather;
|2014 }
|2015
1935~1939: 处理hugepagetlb 暂不分析
1957~1965:
MIGRATE_ASYNC mode:调用migrate_pages_batch
MIGRATE_SYNC mode: 调用migrate_pages_sync
下面是AI回答 暂时分析不动了,有点累:
migrate_pages_batch
c
Context
1. 核心流程控制:migrate_pages_batch 56,000 tokens
6% used
这是异步(Async)或批量模式下的主引擎。它的精髓在于 "批量解映射 + 统一刷新 TLB + 批量移动"。 $0.00 spent
A. 第一阶段:批量解映射 (migrate_folio_unmap) LSP
LSPs are disabled
- 循环扫描:遍历 from 链表中的所有 folio。
- 条件过滤:如果是大页(THP)且硬件不支持迁移,尝试将其拆分(try_split_folio)。
┃ - 尝试解映射:调用 migrate_folio_unmap。
┃ - 这个函数会尝试锁定 folio (如果 mode 是异步且拿不到锁,直接返回 -EAGAIN)。
- 分配目标页(调用回调函数)。
- 调用 try_to_migrate(修改页表,插入迁移条目)。
- 结果处理:
- 解映射成功的 folio 放入 unmap_folios 链表。
- 记录对应的目标页到 dst_folios。
B. 第二阶段:刷新 TLB
- 代码:try_to_unmap_flush()。
- 作用:在第一阶段,页表虽然改了,但各 CPU 的 TLB 缓存可能还没更新。在真正拷贝数据前,必须执行一次昂贵的全局 TLB 刷新,确保没有任何 CPU 还能通过旧地址写入数据。批量刷新比单个刷新效率高得多。
C. 第三阶段:批量物理移动 (migrate_folio_move)
- 执行拷贝:遍历 unmap_folios。
- 数据迁移:调用 migrate_folio_move。
- 执行 copy_highpage(数据拷贝)。
- 通过 remove_migration_ptes 将页表中的"迁移条目"替换为指向新物理页。
- 处理页面标志位、脏位同步。
- 释放旧页:迁移完成后,旧页引用计数减一,最终回归伙伴系统。
migrate_pages_sync:
c
┃ 2. 核心流程控制:migrate_pages_sync
┃
这是同步(Sync)模式下的逻辑(通常在 migrate_pages 的后半段被调用,或者 from 链表只有一个页时使用)。
- 设计哲学:不计代价,必须成功。
- 行为差异:
- 它不会分阶段执行。它会针对每一个 folio,在一个循环里完成"锁定 -> 解映射 -> 移动 -> 刷新"。
- 重试机制:如果页面正在写回(PageWriteback),它会调用 folio_wait_writeback 阻塞等待。
- 锁竞争:它不使用 trylock,而是直接调用 folio_lock 阻塞等待直到拿锁。
- 强制拆分:如果大页迁移一直失败,它会更积极地拆分大页。
unmap_and_move:
c
关键原子函数:unmap_and_move 系列
不管是 batch 还是 sync,最终都会走到这几个关键动作:
1. folio_get_nontail_page: 确保在处理期间 folio 不会被意外销毁。
2. try_to_migrate:
- 它是基于反向映射(RMAP)实现的。
- 它会找到所有映射了该 PFN 的进程页表项(PTE),并将其替换为 swp_entry_t(类型为 MIGRATE)。
3. move_to_new_folio:
- 这是数据真正"易主"的地方。
- 它会交换 mapping、index 等核心元数据,让文件系统层级认为新页就是原来的那个页。