尝试回收页面try_to_free_pages
c
int try_to_free_pages(struct zone **zones,
unsigned int gfp_mask, unsigned int order)
{
int priority;
int ret = 0;
int total_scanned = 0, total_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc;
unsigned long lru_pages = 0;
int i;
sc.gfp_mask = gfp_mask;
sc.may_writepage = 0;
inc_page_state(allocstall);
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
zone->temp_priority = DEF_PRIORITY;
lru_pages += zone->nr_active + zone->nr_inactive;
}
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
sc.priority = priority;
shrink_caches(zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) {
ret = 1;
goto out;
}
total_scanned += sc.nr_scanned;
total_reclaimed += sc.nr_reclaimed;
/*
* Try to write back as many pages as we just scanned. This
* tends to cause slow streaming writers to write data to the
* disk smoothly, at the dirtying rate, which is nice. But
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
wakeup_bdflush(laptop_mode ? 0 : total_scanned);
sc.may_writepage = 1;
}
/* Take a nap, wait for some writeback to complete */
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
out_of_memory(gfp_mask);
out:
for (i = 0; zones[i] != 0; i++)
zones[i]->prev_priority = zones[i]->temp_priority;
return ret;
}
1. 函数功能
在内存压力下尝试回收页面,这是直接内存回收的核心函数。通过多优先级扫描和回收机制,尝试释放足够的内存来满足分配请求
2. 逐行代码解析
c
int try_to_free_pages(struct zone **zones,
unsigned int gfp_mask, unsigned int order)
{
- 函数定义:返回int类型,1表示成功回收足够页面,0表示失败
zones
:要回收的内存区域数组gfp_mask
:分配标志,控制回收行为order
:请求的分配阶数
c
int priority;
- 优先级变量:控制回收的激进程度,从高到低(数值从大到小)
c
int ret = 0;
- 返回值初始化:默认返回0(回收失败)
c
int total_scanned = 0, total_reclaimed = 0;
- 统计变量 :
total_scanned
:累计扫描的页面数total_reclaimed
:累计回收的页面数
c
struct reclaim_state *reclaim_state = current->reclaim_state;
- 获取当前进程的回收状态 :
current->reclaim_state
:当前进程的回收状态指针- 用于跟踪slab回收器回收的页面数量
c
struct scan_control sc;
- 扫描控制结构:包含页面回收的所有控制参数和统计信息
c
unsigned long lru_pages = 0;
- LRU页面总数:所有zone中活跃和非活跃页面的总和
c
int i;
- 循环计数器
c
sc.gfp_mask = gfp_mask;
- 设置扫描控制的GFP掩码:传递分配标志给回收器
c
sc.may_writepage = 0;
- 初始化写页面权限:初始不允许写页面到磁盘
c
inc_page_state(allocstall);
- 增加分配停顿统计:记录发生了一次内存回收事件
c
for (i = 0; zones[i] != NULL; i++) {
- 遍历所有zone:初始化每个zone的回收参数
c
struct zone *zone = zones[i];
- 获取当前zone指针
c
zone->temp_priority = DEF_PRIORITY;
- 设置临时优先级 :
DEF_PRIORITY
表示默认优先级
c
lru_pages += zone->nr_active + zone->nr_inactive;
- 累计LRU页面总数:计算所有zone中可回收页面的基数
c
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- 主回收循环:从默认优先级12开始,逐步降低到0
- 优先级含义:数值越高越温和,数值越低越激进
c
sc.nr_mapped = read_page_state(nr_mapped);
- 读取已映射页面数:获取系统当前被进程映射的页面总数
c
sc.nr_scanned = 0;
- 重置本次扫描计数:每轮优先级循环重新计数
c
sc.nr_reclaimed = 0;
- 重置本次回收计数:每轮优先级循环重新计数
c
sc.priority = priority;
- 设置当前优先级:控制本次循环的回收激进程度
c
shrink_caches(zones, &sc);
- 核心回收函数:扫描并回收页面缓存和匿名页面
c
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
- 收缩slab缓存:回收内核对象缓存
c
if (reclaim_state) {
- 检查是否有回收状态:slab回收器可能已经回收了页面
c
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- 累加slab回收的页面:将slab回收的页面数加到总回收数
c
reclaim_state->reclaimed_slab = 0;
- 重置slab回收计数:为下一轮循环准备
c
if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) {
- 检查是否回收足够页面 :
SWAP_CLUSTER_MAX
通常是32页
c
ret = 1;
- 设置成功标志:表示回收了足够页面
c
goto out;
- 跳转到清理代码:直接退出回收循环
c
total_scanned += sc.nr_scanned;
- 累计总扫描页面数
c
total_reclaimed += sc.nr_reclaimed;
- 累计总回收页面数
c
if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
- 检查是否需要唤醒
bdflush
:当扫描超过48页时(32+16)
c
wakeup_bdflush(laptop_mode ? 0 : total_scanned);
- 唤醒磁盘刷写线程 :
- 笔记本模式:参数0,刷写所有脏页
- 正常模式:参数为扫描数量,按需刷写
c
sc.may_writepage = 1;
- 允许写页面:在后续循环中可以写页面到磁盘
c
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
- 检查是否需要等待 :
sc.nr_scanned
:本轮扫描了页面priority < DEF_PRIORITY - 2
:优先级低于10(比较激进时)
c
blk_congestion_wait(WRITE, HZ/10);
- 等待IO拥塞缓解
c
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
- 检查是否触发OOM :
__GFP_FS
:允许文件系统操作!__GFP_NORETRY
:允许重试(不禁止OOM)
c
out_of_memory(gfp_mask);
- 调用OOM killer:选择并杀死一个进程来释放内存
c
out:
- 标签:清理代码的开始
c
for (i = 0; zones[i] != 0; i++)
- 遍历所有zone进行清理
c
zones[i]->prev_priority = zones[i]->temp_priority;
- 保存优先级历史 :将本次回收的优先级记录到
prev_priority
c
return ret;
- 返回结果:1表示成功,0表示失败
3. 回收策略详解
3.1. 成功条件
- 单轮回收 ≥ 32页(SWAP_CLUSTER_MAX)
- 或者触发OOM killer
3.2. 退出条件
- 成功退出:回收足够页面
- 循环结束:所有优先级都尝试过
- OOM触发:无法回收足够页面且允许OOM
协调多个内存区域的页面回收shrink_caches
c
static void
shrink_caches(struct zone **zones, struct scan_control *sc)
{
int i;
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
if (zone->present_pages == 0)
continue;
zone->temp_priority = sc->priority;
if (zone->prev_priority > sc->priority)
zone->prev_priority = sc->priority;
if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
shrink_zone(zone, sc);
}
}
1. 函数功能
协调多个内存区域的页面回收工作,根据内存区域的状态和回收优先级决定是否对每个zone执行页面回收。这是内存回收的调度器,负责将回收请求分发到各个内存区域
2. 逐行代码解析
c
static void
shrink_caches(struct zone **zones, struct scan_control *sc)
{
static void
: 静态函数,无返回值zones
: 内存区域指针数组,包含所有需要回收的zonesc
: 扫描控制结构指针,包含回收参数和统计信息
c
int i;
- 循环计数器:用于遍历zones数组
c
for (i = 0; zones[i] != NULL; i++) {
- 遍历所有zone :
i = 0
: 从第一个zone开始zones[i] != NULL
: 循环条件,直到遇到NULL指针(数组结束)i++
: 每次循环处理一个zone
c
struct zone *zone = zones[i];
- 获取当前zone指针 :
zones[i]
: 访问zones数组的第i个元素- 将当前zone的指针保存到局部变量
zone
中便于使用
c
if (zone->present_pages == 0)
- 检查zone是否有物理内存 :
zone->present_pages
: zone中实际存在的物理页面数量- 如果为0,表示这个zone没有任何物理内存
c
continue;
- 跳过空zone :
- 如果zone没有物理内存,执行
continue
跳过当前循环迭代 - 直接进入下一个zone的处理
- 如果zone没有物理内存,执行
c
zone->temp_priority = sc->priority;
- 设置zone的临时优先级 :
sc->priority
: 当前扫描控制的优先级(从12到0)zone->temp_priority
: zone的临时优先级字段- 作用:记录本次回收使用的优先级
c
if (zone->prev_priority > sc->priority)
- 检查是否需要更新历史优先级 :
zone->prev_priority
: zone的上次回收使用的优先级sc->priority
: 当前优先级- 条件:如果历史优先级大于当前优先级(历史更温和)
c
zone->prev_priority = sc->priority;
- 更新历史优先级 :
- 将zone的
prev_priority
设置为当前优先级 - 设计意义:记录最近使用的最激进优先级,用于后续回收决策
- 将zone的
c
if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
- 检查不可回收zone :
zone->all_unreclaimable
: zone是否被标记为完全不可回收sc->priority != DEF_PRIORITY
: 当前优先级不是默认优先级(12)- 条件:如果zone不可回收且当前不是温和回收
c
continue; /* Let kswapd poll it */
- 跳过不可回收zone :
- 执行
continue
跳过当前zone的回收 - 设计意义:避免在激进回收时浪费CPU在不可回收的zone上
- 执行
c
shrink_zone(zone, sc);
- 执行zone回收 :
shrink_zone(zone, sc)
: 核心回收函数,对该zone执行实际的页面回收- 参数:当前zone指针和扫描控制结构
指定内存区域页面回收shrink_zone
c
static void
shrink_zone(struct zone *zone, struct scan_control *sc)
{
unsigned long nr_active;
unsigned long nr_inactive;
/*
* Add one to `nr_to_scan' just to make sure that the kernel will
* slowly sift through the active list.
*/
zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
nr_active = zone->nr_scan_active;
if (nr_active >= SWAP_CLUSTER_MAX)
zone->nr_scan_active = 0;
else
nr_active = 0;
zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
nr_inactive = zone->nr_scan_inactive;
if (nr_inactive >= SWAP_CLUSTER_MAX)
zone->nr_scan_inactive = 0;
else
nr_inactive = 0;
sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
while (nr_active || nr_inactive) {
if (nr_active) {
sc->nr_to_scan = min(nr_active,
(unsigned long)SWAP_CLUSTER_MAX);
nr_active -= sc->nr_to_scan;
refill_inactive_zone(zone, sc);
}
if (nr_inactive) {
sc->nr_to_scan = min(nr_inactive,
(unsigned long)SWAP_CLUSTER_MAX);
nr_inactive -= sc->nr_to_scan;
shrink_cache(zone, sc);
if (sc->nr_to_reclaim <= 0)
break;
}
}
}
1. 函数功能
在指定内存区域中执行页面回收,通过平衡活跃和非活跃链表的管理,将页面从活跃链表移动到非活跃链表并最终回收
2. 逐行代码解析
c
static void
shrink_zone(struct zone *zone, struct scan_control *sc)
{
static void
: 静态函数,无返回值zone
: 目标内存区域指针sc
: 扫描控制结构指针,包含回收参数和统计
c
unsigned long nr_active;
- 活跃页面扫描计数:本次要扫描的活跃页面数量
c
unsigned long nr_inactive;
- 非活跃页面扫描计数:本次要扫描的非活跃页面数量
c
zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
- 累计活跃页面扫描计数 :
zone->nr_active
: zone中活跃页面的总数zone->nr_active >> sc->priority
: 根据优先级计算扫描比例- 优先级越高(数值大),右移位数越多,扫描比例越小(温和)
- 优先级越低(数值小),右移位数越少,扫描比例越大(激进)
+ 1
: 确保至少扫描1个页面- 结果累加到
zone->nr_scan_active
(zone的活跃扫描累加器)
c
nr_active = zone->nr_scan_active;
- 获取当前活跃扫描计数:将累加值保存到局部变量
c
if (nr_active >= SWAP_CLUSTER_MAX)
- 检查是否达到批量扫描阈值 :
SWAP_CLUSTER_MAX
通常是32页
c
zone->nr_scan_active = 0;
- 重置活跃扫描累加器:如果达到阈值,清零准备下一轮累计
c
else
nr_active = 0;
- 不足阈值则本次不扫描活跃页面 :如果累计不足32页,设置
nr_active = 0
,本次跳过活跃链表扫描
c
zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
- 累计非活跃页面扫描计数 :
- 同样的逻辑应用于非活跃页面
zone->nr_inactive >> sc->priority
: 根据优先级计算非活跃页面扫描比例+ 1
: 确保至少扫描1个页面
c
nr_inactive = zone->nr_scan_inactive;
- 获取当前非活跃扫描计数
c
if (nr_inactive >= SWAP_CLUSTER_MAX)
zone->nr_scan_inactive = 0;
else
nr_inactive = 0;
- 同样的阈值检查逻辑应用于非活跃页面
c
sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
- 设置回收目标:本次回收希望回收32个页面
c
while (nr_active || nr_inactive) {
- 主回收循环:只要还有活跃或非活跃页面需要扫描就继续
nr_active || nr_inactive
: 任一不为零就继续循环
c
if (nr_active) {
- 检查是否需要扫描活跃页面
c
sc->nr_to_scan = min(nr_active,
(unsigned long)SWAP_CLUSTER_MAX);
- 计算本次扫描数量 :
min(nr_active, (unsigned long)SWAP_CLUSTER_MAX)
:- 取剩余活跃页面数和32中的较小值
- 确保单次扫描不超过32个页面(避免长时间持有锁)
c
nr_active -= sc->nr_to_scan;
- 更新剩余活跃页面数:减去本次要扫描的数量
c
refill_inactive_zone(zone, sc);
- 核心函数:补充非活跃zone :
- 扫描活跃链表,将符合条件的页面移动到非活跃链表
- 这是页面回收的第一步:将"热"页面降级为"冷"页面
c
if (nr_inactive) {
- 检查是否需要扫描非活跃页面
c
sc->nr_to_scan = min(nr_inactive,
(unsigned long)SWAP_CLUSTER_MAX);
- 计算非活跃页面扫描数量:同样的限制逻辑
c
nr_inactive -= sc->nr_to_scan;
- 更新剩余非活跃页面数
c
shrink_cache(zone, sc);
- 核心函数:收缩缓存 :
- 扫描非活跃链表,实际回收页面
- 可能将页面写回磁盘或直接释放
c
if (sc->nr_to_reclaim <= 0)
- 检查是否达到回收目标 :
sc->nr_to_reclaim
在回收过程中递减
c
break;
- 提前退出循环:如果已经回收了足够页面,立即退出
3. 双阶段回收策略
3.1. 阶段1: refill_inactive_zone
- 目的: 将活跃链表中"冷却"的页面移动到非活跃链表
- 策略: 基于页面访问频率和年龄
- 效果: 准备可回收的候选页面
3.2. 阶段2: shrink_cache
- 目的: 实际回收非活跃链表中的页面
- 动作: 写回脏页、释放干净页、交换匿名页
- 效果: 真正释放物理内存
移动到非活跃链表refill_inactive_zone
c
static void
refill_inactive_zone(struct zone *zone, struct scan_control *sc)
{
int pgmoved;
int pgdeactivate = 0;
int pgscanned = 0;
int nr_pages = sc->nr_to_scan;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
LIST_HEAD(l_active); /* Pages to go onto the active_list */
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
long mapped_ratio;
long distress;
long swap_tendency;
lru_add_drain();
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
page = lru_to_page(&zone->active_list);
prefetchw_prev_lru_page(page, &zone->active_list, flags);
if (!TestClearPageLRU(page))
BUG();
list_del(&page->lru);
if (get_page_testone(page)) {
/*
* It was already free! release_pages() or put_page()
* are about to remove it from the LRU and free it. So
* put the refcount back and put the page back on the
* LRU
*/
__put_page(page);
SetPageLRU(page);
list_add(&page->lru, &zone->active_list);
} else {
list_add(&page->lru, &l_hold);
pgmoved++;
}
pgscanned++;
}
zone->pages_scanned += pgscanned;
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
/*
* `distress' is a measure of how much trouble we're having reclaiming
* pages. 0 -> no problems. 100 -> great trouble.
*/
distress = 100 >> zone->prev_priority;
/*
* The point of this algorithm is to decide when to start reclaiming
* mapped memory instead of just pagecache. Work out how much memory
* is mapped.
*/
mapped_ratio = (sc->nr_mapped * 100) / total_memory;
/*
* Now decide how much we really want to unmap some pages. The mapped
* ratio is downgraded - just because there's a lot of mapped memory
* doesn't necessarily mean that page reclaim isn't succeeding.
*
* The distress ratio is important - we don't want to start going oom.
*
* A 100% value of vm_swappiness overrides this algorithm altogether.
*/
swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
/*
* Now use this metric to decide whether to start moving mapped memory
* onto the inactive list.
*/
if (swap_tendency >= 100)
reclaim_mapped = 1;
while (!list_empty(&l_hold)) {
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
page_referenced(page, 0, sc->priority <= 0)) {
list_add(&page->lru, &l_active);
continue;
}
}
list_add(&page->lru, &l_inactive);
}
pagevec_init(&pvec, 1);
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
if (TestSetPageLRU(page))
BUG();
if (!TestClearPageActive(page))
BUG();
list_move(&page->lru, &zone->inactive_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_inactive += pgmoved;
spin_unlock_irq(&zone->lru_lock);
pgdeactivate += pgmoved;
pgmoved = 0;
if (buffer_heads_over_limit)
pagevec_strip(&pvec);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
zone->nr_inactive += pgmoved;
pgdeactivate += pgmoved;
if (buffer_heads_over_limit) {
spin_unlock_irq(&zone->lru_lock);
pagevec_strip(&pvec);
spin_lock_irq(&zone->lru_lock);
}
pgmoved = 0;
while (!list_empty(&l_active)) {
page = lru_to_page(&l_active);
prefetchw_prev_lru_page(page, &l_active, flags);
if (TestSetPageLRU(page))
BUG();
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_active += pgmoved;
pgmoved = 0;
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
zone->nr_active += pgmoved;
spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec);
mod_page_state_zone(zone, pgrefill, pgscanned);
mod_page_state(pgdeactivate, pgdeactivate);
}
1. 函数功能
将页面从活跃链表移动到非活跃链表,这是页面回收的关键步骤。通过智能算法决定哪些活跃页面应该被"降级"到非活跃状态,为后续的实际回收做准备
2. 第一段:变量声明和初始化
c
static void
refill_inactive_zone(struct zone *zone, struct scan_control *sc)
{
int pgmoved;
int pgdeactivate = 0;
int pgscanned = 0;
int nr_pages = sc->nr_to_scan;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
LIST_HEAD(l_active); /* Pages to go onto the active_list */
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
long mapped_ratio;
long distress;
long swap_tendency;
变量说明:
pgmoved
:移动的页面计数pgdeactivate
:停用页面计数(最终进入非活跃链表的页面)pgscanned
:已扫描页面计数nr_pages
:要扫描的总页面数l_hold
:临时存放从活跃链表取下的页面l_inactive
:将要放入非活跃链表的页面l_active
:将要放回活跃链表的页面reclaim_mapped
:是否回收映射页面的标志mapped_ratio
:映射内存比例distress
:内存压力程度swap_tendency
:交换倾向性评分
3. 第二段:LRU准备和页面提取
c
lru_add_drain();
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
page = lru_to_page(&zone->active_list);
prefetchw_prev_lru_page(page, &zone->active_list, flags);
if (!TestClearPageLRU(page))
BUG();
list_del(&page->lru);
if (get_page_testone(page)) {
/*
* It was already free! release_pages() or put_page()
* are about to remove it from the LRU and free it. So
* put the refcount back and put the page back on the
* LRU
*/
__put_page(page);
SetPageLRU(page);
list_add(&page->lru, &zone->active_list);
} else {
list_add(&page->lru, &l_hold);
pgmoved++;
}
pgscanned++;
}
zone->pages_scanned += pgscanned;
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
这段代码的作用:从活跃链表中批量提取页面到临时链表
关键操作:
lru_add_drain()
:清空LRU缓存,确保所有待添加页面已加入相应链表- 循环从活跃链表头部取页面
get_page_testone(page)
:检查页面是否正在被释放- 如果正在释放,将页面放回活跃链表
- 否则,加入临时链表
l_hold
- 更新zone统计信息
4. 第三段:回收策略决策算法
c
/*
* `distress' is a measure of how much trouble we're having reclaiming
* pages. 0 -> no problems. 100 -> great trouble.
*/
distress = 100 >> zone->prev_priority;
/*
* The point of this algorithm is to decide when to start reclaiming
* mapped memory instead of just pagecache. Work out how much memory
* is mapped.
*/
mapped_ratio = (sc->nr_mapped * 100) / total_memory;
/*
* Now decide how much we really want to unmap some pages. The mapped
* ratio is downgraded - just because there's a lot of mapped memory
* doesn't necessarily mean that page reclaim isn't succeeding.
*
* The distress ratio is important - we don't want to start going oom.
*
* A 100% value of vm_swappiness overrides this algorithm altogether.
*/
swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
/*
* Now use this metric to decide whether to start moving mapped memory
* onto the inactive list.
*/
if (swap_tendency >= 100)
reclaim_mapped = 1;
决策算法详解:
-
内存压力计算 :
distress = 100 >> zone->prev_priority
- 优先级越低(越激进),distress值越大
-
映射内存比例 :
mapped_ratio = (sc->nr_mapped * 100) / total_memory
- 计算被进程映射的内存占总内存的比例
-
交换倾向性 :
swap_tendency = mapped_ratio / 2 + distress + vm_swappiness
- 综合三个因素:映射内存比例、内存压力、系统交换倾向设置
-
决策 :如果
swap_tendency >= 100
,则设置reclaim_mapped = 1
- 表示开始回收映射内存(进程的工作集)
5. 第四段:页面分类决策
c
while (!list_empty(&l_hold)) {
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
page_referenced(page, 0, sc->priority <= 0)) {
list_add(&page->lru, &l_active);
continue;
}
}
list_add(&page->lru, &l_inactive);
}
页面分类逻辑:
对于每个临时页面:
-
如果是映射页面(
page_mapped(page)
):- 如果
!reclaim_mapped
(不回收映射页面),放回活跃链表 - 如果没有交换空间且是匿名页面,放回活跃链表
- 如果页面最近被访问(
page_referenced
),放回活跃链表 - 否则,放入非活跃链表
- 如果
-
如果是非映射页面(文件缓存),直接放入非活跃链表
6. 第五段:页面批量放回非活跃链表
c
pagevec_init(&pvec, 1);
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
if (TestSetPageLRU(page))
BUG();
if (!TestClearPageActive(page))
BUG();
list_move(&page->lru, &zone->inactive_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_inactive += pgmoved;
spin_unlock_irq(&zone->lru_lock);
pgdeactivate += pgmoved;
pgmoved = 0;
if (buffer_heads_over_limit)
pagevec_strip(&pvec);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
zone->nr_inactive += pgmoved;
pgdeactivate += pgmoved;
if (buffer_heads_over_limit) {
spin_unlock_irq(&zone->lru_lock);
pagevec_strip(&pvec);
spin_lock_irq(&zone->lru_lock);
}
操作流程:
- 初始化页面向量用于批量操作
- 将
l_inactive
中的页面移动到zone的非活跃链表 - 清除
PG_active
标志,设置PG_LRU
标志 - 使用
pagevec
批量处理,提高效率 - 如果buffer头超过限制,进行特殊处理
7. 第六段:页面放回活跃链表和统计更新
c
pgmoved = 0;
while (!list_empty(&l_active)) {
page = lru_to_page(&l_active);
prefetchw_prev_lru_page(page, &l_active, flags);
if (TestSetPageLRU(page))
BUG();
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_active += pgmoved;
pgmoved = 0;
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
zone->nr_active += pgmoved;
spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec);
mod_page_state_zone(zone, pgrefill, pgscanned);
mod_page_state(pgdeactivate, pgdeactivate);
}
最后阶段:
- 将
l_active
中的页面放回zone的活跃链表 - 确保这些页面保持
PG_active
标志 - 更新zone的活跃页面计数
- 更新内核统计信息:
pgrefill
:页面补充统计pgdeactivate
:页面停用统计
从非活跃链表中回收页面shrink_cache
c
static void shrink_cache(struct zone *zone, struct scan_control *sc)
{
LIST_HEAD(page_list);
struct pagevec pvec;
int max_scan = sc->nr_to_scan;
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
while (max_scan > 0) {
struct page *page;
int nr_taken = 0;
int nr_scan = 0;
int nr_freed;
while (nr_scan++ < SWAP_CLUSTER_MAX &&
!list_empty(&zone->inactive_list)) {
page = lru_to_page(&zone->inactive_list);
prefetchw_prev_lru_page(page,
&zone->inactive_list, flags);
if (!TestClearPageLRU(page))
BUG();
list_del(&page->lru);
if (get_page_testone(page)) {
/*
* It is being freed elsewhere
*/
__put_page(page);
SetPageLRU(page);
list_add(&page->lru, &zone->inactive_list);
continue;
}
list_add(&page->lru, &page_list);
nr_taken++;
}
zone->nr_inactive -= nr_taken;
spin_unlock_irq(&zone->lru_lock);
if (nr_taken == 0)
goto done;
max_scan -= nr_scan;
if (current_is_kswapd())
mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
else
mod_page_state_zone(zone, pgscan_direct, nr_scan);
nr_freed = shrink_list(&page_list, sc);
if (current_is_kswapd())
mod_page_state(kswapd_steal, nr_freed);
mod_page_state_zone(zone, pgsteal, nr_freed);
sc->nr_to_reclaim -= nr_freed;
spin_lock_irq(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
if (TestSetPageLRU(page))
BUG();
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
else
add_page_to_inactive_list(zone, page);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
}
spin_unlock_irq(&zone->lru_lock);
done:
pagevec_release(&pvec);
}
1. 函数功能
从非活跃链表中回收页面,这是实际执行页面释放操作的地方。包括将脏页写回磁盘、交换匿名页面、释放干净页面等具体回收操作
2. 第一段:变量声明和初始化
c
static void shrink_cache(struct zone *zone, struct scan_control *sc)
{
LIST_HEAD(page_list);
struct pagevec pvec;
int max_scan = sc->nr_to_scan;
pagevec_init(&pvec, 1);
变量说明:
page_list
:临时链表,存放从非活跃链表取出的待处理页面pvec
:页面向量,用于批量释放页面max_scan
:最大扫描页面数,从扫描控制结构复制而来pagevec_init(&pvec, 1)
:初始化页面向量,参数1表示冷页面
3. 第二段:准备工作和锁获取
c
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
准备工作:
lru_add_drain()
:清空Per-CPU的LRU缓存,确保所有待添加页面已加入相应链表spin_lock_irq(&zone->lru_lock)
:获取zone的LRU锁并禁用中断,保护LRU链表操作
4. 第三段:主扫描循环
c
while (max_scan > 0) {
struct page *page;
int nr_taken = 0;
int nr_scan = 0;
int nr_freed;
主循环条件 :max_scan > 0
,还有页面需要扫描
局部变量:
nr_taken
:本次从非活跃链表取出的页面数nr_scan
:本次扫描的页面计数器nr_freed
:实际释放的页面数
5. 第四段:从非活跃链表提取页面
c
while (nr_scan++ < SWAP_CLUSTER_MAX &&
!list_empty(&zone->inactive_list)) {
page = lru_to_page(&zone->inactive_list);
prefetchw_prev_lru_page(page,
&zone->inactive_list, flags);
if (!TestClearPageLRU(page))
BUG();
list_del(&page->lru);
批量提取逻辑:
- 循环条件1:
nr_scan++ < SWAP_CLUSTER_MAX
,最多批量提取32个页面 - 循环条件2:
!list_empty(&zone->inactive_list)
,非活跃链表不为空 lru_to_page(&zone->inactive_list)
:从链表头部获取页面prefetchw_prev_lru_page()
:预取下一个页面,提高缓存性能TestClearPageLRU(page)
:原子地清除LRU标志,如果失败触发BUGlist_del(&page->lru)
:从非活跃链表中删除页面
6. 第五段:页面引用检查
c
if (get_page_testone(page)) {
/*
* It is being freed elsewhere
*/
__put_page(page);
SetPageLRU(page);
list_add(&page->lru, &zone->inactive_list);
continue;
}
list_add(&page->lru, &page_list);
nr_taken++;
}
引用检查逻辑:
get_page_testone(page)
:检查页面引用计数,如果正在被其他地方释放__put_page(page)
:减少引用计数SetPageLRU(page)
:重新设置LRU标志list_add(&page->lru, &zone->inactive_list)
:将页面放回非活跃链表continue
:跳过这个页面,处理下一个
- 否则:将页面加入临时链表
page_list
,增加nr_taken
计数
7. 第六段:统计更新和实际回收
c
zone->nr_inactive -= nr_taken;
spin_unlock_irq(&zone->lru_lock);
if (nr_taken == 0)
goto done;
max_scan -= nr_scan;
if (current_is_kswapd())
mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
else
mod_page_state_zone(zone, pgscan_direct, nr_scan);
统计更新:
zone->nr_inactive -= nr_taken
:更新zone的非活跃页面计数spin_unlock_irq(&zone->lru_lock)
:释放锁,允许其他操作- 如果
nr_taken == 0
,跳转到完成处理 max_scan -= nr_scan
:减少剩余扫描数量- 根据当前进程是否是
kswapd
更新不同的统计信息
8. 第七段:实际回收操作
c
nr_freed = shrink_list(&page_list, sc);
if (current_is_kswapd())
mod_page_state(kswapd_steal, nr_freed);
mod_page_state_zone(zone, pgsteal, nr_freed);
sc->nr_to_reclaim -= nr_freed;
核心回收:
nr_freed = shrink_list(&page_list, sc)
:实际回收页面,返回释放的页面数- 这个函数内部处理页面的具体回收逻辑(写回、交换、释放)
- 更新回收统计信息:
kswapd_steal
:kswapd回收的页面数pgsteal
:总的页面窃取数
sc->nr_to_reclaim -= nr_freed
:减少待回收页面目标
9. 第八段:未回收页面的处理
c
spin_lock_irq(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
if (TestSetPageLRU(page))
BUG();
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
else
add_page_to_inactive_list(zone, page);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
}
未回收页面处理:
- 重新获取锁,处理未能回收的页面
- 遍历
page_list
中剩余的页面(未能被回收的) TestSetPageLRU(page)
:设置LRU标志,如果已设置则触发BUG- 根据页面是否活跃,放回相应的链表:
PageActive(page)
:放回活跃链表- 否则:放回非活跃链表
- 使用
pagevec
批量操作提高效率
10. 第九段:清理工作
c
spin_unlock_irq(&zone->lru_lock);
done:
pagevec_release(&pvec);
}
收尾工作:
spin_unlock_irq(&zone->lru_lock)
:最终释放锁done:
:标签,用于前面goto
跳转pagevec_release(&pvec)
:释放页面向量中剩余的页面
实际执行页面回收shrink_list
c
static int shrink_list(struct list_head *page_list, struct scan_control *sc)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
int pgactivate = 0;
int reclaimed = 0;
cond_resched();
pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
int may_enter_fs;
int referenced;
page = lru_to_page(page_list);
list_del(&page->lru);
if (TestSetPageLocked(page))
goto keep;
BUG_ON(PageActive(page));
if (PageWriteback(page))
goto keep_locked;
sc->nr_scanned++;
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
referenced = page_referenced(page, 1, sc->priority <= 0);
/* In active use or really unfreeable? Activate it. */
if (referenced && page_mapping_inuse(page))
goto activate_locked;
#ifdef CONFIG_SWAP
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
if (!add_to_swap(page))
goto activate_locked;
}
#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
if (PageDirty(page)) {
if (referenced)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
if (laptop_mode && !sc->may_writepage)
goto keep_locked;
/* Page is dirty, try to write it out here */
switch(pageout(page, mapping)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
if (PageWriteback(page) || PageDirty(page))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
if (TestSetPageLocked(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
mapping = page_mapping(page);
case PAGE_CLEAN:
; /* try to free the page below */
}
}
/*
* If the page has buffers, try to free the buffer mappings
* associated with this page. If we succeed we try to free
* the page as well.
*
* We do this even if the page is PageDirty().
* try_to_release_page() does not perform I/O, but it is
* possible for a page to have PageDirty set, but it is actually
* clean (all its buffers are clean). This happens if the
* buffers were written out directly, with submit_bh(). ext3
* will do this, as well as the blockdev mapping.
* try_to_release_page() will discover that cleanness and will
* drop the buffers and mark the page clean - it can be freed.
*
* Rarely, pages can have buffers and no ->mapping. These are
* the pages which were not successfully invalidated in
* truncate_complete_page(). We try to drop those buffers here
* and if that worked, and the page is no longer mapped into
* process address space (page_count == 1) it can be freed.
* Otherwise, leave the page on the LRU so it is swappable.
*/
if (PagePrivate(page)) {
if (!try_to_release_page(page, sc->gfp_mask))
goto activate_locked;
if (!mapping && page_count(page) == 1)
goto free_it;
}
if (!mapping)
goto keep_locked; /* truncate got there first */
spin_lock_irq(&mapping->tree_lock);
/*
* The non-racy check for busy page. It is critical to check
* PageDirty _after_ making sure that the page is freeable and
* not in use by anybody. (pagecache + us == 2)
*/
if (page_count(page) != 2 || PageDirty(page)) {
spin_unlock_irq(&mapping->tree_lock);
goto keep_locked;
}
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page->private };
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
swap_free(swap);
__put_page(page); /* The pagecache ref */
goto free_it;
}
#endif /* CONFIG_SWAP */
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
__put_page(page);
free_it:
unlock_page(page);
reclaimed++;
if (!pagevec_add(&freed_pvec, page))
__pagevec_release_nonlru(&freed_pvec);
continue;
activate_locked:
SetPageActive(page);
pgactivate++;
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
BUG_ON(PageLRU(page));
}
list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
mod_page_state(pgactivate, pgactivate);
sc->nr_reclaimed += reclaimed;
return reclaimed;
}
1. 函数功能
实际执行页面回收操作,包括解除映射、写回脏页、释放缓存页面等。这是页面回收管道中真正释放内存的地方
2. 第一段:变量声明和初始化
c
static int shrink_list(struct list_head *page_list, struct scan_control *sc)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
int pgactivate = 0;
int reclaimed = 0;
cond_resched();
pagevec_init(&freed_pvec, 1);
变量说明:
ret_pages
:临时链表,存放未能回收需要返回的页面freed_pvec
:页面向量,用于批量释放已回收的页面pgactivate
:激活页面计数(从非活跃提升到活跃的页面)reclaimed
:成功回收的页面计数cond_resched()
:在开始前让出CPU,避免长时间占用pagevec_init(&freed_pvec, 1)
:初始化页面向量用于批量释放
3. 第二段:主循环和页面锁定
c
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
int may_enter_fs;
int referenced;
page = lru_to_page(page_list);
list_del(&page->lru);
if (TestSetPageLocked(page))
goto keep;
主循环:处理输入链表中的所有页面
页面锁定:
TestSetPageLocked(page)
:尝试锁定页面,如果已被锁定则跳转到keep- 页面锁定防止在回收过程中被其他操作修改
4. 第三段:基本状态检查
c
BUG_ON(PageActive(page));
if (PageWriteback(page))
goto keep_locked;
sc->nr_scanned++;
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
状态检查:
BUG_ON(PageActive(page))
:确保页面不在活跃状态(应该是非活跃的)PageWriteback(page)
:如果页面正在写回,跳过回收- 扫描计数:映射页面或交换缓存页面计数加倍(回收成本更高)
5. 第四段:页面引用检查
c
referenced = page_referenced(page, 1, sc->priority <= 0);
/* In active use or really unfreeable? Activate it. */
if (referenced && page_mapping_inuse(page))
goto activate_locked;
引用检查:
page_referenced(page, 1, sc->priority <= 0)
:检查页面是否最近被引用- 如果被引用且映射还在使用中,激活页面(提升到活跃链表)
6. 第五段:匿名页面处理
c
#ifdef CONFIG_SWAP
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
if (!add_to_swap(page))
goto activate_locked;
}
#endif /* CONFIG_SWAP */
匿名页面:
- 如果是匿名页面且不在交换缓存中,尝试分配交换空间
- 如果分配失败,激活页面(无法回收)
7. 第六段:映射页面解除映射
c
mapping = page_mapping(page);
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
解除映射:
try_to_unmap(page)
:尝试从所有进程的页表中解除页面映射- 三种结果:
SWAP_FAIL
:解除失败,激活页面SWAP_AGAIN
:需要重试,保持锁定SWAP_SUCCESS
:成功解除映射,继续回收
8. 第七段:脏页写回处理
c
if (PageDirty(page)) {
if (referenced)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
if (laptop_mode && !sc->may_writepage)
goto keep_locked;
/* Page is dirty, try to write it out here */
switch(pageout(page, mapping)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
if (PageWriteback(page) || PageDirty(page))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
if (TestSetPageLocked(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
mapping = page_mapping(page);
case PAGE_CLEAN:
; /* try to free the page below */
}
}
脏页处理:
- 多种情况跳过写回:被引用、不允许文件系统操作、笔记本模式等
pageout(page, mapping)
:执行页面写回- 四种结果:
PAGE_KEEP
:保持页面PAGE_ACTIVATE
:激活页面PAGE_SUCCESS
:写回成功,继续回收PAGE_CLEAN
:页面变干净,继续回收
9. 第八段:缓冲区页面处理
c
if (PagePrivate(page)) {
if (!try_to_release_page(page, sc->gfp_mask))
goto activate_locked;
if (!mapping && page_count(page) == 1)
goto free_it;
}
缓冲区页面:
PagePrivate(page)
:页面有缓冲区(文件系统元数据)try_to_release_page()
:尝试释放缓冲区- 如果没有映射且只有一个引用,可以直接释放
10. 第九段:页面缓存检查
c
if (!mapping)
goto keep_locked; /* truncate got there first */
spin_lock_irq(&mapping->tree_lock);
/*
* The non-racy check for busy page. It is critical to check
* PageDirty _after_ making sure that the page is freeable and
* not in use by anybody. (pagecache + us == 2)
*/
if (page_count(page) != 2 || PageDirty(page)) {
spin_unlock_irq(&mapping->tree_lock);
goto keep_locked;
}
页面缓存检查:
- 检查页面是否可释放:引用计数必须为2(页面缓存+当前回收)
- 页面必须干净(非脏页)
11. 第十段:交换缓存页面释放
c
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page->private };
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
swap_free(swap);
__put_page(page); /* The pagecache ref */
goto free_it;
}
#endif /* CONFIG_SWAP */
交换缓存页面:
- 从交换缓存中删除页面
- 释放交换条目
- 减少页面缓存引用
12. 第十一段:普通页面缓存释放
c
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
__put_page(page);
free_it:
unlock_page(page);
reclaimed++;
if (!pagevec_add(&freed_pvec, page))
__pagevec_release_nonlru(&freed_pvec);
continue;
页面缓存释放:
- 从页面缓存中移除页面
- 减少引用计数
- 批量释放已回收的页面
13. 第十二段:失败处理和统计
c
activate_locked:
SetPageActive(page);
pgactivate++;
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
BUG_ON(PageLRU(page));
}
list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
mod_page_state(pgactivate, pgactivate);
sc->nr_reclaimed += reclaimed;
return reclaimed;
}
收尾工作:
- 将未能回收的页面返回原链表
- 批量释放已回收的页面
- 更新统计信息
收缩内核对象缓存shrink_slab
c
static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
unsigned long lru_pages)
{
struct shrinker *shrinker;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem))
return 0;
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
delta = (4 * scanned) / shrinker->seeks;
delta *= (*shrinker->shrinker)(0, gfp_mask);
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
if (shrinker->nr < 0)
shrinker->nr = LONG_MAX; /* It wrapped! */
total_scan = shrinker->nr;
shrinker->nr = 0;
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
if (shrink_ret == -1)
break;
mod_page_state(slabs_scanned, this_scan);
total_scan -= this_scan;
cond_resched();
}
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
return 0;
}
1. 函数功能
收缩内核对象缓存(slab缓存),通过调用所有注册的shrinker函数来回收内核数据结构使用的内存
2. 第一段:函数定义和初始检查
c
static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
unsigned long lru_pages)
{
struct shrinker *shrinker;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
参数说明:
scanned
:页面回收过程中扫描的页面数量,反映内存压力程度gfp_mask
:分配标志,控制回收行为lru_pages
:系统中LRU页面的总数,用于计算回收比例
初始检查:
- 如果
scanned
为0,设置为SWAP_CLUSTER_MAX
(通常32) - 确保即使没有页面扫描信息,也能执行一定程度的slab回收
3. 第二段:锁获取和遍历准备
c
if (!down_read_trylock(&shrinker_rwsem))
return 0;
list_for_each_entry(shrinker, &shrinker_list, list) {
锁机制:
down_read_trylock(&shrinker_rwsem)
:尝试获取shrinker列表的读锁- 如果获取失败(返回0),直接返回,不执行slab回收
- 使用trylock避免在锁争用时阻塞
shrinker_rwsem
:保护shrinker列表的读写信号量
遍历开始:
list_for_each_entry(shrinker, &shrinker_list, list)
:遍历shrinker链表- 每个内核子系统可以注册自己的shrinker来管理其缓存
4. 第三段:回收量计算算法
c
unsigned long long delta;
unsigned long total_scan;
delta = (4 * scanned) / shrinker->seeks;
delta *= (*shrinker->shrinker)(0, gfp_mask);
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
if (shrinker->nr < 0)
shrinker->nr = LONG_MAX; /* It wrapped! */
回收量计算步骤:
-
基础增量 :
delta = (4 * scanned) / shrinker->seeks
scanned
:反映内存压力,扫描越多压力越大shrinker->seeks
:该缓存的重建成本,值越大表示回收代价越高- 系数4:经验值,调整回收强度
-
乘以可回收对象数 :
delta *= (*shrinker->shrinker)(0, gfp_mask)
- 调用shrinker函数,参数0表示只查询可回收数量,不实际回收
- 获取该缓存中可回收的对象数量
-
按比例缩放 :
do_div(delta, lru_pages + 1)
- 根据系统总LRU页面数缩放回收量
- 系统内存越大,单次回收比例越小
+1
防止除零
-
累计和边界检查:
shrinker->nr += delta
:累计到该shrinker的待回收计数- 如果溢出(小于0),设置为
LONG_MAX
5. 第四段:批量回收执行
c
total_scan = shrinker->nr;
shrinker->nr = 0;
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
if (shrink_ret == -1)
break;
mod_page_state(slabs_scanned, this_scan);
total_scan -= this_scan;
cond_resched();
}
批量回收逻辑:
-
初始化 :
total_scan = shrinker->nr
,然后清零shrinker->nr
- 保存累计的待回收量,并重置计数器
-
批量循环 :
while (total_scan >= SHRINK_BATCH)
SHRINK_BATCH
:批量大小(通常128),避免频繁调用shrinker
-
执行回收 :
shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask)
- 实际调用shrinker函数回收指定数量的对象
- 参数
this_scan
:本次要尝试回收的对象数量
-
错误检查 :
if (shrink_ret == -1) break
- 如果shrinker返回-1,表示无法继续回收,提前退出
-
更新统计 :
mod_page_state(slabs_scanned, this_scan)
- 更新slab扫描统计信息
-
调度机会 :
cond_resched()
- 在长时间循环中让出CPU,避免饿死其他进程
6. 第五段:清理和返回
c
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
return 0;
}
收尾工作:
-
保存剩余量 :
shrinker->nr += total_scan
- 将未处理完的回收量保存回shrinker,供下次使用
- 实现渐进式回收,避免丢失回收进度
-
释放锁 :
up_read(&shrinker_rwsem)
- 释放shrinker列表的读锁
-
返回 :
return 0
- 总是返回0,实际回收效果通过全局状态体现
基于我们前面分析的所有函数,我来总结一个完整的页面回收工作流程图。
完整的内存回收工作流程图
否 是 是 否 是 否 是 否 是 否 是 否 是 否 是 否 是 否 是 否 内存分配失败 try_to_free_pages
直接内存回收入口 初始化优先级12->0循环 设置扫描控制参数 shrink_caches
协调各zone回收 遍历所有zone zone有物理内存? 跳过空zone 更新zone优先级 zone不可回收且非默认优先级? 跳过不可回收zone shrink_zone
zone级别回收 继续下一个zone 计算活跃/非活跃扫描量 活跃链表处理循环 refill_inactive_zone
活跃->非活跃 lru_add_drain
清空LRU缓存 从活跃链表提取页面 计算回收策略参数 distress=100>>prev_priority mapped_ratio=映射内存比例 swap_tendency=综合评分 swap_tendency>=100? reclaim_mapped=1 reclaim_mapped=0 页面分类决策 页面映射? 满足回收条件? 放入非活跃链表 放回活跃链表 批量放回非活跃链表 批量放回活跃链表 更新zone统计 非活跃链表处理循环 shrink_cache
实际回收页面 检查页面类型和状态 文件页面? 页面脏? 匿名页面交换 写回磁盘 直接释放 更新回收统计 回收目标达成? 提前退出 shrink_slab
slab缓存回收 获取shrinker锁 遍历所有shrinker 计算回收量delta 批量回收循环 调用shrinker函数 更新统计和调度 批量完成? 处理下一个shrinker 释放锁 回收页面>=32? 返回成功 继续下一优先级 内存分配成功
1. 关键函数职责总结
1.1. 顶层协调层
try_to_free_pages()
: 回收入口,管理优先级循环shrink_caches()
: 协调各个zone的回收工作
1.2. Zone级别回收层
shrink_zone()
: 单个zone的回收调度,计算扫描量refill_inactive_zone()
: 活跃→非活跃链表转换shrink_cache()
: 实际回收非活跃链表中的页面
1.3. 页面处理层
- 页面分类: 映射 vs 非映射,文件 vs 匿名页面
- 回收策略: 基于访问频率、内存压力、交换成本
- 实际操作: 写回脏页、释放干净页、交换匿名页
1.4. Slab回收层
shrink_slab()
: 内核对象缓存回收调度- Shrinker机制: 各子系统注册的缓存回收器
1.5. 辅助功能层
lru_add_drain()
: LRU缓存刷新- 统计更新: 各种页面和回收统计
2. 回收策略决策矩阵
页面类型 | 映射状态 | 回收策略 | 成本 |
---|---|---|---|
文件页面 | 未映射 | 直接释放 | 低 |
文件页面 | 已映射 | 谨慎回收 | 中 |
匿名页面 | 未映射 | 交换释放 | 中 |
匿名页面 | 已映射 | 避免回收 | 高 |
3. 成功条件
- 单次回收 ≥ 32页 (
SWAP_CLUSTER_MAX
) - 所有优先级循环完成
- 触发OOM Killer (最终手段)