struct sched_ext_ops代表了一个调度器,里面定义了很多回调函数,本文分析select_cpu这个hook,sched ext的部分hook如下:

一、唤醒task时,执行select_cpu的调用链
wake_up_process
└─ try_to_wake_up
├─(场景A) p == current 特殊处理
| └─ ttwu_do_wakeup
| └─ WRITE_ONCE(p->__state, TASK_RUNNING),继续执行p,退出
| (场景B)
├─ select_task_rq,返回cpu
| ├─ cpu = p->sched_class->select_task_rq ← 即 select_task_rq_scx
| | ├─(场景1) SCX_CALL_OP_TASK_RET(..., select_cpu ← 即 select_cpu 回调
| | └─(场景2) scx_select_cpu_dfl (如果sched ext没有定义select_cpu)
|
├─ ttwu_queue(p, cpu, wake_flags)
| ├─ rq = cpu_rq(cpu) 参数cpu是前面select_cpu返回的
| └─ ttwu_do_activate(rq, p, wake_flags, &rf) 将task放入到cpu代表的队列中
| ├─ activate_task ,将task加入到调度器自己的queue
| | ├─ enqueue_task
| | | └─ p->sched_class->enqueue_task ← 即 enqueue_task_scx
| | | ├─ p->scx.flags |= SCX_TASK_QUEUED
| | | ├─ SCX_CALL_OP_TASK(..., runnable ← 即 runnable 回调
| | | └─ do_enqueue_task
| | ├─ p不允许migration,goto local
| | ├─ sched ext无enqueue回调, goto global
| | ├─ p->scx.ddsp_dsq_id != SCX_DSQ_INVALID, goto direct
| | ├─ SCX_CALL_OP_TASK(..., enqueue ← 即 enqueue 回调
| | ├─ p->scx.ddsp_dsq_id != SCX_DSQ_INVALID, goto direct
| | direct:├─ direct_dispatch(p, enq_flags)
| | local:├─ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags)
| | global:├─ dispatch_enqueue(find_global_dsq(p), p, enq_flags)
| |
| └─ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED)
|
├─ wakeup_preempt,enqueue的task是否抢占正在运行的task
| ├─(场景1)donor->sched_class->wakeup_preempt,task属于同一个调度类
| └─(场景2)resched_curr(rq),enqueue的task调度类更高
|
├─ ttwu_do_wakeup,设置task的状态为running
└─ WRITE_ONCE(p->__state, TASK_RUNNING)
二、exec时,执行select_cpu的调用链
SYSCALL_DEFINE3(execve, ...
└─ do_execve
└─ do_execveat_common
├─ alloc_bprm
├─ copy_string_kernel(bprm->filename, bprm)
├─ copy_strings(bprm->envc, envp, bprm)
├─ copy_strings(bprm->argc, argv, bprm)
└─ bprm_execve
├─ sched_exec
| ├─ p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC) ← 注1
| | ├─(场景1) SCX_CALL_OP_TASK_RET(..., select_cpu ← 即select_cpu
| | └─(场景2) scx_select_cpu_dfl (如果sched ext没有定义select_cpu)
| |
| ├─ arg = (struct migration_arg){ p, dest_cpu }
| ├─ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg) ← 注2
|
├─ exec_binprm
注1:对于sched ext,p->sched_class->select_task_rq即 select_task_rq_scx,另外第2个参数task_cpu(p),是execve() 发生时当前正在运行的 CPU。
注2:由于exec_binprm后,址空间、代码、数据都会变,cache / TLB / 内存局部性几乎全部失效,这是一个"迁移成本极低"的好时机。如果注1处选择了不同于当前cpu,立即做迁移。
三、唤醒流程中,select cpu用到的p->wake_cpu
try_to_wake_up --> select_task_rq(p, p->wake_cpu, &wake_flags):
int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
}
p->wake_cpu告诉调度器,希望任务醒来时"应该尽量考虑这个 CPU",但是否真的用这个CPU,由调度器根据策略NUMA、能耗、负载、绑定关系等决定。细节参考调度器选核函数:select_task_rq_fair、select_task_rq_scx、select_task_rq_idle等等。
p->wake_cpu在ac66f5477239ebd3c4e2cbf2f591ef387aa09884中引入,用于解决numa balance迁移task时,如果task不在运行队列中(!p->on_rq),通过延迟迁移到唤醒时刻,保证了迁移交换操作的原子性和一致性,同时避免了复杂的锁竞争和状态管理。wake_cpu字段记录该任务应该被迁移到的目标CPU,try_to_wake_up --> select_task_rq会优先考虑选择该cpu。
task_numa_migrate --> migrate_swap :
int migrate_swap(struct task_struct *cur, struct task_struct *p,
int target_cpu, int curr_cpu)
{
arg = (struct migration_swap_arg){
.src_task = cur,
.src_cpu = curr_cpu,
.dst_task = p,
.dst_cpu = target_cpu,
};
ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
}
static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data; <== migrate_swap -> stop_two_cpus的arg参数
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
return 0;
}
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
......
} else {
/*
* Task isn't running anymore; make it appear like we migrated
* it before it went to sleep. This means on wakeup we make the
* previous CPU our target instead of where it really is.
*/
p->wake_cpu = cpu; <== task不在运行队列中,设置wake_cpu,try_to_wake_up优先选择该cpu
}
}
四、p->wake_cpu在sched ext中的使用
select_task_rq_scx代码:
cpp
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
/*
* sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
* can be a good migration opportunity with low cache and memory
* footprint. Returning a CPU different than @prev_cpu triggers
* immediate rq migration. However, for SCX, as the current rq
* association doesn't dictate where the task is going to run, this
* doesn't fit well. If necessary, we can later add a dedicated method
* which can decide to preempt self to force it through the regular
* scheduling path.
*/
if (unlikely(wake_flags & WF_EXEC)) <== 注3
return prev_cpu;
if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
select_cpu, NULL, p, prev_cpu, wake_flags); <== 注4
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
return cpu;
}
}
注3:kernel_execve -> bprm_execve -> p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC),传递的是WF_EXEC标记,select_task_rq_scx检测到该标记,会直接返回prev_cpu。根据注释,exec时,如果select_task_rq返回一个不同于于当前的cpu后,会立即迁移task到新cpu上(参考第二节),但对于sched ext调度器来说,返回的cpu并不代表task就会在该cpu上运行,dispatch阶段可以将task放到任何cpu上执行,所以立即迁过去没有意义。
注4:执行sched ext的select_cpu回调。
下面代码中的prev_cpu就是p->wake_cpu。
cpp
SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
select_cpu, NULL, p, prev_cpu, wake_flags)
try_to_wake_up --> select_task_rq(p, p->wake_cpu, &wake_flags)
所以怎么使用p->wake_cpu是由用户实现的select_cpu回调决定的。
五、select_cpu的注释说明
cpp
struct sched_ext_ops {
/**
* @select_cpu: Pick the target CPU for a task which is being woken up
* @p: task being woken up
* @prev_cpu: the cpu @p was on before sleeping
* @wake_flags: SCX_WAKE_*
*
* Decision made here isn't final. @p may be moved to any CPU while it
* is getting dispatched for execution later. However, as @p is not on
* the rq at this point, getting the eventual execution CPU right here
* saves a small bit of overhead down the line.
<== 注5
*
* If an idle CPU is returned, the CPU is kicked and will try to
* dispatch. While an explicit custom mechanism can be added,
* select_cpu() serves as the default way to wake up idle CPUs.
<== 注6
*
* @p may be inserted into a DSQ directly by calling
* scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
* Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
* of the CPU returned by this operation.
<== 注7
*
* Note that select_cpu() is never called for tasks that can only run
* on a single CPU or tasks with migration disabled, as they don't have
* the option to select a different CPU. See select_task_rq() for
* details.
<== 注8
*/
s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
注5:当任务被唤醒时,select_cpu为即将唤醒的任务选择目标cpu,但task最终不一定运行在该cpu,因为dispatch可将task放到任意cpu上执行。
注6:内核默认行为是,如果返回的cpu是idle的,sched ext会立即 kick这个cpu,让cpu退出idle状态,尝试从其本地或全局运行队列中取出任务执行。select_cpu是唤醒 idlecpu的标准且推荐方式。只要select_cpu回调返回一个 idle cpu,内核就会自动处理唤醒逻辑。
用户也可以在eBPF程序中(比如enqueue回调中),调用scx_bpf_kick_cpu主动kick cpu。
注7:如果用户在eBPF程序中调用scx_bpf_dsq_insert将task加入dsq,那么enqueue回调就不会再执行了。
注8:如果cpumask只允许运行在一个cpu,或者关闭了迁移功能,select_cpu不会被执行。因为这个task没有机会运行在其他cpu上,所以就不需要select_cpu了。