Sched ext回调3——select_cpu（linux 6.15.7）

struct sched_ext_ops代表了一个调度器，里面定义了很多回调函数，本文分析select_cpu这个hook，sched ext的部分hook如下：

一、唤醒task时，执行select_cpu的调用链

wake_up_process

└─ try_to_wake_up

├─(场景A) p == current 特殊处理

| └─ ttwu_do_wakeup

| └─ WRITE_ONCE(p->__state, TASK_RUNNING)，继续执行p，退出

| (场景B)

├─ select_task_rq，返回cpu

| ├─ cpu = p->sched_class->select_task_rq ← 即 select_task_rq_scx

| | ├─(场景1) SCX_CALL_OP_TASK_RET(..., select_cpu ← 即 select_cpu 回调

| | └─(场景2) scx_select_cpu_dfl （如果sched ext没有定义select_cpu）

├─ ttwu_queue(p, cpu, wake_flags)

| ├─ rq = cpu_rq(cpu) 参数cpu是前面select_cpu返回的

| └─ ttwu_do_activate(rq, p, wake_flags, &rf) 将task放入到cpu代表的队列中

| ├─ activate_task ，将task加入到调度器自己的queue

| | ├─ enqueue_task

| | | └─ p->sched_class->enqueue_task ← 即 enqueue_task_scx

| | | ├─ p->scx.flags |= SCX_TASK_QUEUED

| | | ├─ SCX_CALL_OP_TASK(..., runnable ← 即 runnable 回调

| | | └─ do_enqueue_task

| | ├─ p不允许migration，goto local

| | ├─ sched ext无enqueue回调, goto global

| | ├─ p->scx.ddsp_dsq_id != SCX_DSQ_INVALID, goto direct

| | ├─ SCX_CALL_OP_TASK(..., enqueue ← 即 enqueue 回调

| | ├─ p->scx.ddsp_dsq_id != SCX_DSQ_INVALID, goto direct

| | direct:├─ direct_dispatch(p, enq_flags)

| | local:├─ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags)

| | global:├─ dispatch_enqueue(find_global_dsq(p), p, enq_flags)

| |

| └─ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED)

├─ wakeup_preempt，enqueue的task是否抢占正在运行的task

| ├─(场景1)donor->sched_class->wakeup_preempt，task属于同一个调度类

| └─(场景2)resched_curr(rq)，enqueue的task调度类更高

├─ ttwu_do_wakeup，设置task的状态为running

└─ WRITE_ONCE(p->__state, TASK_RUNNING)

二、exec时，执行select_cpu的调用链

SYSCALL_DEFINE3(execve, ...

└─ do_execve

└─ do_execveat_common

├─ alloc_bprm

├─ copy_string_kernel(bprm->filename, bprm)

├─ copy_strings(bprm->envc, envp, bprm)

├─ copy_strings(bprm->argc, argv, bprm)

└─ bprm_execve

├─ sched_exec

| ├─ p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC) ← 注1

| | ├─(场景1) SCX_CALL_OP_TASK_RET(..., select_cpu ← 即select_cpu

| | └─(场景2) scx_select_cpu_dfl （如果sched ext没有定义select_cpu）

| |

| ├─ arg = (struct migration_arg){ p, dest_cpu }

| ├─ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg) ← 注2

├─ exec_binprm

注1：对于sched ext，p->sched_class->select_task_rq即 select_task_rq_scx，另外第2个参数task_cpu(p)，是execve() 发生时当前正在运行的 CPU。

注2：由于exec_binprm后，址空间、代码、数据都会变，cache / TLB / 内存局部性几乎全部失效，这是一个"迁移成本极低"的好时机。如果注1处选择了不同于当前cpu，立即做迁移。

三、唤醒流程中，select cpu用到的p->wake_cpu

try_to_wake_up --> select_task_rq(p, p->wake_cpu, &wake_flags):

复制代码

int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{

	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
	if (task_cpu(p) != cpu) {
		if (p->in_iowait) {
			delayacct_blkio_end(p);
			atomic_dec(&task_rq(p)->nr_iowait);
		}

		wake_flags |= WF_MIGRATED;
		psi_ttwu_dequeue(p);
		set_task_cpu(p, cpu);
	}

}

p->wake_cpu告诉调度器，希望任务醒来时"应该尽量考虑这个 CPU"，但是否真的用这个CPU，由调度器根据策略NUMA、能耗、负载、绑定关系等决定。细节参考调度器选核函数：select_task_rq_fair、select_task_rq_scx、select_task_rq_idle等等。

p->wake_cpu在ac66f5477239ebd3c4e2cbf2f591ef387aa09884中引入，用于解决numa balance迁移task时，如果task不在运行队列中（!p->on_rq），通过延迟迁移到唤醒时刻，保证了迁移交换操作的原子性和一致性，同时避免了复杂的锁竞争和状态管理。wake_cpu字段记录该任务应该被迁移到的目标CPU，try_to_wake_up --> select_task_rq会优先考虑选择该cpu。

task_numa_migrate --> migrate_swap ：

复制代码

int migrate_swap(struct task_struct *cur, struct task_struct *p,
		int target_cpu, int curr_cpu)
{

	arg = (struct migration_swap_arg){
		.src_task = cur,
		.src_cpu = curr_cpu,
		.dst_task = p,
		.dst_cpu = target_cpu,
	};

	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
}

static int migrate_swap_stop(void *data)
{
	struct migration_swap_arg *arg = data; <== migrate_swap -> stop_two_cpus的arg参数

	__migrate_swap_task(arg->src_task, arg->dst_cpu);
	__migrate_swap_task(arg->dst_task, arg->src_cpu);

	return 0;
}

static void __migrate_swap_task(struct task_struct *p, int cpu)
{
	if (task_on_rq_queued(p)) {
		......

	} else {
		/*
		 * Task isn't running anymore; make it appear like we migrated
		 * it before it went to sleep. This means on wakeup we make the
		 * previous CPU our target instead of where it really is.
		 */
		p->wake_cpu = cpu; <== task不在运行队列中，设置wake_cpu，try_to_wake_up优先选择该cpu
	}
}

四、p->wake_cpu在sched ext中的使用

select_task_rq_scx代码：

cpp 复制代码

static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
    /*
	 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
	 * can be a good migration opportunity with low cache and memory
	 * footprint. Returning a CPU different than @prev_cpu triggers
	 * immediate rq migration. However, for SCX, as the current rq
	 * association doesn't dictate where the task is going to run, this
	 * doesn't fit well. If necessary, we can later add a dedicated method
	 * which can decide to preempt self to force it through the regular
	 * scheduling path.
	 */

	if (unlikely(wake_flags & WF_EXEC)) <== 注3
		return prev_cpu;

	if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
					   select_cpu, NULL, p, prev_cpu, wake_flags); <== 注4
		p->scx.selected_cpu = cpu;
		*ddsp_taskp = NULL;
		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
			return cpu;
		else
			return prev_cpu;
	} else {

		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);

		return cpu;
	}
}

注3：kernel_execve -> bprm_execve -> p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC)，传递的是WF_EXEC标记，select_task_rq_scx检测到该标记，会直接返回prev_cpu。根据注释，exec时，如果select_task_rq返回一个不同于于当前的cpu后，会立即迁移task到新cpu上（参考第二节），但对于sched ext调度器来说，返回的cpu并不代表task就会在该cpu上运行，dispatch阶段可以将task放到任何cpu上执行，所以立即迁过去没有意义。

注4：执行sched ext的select_cpu回调。

下面代码中的prev_cpu就是p->wake_cpu。

cpp 复制代码

SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
        select_cpu, NULL, p, prev_cpu, wake_flags)

try_to_wake_up --> select_task_rq(p, p->wake_cpu, &wake_flags)

所以怎么使用p->wake_cpu是由用户实现的select_cpu回调决定的。

五、select_cpu的注释说明

cpp 复制代码

struct sched_ext_ops {
	/**
	 * @select_cpu: Pick the target CPU for a task which is being woken up
	 * @p: task being woken up
	 * @prev_cpu: the cpu @p was on before sleeping
	 * @wake_flags: SCX_WAKE_*
	 *
	 * Decision made here isn't final. @p may be moved to any CPU while it
	 * is getting dispatched for execution later. However, as @p is not on
	 * the rq at this point, getting the eventual execution CPU right here
	 * saves a small bit of overhead down the line. 
        <== 注5
	 *
	 * If an idle CPU is returned, the CPU is kicked and will try to
	 * dispatch. While an explicit custom mechanism can be added,
	 * select_cpu() serves as the default way to wake up idle CPUs. 
       <== 注6
	 *
	 * @p may be inserted into a DSQ directly by calling
	 * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
	 * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
	 * of the CPU returned by this operation.
        <== 注7
	 *
	 * Note that select_cpu() is never called for tasks that can only run
	 * on a single CPU or tasks with migration disabled, as they don't have
	 * the option to select a different CPU. See select_task_rq() for
	 * details.
        <== 注8
	 */
	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);

注5：当任务被唤醒时，select_cpu为即将唤醒的任务选择目标cpu，但task最终不一定运行在该cpu，因为dispatch可将task放到任意cpu上执行。

注6：内核默认行为是，如果返回的cpu是idle的，sched ext会立即 kick这个cpu，让cpu退出idle状态，尝试从其本地或全局运行队列中取出任务执行。select_cpu是唤醒 idlecpu的标准且推荐方式。只要select_cpu回调返回一个 idle cpu，内核就会自动处理唤醒逻辑。

用户也可以在eBPF程序中（比如enqueue回调中），调用scx_bpf_kick_cpu主动kick cpu。

注7：如果用户在eBPF程序中调用scx_bpf_dsq_insert将task加入dsq，那么enqueue回调就不会再执行了。

注8：如果cpumask只允许运行在一个cpu，或者关闭了迁移功能，select_cpu不会被执行。因为这个task没有机会运行在其他cpu上，所以就不需要select_cpu了。