Sched ext回调3——select_cpu(linux 6.15.7)

struct sched_ext_ops代表了一个调度器,里面定义了很多回调函数,本文分析select_cpu这个hook,sched ext的部分hook如下:

一、唤醒task时,执行select_cpu的调用链

wake_up_process

└─ try_to_wake_up

├─(场景A) p == current 特殊处理

| └─ ttwu_do_wakeup

| └─ WRITE_ONCE(p->__state, TASK_RUNNING),继续执行p,退出

| (场景B)

├─ select_task_rq,返回cpu

| ├─ cpu = p->sched_class->select_task_rq ← 即 select_task_rq_scx

| | ├─(场景1) SCX_CALL_OP_TASK_RET(..., select_cpu ← 即 select_cpu 回调

| | └─(场景2) scx_select_cpu_dfl (如果sched ext没有定义select_cpu)

|

├─ ttwu_queue(p, cpu, wake_flags)

| ├─ rq = cpu_rq(cpu) 参数cpu是前面select_cpu返回的

| └─ ttwu_do_activate(rq, p, wake_flags, &rf) 将task放入到cpu代表的队列中

| ├─ activate_task ,将task加入到调度器自己的queue

| | ├─ enqueue_task

| | | └─ p->sched_class->enqueue_task ← 即 enqueue_task_scx

| | | ├─ p->scx.flags |= SCX_TASK_QUEUED

| | | ├─ SCX_CALL_OP_TASK(..., runnable ← 即 runnable 回调

| | | └─ do_enqueue_task

| | ├─ p不允许migration,goto local

| | ├─ sched ext无enqueue回调, goto global

| | ├─ p->scx.ddsp_dsq_id != SCX_DSQ_INVALID, goto direct

| | ├─ SCX_CALL_OP_TASK(..., enqueue ← 即 enqueue 回调

| | ├─ p->scx.ddsp_dsq_id != SCX_DSQ_INVALID, goto direct

| | direct:├─ direct_dispatch(p, enq_flags)

| | local:├─ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags)

| | global:├─ dispatch_enqueue(find_global_dsq(p), p, enq_flags)

| |

| └─ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED)

|

├─ wakeup_preempt,enqueue的task是否抢占正在运行的task

| ├─(场景1)donor->sched_class->wakeup_preempt,task属于同一个调度类

| └─(场景2)resched_curr(rq),enqueue的task调度类更高

|

├─ ttwu_do_wakeup,设置task的状态为running

└─ WRITE_ONCE(p->__state, TASK_RUNNING)

二、exec时,执行select_cpu的调用链

SYSCALL_DEFINE3(execve, ...

└─ do_execve

└─ do_execveat_common

├─ alloc_bprm

├─ copy_string_kernel(bprm->filename, bprm)

├─ copy_strings(bprm->envc, envp, bprm)

├─ copy_strings(bprm->argc, argv, bprm)

└─ bprm_execve

├─ sched_exec

| ├─ p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC) ← 注1

| | ├─(场景1) SCX_CALL_OP_TASK_RET(..., select_cpu ← 即select_cpu

| | └─(场景2) scx_select_cpu_dfl (如果sched ext没有定义select_cpu)

| |

| ├─ arg = (struct migration_arg){ p, dest_cpu }

| ├─ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg) ← 注2

|

├─ exec_binprm

注1:对于sched ext,p->sched_class->select_task_rq即 select_task_rq_scx,另外第2个参数task_cpu(p),是execve() 发生时当前正在运行的 CPU。

注2:由于exec_binprm后,址空间、代码、数据都会变,cache / TLB / 内存局部性几乎全部失效,这是一个"迁移成本极低"的好时机。如果注1处选择了不同于当前cpu,立即做迁移。

三、唤醒流程中,select cpu用到的p->wake_cpu

try_to_wake_up --> select_task_rq(p, p->wake_cpu, &wake_flags):

复制代码
int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{

	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
	if (task_cpu(p) != cpu) {
		if (p->in_iowait) {
			delayacct_blkio_end(p);
			atomic_dec(&task_rq(p)->nr_iowait);
		}

		wake_flags |= WF_MIGRATED;
		psi_ttwu_dequeue(p);
		set_task_cpu(p, cpu);
	}

}

p->wake_cpu告诉调度器,希望任务醒来时"应该尽量考虑这个 CPU",但是否真的用这个CPU,由调度器根据策略NUMA、能耗、负载、绑定关系等决定。细节参考调度器选核函数:select_task_rq_fair、select_task_rq_scx、select_task_rq_idle等等。

p->wake_cpu在ac66f5477239ebd3c4e2cbf2f591ef387aa09884中引入,用于解决numa balance迁移task时,如果task不在运行队列中(!p->on_rq),通过延迟迁移到唤醒时刻,保证了迁移交换操作的原子性和一致性,同时避免了复杂的锁竞争和状态管理。wake_cpu字段记录该任务应该被迁移到的目标CPU,try_to_wake_up --> select_task_rq会优先考虑选择该cpu。

task_numa_migrate --> migrate_swap :

复制代码
int migrate_swap(struct task_struct *cur, struct task_struct *p,
		int target_cpu, int curr_cpu)
{

	arg = (struct migration_swap_arg){
		.src_task = cur,
		.src_cpu = curr_cpu,
		.dst_task = p,
		.dst_cpu = target_cpu,
	};

	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
}

static int migrate_swap_stop(void *data)
{
	struct migration_swap_arg *arg = data; <== migrate_swap -> stop_two_cpus的arg参数

	__migrate_swap_task(arg->src_task, arg->dst_cpu);
	__migrate_swap_task(arg->dst_task, arg->src_cpu);

	return 0;
}

static void __migrate_swap_task(struct task_struct *p, int cpu)
{
	if (task_on_rq_queued(p)) {
		......

	} else {
		/*
		 * Task isn't running anymore; make it appear like we migrated
		 * it before it went to sleep. This means on wakeup we make the
		 * previous CPU our target instead of where it really is.
		 */
		p->wake_cpu = cpu; <== task不在运行队列中,设置wake_cpu,try_to_wake_up优先选择该cpu
	}
}

四、p->wake_cpu在sched ext中的使用

select_task_rq_scx代码:

cpp 复制代码
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
    /*
	 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
	 * can be a good migration opportunity with low cache and memory
	 * footprint. Returning a CPU different than @prev_cpu triggers
	 * immediate rq migration. However, for SCX, as the current rq
	 * association doesn't dictate where the task is going to run, this
	 * doesn't fit well. If necessary, we can later add a dedicated method
	 * which can decide to preempt self to force it through the regular
	 * scheduling path.
	 */

	if (unlikely(wake_flags & WF_EXEC)) <== 注3
		return prev_cpu;

	if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
					   select_cpu, NULL, p, prev_cpu, wake_flags); <== 注4
		p->scx.selected_cpu = cpu;
		*ddsp_taskp = NULL;
		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
			return cpu;
		else
			return prev_cpu;
	} else {

		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);

		return cpu;
	}
}

注3:kernel_execve -> bprm_execve -> p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC),传递的是WF_EXEC标记,select_task_rq_scx检测到该标记,会直接返回prev_cpu。根据注释,exec时,如果select_task_rq返回一个不同于于当前的cpu后,会立即迁移task到新cpu上(参考第二节),但对于sched ext调度器来说,返回的cpu并不代表task就会在该cpu上运行,dispatch阶段可以将task放到任何cpu上执行,所以立即迁过去没有意义。

注4:执行sched ext的select_cpu回调。

下面代码中的prev_cpu就是p->wake_cpu。

cpp 复制代码
SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
        select_cpu, NULL, p, prev_cpu, wake_flags)

try_to_wake_up --> select_task_rq(p, p->wake_cpu, &wake_flags)

所以怎么使用p->wake_cpu是由用户实现的select_cpu回调决定的。

五、select_cpu的注释说明

cpp 复制代码
struct sched_ext_ops {
	/**
	 * @select_cpu: Pick the target CPU for a task which is being woken up
	 * @p: task being woken up
	 * @prev_cpu: the cpu @p was on before sleeping
	 * @wake_flags: SCX_WAKE_*
	 *
	 * Decision made here isn't final. @p may be moved to any CPU while it
	 * is getting dispatched for execution later. However, as @p is not on
	 * the rq at this point, getting the eventual execution CPU right here
	 * saves a small bit of overhead down the line. 
        <== 注5
	 *
	 * If an idle CPU is returned, the CPU is kicked and will try to
	 * dispatch. While an explicit custom mechanism can be added,
	 * select_cpu() serves as the default way to wake up idle CPUs. 
       <== 注6
	 *
	 * @p may be inserted into a DSQ directly by calling
	 * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
	 * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
	 * of the CPU returned by this operation.
        <== 注7
	 *
	 * Note that select_cpu() is never called for tasks that can only run
	 * on a single CPU or tasks with migration disabled, as they don't have
	 * the option to select a different CPU. See select_task_rq() for
	 * details.
        <== 注8
	 */
	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);

注5:当任务被唤醒时,select_cpu为即将唤醒的任务选择目标cpu,但task最终不一定运行在该cpu,因为dispatch可将task放到任意cpu上执行。

注6:内核默认行为是,如果返回的cpu是idle的,sched ext会立即 kick这个cpu,让cpu退出idle状态,尝试从其本地或全局运行队列中取出任务执行。select_cpu是唤醒 idlecpu的标准且推荐方式。只要select_cpu回调返回一个 idle cpu,内核就会自动处理唤醒逻辑。

用户也可以在eBPF程序中(比如enqueue回调中),调用scx_bpf_kick_cpu主动kick cpu。

注7:如果用户在eBPF程序中调用scx_bpf_dsq_insert将task加入dsq,那么enqueue回调就不会再执行了。

注8:如果cpumask只允许运行在一个cpu,或者关闭了迁移功能,select_cpu不会被执行。因为这个task没有机会运行在其他cpu上,所以就不需要select_cpu了。

相关推荐
代码游侠2 小时前
C语言核心概念复习——网络协议与TCP/IP
linux·运维·服务器·网络·算法
你真是饿了2 小时前
6.库制作与原理
linux·服务器
Zach_yuan3 小时前
深入浅出 JSONCpp
linux·服务器·网络·c++
北京迅为4 小时前
《【北京迅为】itop-3568开发板NPU使用手册》- 第 7章 使用RKNN-Toolkit-lite2
linux·人工智能·嵌入式·npu
Dragon~Snow4 小时前
Linux Centos9 安装 Elasticsearch
linux·elasticsearch·jenkins
熊延4 小时前
麒麟V10系统安装部署elasticsearch
linux·运维·服务器·elasticsearch·搜索引擎·全文检索
Jia ming4 小时前
跟踪器与事件使用举例
linux·事件·跟踪器
生活很暖很治愈5 小时前
Linux——基础IO&软硬链接
linux·ubuntu
2401_858936885 小时前
【Linux C 编程】标准 IO 详解与实战:从基础接口到文件操作实战
linux·c语言