什么时候触发负载均衡(kernel 6.12)

一、 负载均衡只有在 sched_tick 时才会发生吗?

答案:绝对不是!Tick 触发的仅仅是"兜底"机制,真正的负载均衡在日常调度中无处不在。

如果只靠 Tick(比如每 4ms 一次)来做负载均衡,系统的响应延迟会非常高。Linux 内核 CFS 调度器主要有三大类负载均衡场景

1. 周期性负载均衡 (Periodic Load Balance)
cs 复制代码
/*
6203   * This function gets called by the timer code, with HZ frequency.
6204   * We call it with interrupts disabled.
6205   */
6206  void sched_tick(void)
6207  {
6208  	int cpu = smp_processor_id();
6209  	struct rq *rq = cpu_rq(cpu);
6210  	/* accounting goes to the donor task */
6211  	struct task_struct *donor;
6212  	struct rq_flags rf;
6213  	unsigned long hw_pressure;
6214  	u64 resched_latency;
6215  
6216  	if (housekeeping_cpu(cpu, HK_TYPE_TICK))
6217  		arch_scale_freq_tick();
6218  
6219  	sched_clock_tick();
6220  
6221  	rq_lock(rq, &rf);
6222  	donor = rq->donor;
6223  
6224  	psi_account_irqtime(rq, donor, NULL);
6225  
6226  	update_rq_clock(rq);
6227  	trace_android_rvh_tick_entry(rq);
6228  	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
6229  	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
6230  	donor->sched_class->task_tick(rq, donor, 0);
6231  	if (sched_feat(LATENCY_WARN))
6232  		resched_latency = cpu_resched_latency(rq);
6233  	calc_global_load_tick(rq);
6234  	sched_core_tick(rq);
6235  	task_tick_mm_cid(rq, donor);
6236  	scx_tick(rq);
6237  
6238  	rq_unlock(rq, &rf);
6239  
6240  	if (sched_feat(LATENCY_WARN) && resched_latency)
6241  		resched_latency_warn(cpu, resched_latency);
6242  
6243  	perf_event_task_tick();
6244  
6245  	if (donor->flags & PF_WQ_WORKER)
6246  		wq_worker_tick(donor);
6247  
6248  #ifdef CONFIG_SMP
6249  	if (!scx_switched_all()) {
6250  		rq->idle_balance = idle_cpu(cpu);
6251  		sched_balance_trigger(rq);
6252  	}
6253  #endif
6254  
6255  	trace_android_vh_scheduler_tick(rq);
6256  }
6257  



/*
13238   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
13239   */
13240  void sched_balance_trigger(struct rq *rq)
13241  {
13242  	/*
13243  	 * Don't need to rebalance while attached to NULL domain or
13244  	 * runqueue CPU is not active
13245  	 */
13246  	if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
13247  		return;
13248  
13249  	if (time_after_eq(jiffies, rq->next_balance))
13250  		raise_softirq(SCHED_SOFTIRQ);
13251  
13252  	nohz_balancer_kick(rq);
13253  }


/*
12625   * Current decision point for kicking the idle load balancer in the presence
12626   * of idle CPUs in the system.
12627   */
12628  static void nohz_balancer_kick(struct rq *rq)
12629  {
12630  	unsigned long now = jiffies;
12631  	struct sched_domain_shared *sds;
12632  	struct sched_domain *sd;
12633  	int nr_busy, i, cpu = rq->cpu;
12634  	unsigned int flags = 0;
12635  	int done = 0;
12636  
12637  	if (unlikely(rq->idle_balance))
12638  		return;
12639  
12640  	/*
12641  	 * We may be recently in ticked or tickless idle mode. At the first
12642  	 * busy tick after returning from idle, we will update the busy stats.
12643  	 */
12644  	nohz_balance_exit_idle(rq);
12645  
12646  	/*
12647  	 * None are in tickless mode and hence no need for NOHZ idle load
12648  	 * balancing:
12649  	 */
12650  	if (likely(!atomic_read(&nohz.nr_cpus)))
12651  		return;
12652  
12653  	if (READ_ONCE(nohz.has_blocked) &&
12654  	    time_after(now, READ_ONCE(nohz.next_blocked)))
12655  		flags = NOHZ_STATS_KICK;
12656  
12657  	if (time_before(now, nohz.next_balance))
12658  		goto out;
12659  
12660  	trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
12661  	if (done)
12662  		goto out;
12663  
12664  	if (rq->nr_running >= 2) {
12665  		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12666  		goto out;
12667  	}
12668  
12669  	rcu_read_lock();
12670  
12671  	sd = rcu_dereference(rq->sd);
12672  	if (sd) {
12673  		/*
12674  		 * If there's a runnable CFS task and the current CPU has reduced
12675  		 * capacity, kick the ILB to see if there's a better CPU to run on:
12676  		 */
12677  		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
12678  			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12679  			goto unlock;
12680  		}
12681  	}
12682  
12683  	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
12684  	if (sd) {
12685  		/*
12686  		 * When ASYM_PACKING; see if there's a more preferred CPU
12687  		 * currently idle; in which case, kick the ILB to move tasks
12688  		 * around.
12689  		 *
12690  		 * When balancing between cores, all the SMT siblings of the
12691  		 * preferred CPU must be idle.
12692  		 */
12693  		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
12694  			if (sched_asym(sd, i, cpu)) {
12695  				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12696  				goto unlock;
12697  			}
12698  		}
12699  	}
12700  
12701  	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
12702  	if (sd) {
12703  		/*
12704  		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
12705  		 * to run the misfit task on.
12706  		 */
12707  		if (check_misfit_status(rq)) {
12708  			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12709  			goto unlock;
12710  		}
12711  
12712  		/*
12713  		 * For asymmetric systems, we do not want to nicely balance
12714  		 * cache use, instead we want to embrace asymmetry and only
12715  		 * ensure tasks have enough CPU capacity.
12716  		 *
12717  		 * Skip the LLC logic because it's not relevant in that case.
12718  		 */
12719  		goto unlock;
12720  	}
12721  
12722  	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
12723  	if (sds) {
12724  		/*
12725  		 * If there is an imbalance between LLC domains (IOW we could
12726  		 * increase the overall cache utilization), we need a less-loaded LLC
12727  		 * domain to pull some load from. Likewise, we may need to spread
12728  		 * load within the current LLC domain (e.g. packed SMT cores but
12729  		 * other CPUs are idle). We can't really know from here how busy
12730  		 * the others are - so just get a NOHZ balance going if it looks
12731  		 * like this LLC domain has tasks we could move.
12732  		 */
12733  		nr_busy = atomic_read(&sds->nr_busy_cpus);
12734  		if (nr_busy > 1) {
12735  			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12736  			goto unlock;
12737  		}
12738  	}
12739  unlock:
12740  	rcu_read_unlock();
12741  out:
12742  	if (READ_ONCE(nohz.needs_update))
12743  		flags |= NOHZ_NEXT_KICK;
12744  
12745  	if (flags)
12746  		kick_ilb(flags);
12747  }
12748  

这就是你贴出的代码。它由 Tick 驱动,属于一种"亡羊补牢"的兜底策略。用于纠正系统长时间运行后产生的不平衡(比如 CPU 0 上的几个任务突然变成了死循环,而 CPU 1 上的任务全睡着了)。

2. 空闲负载均衡 (New Idle Load Balance) ------ 【极其主动,极度硬核】
cs 复制代码
/*
7595   * __schedule() is the main scheduler function.
7596   *
7597   * The main means of driving the scheduler and thus entering this function are:
7598   *
7599   *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
7600   *
7601   *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
7602   *      paths. For example, see arch/x86/entry_64.S.
7603   *
7604   *      To drive preemption between tasks, the scheduler sets the flag in timer
7605   *      interrupt handler sched_tick().
7606   *
7607   *   3. Wakeups don't really cause entry into schedule(). They add a
7608   *      task to the run-queue and that's it.
7609   *
7610   *      Now, if the new task added to the run-queue preempts the current
7611   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
7612   *      called on the nearest possible occasion:
7613   *
7614   *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
7615   *
7616   *         - in syscall or exception context, at the next outmost
7617   *           preempt_enable(). (this might be as soon as the wake_up()'s
7618   *           spin_unlock()!)
7619   *
7620   *         - in IRQ context, return from interrupt-handler to
7621   *           preemptible context
7622   *
7623   *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
7624   *         then at the next:
7625   *
7626   *          - cond_resched() call
7627   *          - explicit schedule() call
7628   *          - return from syscall or exception to user-space
7629   *          - return from interrupt-handler to user-space
7630   *
7631   * WARNING: must be called with preemption disabled!
7632   */
7633  static void __sched notrace __schedule(int sched_mode)
7634  {
7635  	struct task_struct *prev, *next;
7636  	/*
7637  	 * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
7638  	 * as a preemption by schedule_debug() and RCU.
7639  	 */
7640  	bool preempt = sched_mode > SM_NONE;
7641  	bool block = false;
7642  	unsigned long *switch_count;
7643  	unsigned long prev_state;
7644  	struct rq_flags rf;
7645  	struct rq *rq;
7646  	bool prev_not_proxied;
7647  	int cpu;
7648  
7649  	cpu = smp_processor_id();
7650  	rq = cpu_rq(cpu);
7651  	prev = rq->curr;
7652  
7653  	schedule_debug(prev, preempt);
7654  
7655  	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
7656  		hrtick_clear(rq);
7657  
7658  	local_irq_disable();
7659  	rcu_note_context_switch(preempt);
7660  
7661  	/*
7662  	 * Make sure that signal_pending_state()->signal_pending() below
7663  	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
7664  	 * done by the caller to avoid the race with signal_wake_up():
7665  	 *
7666  	 * __set_current_state(@state)		signal_wake_up()
7667  	 * schedule()				  set_tsk_thread_flag(p, TIF_SIGPENDING)
7668  	 *					  wake_up_state(p, state)
7669  	 *   LOCK rq->lock			    LOCK p->pi_state
7670  	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()
7671  	 *     if (signal_pending_state())	    if (p->state & @state)
7672  	 *
7673  	 * Also, the membarrier system call requires a full memory barrier
7674  	 * after coming from user-space, before storing to rq->curr; this
7675  	 * barrier matches a full barrier in the proximity of the membarrier
7676  	 * system call exit.
7677  	 */
7678  	rq_lock(rq, &rf);
7679  	smp_mb__after_spinlock();
7680  
7681  	/* Promote REQ to ACT */
7682  	rq->clock_update_flags <<= 1;
7683  	update_rq_clock(rq);
7684  	rq->clock_update_flags = RQCF_UPDATED;
7685  
7686  	switch_count = &prev->nivcsw;
7687  
7688  	/* Task state changes only considers SM_PREEMPT as preemption */
7689  	preempt = sched_mode == SM_PREEMPT;
7690  
7691  	/*
7692  	 * We must load prev->state once (task_struct::state is volatile), such
7693  	 * that we form a control dependency vs deactivate_task() below.
7694  	 */
7695  	prev_state = READ_ONCE(prev->__state);
7696  	if (sched_mode == SM_IDLE) {
7697  		/* SCX must consult the BPF scheduler to tell if rq is empty */
7698  		if (!rq->nr_running && !scx_enabled()) {
7699  			next = prev;
7700  			goto picked;
7701  		}
7702  	} else if (!preempt && prev_state) {
7703  		/*
7704  		 * We pass task_is_blocked() as the should_block arg
7705  		 * in order to keep mutex-blocked tasks on the runqueue
7706  		 * for slection with proxy-exec (without proxy-exec
7707  		 * task_is_blocked() will always be false).
7708  		 */
7709  		block = try_to_block_task(rq, prev, &prev_state,
7710  					  !task_is_blocked(prev));
7711  		switch_count = &prev->nvcsw;
7712  	}
7713  
7714  	prev_not_proxied = !prev->blocked_donor;
7715  
7716  	trace_sched_start_task_selection(prev, cpu, task_is_blocked(prev));
7717  pick_again:
7718  	next = pick_next_task(rq, rq->donor, &rf);   --》 cfs执行pick_next_task_fair
7719  	rq_set_donor(rq, next);
7720  	next->blocked_donor = NULL;
7721  	if (unlikely(task_is_blocked(next))) {
7722  		next = find_proxy_task(rq, next, &rf);
7723  		if (!next)
7724  			goto pick_again;
7725  		if (next == rq->idle)
7726  			goto keep_resched;
7727  	}
7728  	trace_sched_finish_task_selection(rq->donor, next, cpu);
7729  picked:
7730  	clear_tsk_need_resched(prev);
7731  	clear_preempt_need_resched();
7732  keep_resched:
7733  #ifdef CONFIG_SCHED_DEBUG
7734  	rq->last_seen_need_resched_ns = 0;
7735  #endif
7736  
7737  	trace_android_rvh_schedule(prev, next, rq);
7738  	if (likely(prev != next)) {
7739  		rq->nr_switches++;
7740  		/*
7741  		 * RCU users of rcu_dereference(rq->curr) may not see
7742  		 * changes to task_struct made by pick_next_task().
7743  		 */
7744  		RCU_INIT_POINTER(rq->curr, next);
7745  
7746  		if (!task_current_donor(rq, next))
7747  			proxy_tag_curr(rq, next);
7748  
7749  		/*
7750  		 * The membarrier system call requires each architecture
7751  		 * to have a full memory barrier after updating
7752  		 * rq->curr, before returning to user-space.
7753  		 *
7754  		 * Here are the schemes providing that barrier on the
7755  		 * various architectures:
7756  		 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
7757  		 *   RISC-V.  switch_mm() relies on membarrier_arch_switch_mm()
7758  		 *   on PowerPC and on RISC-V.
7759  		 * - finish_lock_switch() for weakly-ordered
7760  		 *   architectures where spin_unlock is a full barrier,
7761  		 * - switch_to() for arm64 (weakly-ordered, spin_unlock
7762  		 *   is a RELEASE barrier),
7763  		 *
7764  		 * The barrier matches a full barrier in the proximity of
7765  		 * the membarrier system call entry.
7766  		 *
7767  		 * On RISC-V, this barrier pairing is also needed for the
7768  		 * SYNC_CORE command when switching between processes, cf.
7769  		 * the inline comments in membarrier_arch_switch_mm().
7770  		 */
7771  		++*switch_count;
7772  
7773  		migrate_disable_switch(rq, prev);
7774  		psi_account_irqtime(rq, prev, next);
7775  		psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
7776  					     prev->se.sched_delayed);
7777  
7778  		trace_sched_switch(preempt, prev, next, prev_state);
7779  
7780  		if (block && (prev_state & TASK_UNINTERRUPTIBLE)
7781  			&& trace_sched_blocked_reason_enabled()) {
7782  			unsigned long blocked_func = 0;
7783  
7784  #ifdef CONFIG_STACKTRACE
7785  			stack_trace_save_tsk(prev, &blocked_func, 1, 0);
7786  #endif
7787  			trace_sched_blocked_reason(prev, (void *)blocked_func);
7788  		}
7789  
7790  		/* Also unlocks the rq: */
7791  		rq = context_switch(rq, prev, next, &rf);
7792  	} else {
7793  		/* In case next was already curr but just got blocked_donor */
7794  		if (prev_not_proxied && next->blocked_donor)
7795  			proxy_tag_curr(rq, next);
7796  
7797  		rq_unpin_lock(rq, &rf);
7798  		__balance_callbacks(rq);
7799  		raw_spin_rq_unlock_irq(rq);
7800  	}
7801  }
7802  


struct task_struct *
9113  pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
9114  {
9115  	struct sched_entity *se;
9116  	struct task_struct *p = NULL;
9117  	int new_tasks;
9118  
9119  again:
9120  	trace_android_rvh_before_pick_task_fair(rq, &p, prev, rf);
9121  	if (!p) {
9122  		p = pick_task_fair(rq);
9123  		trace_android_rvh_replace_next_task_fair(rq, &p, prev);
9124  	}
9125  
9126  	if (!p)
9127  		goto idle;
9128  	se = &p->se;
9129  
9130  #ifdef CONFIG_FAIR_GROUP_SCHED
9131  	if (prev->sched_class != &fair_sched_class ||
9132  	    rq->curr != rq->donor)
9133  		goto simple;
9134  
9135  	__put_prev_set_next_dl_server(rq, prev, p);
9136  
9137  	/*
9138  	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
9139  	 * likely that a next task is from the same cgroup as the current.
9140  	 *
9141  	 * Therefore attempt to avoid putting and setting the entire cgroup
9142  	 * hierarchy, only change the part that actually changes.
9143  	 *
9144  	 * Since we haven't yet done put_prev_entity and if the selected task
9145  	 * is a different task than we started out with, try and touch the
9146  	 * least amount of cfs_rqs.
9147  	 */
9148  	if (prev != p) {
9149  		struct sched_entity *pse = &prev->se;
9150  		struct cfs_rq *cfs_rq;
9151  
9152  		while (!(cfs_rq = is_same_group(se, pse))) {
9153  			int se_depth = se->depth;
9154  			int pse_depth = pse->depth;
9155  
9156  			if (se_depth <= pse_depth) {
9157  				put_prev_entity(cfs_rq_of(pse), pse);
9158  				pse = parent_entity(pse);
9159  			}
9160  			if (se_depth >= pse_depth) {
9161  				set_next_entity(cfs_rq_of(se), se);
9162  				se = parent_entity(se);
9163  			}
9164  		}
9165  
9166  		put_prev_entity(cfs_rq, pse);
9167  		set_next_entity(cfs_rq, se);
9168  
9169  		__set_next_task_fair(rq, p, true);
9170  	}
9171  
9172  	return p;
9173  
9174  simple:
9175  #endif
9176  	put_prev_set_next_task(rq, prev, p);
9177  	return p;
9178  
9179  idle:
9180  	if (!rf)
9181  		return NULL;
9182  
9183  	new_tasks = sched_balance_newidle(rq, rf);   --> 这里会拉取其它task
9184  
9185  	/*
9186  	 * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
9187  	 * possible for any higher priority task to appear. In that case we
9188  	 * must re-start the pick_next_entity() loop.
9189  	 */
9190  	if (new_tasks < 0)
9191  		return RETRY_TASK;
9192  
9193  	if (new_tasks > 0)
9194  		goto again;
9195  
9196  	/*
9197  	 * rq is about to be idle, check if we need to update the
9198  	 * lost_idle_time of clock_pelt
9199  	 */
9200  	update_idle_rq_clock_pelt(rq);
9201  
9202  	return NULL;
9203  }

假设 CPU 0 上的任务刚刚执行完毕,或者主动睡眠了。此时 CPU 0 的可运行队列空了,它马上就要进入 Idle(发呆)状态了。 内核会想:"与其让你闲着,不如你去看看别人忙不忙?"

  • 调用时机 :在 __schedule() 函数中,当调度器发现下一个要运行的任务是 idle 线程时,会立刻调用 sched_balance_newidle()

  • 行为:CPU 0 会赶在自己彻底睡死之前,主动去扒拉其他忙碌 CPU(比如 CPU 1)的队列,强行把任务"偷"过来自己运行。这大大降低了任务的排队延迟。

3. 唤醒/新建负载均衡 (Wakeup/Exec Load Balance) ------ 【防患于未然】

假设进程 A 在 CPU 0 上调用了 fork() 创建了进程 B,或者进程 B 一直在睡眠,现在被硬件中断唤醒了。 内核会想:"我该把进程 B 放在哪个 CPU 上最合适呢?"

  • 调用时机 :在 try_to_wake_up()(唤醒进程)或系统调用 execve() 时,内核会调用 select_task_rq_fair()

  • 行为 :内核会在将进程 B 放入就绪队列之前 ,纵观全局,计算各个 CPU 的负载(利用前面提到的 util_avg 等数据)。如果发现 CPU 0 很忙,而 CPU 2 很闲,内核会直接把进程 B 唤醒到 CPU 2 的队列里。这就是一种"主动分配"的负载均衡。

总结

你看到的 sched_tick 里的代码,只是 Linux 复杂负载均衡宏大版图的三分之一

  • 唤醒负载均衡 是"新生儿分班",尽量一开始就分均匀。

  • 空闲负载均衡 是"主动抢活干",绝不让自己闲着。

  • 周期性负载均衡(Tick) 则是"定期大盘点",防止出现长期积劳成疾的极端不平衡。这三者结合,才造就了 Linux 在多核时代强悍的调度性能。

相关推荐
kainx2 小时前
Linux编译eeprom
linux·运维·c语言·eeprom
攻城狮在此2 小时前
MobaXterm下载安装及SSH远程连接(交换机/路由器/服务器)
linux·运维·服务器·网络
花间相见2 小时前
【Agent开发】—— ToolCall 、 FunctionCall 底层原理与极简实现
运维·服务器
mounter6253 小时前
【LSF/MM内核前沿】Linux 内存回收推倒重来?解析 MGLRU 与传统 LRU 的“统一之战”
linux·运维·服务器·网络·内核·内存回收
Exquisite.3 小时前
k8s的Pod管理
linux·运维·服务器
IMPYLH3 小时前
Linux 的 env 命令
linux·运维·服务器·数据库
牛奶咖啡133 小时前
DevOps自动化运维实践_搭建UEFI网络引导的自动安装Debian系统
运维·自动化·devops·uefi·pxe·debian自动应答文件·debian网络自动化安装系统
拾贰_C3 小时前
【Ubuntu | Nvidia 】nvidia 驱动安装
linux·运维·ubuntu
zzzsde3 小时前
【Linux】EXT文件系统(2)
linux·运维·服务器