什么时候触发负载均衡(kernel 6.12)

一、 负载均衡只有在 sched_tick 时才会发生吗?

答案:绝对不是!Tick 触发的仅仅是"兜底"机制,真正的负载均衡在日常调度中无处不在。

如果只靠 Tick(比如每 4ms 一次)来做负载均衡,系统的响应延迟会非常高。Linux 内核 CFS 调度器主要有三大类负载均衡场景

1. 周期性负载均衡 (Periodic Load Balance)
cs 复制代码
/*
6203   * This function gets called by the timer code, with HZ frequency.
6204   * We call it with interrupts disabled.
6205   */
6206  void sched_tick(void)
6207  {
6208  	int cpu = smp_processor_id();
6209  	struct rq *rq = cpu_rq(cpu);
6210  	/* accounting goes to the donor task */
6211  	struct task_struct *donor;
6212  	struct rq_flags rf;
6213  	unsigned long hw_pressure;
6214  	u64 resched_latency;
6215  
6216  	if (housekeeping_cpu(cpu, HK_TYPE_TICK))
6217  		arch_scale_freq_tick();
6218  
6219  	sched_clock_tick();
6220  
6221  	rq_lock(rq, &rf);
6222  	donor = rq->donor;
6223  
6224  	psi_account_irqtime(rq, donor, NULL);
6225  
6226  	update_rq_clock(rq);
6227  	trace_android_rvh_tick_entry(rq);
6228  	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
6229  	update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
6230  	donor->sched_class->task_tick(rq, donor, 0);
6231  	if (sched_feat(LATENCY_WARN))
6232  		resched_latency = cpu_resched_latency(rq);
6233  	calc_global_load_tick(rq);
6234  	sched_core_tick(rq);
6235  	task_tick_mm_cid(rq, donor);
6236  	scx_tick(rq);
6237  
6238  	rq_unlock(rq, &rf);
6239  
6240  	if (sched_feat(LATENCY_WARN) && resched_latency)
6241  		resched_latency_warn(cpu, resched_latency);
6242  
6243  	perf_event_task_tick();
6244  
6245  	if (donor->flags & PF_WQ_WORKER)
6246  		wq_worker_tick(donor);
6247  
6248  #ifdef CONFIG_SMP
6249  	if (!scx_switched_all()) {
6250  		rq->idle_balance = idle_cpu(cpu);
6251  		sched_balance_trigger(rq);
6252  	}
6253  #endif
6254  
6255  	trace_android_vh_scheduler_tick(rq);
6256  }
6257  



/*
13238   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
13239   */
13240  void sched_balance_trigger(struct rq *rq)
13241  {
13242  	/*
13243  	 * Don't need to rebalance while attached to NULL domain or
13244  	 * runqueue CPU is not active
13245  	 */
13246  	if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
13247  		return;
13248  
13249  	if (time_after_eq(jiffies, rq->next_balance))
13250  		raise_softirq(SCHED_SOFTIRQ);
13251  
13252  	nohz_balancer_kick(rq);
13253  }


/*
12625   * Current decision point for kicking the idle load balancer in the presence
12626   * of idle CPUs in the system.
12627   */
12628  static void nohz_balancer_kick(struct rq *rq)
12629  {
12630  	unsigned long now = jiffies;
12631  	struct sched_domain_shared *sds;
12632  	struct sched_domain *sd;
12633  	int nr_busy, i, cpu = rq->cpu;
12634  	unsigned int flags = 0;
12635  	int done = 0;
12636  
12637  	if (unlikely(rq->idle_balance))
12638  		return;
12639  
12640  	/*
12641  	 * We may be recently in ticked or tickless idle mode. At the first
12642  	 * busy tick after returning from idle, we will update the busy stats.
12643  	 */
12644  	nohz_balance_exit_idle(rq);
12645  
12646  	/*
12647  	 * None are in tickless mode and hence no need for NOHZ idle load
12648  	 * balancing:
12649  	 */
12650  	if (likely(!atomic_read(&nohz.nr_cpus)))
12651  		return;
12652  
12653  	if (READ_ONCE(nohz.has_blocked) &&
12654  	    time_after(now, READ_ONCE(nohz.next_blocked)))
12655  		flags = NOHZ_STATS_KICK;
12656  
12657  	if (time_before(now, nohz.next_balance))
12658  		goto out;
12659  
12660  	trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
12661  	if (done)
12662  		goto out;
12663  
12664  	if (rq->nr_running >= 2) {
12665  		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12666  		goto out;
12667  	}
12668  
12669  	rcu_read_lock();
12670  
12671  	sd = rcu_dereference(rq->sd);
12672  	if (sd) {
12673  		/*
12674  		 * If there's a runnable CFS task and the current CPU has reduced
12675  		 * capacity, kick the ILB to see if there's a better CPU to run on:
12676  		 */
12677  		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
12678  			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12679  			goto unlock;
12680  		}
12681  	}
12682  
12683  	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
12684  	if (sd) {
12685  		/*
12686  		 * When ASYM_PACKING; see if there's a more preferred CPU
12687  		 * currently idle; in which case, kick the ILB to move tasks
12688  		 * around.
12689  		 *
12690  		 * When balancing between cores, all the SMT siblings of the
12691  		 * preferred CPU must be idle.
12692  		 */
12693  		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
12694  			if (sched_asym(sd, i, cpu)) {
12695  				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12696  				goto unlock;
12697  			}
12698  		}
12699  	}
12700  
12701  	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
12702  	if (sd) {
12703  		/*
12704  		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
12705  		 * to run the misfit task on.
12706  		 */
12707  		if (check_misfit_status(rq)) {
12708  			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12709  			goto unlock;
12710  		}
12711  
12712  		/*
12713  		 * For asymmetric systems, we do not want to nicely balance
12714  		 * cache use, instead we want to embrace asymmetry and only
12715  		 * ensure tasks have enough CPU capacity.
12716  		 *
12717  		 * Skip the LLC logic because it's not relevant in that case.
12718  		 */
12719  		goto unlock;
12720  	}
12721  
12722  	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
12723  	if (sds) {
12724  		/*
12725  		 * If there is an imbalance between LLC domains (IOW we could
12726  		 * increase the overall cache utilization), we need a less-loaded LLC
12727  		 * domain to pull some load from. Likewise, we may need to spread
12728  		 * load within the current LLC domain (e.g. packed SMT cores but
12729  		 * other CPUs are idle). We can't really know from here how busy
12730  		 * the others are - so just get a NOHZ balance going if it looks
12731  		 * like this LLC domain has tasks we could move.
12732  		 */
12733  		nr_busy = atomic_read(&sds->nr_busy_cpus);
12734  		if (nr_busy > 1) {
12735  			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12736  			goto unlock;
12737  		}
12738  	}
12739  unlock:
12740  	rcu_read_unlock();
12741  out:
12742  	if (READ_ONCE(nohz.needs_update))
12743  		flags |= NOHZ_NEXT_KICK;
12744  
12745  	if (flags)
12746  		kick_ilb(flags);
12747  }
12748  

这就是你贴出的代码。它由 Tick 驱动,属于一种"亡羊补牢"的兜底策略。用于纠正系统长时间运行后产生的不平衡(比如 CPU 0 上的几个任务突然变成了死循环,而 CPU 1 上的任务全睡着了)。

2. 空闲负载均衡 (New Idle Load Balance) ------ 【极其主动,极度硬核】
cs 复制代码
/*
7595   * __schedule() is the main scheduler function.
7596   *
7597   * The main means of driving the scheduler and thus entering this function are:
7598   *
7599   *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
7600   *
7601   *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
7602   *      paths. For example, see arch/x86/entry_64.S.
7603   *
7604   *      To drive preemption between tasks, the scheduler sets the flag in timer
7605   *      interrupt handler sched_tick().
7606   *
7607   *   3. Wakeups don't really cause entry into schedule(). They add a
7608   *      task to the run-queue and that's it.
7609   *
7610   *      Now, if the new task added to the run-queue preempts the current
7611   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
7612   *      called on the nearest possible occasion:
7613   *
7614   *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
7615   *
7616   *         - in syscall or exception context, at the next outmost
7617   *           preempt_enable(). (this might be as soon as the wake_up()'s
7618   *           spin_unlock()!)
7619   *
7620   *         - in IRQ context, return from interrupt-handler to
7621   *           preemptible context
7622   *
7623   *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
7624   *         then at the next:
7625   *
7626   *          - cond_resched() call
7627   *          - explicit schedule() call
7628   *          - return from syscall or exception to user-space
7629   *          - return from interrupt-handler to user-space
7630   *
7631   * WARNING: must be called with preemption disabled!
7632   */
7633  static void __sched notrace __schedule(int sched_mode)
7634  {
7635  	struct task_struct *prev, *next;
7636  	/*
7637  	 * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
7638  	 * as a preemption by schedule_debug() and RCU.
7639  	 */
7640  	bool preempt = sched_mode > SM_NONE;
7641  	bool block = false;
7642  	unsigned long *switch_count;
7643  	unsigned long prev_state;
7644  	struct rq_flags rf;
7645  	struct rq *rq;
7646  	bool prev_not_proxied;
7647  	int cpu;
7648  
7649  	cpu = smp_processor_id();
7650  	rq = cpu_rq(cpu);
7651  	prev = rq->curr;
7652  
7653  	schedule_debug(prev, preempt);
7654  
7655  	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
7656  		hrtick_clear(rq);
7657  
7658  	local_irq_disable();
7659  	rcu_note_context_switch(preempt);
7660  
7661  	/*
7662  	 * Make sure that signal_pending_state()->signal_pending() below
7663  	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
7664  	 * done by the caller to avoid the race with signal_wake_up():
7665  	 *
7666  	 * __set_current_state(@state)		signal_wake_up()
7667  	 * schedule()				  set_tsk_thread_flag(p, TIF_SIGPENDING)
7668  	 *					  wake_up_state(p, state)
7669  	 *   LOCK rq->lock			    LOCK p->pi_state
7670  	 *   smp_mb__after_spinlock()		    smp_mb__after_spinlock()
7671  	 *     if (signal_pending_state())	    if (p->state & @state)
7672  	 *
7673  	 * Also, the membarrier system call requires a full memory barrier
7674  	 * after coming from user-space, before storing to rq->curr; this
7675  	 * barrier matches a full barrier in the proximity of the membarrier
7676  	 * system call exit.
7677  	 */
7678  	rq_lock(rq, &rf);
7679  	smp_mb__after_spinlock();
7680  
7681  	/* Promote REQ to ACT */
7682  	rq->clock_update_flags <<= 1;
7683  	update_rq_clock(rq);
7684  	rq->clock_update_flags = RQCF_UPDATED;
7685  
7686  	switch_count = &prev->nivcsw;
7687  
7688  	/* Task state changes only considers SM_PREEMPT as preemption */
7689  	preempt = sched_mode == SM_PREEMPT;
7690  
7691  	/*
7692  	 * We must load prev->state once (task_struct::state is volatile), such
7693  	 * that we form a control dependency vs deactivate_task() below.
7694  	 */
7695  	prev_state = READ_ONCE(prev->__state);
7696  	if (sched_mode == SM_IDLE) {
7697  		/* SCX must consult the BPF scheduler to tell if rq is empty */
7698  		if (!rq->nr_running && !scx_enabled()) {
7699  			next = prev;
7700  			goto picked;
7701  		}
7702  	} else if (!preempt && prev_state) {
7703  		/*
7704  		 * We pass task_is_blocked() as the should_block arg
7705  		 * in order to keep mutex-blocked tasks on the runqueue
7706  		 * for slection with proxy-exec (without proxy-exec
7707  		 * task_is_blocked() will always be false).
7708  		 */
7709  		block = try_to_block_task(rq, prev, &prev_state,
7710  					  !task_is_blocked(prev));
7711  		switch_count = &prev->nvcsw;
7712  	}
7713  
7714  	prev_not_proxied = !prev->blocked_donor;
7715  
7716  	trace_sched_start_task_selection(prev, cpu, task_is_blocked(prev));
7717  pick_again:
7718  	next = pick_next_task(rq, rq->donor, &rf);   --》 cfs执行pick_next_task_fair
7719  	rq_set_donor(rq, next);
7720  	next->blocked_donor = NULL;
7721  	if (unlikely(task_is_blocked(next))) {
7722  		next = find_proxy_task(rq, next, &rf);
7723  		if (!next)
7724  			goto pick_again;
7725  		if (next == rq->idle)
7726  			goto keep_resched;
7727  	}
7728  	trace_sched_finish_task_selection(rq->donor, next, cpu);
7729  picked:
7730  	clear_tsk_need_resched(prev);
7731  	clear_preempt_need_resched();
7732  keep_resched:
7733  #ifdef CONFIG_SCHED_DEBUG
7734  	rq->last_seen_need_resched_ns = 0;
7735  #endif
7736  
7737  	trace_android_rvh_schedule(prev, next, rq);
7738  	if (likely(prev != next)) {
7739  		rq->nr_switches++;
7740  		/*
7741  		 * RCU users of rcu_dereference(rq->curr) may not see
7742  		 * changes to task_struct made by pick_next_task().
7743  		 */
7744  		RCU_INIT_POINTER(rq->curr, next);
7745  
7746  		if (!task_current_donor(rq, next))
7747  			proxy_tag_curr(rq, next);
7748  
7749  		/*
7750  		 * The membarrier system call requires each architecture
7751  		 * to have a full memory barrier after updating
7752  		 * rq->curr, before returning to user-space.
7753  		 *
7754  		 * Here are the schemes providing that barrier on the
7755  		 * various architectures:
7756  		 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
7757  		 *   RISC-V.  switch_mm() relies on membarrier_arch_switch_mm()
7758  		 *   on PowerPC and on RISC-V.
7759  		 * - finish_lock_switch() for weakly-ordered
7760  		 *   architectures where spin_unlock is a full barrier,
7761  		 * - switch_to() for arm64 (weakly-ordered, spin_unlock
7762  		 *   is a RELEASE barrier),
7763  		 *
7764  		 * The barrier matches a full barrier in the proximity of
7765  		 * the membarrier system call entry.
7766  		 *
7767  		 * On RISC-V, this barrier pairing is also needed for the
7768  		 * SYNC_CORE command when switching between processes, cf.
7769  		 * the inline comments in membarrier_arch_switch_mm().
7770  		 */
7771  		++*switch_count;
7772  
7773  		migrate_disable_switch(rq, prev);
7774  		psi_account_irqtime(rq, prev, next);
7775  		psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
7776  					     prev->se.sched_delayed);
7777  
7778  		trace_sched_switch(preempt, prev, next, prev_state);
7779  
7780  		if (block && (prev_state & TASK_UNINTERRUPTIBLE)
7781  			&& trace_sched_blocked_reason_enabled()) {
7782  			unsigned long blocked_func = 0;
7783  
7784  #ifdef CONFIG_STACKTRACE
7785  			stack_trace_save_tsk(prev, &blocked_func, 1, 0);
7786  #endif
7787  			trace_sched_blocked_reason(prev, (void *)blocked_func);
7788  		}
7789  
7790  		/* Also unlocks the rq: */
7791  		rq = context_switch(rq, prev, next, &rf);
7792  	} else {
7793  		/* In case next was already curr but just got blocked_donor */
7794  		if (prev_not_proxied && next->blocked_donor)
7795  			proxy_tag_curr(rq, next);
7796  
7797  		rq_unpin_lock(rq, &rf);
7798  		__balance_callbacks(rq);
7799  		raw_spin_rq_unlock_irq(rq);
7800  	}
7801  }
7802  


struct task_struct *
9113  pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
9114  {
9115  	struct sched_entity *se;
9116  	struct task_struct *p = NULL;
9117  	int new_tasks;
9118  
9119  again:
9120  	trace_android_rvh_before_pick_task_fair(rq, &p, prev, rf);
9121  	if (!p) {
9122  		p = pick_task_fair(rq);
9123  		trace_android_rvh_replace_next_task_fair(rq, &p, prev);
9124  	}
9125  
9126  	if (!p)
9127  		goto idle;
9128  	se = &p->se;
9129  
9130  #ifdef CONFIG_FAIR_GROUP_SCHED
9131  	if (prev->sched_class != &fair_sched_class ||
9132  	    rq->curr != rq->donor)
9133  		goto simple;
9134  
9135  	__put_prev_set_next_dl_server(rq, prev, p);
9136  
9137  	/*
9138  	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
9139  	 * likely that a next task is from the same cgroup as the current.
9140  	 *
9141  	 * Therefore attempt to avoid putting and setting the entire cgroup
9142  	 * hierarchy, only change the part that actually changes.
9143  	 *
9144  	 * Since we haven't yet done put_prev_entity and if the selected task
9145  	 * is a different task than we started out with, try and touch the
9146  	 * least amount of cfs_rqs.
9147  	 */
9148  	if (prev != p) {
9149  		struct sched_entity *pse = &prev->se;
9150  		struct cfs_rq *cfs_rq;
9151  
9152  		while (!(cfs_rq = is_same_group(se, pse))) {
9153  			int se_depth = se->depth;
9154  			int pse_depth = pse->depth;
9155  
9156  			if (se_depth <= pse_depth) {
9157  				put_prev_entity(cfs_rq_of(pse), pse);
9158  				pse = parent_entity(pse);
9159  			}
9160  			if (se_depth >= pse_depth) {
9161  				set_next_entity(cfs_rq_of(se), se);
9162  				se = parent_entity(se);
9163  			}
9164  		}
9165  
9166  		put_prev_entity(cfs_rq, pse);
9167  		set_next_entity(cfs_rq, se);
9168  
9169  		__set_next_task_fair(rq, p, true);
9170  	}
9171  
9172  	return p;
9173  
9174  simple:
9175  #endif
9176  	put_prev_set_next_task(rq, prev, p);
9177  	return p;
9178  
9179  idle:
9180  	if (!rf)
9181  		return NULL;
9182  
9183  	new_tasks = sched_balance_newidle(rq, rf);   --> 这里会拉取其它task
9184  
9185  	/*
9186  	 * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
9187  	 * possible for any higher priority task to appear. In that case we
9188  	 * must re-start the pick_next_entity() loop.
9189  	 */
9190  	if (new_tasks < 0)
9191  		return RETRY_TASK;
9192  
9193  	if (new_tasks > 0)
9194  		goto again;
9195  
9196  	/*
9197  	 * rq is about to be idle, check if we need to update the
9198  	 * lost_idle_time of clock_pelt
9199  	 */
9200  	update_idle_rq_clock_pelt(rq);
9201  
9202  	return NULL;
9203  }

假设 CPU 0 上的任务刚刚执行完毕,或者主动睡眠了。此时 CPU 0 的可运行队列空了,它马上就要进入 Idle(发呆)状态了。 内核会想:"与其让你闲着,不如你去看看别人忙不忙?"

  • 调用时机 :在 __schedule() 函数中,当调度器发现下一个要运行的任务是 idle 线程时,会立刻调用 sched_balance_newidle()

  • 行为:CPU 0 会赶在自己彻底睡死之前,主动去扒拉其他忙碌 CPU(比如 CPU 1)的队列,强行把任务"偷"过来自己运行。这大大降低了任务的排队延迟。

3. 唤醒/新建负载均衡 (Wakeup/Exec Load Balance) ------ 【防患于未然】

假设进程 A 在 CPU 0 上调用了 fork() 创建了进程 B,或者进程 B 一直在睡眠,现在被硬件中断唤醒了。 内核会想:"我该把进程 B 放在哪个 CPU 上最合适呢?"

  • 调用时机 :在 try_to_wake_up()(唤醒进程)或系统调用 execve() 时,内核会调用 select_task_rq_fair()

  • 行为 :内核会在将进程 B 放入就绪队列之前 ,纵观全局,计算各个 CPU 的负载(利用前面提到的 util_avg 等数据)。如果发现 CPU 0 很忙,而 CPU 2 很闲,内核会直接把进程 B 唤醒到 CPU 2 的队列里。这就是一种"主动分配"的负载均衡。

总结

你看到的 sched_tick 里的代码,只是 Linux 复杂负载均衡宏大版图的三分之一

  • 唤醒负载均衡 是"新生儿分班",尽量一开始就分均匀。

  • 空闲负载均衡 是"主动抢活干",绝不让自己闲着。

  • 周期性负载均衡(Tick) 则是"定期大盘点",防止出现长期积劳成疾的极端不平衡。这三者结合,才造就了 Linux 在多核时代强悍的调度性能。

相关推荐
SelectDB18 小时前
Litefuse 开源并推出单进程轻量模式,25 秒就能跑起来的 Agent 可观测与评估平台
运维·后端·自动化运维
XIAOHEZIcode2 天前
Linux系统鼠标偏移常见原因以及修复方案
linux·运维·游戏
用户0328472220703 天前
如何搭建本地yum源(上)
运维
大树886 天前
金刚石散热越强,管路越先见顶
大数据·运维·服务器·人工智能·ai
摇滚侠6 天前
Linux CentOS7 rpm 安装 MySQL 5.7
linux·运维·mysql
霸道流氓气质6 天前
领域驱动设计(DDD)在 Spring Boot 微服务中的实践指南
运维·spring boot·微服务
Inhand陈工6 天前
基于台达PLC与映翰通IG502的智慧水产养殖精准投喂与远程运维解决方案
运维·人工智能·物联网·阿里云·信息与通信
酣大智6 天前
ARP代理--工作原理
运维·网络·arp·arp代理
shushangyun_6 天前
2026年快消品B2B系统推荐:支持终端门店订货、促销政策自动化的工具?
java·运维·网络·数据库·人工智能·spring·自动化
施努卡机器视觉6 天前
SNK施努卡侧滑门锁上滑轮总成自动化装配线,从零件到组件,全流程精密制造方案
运维·自动化·制造