一、 负载均衡只有在 sched_tick 时才会发生吗?
答案:绝对不是!Tick 触发的仅仅是"兜底"机制,真正的负载均衡在日常调度中无处不在。
如果只靠 Tick(比如每 4ms 一次)来做负载均衡,系统的响应延迟会非常高。Linux 内核 CFS 调度器主要有三大类负载均衡场景:
1. 周期性负载均衡 (Periodic Load Balance)
cs
/*
6203 * This function gets called by the timer code, with HZ frequency.
6204 * We call it with interrupts disabled.
6205 */
6206 void sched_tick(void)
6207 {
6208 int cpu = smp_processor_id();
6209 struct rq *rq = cpu_rq(cpu);
6210 /* accounting goes to the donor task */
6211 struct task_struct *donor;
6212 struct rq_flags rf;
6213 unsigned long hw_pressure;
6214 u64 resched_latency;
6215
6216 if (housekeeping_cpu(cpu, HK_TYPE_TICK))
6217 arch_scale_freq_tick();
6218
6219 sched_clock_tick();
6220
6221 rq_lock(rq, &rf);
6222 donor = rq->donor;
6223
6224 psi_account_irqtime(rq, donor, NULL);
6225
6226 update_rq_clock(rq);
6227 trace_android_rvh_tick_entry(rq);
6228 hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
6229 update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
6230 donor->sched_class->task_tick(rq, donor, 0);
6231 if (sched_feat(LATENCY_WARN))
6232 resched_latency = cpu_resched_latency(rq);
6233 calc_global_load_tick(rq);
6234 sched_core_tick(rq);
6235 task_tick_mm_cid(rq, donor);
6236 scx_tick(rq);
6237
6238 rq_unlock(rq, &rf);
6239
6240 if (sched_feat(LATENCY_WARN) && resched_latency)
6241 resched_latency_warn(cpu, resched_latency);
6242
6243 perf_event_task_tick();
6244
6245 if (donor->flags & PF_WQ_WORKER)
6246 wq_worker_tick(donor);
6247
6248 #ifdef CONFIG_SMP
6249 if (!scx_switched_all()) {
6250 rq->idle_balance = idle_cpu(cpu);
6251 sched_balance_trigger(rq);
6252 }
6253 #endif
6254
6255 trace_android_vh_scheduler_tick(rq);
6256 }
6257
/*
13238 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
13239 */
13240 void sched_balance_trigger(struct rq *rq)
13241 {
13242 /*
13243 * Don't need to rebalance while attached to NULL domain or
13244 * runqueue CPU is not active
13245 */
13246 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
13247 return;
13248
13249 if (time_after_eq(jiffies, rq->next_balance))
13250 raise_softirq(SCHED_SOFTIRQ);
13251
13252 nohz_balancer_kick(rq);
13253 }
/*
12625 * Current decision point for kicking the idle load balancer in the presence
12626 * of idle CPUs in the system.
12627 */
12628 static void nohz_balancer_kick(struct rq *rq)
12629 {
12630 unsigned long now = jiffies;
12631 struct sched_domain_shared *sds;
12632 struct sched_domain *sd;
12633 int nr_busy, i, cpu = rq->cpu;
12634 unsigned int flags = 0;
12635 int done = 0;
12636
12637 if (unlikely(rq->idle_balance))
12638 return;
12639
12640 /*
12641 * We may be recently in ticked or tickless idle mode. At the first
12642 * busy tick after returning from idle, we will update the busy stats.
12643 */
12644 nohz_balance_exit_idle(rq);
12645
12646 /*
12647 * None are in tickless mode and hence no need for NOHZ idle load
12648 * balancing:
12649 */
12650 if (likely(!atomic_read(&nohz.nr_cpus)))
12651 return;
12652
12653 if (READ_ONCE(nohz.has_blocked) &&
12654 time_after(now, READ_ONCE(nohz.next_blocked)))
12655 flags = NOHZ_STATS_KICK;
12656
12657 if (time_before(now, nohz.next_balance))
12658 goto out;
12659
12660 trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
12661 if (done)
12662 goto out;
12663
12664 if (rq->nr_running >= 2) {
12665 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12666 goto out;
12667 }
12668
12669 rcu_read_lock();
12670
12671 sd = rcu_dereference(rq->sd);
12672 if (sd) {
12673 /*
12674 * If there's a runnable CFS task and the current CPU has reduced
12675 * capacity, kick the ILB to see if there's a better CPU to run on:
12676 */
12677 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
12678 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12679 goto unlock;
12680 }
12681 }
12682
12683 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
12684 if (sd) {
12685 /*
12686 * When ASYM_PACKING; see if there's a more preferred CPU
12687 * currently idle; in which case, kick the ILB to move tasks
12688 * around.
12689 *
12690 * When balancing between cores, all the SMT siblings of the
12691 * preferred CPU must be idle.
12692 */
12693 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
12694 if (sched_asym(sd, i, cpu)) {
12695 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12696 goto unlock;
12697 }
12698 }
12699 }
12700
12701 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
12702 if (sd) {
12703 /*
12704 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
12705 * to run the misfit task on.
12706 */
12707 if (check_misfit_status(rq)) {
12708 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12709 goto unlock;
12710 }
12711
12712 /*
12713 * For asymmetric systems, we do not want to nicely balance
12714 * cache use, instead we want to embrace asymmetry and only
12715 * ensure tasks have enough CPU capacity.
12716 *
12717 * Skip the LLC logic because it's not relevant in that case.
12718 */
12719 goto unlock;
12720 }
12721
12722 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
12723 if (sds) {
12724 /*
12725 * If there is an imbalance between LLC domains (IOW we could
12726 * increase the overall cache utilization), we need a less-loaded LLC
12727 * domain to pull some load from. Likewise, we may need to spread
12728 * load within the current LLC domain (e.g. packed SMT cores but
12729 * other CPUs are idle). We can't really know from here how busy
12730 * the others are - so just get a NOHZ balance going if it looks
12731 * like this LLC domain has tasks we could move.
12732 */
12733 nr_busy = atomic_read(&sds->nr_busy_cpus);
12734 if (nr_busy > 1) {
12735 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12736 goto unlock;
12737 }
12738 }
12739 unlock:
12740 rcu_read_unlock();
12741 out:
12742 if (READ_ONCE(nohz.needs_update))
12743 flags |= NOHZ_NEXT_KICK;
12744
12745 if (flags)
12746 kick_ilb(flags);
12747 }
12748
这就是你贴出的代码。它由 Tick 驱动,属于一种"亡羊补牢"的兜底策略。用于纠正系统长时间运行后产生的不平衡(比如 CPU 0 上的几个任务突然变成了死循环,而 CPU 1 上的任务全睡着了)。
2. 空闲负载均衡 (New Idle Load Balance) ------ 【极其主动,极度硬核】
cs
/*
7595 * __schedule() is the main scheduler function.
7596 *
7597 * The main means of driving the scheduler and thus entering this function are:
7598 *
7599 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
7600 *
7601 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
7602 * paths. For example, see arch/x86/entry_64.S.
7603 *
7604 * To drive preemption between tasks, the scheduler sets the flag in timer
7605 * interrupt handler sched_tick().
7606 *
7607 * 3. Wakeups don't really cause entry into schedule(). They add a
7608 * task to the run-queue and that's it.
7609 *
7610 * Now, if the new task added to the run-queue preempts the current
7611 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
7612 * called on the nearest possible occasion:
7613 *
7614 * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
7615 *
7616 * - in syscall or exception context, at the next outmost
7617 * preempt_enable(). (this might be as soon as the wake_up()'s
7618 * spin_unlock()!)
7619 *
7620 * - in IRQ context, return from interrupt-handler to
7621 * preemptible context
7622 *
7623 * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
7624 * then at the next:
7625 *
7626 * - cond_resched() call
7627 * - explicit schedule() call
7628 * - return from syscall or exception to user-space
7629 * - return from interrupt-handler to user-space
7630 *
7631 * WARNING: must be called with preemption disabled!
7632 */
7633 static void __sched notrace __schedule(int sched_mode)
7634 {
7635 struct task_struct *prev, *next;
7636 /*
7637 * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
7638 * as a preemption by schedule_debug() and RCU.
7639 */
7640 bool preempt = sched_mode > SM_NONE;
7641 bool block = false;
7642 unsigned long *switch_count;
7643 unsigned long prev_state;
7644 struct rq_flags rf;
7645 struct rq *rq;
7646 bool prev_not_proxied;
7647 int cpu;
7648
7649 cpu = smp_processor_id();
7650 rq = cpu_rq(cpu);
7651 prev = rq->curr;
7652
7653 schedule_debug(prev, preempt);
7654
7655 if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
7656 hrtick_clear(rq);
7657
7658 local_irq_disable();
7659 rcu_note_context_switch(preempt);
7660
7661 /*
7662 * Make sure that signal_pending_state()->signal_pending() below
7663 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
7664 * done by the caller to avoid the race with signal_wake_up():
7665 *
7666 * __set_current_state(@state) signal_wake_up()
7667 * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
7668 * wake_up_state(p, state)
7669 * LOCK rq->lock LOCK p->pi_state
7670 * smp_mb__after_spinlock() smp_mb__after_spinlock()
7671 * if (signal_pending_state()) if (p->state & @state)
7672 *
7673 * Also, the membarrier system call requires a full memory barrier
7674 * after coming from user-space, before storing to rq->curr; this
7675 * barrier matches a full barrier in the proximity of the membarrier
7676 * system call exit.
7677 */
7678 rq_lock(rq, &rf);
7679 smp_mb__after_spinlock();
7680
7681 /* Promote REQ to ACT */
7682 rq->clock_update_flags <<= 1;
7683 update_rq_clock(rq);
7684 rq->clock_update_flags = RQCF_UPDATED;
7685
7686 switch_count = &prev->nivcsw;
7687
7688 /* Task state changes only considers SM_PREEMPT as preemption */
7689 preempt = sched_mode == SM_PREEMPT;
7690
7691 /*
7692 * We must load prev->state once (task_struct::state is volatile), such
7693 * that we form a control dependency vs deactivate_task() below.
7694 */
7695 prev_state = READ_ONCE(prev->__state);
7696 if (sched_mode == SM_IDLE) {
7697 /* SCX must consult the BPF scheduler to tell if rq is empty */
7698 if (!rq->nr_running && !scx_enabled()) {
7699 next = prev;
7700 goto picked;
7701 }
7702 } else if (!preempt && prev_state) {
7703 /*
7704 * We pass task_is_blocked() as the should_block arg
7705 * in order to keep mutex-blocked tasks on the runqueue
7706 * for slection with proxy-exec (without proxy-exec
7707 * task_is_blocked() will always be false).
7708 */
7709 block = try_to_block_task(rq, prev, &prev_state,
7710 !task_is_blocked(prev));
7711 switch_count = &prev->nvcsw;
7712 }
7713
7714 prev_not_proxied = !prev->blocked_donor;
7715
7716 trace_sched_start_task_selection(prev, cpu, task_is_blocked(prev));
7717 pick_again:
7718 next = pick_next_task(rq, rq->donor, &rf); --》 cfs执行pick_next_task_fair
7719 rq_set_donor(rq, next);
7720 next->blocked_donor = NULL;
7721 if (unlikely(task_is_blocked(next))) {
7722 next = find_proxy_task(rq, next, &rf);
7723 if (!next)
7724 goto pick_again;
7725 if (next == rq->idle)
7726 goto keep_resched;
7727 }
7728 trace_sched_finish_task_selection(rq->donor, next, cpu);
7729 picked:
7730 clear_tsk_need_resched(prev);
7731 clear_preempt_need_resched();
7732 keep_resched:
7733 #ifdef CONFIG_SCHED_DEBUG
7734 rq->last_seen_need_resched_ns = 0;
7735 #endif
7736
7737 trace_android_rvh_schedule(prev, next, rq);
7738 if (likely(prev != next)) {
7739 rq->nr_switches++;
7740 /*
7741 * RCU users of rcu_dereference(rq->curr) may not see
7742 * changes to task_struct made by pick_next_task().
7743 */
7744 RCU_INIT_POINTER(rq->curr, next);
7745
7746 if (!task_current_donor(rq, next))
7747 proxy_tag_curr(rq, next);
7748
7749 /*
7750 * The membarrier system call requires each architecture
7751 * to have a full memory barrier after updating
7752 * rq->curr, before returning to user-space.
7753 *
7754 * Here are the schemes providing that barrier on the
7755 * various architectures:
7756 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
7757 * RISC-V. switch_mm() relies on membarrier_arch_switch_mm()
7758 * on PowerPC and on RISC-V.
7759 * - finish_lock_switch() for weakly-ordered
7760 * architectures where spin_unlock is a full barrier,
7761 * - switch_to() for arm64 (weakly-ordered, spin_unlock
7762 * is a RELEASE barrier),
7763 *
7764 * The barrier matches a full barrier in the proximity of
7765 * the membarrier system call entry.
7766 *
7767 * On RISC-V, this barrier pairing is also needed for the
7768 * SYNC_CORE command when switching between processes, cf.
7769 * the inline comments in membarrier_arch_switch_mm().
7770 */
7771 ++*switch_count;
7772
7773 migrate_disable_switch(rq, prev);
7774 psi_account_irqtime(rq, prev, next);
7775 psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
7776 prev->se.sched_delayed);
7777
7778 trace_sched_switch(preempt, prev, next, prev_state);
7779
7780 if (block && (prev_state & TASK_UNINTERRUPTIBLE)
7781 && trace_sched_blocked_reason_enabled()) {
7782 unsigned long blocked_func = 0;
7783
7784 #ifdef CONFIG_STACKTRACE
7785 stack_trace_save_tsk(prev, &blocked_func, 1, 0);
7786 #endif
7787 trace_sched_blocked_reason(prev, (void *)blocked_func);
7788 }
7789
7790 /* Also unlocks the rq: */
7791 rq = context_switch(rq, prev, next, &rf);
7792 } else {
7793 /* In case next was already curr but just got blocked_donor */
7794 if (prev_not_proxied && next->blocked_donor)
7795 proxy_tag_curr(rq, next);
7796
7797 rq_unpin_lock(rq, &rf);
7798 __balance_callbacks(rq);
7799 raw_spin_rq_unlock_irq(rq);
7800 }
7801 }
7802
struct task_struct *
9113 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
9114 {
9115 struct sched_entity *se;
9116 struct task_struct *p = NULL;
9117 int new_tasks;
9118
9119 again:
9120 trace_android_rvh_before_pick_task_fair(rq, &p, prev, rf);
9121 if (!p) {
9122 p = pick_task_fair(rq);
9123 trace_android_rvh_replace_next_task_fair(rq, &p, prev);
9124 }
9125
9126 if (!p)
9127 goto idle;
9128 se = &p->se;
9129
9130 #ifdef CONFIG_FAIR_GROUP_SCHED
9131 if (prev->sched_class != &fair_sched_class ||
9132 rq->curr != rq->donor)
9133 goto simple;
9134
9135 __put_prev_set_next_dl_server(rq, prev, p);
9136
9137 /*
9138 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
9139 * likely that a next task is from the same cgroup as the current.
9140 *
9141 * Therefore attempt to avoid putting and setting the entire cgroup
9142 * hierarchy, only change the part that actually changes.
9143 *
9144 * Since we haven't yet done put_prev_entity and if the selected task
9145 * is a different task than we started out with, try and touch the
9146 * least amount of cfs_rqs.
9147 */
9148 if (prev != p) {
9149 struct sched_entity *pse = &prev->se;
9150 struct cfs_rq *cfs_rq;
9151
9152 while (!(cfs_rq = is_same_group(se, pse))) {
9153 int se_depth = se->depth;
9154 int pse_depth = pse->depth;
9155
9156 if (se_depth <= pse_depth) {
9157 put_prev_entity(cfs_rq_of(pse), pse);
9158 pse = parent_entity(pse);
9159 }
9160 if (se_depth >= pse_depth) {
9161 set_next_entity(cfs_rq_of(se), se);
9162 se = parent_entity(se);
9163 }
9164 }
9165
9166 put_prev_entity(cfs_rq, pse);
9167 set_next_entity(cfs_rq, se);
9168
9169 __set_next_task_fair(rq, p, true);
9170 }
9171
9172 return p;
9173
9174 simple:
9175 #endif
9176 put_prev_set_next_task(rq, prev, p);
9177 return p;
9178
9179 idle:
9180 if (!rf)
9181 return NULL;
9182
9183 new_tasks = sched_balance_newidle(rq, rf); --> 这里会拉取其它task
9184
9185 /*
9186 * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
9187 * possible for any higher priority task to appear. In that case we
9188 * must re-start the pick_next_entity() loop.
9189 */
9190 if (new_tasks < 0)
9191 return RETRY_TASK;
9192
9193 if (new_tasks > 0)
9194 goto again;
9195
9196 /*
9197 * rq is about to be idle, check if we need to update the
9198 * lost_idle_time of clock_pelt
9199 */
9200 update_idle_rq_clock_pelt(rq);
9201
9202 return NULL;
9203 }
假设 CPU 0 上的任务刚刚执行完毕,或者主动睡眠了。此时 CPU 0 的可运行队列空了,它马上就要进入 Idle(发呆)状态了。 内核会想:"与其让你闲着,不如你去看看别人忙不忙?"
-
调用时机 :在
__schedule()函数中,当调度器发现下一个要运行的任务是idle线程时,会立刻调用 sched_balance_newidle() -
行为:CPU 0 会赶在自己彻底睡死之前,主动去扒拉其他忙碌 CPU(比如 CPU 1)的队列,强行把任务"偷"过来自己运行。这大大降低了任务的排队延迟。
3. 唤醒/新建负载均衡 (Wakeup/Exec Load Balance) ------ 【防患于未然】
假设进程 A 在 CPU 0 上调用了 fork() 创建了进程 B,或者进程 B 一直在睡眠,现在被硬件中断唤醒了。 内核会想:"我该把进程 B 放在哪个 CPU 上最合适呢?"
-
调用时机 :在
try_to_wake_up()(唤醒进程)或系统调用execve()时,内核会调用select_task_rq_fair()。 -
行为 :内核会在将进程 B 放入就绪队列之前 ,纵观全局,计算各个 CPU 的负载(利用前面提到的
util_avg等数据)。如果发现 CPU 0 很忙,而 CPU 2 很闲,内核会直接把进程 B 唤醒到 CPU 2 的队列里。这就是一种"主动分配"的负载均衡。
总结
你看到的 sched_tick 里的代码,只是 Linux 复杂负载均衡宏大版图的三分之一。
-
唤醒负载均衡 是"新生儿分班",尽量一开始就分均匀。
-
空闲负载均衡 是"主动抢活干",绝不让自己闲着。
-
周期性负载均衡(Tick) 则是"定期大盘点",防止出现长期积劳成疾的极端不平衡。这三者结合,才造就了 Linux 在多核时代强悍的调度性能。