负载均衡流程

1、负载均衡流程图

2、触发负载均衡函数trigger_load_balance

void trigger_load_balance(struct rq *rq)

{

/* Don't need to rebalance while attached to NULL domain */

if (unlikely(on_null_domain(rq)))//当前调度队列中的调度域是空的则返回

return;

if (time_after_eq(jiffies, rq->next_balance))//判断下一次均衡的时间是否到

raise_softirq(SCHED_SOFTIRQ);//触发软中断，在init_sched_fair_class中初始化open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

#ifdef CONFIG_NO_HZ_COMMON

if (nohz_kick_needed(rq, false))

nohz_balancer_kick(false);

#endif

}

2.1 run_rebalance_domains

static __latent_entropy void run_rebalance_domains(struct softirq_action *h)

{

struct rq *this_rq = this_rq();//获取当前运行队列

enum cpu_idle_type idle = this_rq->idle_balance ?

CPU_IDLE : CPU_NOT_IDLE;//判断当前运行队列是空闲还是非空闲

* If this cpu has a pending nohz_balance_kick, then do the

* balancing on behalf of the other idle cpus whose ticks are

* stopped. Do nohz_idle_balance *before* rebalance_domains to

* give the idle cpus a chance to load balance. Else we may

* load balance only within the local sched_domain hierarchy

* and abort nohz_idle_balance altogether if we pull some load.

nohz_idle_balance(this_rq, idle);//给空闲cpu一个均衡的机会进行均衡，

update_blocked_averages(this_rq->cpu);//更新阻塞平均值

#ifdef CONFIG_NO_HZ_COMMON

if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)))//如果当前cpu设置了NOHZ_STATS_KICK,则跳过，否则进行rebalance_domain

rebalance_domains(this_rq, idle);

clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));

#else

rebalance_domains(this_rq, idle);

#endif

}

2.1.1 nohz_idle_balance

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)

{

int this_cpu = this_rq->cpu;//获取cpu

struct rq *rq;

struct sched_domain *sd;

int balance_cpu;

/* Earliest time when we have to do rebalance again */

unsigned long next_balance = jiffies + 60*HZ;

int update_next_balance = 0;

#ifdef CONFIG_SPRD_CORE_CTL

cpumask_t cpus;

#endif

if (idle != CPU_IDLE ||

!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))//如果cpu不是空闲，或者设置了NOHZ_BALANCE_KICK，则返回

goto end;

* This cpu is going to update the blocked load of idle CPUs either

* before doing a rebalancing or just to keep metrics up to date. we

* can safely update the next update timestamp

rcu_read_lock();//rcu读锁

sd = rcu_dereference(this_rq->sd);//获取当前this_rq的调度域

* Check whether there is a sched_domain available for this cpu.

* The last other cpu can have been unplugged since the ILB has been

* triggered and the sched_domain can now be null. The idle balance

* sequence will quickly be aborted as there is no more idle CPUs

if (sd)

nohz.next_update = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);//计算下一次空闲cpu负载均衡的时间

rcu_read_unlock();

cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);移除隔离的cpu

for_each_cpu(balance_cpu, &cpus) {//遍历空闲cpu

if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))//如果均衡cpu是当前cpu或者不是空闲的，则进行下一个循环。

continue;

* If this cpu gets work to do, stop the load balancing

* work being done for other cpus. Next load

* balancing owner will pick it up.

if (need_resched())//判断如果此cpu需要调度，则停止均衡

break;

rq = cpu_rq(balance_cpu);//获取要均衡cpu的运行队列

* If time for next balance is due,

* do the balance.

if (time_after_eq(jiffies, rq->next_balance)) {//判断均衡时间有没有到

struct rq_flags rf;

rq_lock_irq(rq, &rf);//获取运行队列锁

update_rq_clock(rq);//更新运行队列时钟

cpu_load_update_idle(rq);//更新队列负载

rq_unlock_irq(rq, &rf);//释放锁

update_blocked_averages(balance_cpu);//更新均衡cpu的阻塞平均值

* This idle load balance softirq may have been

* triggered only to update the blocked load and shares

* of idle CPUs (which we have just done for

* balance_cpu). In that case skip the actual balance.

if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_cpu)))//如果没有设置NOHZ_STATS_KICK，则进行均衡

rebalance_domains(rq, idle);//域负载均衡

}

if (time_after(next_balance, rq->next_balance)) {//更新下一次均衡时间

next_balance = rq->next_balance;

update_next_balance = 1;

}

* next_balance will be updated only when there is a need.

* When the CPU is attached to null domain for ex, it will not be

* updated.

if (likely(update_next_balance))//更新下一次均衡时间

nohz.next_balance = next_balance;

end:

clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));

}

2.2 rebalance_domains函数

static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)

{

int continue_balancing = 1;

int cpu = rq->cpu;

unsigned long interval;

struct sched_domain *sd;

/* Earliest time when we have to do rebalance again */

unsigned long next_balance = jiffies + 60*HZ;

int update_next_balance = 0;

int need_serialize, need_decay = 0;

u64 max_cost = 0;

rcu_read_lock();

for_each_domain(cpu, sd) {//遍历调度域中每个cpu

* Decay the newidle max times here because this is a regular

* visit to all the domains. Decay ~1% per second.

if (time_after(jiffies, sd->next_decay_max_lb_cost)) {//判断衰减时间有没有到

sd->max_newidle_lb_cost =

(sd->max_newidle_lb_cost * 253) / 256;//衰减百分之一

sd->next_decay_max_lb_cost = jiffies + HZ;//衰减时间更新

need_decay = 1;

}

max_cost += sd->max_newidle_lb_cost;

if (energy_aware() && !sd_overutilized(sd) && !sd->parent)//在使能了eas且调度域没有过载已及这是个根调度域时跳过

continue;

if (!(sd->flags & SD_LOAD_BALANCE)) {//判断此调度域是否设置了SD_LOAD_BALANCE

if (time_after_eq(jiffies,

sd->groups->sgc->next_update))

update_group_capacity(sd, cpu);//更新cpu调度组能力

continue;

}

* Stop the load balance at this level. There is another

* CPU in our sched group which is doing load balancing more

* actively.

if (!continue_balancing) {//判断是否停止均衡

if (need_decay)

continue;

break;

}

interval = get_sd_balance_interval(sd, idle != CPU_IDLE);//得到调度域的均衡间隔

need_serialize = sd->flags & SD_SERIALIZE;//判断是否需要串行化

if (need_serialize) {

if (!spin_trylock(&balancing))//获取锁

goto out;

}

if (time_after_eq(jiffies, sd->last_balance + interval)) {//判断均衡时间是否到

if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {//进行均衡

* The LBF_DST_PINNED logic could have changed

* env->dst_cpu, so we can't know our idle

* state even if we migrated tasks. Update it.

idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;//获取cpu空闲状态

}

sd->last_balance = jiffies**;//更新均衡时间**

interval = get_sd_balance_interval(sd, idle != CPU_IDLE);//获取均衡间隔

}

if (need_serialize)

spin_unlock(&balancing);//释放锁

out:

if (time_after(next_balance, sd->last_balance + interval)) {//判断next_balance是否需要更新

next_balance = sd->last_balance + interval;

update_next_balance = 1;

}

if (need_decay) {//判断是否需要衰减

* Ensure the rq-wide value also decays but keep it at a

* reasonable floor to avoid funnies with rq->avg_idle.

rq->max_idle_balance_cost =

max((u64)sysctl_sched_migration_cost, max_cost);

}

rcu_read_unlock();

* next_balance will be updated only when there is a need.

* When the cpu is attached to null domain for ex, it will not be

* updated.

if (likely(update_next_balance)) {

rq->next_balance = next_balance;//更新运行队列下一次均衡时间

#ifdef CONFIG_NO_HZ_COMMON

* If this CPU has been elected to perform the nohz idle

* balance. Other idle CPUs have already rebalanced with

* nohz_idle_balance() and nohz.next_balance has been

* updated accordingly. This CPU is now running the idle load

* balance for itself and we need to update the

* nohz.next_balance accordingly.

if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))//如果cpu状态是空闲且运行队列的下次均衡时间小于空闲cpu的下次均衡时间

nohz.next_balance = rq->next_balance;//更新空闲cpu的下次均衡时间

#endif

}

2.2.1 load_balance

static int load_balance(int this_cpu, struct rq *this_rq,

struct sched_domain *sd, enum cpu_idle_type idle,

int *continue_balancing)

{

int ld_moved, cur_ld_moved, active_balance = 0;

struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;

struct sched_group *group;

struct rq *busiest;

struct rq_flags rf;

struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

struct lb_env env = {//负载平衡环境，包含了一组与负载平衡相关的参数和状态信息

.sd = sd,//调度域

.dst_cpu = this_cpu,//均衡给此cpu

.dst_rq = this_rq,//均衡给此队列

.dst_grpmask = sched_group_span(sd->groups),//目标调度组掩码

.idle = idle,//cpu状态

.loop_break = sched_nr_migrate_break,//迁移间隔

.cpus = cpus,

.fbq_type = all,

.tasks = LIST_HEAD_INIT(env.tasks),

};

cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);//将调度域中处于active状态的cpu挑选出来

schedstat_inc(sd->lb_count[idle]);//更新负载均衡idle类型的计数

redo:

if (!should_we_balance(&env)) {//判断是否应该均衡

*continue_balancing = 0;

goto out_balanced;

}

group = find_busiest_group(&env);//找到最繁忙的组

if (!group) {

schedstat_inc(sd->lb_nobusyg[idle]);

goto out_balanced;

}

busiest = find_busiest_queue(&env, group);//找到最繁忙的队列

if (!busiest) {

schedstat_inc(sd->lb_nobusyq[idle]);

goto out_balanced;

}

BUG_ON(busiest == env.dst_rq);//最繁忙的队列不等于目的队列

schedstat_add(sd->lb_imbalance[idle], env.imbalance);更新负载均衡idle类型不均衡的计数

env.src_cpu = busiest->cpu;//最繁忙的队列的cpu给要均衡的cpu

env.src_rq = busiest;//最繁忙的队列给要均衡的队列

ld_moved = 0;

if (busiest->nr_running > 1) {最繁忙的运行队列中的task要大于1

* Attempt to move tasks. If find_busiest_group has found

* an imbalance but busiest->nr_running <= 1, the group is

* still unbalanced. ld_moved simply stays zero, so it is

* correctly treated as an imbalance.

env.flags |= LBF_ALL_PINNED;

env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);//最大循环的次数

more_balance:

rq_lock_irqsave(busiest, &rf);//获取锁

update_rq_clock(busiest);//更新最忙的队列的时钟

* cur_ld_moved - load moved in current iteration

* ld_moved - cumulative load moved across iterations

cur_ld_moved = detach_tasks(&env, &rf);//出队，将要迁移的task从src cpu中移除并返回出队的个数

* We've detached some tasks from busiest_rq. Every

* task is masked "TASK_ON_RQ_MIGRATING", so we can safely

* unlock busiest->lock, and we are able to be sure

* that nobody can manipulate the tasks in parallel.

* See task_rq_lock() family for the details.

rq_unlock(busiest, &rf);//释放锁

if (cur_ld_moved) {

attach_tasks(&env);//入队，将移除的task加入到新的队列中

ld_moved += cur_ld_moved;

}

local_irq_restore(rf.flags);//恢复本地的中断状态

if (env.flags & LBF_NEED_BREAK) {//判断是否设置了LBF_NEED_BREAK

env.flags &= ~LBF_NEED_BREAK;

goto more_balance;

}

* Revisit (affine) tasks on src_cpu that couldn't be moved to

* us and move them to an alternate dst_cpu in our sched_group

* where they can run. The upper limit on how many times we

* iterate on same src_cpu is dependent on number of cpus in our

* sched_group.

* This changes load balance semantics a bit on who can move

* load to a given_cpu. In addition to the given_cpu itself

* (or a ilb_cpu acting on its behalf where given_cpu is

* nohz-idle), we now have balance_cpu in a position to move

* load to given_cpu. In rare situations, this may cause

* conflicts (balance_cpu and given_cpu/ilb_cpu deciding

* independently and at same time to move some load to

* given_cpu) causing exceess load to be moved to given_cpu.

* This however should not happen so much in practice and

* moreover subsequent load balance cycles should correct the

* excess load moved.

if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {//如果sched domain仍然未达均衡均衡状态，并且在之前的均衡过程中，有因为affinity的原因导致任务无法迁移到dest cpu，这时候要继续在src rq上搜索任务，迁移到备选的dest cpu，因此，这里再次发起均衡操作。这里的均衡上下文的dest cpu设定为备选的cpu，loop也被清零，重新开始扫描。

/* Prevent to re-select dst_cpu via env's cpus */

cpumask_clear_cpu(env.dst_cpu, env.cpus);

env.dst_rq = cpu_rq(env.new_dst_cpu);//备用cpu队列

env.dst_cpu = env.new_dst_cpu;

env.flags &= ~LBF_DST_PINNED;

env.loop = 0;

env.loop_break = sched_nr_migrate_break;

* Go back to "more_balance" rather than "redo" since we

* need to continue with same src_cpu.

goto more_balance;

}

* We failed to reach balance because of affinity.

if (sd_parent) {//如果父调度域存在

int *group_imbalance = &sd_parent->groups->sgc->imbalance;

if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)//由于亲和性原因不能在目标cpu上迁移而设置了LBF_SOME_PINNED

*group_imbalance = 1;

}

/* All tasks on this runqueue were pinned by CPU affinity */

if (unlikely(env.flags & LBF_ALL_PINNED)) {//设置了LBF_ALL_PINNED，由于亲和性原因在这个运行队列上的所有的任务不能迁移

cpumask_clear_cpu(cpu_of(busiest), cpus);//清除在cpus中的busiest所在的cpu

* Attempting to continue load balancing at the current

* sched_domain level only makes sense if there are

* active CPUs remaining as possible busiest CPUs to

* pull load from which are not contained within the

* destination group that is receiving any migrated

* load.

if (!cpumask_subset(cpus, env.dst_grpmask)) {//如果选中的busiest cpu上的任务全部都是通过affinity锁定在了该cpu上，那么清除该cpu（为了确保下轮均衡不考虑该cpu），再次发起均衡。这种情况下，需要重新搜索source cpu，因此跳转到redo

env.loop = 0;

env.loop_break = sched_nr_migrate_break;

goto redo;

}

goto out_all_pinned;

}

if (!ld_moved) {//如果前面迁移的task如果为0，则走这里

schedstat_inc(sd->lb_failed[idle]);//增加负载均衡lb_failed计数

* Increment the failure counter only on periodic balance.

* We do not want newidle balance, which can be very

* frequent, pollute the failure counter causing

* excessive cache_hot migrations and active balances.

if (idle != CPU_NEWLY_IDLE)//如果cpu状态不是刚刚处于空闲状态

if (env.src_grp_nr_running > 1)//要迁移的调度组中的队列个数大于1

sd->nr_balance_failed++;//失败计数加一

if (need_active_balance(&env)) {//判断是否要启动active balance。所谓activebalance就是把当前正在运行的任务迁移到dest cpu上。也就是说经过前面一番折腾，runnable的任务都无法迁移到dest cpu，从而达到均衡，那么就考虑当前正在运行的任务

unsigned long flags;

raw_spin_lock_irqsave(&busiest->lock, flags);

/* don't kick the active_load_balance_cpu_stop,

* if the curr task on busiest cpu can't be

* moved to this_cpu

if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {//在启动active balance之前，先看看busiestcpu上当前正在运行的任务是否可以运行在dest cpu上。如果不可以的话，那么不再试图执行均衡操作，跳转到out_one_pinned

raw_spin_unlock_irqrestore(&busiest->lock,

flags);

env.flags |= LBF_ALL_PINNED;

goto out_one_pinned;

}

* ->active_balance synchronizes accesses to

* ->active_balance_work. Once set, it's cleared

* only after active load balance is finished.

#ifdef CONFIG_SPRD_CORE_CTL

if (!busiest->active_balance &&

!cpu_isolated(cpu_of(busiest))) {

#else

if (!busiest->active_balance) {//busiest cpu运行队列上设置active balance的标记

#endif

busiest->active_balance = 1;

busiest->push_cpu = this_cpu;

active_balance = 1;

}

raw_spin_unlock_irqrestore(&busiest->lock, flags);

if (active_balance) {//将正在运行的busiest cpu 正在运行的任务停止并进行迁移

stop_one_cpu_nowait(cpu_of(busiest),

active_load_balance_cpu_stop, busiest,

&busiest->active_balance_work);

}

/* We've kicked active balancing, force task migration. */

sd->nr_balance_failed = sd->cache_nice_tries+1;

}

} else

sd->nr_balance_failed = 0;//完成了至少一个任务迁移

if (likely(!active_balance)) {

/* We were unbalanced, so reset the balancing interval */

sd->balance_interval = sd->min_interval;//重新设置均衡间隔

} else {

* If we've begun active balancing, start to back off. This

* case may not be covered by the all_pinned logic if there

* is only 1 task on the busy runqueue (because we don't call

* detach_tasks).

if (sd->balance_interval < sd->max_interval)

sd->balance_interval *= 2;

}

goto out;

out_balanced:

* We reach balance although we may have faced some affinity

* constraints. Clear the imbalance flag if it was set.

if (sd_parent) {

int *group_imbalance = &sd_parent->groups->sgc->imbalance;

if (*group_imbalance)

*group_imbalance = 0;

}

out_all_pinned://由于所有的亲和性原因

* We reach balance because all tasks are pinned at this level so

* we can't migrate them. Let the imbalance flag set so parent level

* can try to migrate them.

schedstat_inc(sd->lb_balanced[idle]);

sd->nr_balance_failed = 0;

out_one_pinned://由某个task亲和性原因

ld_moved = 0;

* idle_balance() disregards balance intervals, so we could repeatedly

* reach this code, which would lead to balance_interval skyrocketting

* in a short amount of time. Skip the balance_interval increase logic

* to avoid that.

if (env.idle == CPU_NEWLY_IDLE)

goto out;

/* tune up the balancing interval */

if (((env.flags & LBF_ALL_PINNED) &&

sd->balance_interval < MAX_PINNED_INTERVAL) ||

(sd->balance_interval < sd->max_interval))

sd->balance_interval *= 2;

out:

return ld_moved;

}