负载均衡流程

1、负载均衡流程图

2、触发负载均衡函数trigger_load_balance

void trigger_load_balance(struct rq *rq)

{

/* Don't need to rebalance while attached to NULL domain */

if (unlikely(on_null_domain(rq)))//当前调度队列中的调度域是空的则返回

return;

if (time_after_eq(jiffies, rq->next_balance))//判断下一次均衡的时间是否到

raise_softirq(SCHED_SOFTIRQ);//触发软中断,在init_sched_fair_class中初始化open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

#ifdef CONFIG_NO_HZ_COMMON

if (nohz_kick_needed(rq, false))

nohz_balancer_kick(false);

#endif

}

2.1 run_rebalance_domains

static __latent_entropy void run_rebalance_domains(struct softirq_action *h)

{

struct rq *this_rq = this_rq();//获取当前运行队列

enum cpu_idle_type idle = this_rq->idle_balance ?

CPU_IDLE : CPU_NOT_IDLE;//判断当前运行队列是空闲还是非空闲

/*

* If this cpu has a pending nohz_balance_kick, then do the

* balancing on behalf of the other idle cpus whose ticks are

* stopped. Do nohz_idle_balance *before* rebalance_domains to

* give the idle cpus a chance to load balance. Else we may

* load balance only within the local sched_domain hierarchy

* and abort nohz_idle_balance altogether if we pull some load.

*/

nohz_idle_balance(this_rq, idle);//给空闲cpu一个均衡的机会进行均衡,

update_blocked_averages(this_rq->cpu);//更新阻塞平均值

#ifdef CONFIG_NO_HZ_COMMON

if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)))//如果当前cpu设置了NOHZ_STATS_KICK,则跳过,否则进行rebalance_domain

rebalance_domains(this_rq, idle);

clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));

#else

rebalance_domains(this_rq, idle);

#endif

}

2.1.1 nohz_idle_balance

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)

{

int this_cpu = this_rq->cpu;//获取cpu

struct rq *rq;

struct sched_domain *sd;

int balance_cpu;

/* Earliest time when we have to do rebalance again */

unsigned long next_balance = jiffies + 60*HZ;

int update_next_balance = 0;

#ifdef CONFIG_SPRD_CORE_CTL

cpumask_t cpus;

#endif

if (idle != CPU_IDLE ||

!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))//如果cpu不是空闲,或者设置了NOHZ_BALANCE_KICK,则返回

goto end;

/*

* This cpu is going to update the blocked load of idle CPUs either

* before doing a rebalancing or just to keep metrics up to date. we

* can safely update the next update timestamp

*/

rcu_read_lock();//rcu读锁

sd = rcu_dereference(this_rq->sd);//获取当前this_rq的调度域

/*

* Check whether there is a sched_domain available for this cpu.

* The last other cpu can have been unplugged since the ILB has been

* triggered and the sched_domain can now be null. The idle balance

* sequence will quickly be aborted as there is no more idle CPUs

*/

if (sd)

nohz.next_update = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);//计算下一次空闲cpu负载均衡的时间

rcu_read_unlock();

cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);移除隔离的cpu

for_each_cpu(balance_cpu, &cpus) {//遍历空闲cpu

if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))//如果均衡cpu是当前cpu或者不是空闲的,则进行下一个循环。

continue;

/*

* If this cpu gets work to do, stop the load balancing

* work being done for other cpus. Next load

* balancing owner will pick it up.

*/

if (need_resched())//判断如果此cpu需要调度,则停止均衡

break;

rq = cpu_rq(balance_cpu);//获取要均衡cpu的运行队列

/*

* If time for next balance is due,

* do the balance.

*/

if (time_after_eq(jiffies, rq->next_balance)) {//判断均衡时间有没有到

struct rq_flags rf;

rq_lock_irq(rq, &rf);//获取运行队列锁

update_rq_clock(rq);//更新运行队列时钟

cpu_load_update_idle(rq);//更新队列负载

rq_unlock_irq(rq, &rf);//释放锁

update_blocked_averages(balance_cpu);//更新均衡cpu的阻塞平均值

/*

* This idle load balance softirq may have been

* triggered only to update the blocked load and shares

* of idle CPUs (which we have just done for

* balance_cpu). In that case skip the actual balance.

*/

if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_cpu)))//如果没有设置NOHZ_STATS_KICK,则进行均衡

rebalance_domains(rq, idle);//域负载均衡

}

if (time_after(next_balance, rq->next_balance)) {//更新下一次均衡时间

next_balance = rq->next_balance;

update_next_balance = 1;

}

}

/*

* next_balance will be updated only when there is a need.

* When the CPU is attached to null domain for ex, it will not be

* updated.

*/

if (likely(update_next_balance))//更新下一次均衡时间

nohz.next_balance = next_balance;

end:

clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));

}

2.2 rebalance_domains函数

static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)

{

int continue_balancing = 1;

int cpu = rq->cpu;

unsigned long interval;

struct sched_domain *sd;

/* Earliest time when we have to do rebalance again */

unsigned long next_balance = jiffies + 60*HZ;

int update_next_balance = 0;

int need_serialize, need_decay = 0;

u64 max_cost = 0;

rcu_read_lock();

for_each_domain(cpu, sd) {//遍历调度域中每个cpu

/*

* Decay the newidle max times here because this is a regular

* visit to all the domains. Decay ~1% per second.

*/

if (time_after(jiffies, sd->next_decay_max_lb_cost)) {//判断衰减时间有没有到

sd->max_newidle_lb_cost =

(sd->max_newidle_lb_cost * 253) / 256;//衰减百分之一

sd->next_decay_max_lb_cost = jiffies + HZ;//衰减时间更新

need_decay = 1;

}

max_cost += sd->max_newidle_lb_cost;

if (energy_aware() && !sd_overutilized(sd) && !sd->parent)//在使能了eas且调度域没有过载已及这是个根调度域时跳过

continue;

if (!(sd->flags & SD_LOAD_BALANCE)) {//判断此调度域是否设置了SD_LOAD_BALANCE

if (time_after_eq(jiffies,

sd->groups->sgc->next_update))

update_group_capacity(sd, cpu);//更新cpu调度组能力

continue;

}

/*

* Stop the load balance at this level. There is another

* CPU in our sched group which is doing load balancing more

* actively.

*/

if (!continue_balancing) {//判断是否停止均衡

if (need_decay)

continue;

break;

}

interval = get_sd_balance_interval(sd, idle != CPU_IDLE);//得到调度域的均衡间隔

need_serialize = sd->flags & SD_SERIALIZE;//判断是否需要串行化

if (need_serialize) {

if (!spin_trylock(&balancing))//获取锁

goto out;

}

if (time_after_eq(jiffies, sd->last_balance + interval)) {//判断均衡时间是否到

if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {//进行均衡

/*

* The LBF_DST_PINNED logic could have changed

* env->dst_cpu, so we can't know our idle

* state even if we migrated tasks. Update it.

*/

idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;//获取cpu空闲状态

}

sd->last_balance = jiffies**;//更新均衡时间**

interval = get_sd_balance_interval(sd, idle != CPU_IDLE);//获取均衡间隔

}

if (need_serialize)

spin_unlock(&balancing);//释放锁

out:

if (time_after(next_balance, sd->last_balance + interval)) {//判断next_balance是否需要更新

next_balance = sd->last_balance + interval;

update_next_balance = 1;

}

}

if (need_decay) {//判断是否需要衰减

/*

* Ensure the rq-wide value also decays but keep it at a

* reasonable floor to avoid funnies with rq->avg_idle.

*/

rq->max_idle_balance_cost =

max((u64)sysctl_sched_migration_cost, max_cost);

}

rcu_read_unlock();

/*

* next_balance will be updated only when there is a need.

* When the cpu is attached to null domain for ex, it will not be

* updated.

*/

if (likely(update_next_balance)) {

rq->next_balance = next_balance;//更新运行队列下一次均衡时间

#ifdef CONFIG_NO_HZ_COMMON

/*

* If this CPU has been elected to perform the nohz idle

* balance. Other idle CPUs have already rebalanced with

* nohz_idle_balance() and nohz.next_balance has been

* updated accordingly. This CPU is now running the idle load

* balance for itself and we need to update the

* nohz.next_balance accordingly.

*/

if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))//如果cpu状态是空闲且运行队列的下次均衡时间小于空闲cpu的下次均衡时间

nohz.next_balance = rq->next_balance;//更新空闲cpu的下次均衡时间

#endif

}

}

2.2.1 load_balance

static int load_balance(int this_cpu, struct rq *this_rq,

struct sched_domain *sd, enum cpu_idle_type idle,

int *continue_balancing)

{

int ld_moved, cur_ld_moved, active_balance = 0;

struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;

struct sched_group *group;

struct rq *busiest;

struct rq_flags rf;

struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

struct lb_env env = {//负载平衡环境,包含了一组与负载平衡相关的参数和状态信息

.sd = sd,//调度域

.dst_cpu = this_cpu,//均衡给此cpu

.dst_rq = this_rq,//均衡给此队列

.dst_grpmask = sched_group_span(sd->groups),//目标调度组掩码

.idle = idle,//cpu状态

.loop_break = sched_nr_migrate_break,//迁移间隔

.cpus = cpus,

.fbq_type = all,

.tasks = LIST_HEAD_INIT(env.tasks),

};

cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);//将调度域中处于active状态的cpu挑选出来

schedstat_inc(sd->lb_count[idle]);//更新负载均衡idle类型的计数

redo:

if (!should_we_balance(&env)) {//判断是否应该均衡

*continue_balancing = 0;

goto out_balanced;

}

group = find_busiest_group(&env);//找到最繁忙的组

if (!group) {

schedstat_inc(sd->lb_nobusyg[idle]);

goto out_balanced;

}

busiest = find_busiest_queue(&env, group);//找到最繁忙的队列

if (!busiest) {

schedstat_inc(sd->lb_nobusyq[idle]);

goto out_balanced;

}

BUG_ON(busiest == env.dst_rq);//最繁忙的队列不等于目的队列

schedstat_add(sd->lb_imbalance[idle], env.imbalance);更新负载均衡idle类型不均衡的计数

env.src_cpu = busiest->cpu;//最繁忙的队列的cpu给要均衡的cpu

env.src_rq = busiest;//最繁忙的队列给要均衡的队列

ld_moved = 0;

if (busiest->nr_running > 1) {最繁忙的运行队列中的task要大于1

/*

* Attempt to move tasks. If find_busiest_group has found

* an imbalance but busiest->nr_running <= 1, the group is

* still unbalanced. ld_moved simply stays zero, so it is

* correctly treated as an imbalance.

*/

env.flags |= LBF_ALL_PINNED;

env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);//最大循环的次数

more_balance:

rq_lock_irqsave(busiest, &rf);//获取锁

update_rq_clock(busiest);//更新最忙的队列的时钟

/*

* cur_ld_moved - load moved in current iteration

* ld_moved - cumulative load moved across iterations

*/

cur_ld_moved = detach_tasks(&env, &rf);//出队,将要迁移的task从src cpu中移除并返回出队的个数

/*

* We've detached some tasks from busiest_rq. Every

* task is masked "TASK_ON_RQ_MIGRATING", so we can safely

* unlock busiest->lock, and we are able to be sure

* that nobody can manipulate the tasks in parallel.

* See task_rq_lock() family for the details.

*/

rq_unlock(busiest, &rf);//释放锁

if (cur_ld_moved) {

attach_tasks(&env);//入队,将移除的task加入到新的队列中

ld_moved += cur_ld_moved;

}

local_irq_restore(rf.flags);//恢复本地的中断状态

if (env.flags & LBF_NEED_BREAK) {//判断是否设置了LBF_NEED_BREAK

env.flags &= ~LBF_NEED_BREAK;

goto more_balance;

}

/*

* Revisit (affine) tasks on src_cpu that couldn't be moved to

* us and move them to an alternate dst_cpu in our sched_group

* where they can run. The upper limit on how many times we

* iterate on same src_cpu is dependent on number of cpus in our

* sched_group.

*

* This changes load balance semantics a bit on who can move

* load to a given_cpu. In addition to the given_cpu itself

* (or a ilb_cpu acting on its behalf where given_cpu is

* nohz-idle), we now have balance_cpu in a position to move

* load to given_cpu. In rare situations, this may cause

* conflicts (balance_cpu and given_cpu/ilb_cpu deciding

* independently and at same time to move some load to

* given_cpu) causing exceess load to be moved to given_cpu.

* This however should not happen so much in practice and

* moreover subsequent load balance cycles should correct the

* excess load moved.

*/

if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {//果sched domain仍然未达均衡均衡状态,并且在之前的均衡过程中,有因为affinity的原因导致任务无法迁移到dest cpu,这时候要继续在src rq上搜索任务,迁移到备选的dest cpu,因此,这里再次发起均衡操作。这里的均衡上下文的dest cpu设定为备选的cpu,loop也被清零,重新开始扫描。

/* Prevent to re-select dst_cpu via env's cpus */

cpumask_clear_cpu(env.dst_cpu, env.cpus);

env.dst_rq = cpu_rq(env.new_dst_cpu);//备用cpu队列

env.dst_cpu = env.new_dst_cpu;

env.flags &= ~LBF_DST_PINNED;

env.loop = 0;

env.loop_break = sched_nr_migrate_break;

/*

* Go back to "more_balance" rather than "redo" since we

* need to continue with same src_cpu.

*/

goto more_balance;

}

/*

* We failed to reach balance because of affinity.

*/

if (sd_parent) {//如果父调度域存在

int *group_imbalance = &sd_parent->groups->sgc->imbalance;

if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)//由于亲和性原因不能在目标cpu上迁移而设置了LBF_SOME_PINNED

*group_imbalance = 1;

}

/* All tasks on this runqueue were pinned by CPU affinity */

if (unlikely(env.flags & LBF_ALL_PINNED)) {//设置了LBF_ALL_PINNED,由于亲和性原因在这个运行队列上的所有的任务不能迁移

cpumask_clear_cpu(cpu_of(busiest), cpus);//清除在cpus中的busiest所在的cpu

/*

* Attempting to continue load balancing at the current

* sched_domain level only makes sense if there are

* active CPUs remaining as possible busiest CPUs to

* pull load from which are not contained within the

* destination group that is receiving any migrated

* load.

*/

if (!cpumask_subset(cpus, env.dst_grpmask)) {//如果选中的busiest cpu上的任务全部都是通过affinity锁定在了该cpu上,那么清除该cpu(为了确保下轮均衡不考虑该cpu),再次发起均衡。这种情况下,需要重新搜索source cpu,因此跳转到redo

env.loop = 0;

env.loop_break = sched_nr_migrate_break;

goto redo;

}

goto out_all_pinned;

}

}

if (!ld_moved) {//如果前面迁移的task如果为0,则走这里

schedstat_inc(sd->lb_failed[idle]);//增加负载均衡lb_failed计数

/*

* Increment the failure counter only on periodic balance.

* We do not want newidle balance, which can be very

* frequent, pollute the failure counter causing

* excessive cache_hot migrations and active balances.

*/

if (idle != CPU_NEWLY_IDLE)//如果cpu状态不是刚刚处于空闲状态

if (env.src_grp_nr_running > 1)//要迁移的调度组中的队列个数大于1

sd->nr_balance_failed++;//失败计数加一

if (need_active_balance(&env)) {//判断是否要启动active balance。所谓activebalance就是把当前正在运行的任务迁移到dest cpu上。也就是说经过前面一番折腾,runnable的任务都无法迁移到dest cpu,从而达到均衡,那么就考虑当前正在运行的任务

unsigned long flags;

raw_spin_lock_irqsave(&busiest->lock, flags);

/* don't kick the active_load_balance_cpu_stop,

* if the curr task on busiest cpu can't be

* moved to this_cpu

*/

if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {//在启动active balance之前,先看看busiestcpu上当前正在运行的任务是否可以运行在dest cpu上。如果不可以的话,那么不再试图执行均衡操作,跳转到out_one_pinned

raw_spin_unlock_irqrestore(&busiest->lock,

flags);

env.flags |= LBF_ALL_PINNED;

goto out_one_pinned;

}

/*

* ->active_balance synchronizes accesses to

* ->active_balance_work. Once set, it's cleared

* only after active load balance is finished.

*/

#ifdef CONFIG_SPRD_CORE_CTL

if (!busiest->active_balance &&

!cpu_isolated(cpu_of(busiest))) {

#else

if (!busiest->active_balance) {//busiest cpu运行队列上设置active balance的标记

#endif

busiest->active_balance = 1;

busiest->push_cpu = this_cpu;

active_balance = 1;

}

raw_spin_unlock_irqrestore(&busiest->lock, flags);

if (active_balance) {//将正在运行的busiest cpu 正在运行的任务停止并进行迁移

stop_one_cpu_nowait(cpu_of(busiest),

active_load_balance_cpu_stop, busiest,

&busiest->active_balance_work);

}

/* We've kicked active balancing, force task migration. */

sd->nr_balance_failed = sd->cache_nice_tries+1;

}

} else

sd->nr_balance_failed = 0;//完成了至少一个任务迁移

if (likely(!active_balance)) {

/* We were unbalanced, so reset the balancing interval */

sd->balance_interval = sd->min_interval;//重新设置均衡间隔

} else {

/*

* If we've begun active balancing, start to back off. This

* case may not be covered by the all_pinned logic if there

* is only 1 task on the busy runqueue (because we don't call

* detach_tasks).

*/

if (sd->balance_interval < sd->max_interval)

sd->balance_interval *= 2;

}

goto out;

out_balanced:

/*

* We reach balance although we may have faced some affinity

* constraints. Clear the imbalance flag if it was set.

*/

if (sd_parent) {

int *group_imbalance = &sd_parent->groups->sgc->imbalance;

if (*group_imbalance)

*group_imbalance = 0;

}

out_all_pinned://由于所有的亲和性原因

/*

* We reach balance because all tasks are pinned at this level so

* we can't migrate them. Let the imbalance flag set so parent level

* can try to migrate them.

*/

schedstat_inc(sd->lb_balanced[idle]);

sd->nr_balance_failed = 0;

out_one_pinned://由某个task亲和性原因

ld_moved = 0;

/*

* idle_balance() disregards balance intervals, so we could repeatedly

* reach this code, which would lead to balance_interval skyrocketting

* in a short amount of time. Skip the balance_interval increase logic

* to avoid that.

*/

if (env.idle == CPU_NEWLY_IDLE)

goto out;

/* tune up the balancing interval */

if (((env.flags & LBF_ALL_PINNED) &&

sd->balance_interval < MAX_PINNED_INTERVAL) ||

(sd->balance_interval < sd->max_interval))

sd->balance_interval *= 2;

out:

return ld_moved;

}

相关推荐
xuanzdhc21 分钟前
Linux 基础IO
linux·运维·服务器
愚润求学26 分钟前
【Linux】网络基础
linux·运维·网络
bantinghy1 小时前
Linux进程单例模式运行
linux·服务器·单例模式
小和尚同志2 小时前
29.4k!使用 1Panel 来管理你的服务器吧
linux·运维
帽儿山的枪手2 小时前
为什么Linux需要3种NAT地址转换?一探究竟
linux·网络协议·安全
shadon1789 天前
回答 如何通过inode client的SSLVPN登录之后,访问需要通过域名才能打开的服务
linux
小米里的大麦9 天前
014 Linux 2.6内核进程调度队列(了解)
linux·运维·驱动开发
算法练习生9 天前
Linux文件元信息完全指南:权限、链接与时间属性
linux·运维·服务器
忘了ʷºᵇₐ9 天前
Linux系统能ping通ip但无法ping通域名的解决方法
linux·服务器·tcp/ip
浩浩测试一下9 天前
渗透测试指南(CS&&MSF):Windows 与 Linux 系统中的日志与文件痕迹清理
linux·运维·windows·安全·web安全·网络安全·系统安全