Linux RT带宽控制

所谓带宽控制，指的是能够控制任务长时间占用CPU的能力。Deadline调度器、RT调度器、CFS调度器均支持这种能力。这篇笔记记录了RT调度器的带宽控制相关实现。

RT调度器的带宽控制核心思想是：限制CPU运行队列上的任务在检测周期 内占用的CPU时长不能超过限定时长，检测周期和限定时长正是RT调度器带宽控制的两个可配置参数。带宽控制可以在CPU级别（即CPU运行队列上）实现；支持组调度后，带宽控制还可以扩展到在分组级别（即任务组的运行队列上）。

可配置参数

带宽控制的可配置参数也分为CPU和任务组两个级别。

CPU级别配置参数

在CPU级别上，由下面两个全局变量表示检测周期和限定时长两个带宽控制参数。默认的周期为1s，RT任务可运行时长为0.95s，即最大可以占用95%的CPU。这两个参数对应的用户态配置节点为**/proc/sys/kernel/sched_rt_period_us** 和**/proc/sys/kernelsched_rt_runtime_us**。

cpp 复制代码

/*
 * period over which we measure -rt task cpu usage in us.
 * default: 1s
 */
unsigned int sysctl_sched_rt_period = 1000000;
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;

RT调度器就是通过判断sysctl_sched_rt_runtime变量来确定系统是否开启带宽控制的。

cpp 复制代码

static inline int rt_bandwidth_enabled(void)
{
    return sysctl_sched_rt_runtime >= 0;
}

任务组级别配置参数

支持组调度后，用户态的每个控制组的目录下增加了rt_period_us 和rt_runtime_us，分别表示该任务组的检测周期和运行时长。

cpp 复制代码

static struct cftype cpu_files[] = {
...

#ifdef CONFIG_RT_GROUP_SCHED
    {
        .name = "rt_runtime_us",
        .read_s64 = cpu_rt_runtime_read,
        .write_s64 = cpu_rt_runtime_write,
    },
    {
        .name = "rt_period_us",
        .read_u64 = cpu_rt_period_read_uint,
        .write_u64 = cpu_rt_period_write_uint,
    },
#endif
    { }    /* terminate */
};

数据结构

RT带宽控制: rt_bandwidth

调度过程中，RT调度器设计了rt_bandwidth结构来保存带宽控制信息。对于任务组，在task_group中包含了该结构；对于CPU级别的控制，则设计了一个全局变量def_rt_bandwidth。

cpp 复制代码

struct rt_bandwidth {
    raw_spinlock_t rt_runtime_lock;
    ktime_t rt_period;
// 检查周期参数，单位为纳秒
    u64 rt_runtime;
 // 限定时长参数，单位为纳秒。值为0表示不限制
    struct hrtimer rt_period_timer; // 用于解除限制的定时器
};

struct task_group {
...
#ifdef CONFIG_RT_GROUP_SCHED

    struct rt_bandwidth rt_bandwidth;
#endif
}

struct rt_bandwidth def_rt_bandwidth;

RT运行队列: rt_rq

带宽控制在运行队列中设计了如下字段。rt_time 累计了该运行队列上任务占用的CPU时长。rt_runtime 就是该运行队列的带宽控制限定时长，来自默认rt_bandwidth.rt_runtime或者任务组的rt_bandwidth.rt_runtime。rt_throttled是一个标记，非0表示该运行队列的任务运行已经超过了带宽限制，进入了限流状态，此时这些任务不会被CPU调度运行。

cpp 复制代码

struct rt_rq {
...
    int rt_throttled;
    u64 rt_time;
    u64 rt_runtime;
    /* Nests inside the rq lock: */
    raw_spinlock_t rt_runtime_lock;
};

开机初始化

在**sched_init()**中，有如下和带宽控制相关的初始化逻辑。

cpp 复制代码

void __init sched_init(void)
{
...
    // 用CPU级别的检测周期和运行时长参数初始化默认的rt_bandwith和根任务组的rt_bandwidth
    init_rt_bandwidth(&def_rt_bandwidth,
        global_rt_period(), global_rt_runtime());


#ifdef CONFIG_RT_GROUP_SCHED
    init_rt_bandwidth(&root_task_group.rt_bandwidth,
        global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */

    for_each_possible_cpu(i) {
        struct rq *rq = cpu_rq(i);
        // 设置CPU运行队列的RT带宽控制限定时长参数
        rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
    } 
}

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
{
    rt_b->rt_period = ns_to_ktime(period);
    rt_b->rt_runtime = runtime;

    raw_spin_lock_init(&rt_b->rt_runtime_lock);

    hrtimer_init(&rt_b->rt_period_timer,
        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    rt_b->rt_period_timer.function = sched_rt_period_timer; // 该定时器见后面分析
}

此外，在任务组创建时，也会调用init_rt_bandwidth()函数为新的task_group初始化rt_bandwidth，具体见**alloc_rt_sched_group()**函数。

带宽控制

在周期性的tick处理过程中，或者在任务切出CPU时，都会调用update_curr_rt()函数更新任务的CPU使用信息，这里是触发带宽控制的关键流程。

cpp 复制代码

static void update_curr_rt(struct rq *rq)
{
    struct task_struct *curr = rq->curr;
    struct sched_rt_entity *rt_se = &curr->rt;
    u64 delta_exec;
...
    // 计算本次更新间隔内，当前调度实体占用的CPU时长     
    delta_exec = rq_clock_task(rq) - curr->se.exec_start;
    if (unlikely((s64)delta_exec <= 0))
        return;

    // 更新调度实体总的CPU使用时长
    curr->se.sum_exec_runtime += delta_exec;

    curr->se.exec_start = rq_clock_task(rq);

     // 使能RT带宽控制的情况下，检查任务是否超过了带宽限制，如果超过则重新调度
    if (!rt_bandwidth_enabled())
        return;
    // 从当前任务所在调度实体开始，一直到一级任务组的调度实体，自底向上更新并检查各层任务组是否超过了带宽限制        
    for_each_sched_rt_entity(rt_se) {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        // sched_rt_runtime(rt_rq)返回0表示该任务组没有带宽限制，非0的情况下累计CPU占用，然后检查是否超过了带宽限制
        if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { // RUNTIME_INF = 0
            raw_spin_lock(&rt_rq->rt_runtime_lock);
            rt_rq->rt_time += delta_exec;
            if (sched_rt_runtime_exceeded(rt_rq)) // 检查CPU使用是否操作了带宽限制
                resched_curr(rq);
            raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
    }
}

sched_rt_runtime_exceeded()

该函数检查某个运行队列的使用时长是否超过了带宽限制，指定的运行队列可能是CPU运行队列，也可能是某个任务组的运行队列，这取决于当前运行的任务属于哪个任务组。

cpp 复制代码

static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
    u64 runtime = sched_rt_runtime(rt_rq); // 得到该运行队列的限定时长参数

    if (rt_rq->rt_throttled) // 如果已经限流，则不在重复计算
        return rt_rq_throttled(rt_rq);

    // 限定时长大于检测周期，这种情况相当于不进行带宽限制
    if (runtime >= sched_rt_period(rt_rq))
        return 0;

    // 多核场景如果开启了RT_RUNTIME_SHARE特性，CPU之间可以互相借用带宽，系统只需要在整体上保持带宽不超过限制即可
    balance_runtime(rt_rq);
    // 多核之间借用后重新得到该运行队列的限定时长参数
    runtime = sched_rt_runtime(rt_rq);
    if (runtime == RUNTIME_INF) // 限定时长参数为0也相当于不进行带宽限制
        return 0;

    // CPU使用时长超过了限定时长，执行限流
    if (rt_rq->rt_time > runtime) {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
        if (likely(rt_b->rt_runtime)) {
            // 该运行队列确实配置了限定的情况下设置限流标记
            rt_rq->rt_throttled = 1;
            printk_deferred_once("sched: RT throttling activated\n");
        } else {
            rt_rq->rt_time = 0;
        }

        // 该运行队列已经限流，将当前运行队列中的调度实体移除出CPU运行队列
        if (rt_rq_throttled(rt_rq)) {
            sched_rt_rq_dequeue(rt_rq);
            return 1;
        }
    }
    return 0;
}

static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
    struct sched_rt_entity *rt_se;
    int cpu = cpu_of(rq_of_rt_rq(rt_rq));

    rt_se = rt_rq->tg->rt_se[cpu];

    if (!rt_se)
        dequeue_top_rt_rq(rt_rq); // CPU运行队列发生限流
    else if (on_rt_rq(rt_se))
        // 某个任务组发生了限流
        dequeue_rt_entity(rt_se);
}

带宽控制定时器

任务在运行过程中会不断的更新其CPU使用时长，如果发生限流则将其从队列中移除，那么必然还有一种机制能够解除限流并将其重新加入队列，这就是带宽控制定时器的作用。每个任务组的rt_bandwidth结构中都包含一个该定时器，从前面看到，定时器处理函数为sched_rt_period_timer()。

cpp 复制代码

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
    // 根据rt_bandwidth可以找到是哪个任务组，进而找到该任务组所有的运行队列
    struct rt_bandwidth *rt_b =
        container_of(timer, struct rt_bandwidth, rt_period_timer);
    ktime_t now;
    int overrun;
    int idle = 0;

    for (;;) {
        now = hrtimer_cb_get_time(timer);
        overrun = hrtimer_forward(timer, now, rt_b->rt_period);
        // 由于hrtimer的timerslack参数的影响，这里确保是真的到期后在处理
        if (!overrun)
            break;
        idle = do_sched_rt_period_timer(rt_b, overrun);
    }
    return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}

// 返回非0表示需要重新启动定时器，0则不需要
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
    int i, idle = 1, throttled = 0;
    const struct cpumask *span;

    span = sched_rt_period_mask();
#ifdef CONFIG_RT_GROUP_SCHED

    if (rt_b == &root_task_group.rt_bandwidth)
        span = cpu_online_mask;
#endif
    // 依次处理任务组在每个CPU的运行队列
    for_each_cpu(i, span) {
        int enqueue = 0;
        struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
        struct rq *rq = rq_of_rt_rq(rt_rq);

        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);

        if (rt_rq->rt_time) {
            u64 runtime;
    
            raw_spin_lock(&rt_rq->rt_runtime_lock);
            if (rt_rq->rt_throttled) // 运行队列发生了限流，尝试从其它CPU借用配额
                balance_runtime(rt_rq);
            // 定时器到期后重置该运行队列已经占用的CPU时长
            runtime = rt_rq->rt_runtime;
            rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
            if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                // 
经过一段时间，运行时长不再超过限定时长，解除限流。enqueue为1会让后面执行入队列操作
                rt_rq->rt_throttled = 0;
                enqueue = 1;

                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
                    rq->skip_clock_update = -1;
            }
            if (rt_rq->rt_time || rt_rq->rt_nr_running)
                idle = 0;
            raw_spin_unlock(&rt_rq->rt_runtime_lock);
        } else if (rt_rq->rt_nr_running) {
            idle = 0;
            if (!rt_rq_throttled(rt_rq))
                enqueue = 1;
        }
        if (rt_rq->rt_throttled)
            throttled = 1;

        if (enqueue) // 限流解除，将该运行队列上的调度实体重新入队
            sched_rt_rq_enqueue(rt_rq);
        raw_spin_unlock(&rq->lock);
    }

    if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
        return 1;

    return idle;
}

从实现上可以看出，该定时器的核心逻辑包括两点：

递减运行队列上任务的CPU使用时长。因为检测是按周期进行的，这样相当于按周期将使用时长复位。
对于限流的运行队列则解除限流，并将其重新入队。

下面还需要看一下该定时器是如何启动的。每个任务组都有一个自己的定时器，因此只需要在该任务组有任务被加入队列时启动即可，调用流程为__enqueue_rt_entity()->inc_rt_tasks()->inc_rt_group()->start_rt_bandwidth()。

cpp 复制代码

static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
...
    if (rt_rq->tg)
        start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
}

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
    // 该任务组的带宽控制被使能才启动定时器
    if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
        return;

    if (hrtimer_active(&rt_b->rt_period_timer)) // 防止重复启动
        return;

    // 启动定时器，周期就是检测周期
    raw_spin_lock(&rt_b->rt_runtime_lock);
    start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
    raw_spin_unlock(&rt_b->rt_runtime_lock);
}