rt-linux下的cgroup cpu的死锁bug

一、背景

rt-linux系统有其非常大的实时性的优势，但是与之俱来的是该系统上有一些天然的缺陷。由于rt-linux系统允许进程在内核态执行的逻辑里，在持锁期间，甚至持spinlock锁期间，都能被其他进程抢占。这一特性能带来实时性的好处，即能最大限度的满足高优先级进程的优先运行，但是势必会带来一定坏处，也就是某些底层锁会因此在持有的情况下被迫不能继续执行（被别的任务抢占），导致别的任务也需要访问该底层锁的时候，就会阻塞，这种情况会因为开启cgroup后更加加剧。

它会导致一些看似很不合理的情况，比如两个看似毫无关系的进程却有因为其中一个执行到一半被cgroup限额了导致另外一个进程一直在等前者继续运行从而来唤醒自己。老实说，这种情况应该也能在普通内核开启cgroup的cpu限额后也能出现，但是普通内核由于spinlock是禁用抢占的，而底层逻辑里的锁大部分还是spinlock，所以普通内核里因为cgroup限额导致底层逻辑阻塞两个看似不相关的进程的情况还是比较少的。另外一方面，普通内核里如果发生类似的情况就是使用了mutex，而使用了mutex那就是可以认为是能忍受一定的等待的，同样的，对于底层逻辑也一样，底层逻辑里要使用mutex的地方肯定也都是默认能忍受一定等待的，那么这时候发生cgroup cpu的限额导致这样的等待又多持续了一段时间，那也理应能接受的。

虽然rt-linux有针对锁有优先级继承的逻辑来保证杜绝优先级反转的情况，比如如果普通进程访问了一个spinlock但是被RT进程抢占了，而RT进程执行完之后，普通进程所在的cgroup组又限额导致普通进程不能进一步执行逻辑，而导致spinlock一直退不出来。这时候，如果是一个实时进程要访问该锁，那么由于优先级继承的逻辑，普通进程就会被临时提高优先级，提到到要用锁的进程的优先级和自己优先级中的较高者，这个例子里就是提高到实时优先级，所以普通进程又能执行下去了。关于该优先级反转的进一步细节见之前的博客 rt-linux之防止优先级反转-CSDN博客。

但是，刚才说是要使用该锁的进程是实时进程的情况，但是如果要使用该锁的后者并不是实时进程，而是普通进程的话，那么就算提高优先级也是在普通进程这个调度类范畴，也受cgroup cpu的管控，仍然无法拿到额外的运行时间，还得等cgroup cpu的period timer重新补充时间来运行。这就会导致后面要拿锁的进程要等很长的时间，而后面要拿锁的进程和前者持锁被throttle的进程可能它们之间是表面并不关联的，关联的部分可能就是底层的逻辑，这就会导致一些比较诡异难理解的现象出来。这种现象，我们后面的博客会用一些例子程序来模拟出来，并用图示来进一步解释。

这篇博客里，我们只讨论rt-linux内核里会因为cgroup限额导致死锁的情况，在下面第二章里，我们给出复现的程序和复现方法，在第三章里，我们来阐述其原理，并给出相应的解法。

二、复现程序和复现方法

2.1 复现程序

2.1.1 复现程序用的内核模块

cpp 复制代码

#include <linux/module.h>
#include <linux/capability.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/init.h>
#include <asm/atomic.h>
#include <trace/events/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/tracepoint.h>
#include <trace/events/osmonitor.h>
#include <trace/events/sched.h>
#include <trace/events/irq.h>
#include <trace/events/kmem.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/sched/task_stack.h>
#include <linux/nmi.h>
#include <linux/version.h>
#include <linux/sched/mm.h>
#include <asm/irq_regs.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/stop_machine.h>

struct kprobe _kp1;

//static bool _blog = false;

int getfullpath(struct inode* inode, char* i_buffer, int i_len)
{
    struct dentry* dentry;
    //printk("inode = %ld\n", inode->i_ino);
    //spin_lock(&inode->i_lock);
    hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
        char* buffer, * path;

        buffer = (char*)__get_free_page(GFP_KERNEL);
        if (!buffer)
            return -ENOMEM;

        path = dentry_path_raw(dentry, buffer, PAGE_SIZE);
        if (IS_ERR(path)) {
            continue;
        }

        strlcpy(i_buffer, path, i_len);

        //printk("dentry name = %s , path = %s", dentry->d_name.name, path);

        free_page((unsigned long)buffer);
    }
    //spin_unlock(&inode->i_lock);
    return 0;
}

#define DEVICE_NAME "testcgroupbug"
#define IOCTL_SET_MODE _IOW('a', 1, long)

static struct proc_dir_entry* proc_entry;
// static rwlock_t my_rwlock;
DEFINE_RWLOCK(my_rwlock);

typedef struct testpara {
    int mode;
    int sleepsecond;
} testpara;

// static ssize_t proc_read(struct file *file, char __user *buf, size_t count, loff_t *offset) {
//     char message[256];
//     ssize_t len = snprintf(message, sizeof(message), "Current mode: %d\n", mode);

//     return simple_read_from_buffer(buf, count, offset, message, len);
// }

void deadloop_second(int i_time) {
    unsigned long start_time = jiffies;
    if (i_time < 0) {
        while (1);
        return;
    }
    else if (i_time == 0) {
        return;
    }
    else {
        while (time_before(jiffies, start_time + (unsigned long)(HZ * i_time))) {
        }
    }
}

#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>

struct perf_event* __percpu* sample_hbp;

static void sample_hbp_handler(struct perf_event* bp,
    struct perf_sample_data* data,
    struct pt_regs* regs)
{
    //printk(KERN_INFO "%s value is changed\n", ksym_name);
    dump_stack();
    //printk(KERN_INFO "Dump stack from sample_hbp_handler\n");
}

void register_prioritychange_dumpstack(void)
{
    int ret;
    struct perf_event_attr attr;
    void* addr = &current->prio;

    hw_breakpoint_init(&attr);
    attr.bp_addr = (unsigned long)addr;
    attr.bp_len = HW_BREAKPOINT_LEN_4;
    attr.bp_type = HW_BREAKPOINT_W;

    sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL);
    if (IS_ERR((void __force*)sample_hbp)) {
        ret = PTR_ERR((void __force*)sample_hbp);
        return;
    }

    printk(KERN_INFO "HW Breakpoint for write installed\n");

}

void unregister_prioritychange_dumpstack(void)
{
    unregister_wide_hw_breakpoint(sample_hbp);
}

static bool _bneedoutput = false;

static enum hrtimer_restart hrtimer_callback(struct hrtimer* timer)
{
    printk(KERN_INFO "<cpu%d><comm%s>hrtimer_callback before read_lock!\n", smp_processor_id(), current->comm);
    read_lock(&my_rwlock);
    printk(KERN_INFO "<cpu%d><comm%s>hrtimer_callback before read_unlock!\n", smp_processor_id(), current->comm);
    read_unlock(&my_rwlock);
    printk(KERN_INFO "<cpu%d><comm%s>hrtimer_callback after read_unlock!\n", smp_processor_id(), current->comm);
    return HRTIMER_NORESTART;
}

static struct hrtimer		_testtimer;

void register_hrtimer_soft(void)
{
    hrtimer_init(&_testtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
    _testtimer.function = hrtimer_callback;
    hrtimer_forward_now(&_testtimer, ns_to_ktime(10000));
    hrtimer_start_expires(&_testtimer, HRTIMER_MODE_ABS_PINNED);
}

void unregister_hrtimer_soft(void)
{
    hrtimer_cancel(&_testtimer);
}

static volatile bool bhasreadenterlock = false;

static volatile bool bhaswriter = false;

static volatile bool bhasregisterhrtimersoft = false;

void deadloop_second_special(int i_time) {
    unsigned long start_time = jiffies;
    if (i_time < 0) {
        while (1);
        return;
    }
    else if (i_time == 0) {
        return;
    }
    else {
        while (time_before(jiffies, start_time + (unsigned long)(HZ * i_time))) {
            if (bhaswriter) {
                {
                    unsigned long start = jiffies;
                    while (time_before(jiffies, start + (unsigned long)(HZ / 25))) {
                    }
                }
                if (!bhasregisterhrtimersoft) {
                    bhasregisterhrtimersoft = true;
                    register_hrtimer_soft();
                }
            }
        }
    }
}

static int _readindex = 0;
static int _writeindex = 0;

static long proc_ioctl(struct file* file, unsigned int cmd, unsigned long arg) {
    testpara para;
    if (cmd == IOCTL_SET_MODE) {
        if (copy_from_user(&para, (int __user*)arg, sizeof(testpara))) {
            return -EFAULT;
        }

        if (para.mode == 0) {
            int readindex = _readindex;
            _bneedoutput = true;
            _readindex++;
            if (bhaswriter) {
                printk(KERN_ERR "<cpu%d>readindex[%d]Not expected bhaswriter value! Need reload ko!\n", smp_processor_id(),
                    readindex);
                return -EFAULT;
            }
            //register_prioritychange_dumpstack();
            printk(KERN_INFO "<cpu%d>readindex[%d]before read lock\n", smp_processor_id(), readindex);
            read_lock(&my_rwlock);
            bhasreadenterlock = true;
            printk(KERN_INFO "<cpu%d>readindex[%d]Read lock acquired.\n", smp_processor_id(), readindex);
            printk(KERN_INFO "<cpu%d>readindex[%d]Sleep second[%d]\n", smp_processor_id(), readindex, para.sleepsecond);
            // wait bhaswriter true and then register_hrtimer_soft
            deadloop_second_special(para.sleepsecond);
            // Perform read operations here
            read_unlock(&my_rwlock);
            printk(KERN_INFO "<cpu%d>readindex[%d]Read lock released.\n", smp_processor_id(), readindex);
            //unregister_prioritychange_dumpstack();
        }
        else if (para.mode == 1) {
            int writeindex = _writeindex;
            _writeindex++;
            printk(KERN_INFO "<cpu%d>writerindex[%d]writer check bhasreadenterlock\n", smp_processor_id(), writeindex);
            while (1) {
                if (bhasreadenterlock) break;
            }
            printk(KERN_INFO "<cpu%d>writerindex[%d]before write lock\n", smp_processor_id(), writeindex);
            bhaswriter = true;
            write_lock(&my_rwlock);
            printk(KERN_INFO "<cpu%d>writerindex[%d]Write lock acquired.\n", smp_processor_id(), writeindex);
            printk(KERN_INFO "<cpu%d>writerindex[%d]Sleep second[%d]\n", smp_processor_id(), writeindex, para.sleepsecond);
            deadloop_second(para.sleepsecond);
            // Perform write operations here
            write_unlock(&my_rwlock);
            printk(KERN_INFO "<cpu%d>writerindex[%d]Write lock released.\n", smp_processor_id(), writeindex);
            _bneedoutput = false;
        }
        else {
            printk(KERN_ERR "<cpu%d>Invalid mode: %d\n", smp_processor_id(), para.mode);
            return -EINVAL;
        }

        return 0;
    }

    return -EINVAL;
}

static const struct proc_ops proc_fops = {
    .proc_ioctl = proc_ioctl,
};

static int proctestcgroup_init(void) {
    proc_entry = proc_create(DEVICE_NAME, 0666, NULL, &proc_fops);
    if (!proc_entry) {
        return -ENOMEM;
    }

    rwlock_init(&my_rwlock);
    printk(KERN_INFO "Module loaded: /proc/%s created.\n", DEVICE_NAME);
    return 0;
}

static void proctestcgroup_exit(void) {
    proc_remove(proc_entry);
    printk(KERN_INFO "Module unloaded: /proc/%s removed.\n", DEVICE_NAME);
}

int kprobecb_vdso_fault_pre(struct kprobe* i_k, struct pt_regs* i_p)
{
    if (_bneedoutput) {
        printk(KERN_INFO "<cpu%d><comm%s>sched_cfs_period_timer\n", smp_processor_id(), current->comm);
        //dump_stack();

        // {
        //     struct kiocb *iocb = (struct kiocb *)i_p->di;
        //     struct file *file = iocb->ki_filp;
        //     struct address_space *mapping = file->f_mapping;
        //     struct inode *inode = mapping->host;
        //     int ret = 0;
        //     char buf[128];
        //     if ((ret = getfullpath(inode, buf, 128)) < 0) {
        //         return 0;
        //     }
        //     printk("generic_file_write_iter file[%s]\n",
        //         buf);
        // }
    }
    return 0;
}

int kprobe_register_func_vdso_fault(void)
{
    int ret;
    memset(&_kp1, 0, sizeof(_kp1));
    _kp1.symbol_name = "sched_cfs_period_timer";
    _kp1.pre_handler = kprobecb_vdso_fault_pre;
    _kp1.post_handler = NULL;
    ret = register_kprobe(&_kp1);
    if (ret < 0) {
        printk("register_kprobe fail!\n");
        return -1;
    }
    printk("register_kprobe success!\n");
    return 0;
}

void kprobe_unregister_func_vdso_fault(void)
{
    unregister_kprobe(&_kp1);
}

static int __init testcgroupbug_init(void)
{
    kprobe_register_func_vdso_fault();
    proctestcgroup_init();
    return 0;
}

static void __exit testcgroupbug_exit(void)
{
    kprobe_unregister_func_vdso_fault();
    proctestcgroup_exit();
}

module_init(testcgroupbug_init);
module_exit(testcgroupbug_exit);
MODULE_AUTHOR("zhaoxin");
MODULE_DESCRIPTION("Module for testcgroupbug debug.");
MODULE_LICENSE("GPL");

2.1.2 复现程序用的rwlock_read用户态程序

cpp 复制代码

#include <cstring>
#include <iostream>
#include <csignal>
#include <thread>
#include <chrono>
#include <ctime>
#include <atomic>
#include <cmath>
#include <fstream>
#include <vector>
#include <memory>
#include <map>
#include <getopt.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/ioctl.h>
#include <linux/types.h>
#include <signal.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/syscall.h>

#define DEVICE_NAME "testcgroupbug"
#define IOCTL_SET_MODE _IOW('a', 1, long)

typedef struct testpara {
    int mode;
    int sleepsecond;
} testpara;

#define IOCTL_SET_MODE _IOW('a', 1, long)

int main(int argc, char *argv[])
{
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <sleep_time_in_seconds>\n", argv[0]);
        return 1;
    }

    int duration = atoi(argv[1]);

    int fd = open("/proc/testcgroupbug", O_RDWR);
	if (fd < 0) {
		printf("/proc/testcgroupbug do not exist!\n");
		return -ENOENT;
	}
	printf("/proc/testcgrouopbug succeed!\n");
	testpara para;
    para.mode = 0;
    para.sleepsecond = duration;

	if (__glibc_unlikely((ioctl(fd, IOCTL_SET_MODE, &para)) < 0)) {
		printf("/proc/testcgroupbug ioctl fail!\n");
		return -errno;
	}
	printf("/proc/testcgroupbug ioctl success!\n");

    return 0;
}

2.1.3 复现程序用的rwlock_write用户态程序

cpp 复制代码

#include <cstring>
#include <iostream>
#include <csignal>
#include <thread>
#include <chrono>
#include <ctime>
#include <atomic>
#include <cmath>
#include <fstream>
#include <vector>
#include <memory>
#include <map>
#include <getopt.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/ioctl.h>
#include <linux/types.h>
#include <signal.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/syscall.h>

#define DEVICE_NAME "testcgroupbug"
#define IOCTL_SET_MODE _IOW('a', 1, long)

typedef struct testpara {
    int mode;
    int sleepsecond;
} testpara;

#define IOCTL_SET_MODE _IOW('a', 1, long)

int main(int argc, char *argv[])
{
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <sleep_time_in_seconds>\n", argv[0]);
        return 1;
    }

    int duration = atoi(argv[1]);

    int fd = open("/proc/testcgroupbug", O_RDWR);
	if (fd < 0) {
		printf("/proc/testcgroupbug do not exist!\n");
		return -ENOENT;
	}
	printf("/proc/testcgrouopbug succeed!\n");
	testpara para;
    para.mode = 1;
    para.sleepsecond = duration;

	if (__glibc_unlikely((ioctl(fd, IOCTL_SET_MODE, &para)) < 0)) {
		printf("/proc/testcgroupbug ioctl fail!\n");
		return -errno;
	}
	printf("/proc/testcgroupbug ioctl success!\n");

    return 0;
}

2.1.4 依次启动程序的脚本

再准备一个死循环程序，代码如下，比较简单：

cpp 复制代码

#include <stdio.h>

int main()
{
	while(1);
	return 1;
}

然后就是一个脚本，来按照如下的时序来依次启动程序：

bash 复制代码

#!/bin/bash

pkill deadloop

rmmod testcgroupbug
insmod testcgroupbug.ko

dmesg -c

mkdir /sys/fs/cgroup/test
echo "500000 1000000" > /sys/fs/cgroup/test/cpu.max

taskset -c 0 ./deadloop &
taskset -c 2 ./deadloop &
taskset -c 3 ./deadloop &
taskset -c 4 ./deadloop &
taskset -c 5 ./deadloop &

sleep 3

taskset -c 1 ./rwlock_read 5 &

pid=$!

echo $pid > /sys/fs/cgroup/test/cgroup.procs

#sleep 2

chrt -f 60 ./rwlock_write 3 &

#sleep 2

#./rwlock_read 10 &

2.2 复现方法

复现方法比较简单，就是在rt-linux内核上执行上面 2.1.4 里的脚本即可。

要注意的是 2.1.4 里的脚本是针对只有6个cpu的情况，如果cpu的数量较多，请自行根据cpu的数量调整脚本，意思就是把cpu1以外的其他核都跑一个死循环的任务，让cpu 1相对比较闲，这样更容易复现该死锁问题。

三、原理及相应解法

3.1 死锁原理

死锁发生需要依赖一定的时序：

1）进程A在持有了rwlock的读锁之后，被其所在的cgroup cpu给throttle了

2）进程B持有了rwlock的写锁，由于rwlock的机制（一旦有人尝试拿写锁，后续的读者都会被阻塞）

3）一个软timer的任务也需要访问该rwlock的读锁，该软timer的任务这时候访问了该rwlock的读锁，所以只能等在那儿，执行不下去了

4）由于之前进程A已经被cgroup cpu给throttle了，且持有者读锁没有释放，软timer的任务假设是在ktimer内核线程上运行，由于也需要拿读锁且读锁没有释放，所以该ktimer内核线程的当前这个处理timer到期的任务包括后面的处理timer到期的任务都得不到执行

5）而cgroup的period timer是pinned的timer，在一开始创建这个period timer如果绑定在某个核上，那因为pinned的模式就一直得在这个核上运行，所以假设绑定的这个核切好就是使用上面说的读锁而被阻塞的软timer任务所在的核上，那么这个period timer就一直得不到执行。因为另外一个细节是，CONFIG_PREEMPT_RT的系统上该cgroup的period timer设的HRTIMER_MODE_ABS_PINNED和HRTIMER_MODE_REL都没带HARD所以都是在ktimer或ksoftirqd里运行，而不是硬中断处理直接触发执行的

6）另外，如果是mutex而不是rwlock，由于ktimer是一个FIFO 1的进程，根据rt-linux上的优先级继承策略，被cgroup限制执行的进程A如果使用的不是rwlock而是mutex的话，会被临时提高优先级到ktimer的FIFO 1优先级，这样就不死锁了。可是很遗憾，这个例子里，进程A是用的rwlock而不是mutex，由于rwlock允许有多个读者，所以它并不能实施该优先级继承的策略，因为优先级继承的实现针对的是一个持锁人的情况。

3.1.1 代码解释

其实在上面的逻辑描述还是比较清晰的，我们再来跟着看一下复现该bug用到的代码。

先启动的rwlock的读者进程，走到了下面的内核模块里的逻辑：

然后执行rwlock的writer，上图里执行了deadloop_second_special函数，确保有rwlock的writer了之后再启动软hrtimer：

启动软hrtimer并设定PINNED：

在该软hrtimer的callback里使用rwlock进行读：

这时候就可能发生循环依赖的死锁。

3.2 死锁解法

我们有一个相对简单的针对该死锁问题的解法，改动如下：

意思就是在rt-linux下把该cgroup cpu相关的两个period相关的timer都改成和普通内核里的timer的实现方式一样，即使用硬中断响应该时间到期的事件，而不用当前rt-linux里采用的hrtimer的软处理逻辑。