进程内mmap锁相互干扰问题

一、背景

在调查进程内的非预期的较长时间的唤醒关系时，已经增加了trace_marker ftrace的trace_marker使用并从抓到的异常状态的trace里发现有进程内的两个线程之间有一个从增加的trace_marker的用户态的函数逻辑来看是不存在锁关联情况的一次唤醒关系。且这次唤醒关系由于被唤醒者是高优线程，唤醒者是低优线程，唤醒者由于是低优被其他中优或者高优线程抢占，导致等了较长的时间后再执行逻辑，再去唤醒高优线程，如下图：

从上图里可以看到低优线程528169被抢占了很久时间后才继续运行，唤醒了528164线程，而528164线程是以D的状态陷入了睡眠，说明是在等内核的锁，而我们是rt-linux系统，底层的内核锁如果是用的mutex或者spinlock的话，它是有优先级继承的，怎么会没有更早让唤醒者528169提高优先级而得到调度，这样如此高优（都已经设置成了99这种rt了）的528164线程可以早一些被唤醒。所以可以说明这个锁并不是内核里的mutex或者spinlock。那么到底是什么锁呢？我们在下面第二章里通过ko来抓这样的唤醒栈，另外，这部分逻辑可能也涉及用户栈，所以抓取程序也抓用户栈的内容，相关用户态解析的程序及相关的逻辑见之前的博客内核逻辑里抓取用户栈的几种方法和用户栈的高效解析逻辑。另外，在第三章里，我们针对抓取程序及抓取到的内容进行相关解释。

二、抓取程序的ko

2.1 抓取唤醒栈的程序

cpp 复制代码

#include <linux/module.h>
#include <linux/capability.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/init.h>
#include <asm/atomic.h>
#include <trace/events/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/tracepoint.h>
#include <trace/events/osmonitor.h>
#include <trace/events/sched.h>
#include <trace/events/irq.h>
#include <trace/events/kmem.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/sched/task_stack.h>
#include <linux/nmi.h>
#include <linux/version.h>
#include <linux/sched/mm.h>
#include <asm/irq_regs.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/stop_machine.h>
#include <linux/perf_event.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("zhaoxin");
MODULE_DESCRIPTION("Module for monitor wakeup.");
MODULE_VERSION("1.0");

static int wakerpid = 0;
module_param(wakerpid, int, S_IRUGO);
static int wakeepid = 0;
module_param(wakeepid, int, S_IRUGO);

#define TEST_STACK_TRACE_ENTRIES   32

typedef unsigned int (*stack_trace_save_tsk_func)(struct task_struct *task,
				  unsigned long *store, unsigned int size,
				  unsigned int skipnr);
stack_trace_save_tsk_func _stack_trace_save_tsk;

typedef void (*perf_callchain_user_func)(struct perf_callchain_entry_ctx *entry,
				struct pt_regs *regs);
perf_callchain_user_func _perf_callchain_user;

void init_get_perf_callchain_func(void)
{
	int ret;
	struct kprobe kp;
	memset(&kp, 0, sizeof(kp));
	kp.symbol_name = "perf_callchain_user";
	kp.pre_handler = NULL;
	kp.addr = NULL;	// 作为强调，提示使用symbol_name
	ret = register_kprobe(&kp);
	if (ret < 0) {
		printk("register_kprobe fail!\n");
		return;
	}
	printk("register_kprobe succeed!\n");
	_perf_callchain_user = (void*)kp.addr;
	unregister_kprobe(&kp);
}

#define FILENAME        "dlog.txt"

#define TESTDIOMONITOR_FILE_MAXLEN  1024
#define PERF_MAX_STACK_DEPTH		127

typedef struct testdmonitor_sample {
    struct timespec64 time;
    u64 timens;
    int cpu;
    int pid;
    int tgid;
    int ppid;
    char comm[TASK_COMM_LEN];
    char ppidcomm[TASK_COMM_LEN];
    int stackn;
    void* parray_stack[TEST_STACK_TRACE_ENTRIES];
    int stackn_user;
    void* parray_stack_user[PERF_MAX_STACK_DEPTH];
    int wakercpu;
    int wakerpid;
    int wakertgid;
    int wakerppid;
    char wakercomm[TASK_COMM_LEN];
    char wakerppidcomm[TASK_COMM_LEN];
    int wakerstackn;
    void* parray_wakerstack[TEST_STACK_TRACE_ENTRIES];
    int waker_stackn_user;
    void* waker_parray_stack_user[PERF_MAX_STACK_DEPTH];
    u32 writedone;  // 0 or 1
} testdmonitor_sample;

#define TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT  8192*4

typedef struct testdmonitor_sample_ringbuff {
    testdmonitor_sample* parray_sample;
    volatile u64 wp;    // Index is wp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1).
    volatile u64 rp;    // Index is rp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1).
    u32 skipcount;  // 0 means no skip any abnormal event
} testdmonitor_sample_ringbuff;

#define TESTDIOMONITOR_LINEBUFF  1024
#define MAX_CPU_COUNT   64

typedef struct testdmonitor_env {
    struct file* file;
    char file_linebuff[TESTDIOMONITOR_LINEBUFF];
    int headoffset;
    loff_t file_pos;
    testdmonitor_sample_ringbuff ringbuff;
    struct perf_callchain_entry* pentry[MAX_CPU_COUNT];
} testdmonitor_env;

static testdmonitor_env _env;

static struct delayed_work work_write_file;
static struct workqueue_struct *wq_write_file;

void init_file(void)
{
    _env.file = filp_open(FILENAME, O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (IS_ERR(_env.file)) {
        _env.file = NULL;
    }
}

void exit_file(void)
{
    if (_env.file) {
        filp_close(_env.file, NULL);
    }
}

void testdmonitor_write_file(char* i_pchar, int i_size)
{
    if (_env.file) {
        kernel_write(_env.file, i_pchar, i_size, &_env.file_pos);
    }
}

void testdmonitor_write_file_emptyline(void)
{
    testdmonitor_write_file("\n", strlen("\n"));
}

void testdmonitor_file_oneline(const char* i_format, ...)
{
    char* pcontent = &_env.file_linebuff[_env.headoffset];
    va_list args;
    va_start(args, i_format);
    vsnprintf(pcontent, TESTDIOMONITOR_LINEBUFF - _env.headoffset, i_format, args);
    va_end(args);
    testdmonitor_write_file(_env.file_linebuff, strlen(_env.file_linebuff));
}

void testdmonitor_checkget_parentinfo(testdmonitor_sample* io_psample, struct task_struct* i_ptask)
{
    struct task_struct* parent;
    rcu_read_lock();
    parent = rcu_dereference(i_ptask->real_parent);
    io_psample->ppid = parent->pid;
    strlcpy(io_psample->ppidcomm, parent->comm, TASK_COMM_LEN);
    rcu_read_unlock();
}

void testdmonitor_checkget_parentinfo_waker(testdmonitor_sample* io_psample, struct task_struct* i_ptask)
{
    struct task_struct* parent;
    rcu_read_lock();
    parent = rcu_dereference(i_ptask->real_parent);
    io_psample->wakerppid = parent->pid;
    strlcpy(io_psample->wakerppidcomm, parent->comm, TASK_COMM_LEN);
    rcu_read_unlock();
}

static inline u64 gettscns(void)
{
    struct timespec64 ts;
    ktime_get_ts64(&ts);
    return timespec64_to_ns(&ts);
}

#define HEAD_USER       "user "

static void write_file(struct work_struct *w)
{
    //ssize_t ret;
    u32 index;
    testdmonitor_sample* psample;
    struct tm t;
    char timestr[64];
    int stacki;
    while (_env.ringbuff.rp != _env.ringbuff.wp) {
        index = (_env.ringbuff.rp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1));
        psample = &_env.ringbuff.parray_sample[index];
        if (psample->writedone != 1) {
            break;
        }
        testdmonitor_write_file_emptyline();
        _env.headoffset = sprintf(_env.file_linebuff, "[%llu] ", _env.ringbuff.rp);
        time64_to_tm(psample->time.tv_sec + 8 * 60 * 60, 0, &t);
        snprintf(timestr, 64, "%04ld-%02d-%02d-%02d_%02d_%02d.%09ld",
		    1900 + t.tm_year, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec, psample->time.tv_nsec);
        testdmonitor_file_oneline("[skipcount:%u]begin...time[%s][%llu]wakercpu[%d]\n", 
            _env.ringbuff.skipcount, timestr, psample->timens, psample->wakercpu);
        testdmonitor_file_oneline("wakertgid[%d]wakerpid[%d]wakercomm[%s]wakerppid[%d]wakerppidcomm[%s]\n",
            psample->wakertgid, psample->wakerpid, psample->wakercomm, psample->wakerppid, psample->wakerppidcomm);
        testdmonitor_file_oneline("stack[%d]:\n", psample->wakerstackn);
        for (stacki = 0; stacki < psample->wakerstackn; stacki++) {
            testdmonitor_file_oneline("%*c%pS\n", 5, ' ', (void *)psample->parray_wakerstack[stacki]);
        }
        for (stacki = 0; stacki < psample->waker_stackn_user; stacki++) {
            testdmonitor_file_oneline(HEAD_USER "%d %*c%llx\n",
                psample->wakertgid, 5, ' ', (u64)psample->waker_parray_stack_user[stacki]);
        }
        testdmonitor_file_oneline("cpu[%d]tgid[%d]pid[%d]comm[%s]ppid[%d]ppidcomm[%s]\n",
            psample->cpu, psample->tgid, psample->pid, psample->comm, psample->ppid, psample->ppidcomm);
        testdmonitor_file_oneline("stack[%d]:\n", psample->stackn);
        for (stacki = 0; stacki < psample->stackn; stacki++) {
            testdmonitor_file_oneline("%*c%pS\n", 5, ' ', (void *)psample->parray_stack[stacki]);
        }
        for (stacki = 0; stacki < psample->stackn_user; stacki++) {
            testdmonitor_file_oneline(HEAD_USER "%d %*c%llx\n",
                psample->tgid, 5, ' ', (u64)psample->parray_stack_user[stacki]);
        }
        testdmonitor_write_file_emptyline();
        smp_wmb();
        psample->writedone = 0;
        _env.ringbuff.rp ++;
    }
    queue_delayed_work_on(nr_cpu_ids - 1, wq_write_file,
        &work_write_file, 1);
}

static void init_write_file(void)
{
    init_file();
    wq_write_file = alloc_workqueue("testdmonitor_write_file", WQ_MEM_RECLAIM, 0);
    INIT_DELAYED_WORK(&work_write_file, write_file);
    queue_delayed_work_on(nr_cpu_ids - 1, wq_write_file,
        &work_write_file, 3);
}

static void exit_write_file(void)
{
    cancel_delayed_work_sync(&work_write_file);
    destroy_workqueue(wq_write_file);
    exit_file();
}

void init_testdmonitor_sample_ringbuff(void) 
{
    int i;
    _env.ringbuff.parray_sample = kvzalloc(sizeof(testdmonitor_sample) * TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT, GFP_KERNEL);
    for (i = 0; i < MAX_CPU_COUNT; i++) {
        _env.pentry[i] = kmalloc(sizeof(struct perf_callchain_entry) + sizeof(__u64) * PERF_MAX_STACK_DEPTH, 
            GFP_KERNEL);
    }
}

void exit_testdmonitor_sample_ringbuff(void)
{
    int i;
    for (i = 0; i < MAX_CPU_COUNT; i++) {
        kvfree(_env.pentry[i]);
    }
    kvfree(_env.ringbuff.parray_sample);
}

testdmonitor_sample* testdmonitor_get_psample(void)
{
    u64 windex_raw, windex_raw_old;
    u32 windex;
    while (1) {
        windex_raw = _env.ringbuff.wp;
        if (windex_raw - _env.ringbuff.rp >= (u64)(TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT)) {
            _env.ringbuff.skipcount ++;
            return NULL;
        }
        // atomic_cmpxchg return old value
		windex_raw_old = atomic64_cmpxchg((atomic64_t*)&_env.ringbuff.wp,
			windex_raw, windex_raw + 1);
        if (windex_raw_old == windex_raw) {
            break;
        }
    }
    windex = (u32)(windex_raw & (u64)(TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1));
    return &_env.ringbuff.parray_sample[windex];
}

void store_user_stack(int *o_pstackn_user, void** parray_stack_user, struct task_struct *i_p)
{
    int i;
    struct perf_callchain_entry_ctx ctx;
    struct perf_callchain_entry* pentry = NULL;
    struct pt_regs *regs = task_pt_regs(i_p);
    *o_pstackn_user = 0;
    pentry = _env.pentry[smp_processor_id()];
    ctx.entry     = pentry;
    ctx.max_stack = PERF_MAX_STACK_DEPTH;
    ctx.nr	      = pentry->nr = 0;
    ctx.contexts       = 0;
    ctx.contexts_maxed = false;
    _perf_callchain_user(&ctx, regs);
    *o_pstackn_user = pentry->nr;
    for (i = 0; i < pentry->nr; i++) {
        parray_stack_user[i] = (void*)pentry->ip[i];
    }
}

void testdmonitor_add_sample(struct task_struct* i_task)
{
    testdmonitor_sample* psample = testdmonitor_get_psample();
    if (!psample) {
        return;
    }
    ktime_get_real_ts64(&psample->time);
    psample->timens = gettscns();
    psample->cpu = task_cpu(i_task);
    psample->pid = i_task->pid;
    psample->tgid = i_task->tgid;
    strlcpy(psample->comm, i_task->comm, TASK_COMM_LEN);
    testdmonitor_checkget_parentinfo(psample, i_task);
    psample->stackn = _stack_trace_save_tsk(i_task, (unsigned long*)psample->parray_stack, TEST_STACK_TRACE_ENTRIES, 0);
    store_user_stack(&psample->stackn_user, psample->parray_stack_user, i_task);
    psample->wakercpu = smp_processor_id();
    psample->wakerpid = current->pid;
    psample->wakertgid = current->tgid;
    strlcpy(psample->wakercomm, current->comm, TASK_COMM_LEN);
    testdmonitor_checkget_parentinfo_waker(psample, current);
    psample->wakerstackn = _stack_trace_save_tsk(current, (unsigned long*)psample->parray_wakerstack, TEST_STACK_TRACE_ENTRIES, 0);
    store_user_stack(&psample->waker_stackn_user, psample->waker_parray_stack_user, current);
    smp_wmb();
    psample->writedone = 1;
}

static void cb_sched_waking(void *i_data, struct task_struct *i_p)
{
    if (i_p->on_cpu || i_p->on_rq == 1) {
        return;
    }
    //if (current->tgid == 3366)
    if (current->pid == wakerpid && i_p->pid == wakeepid)
    {
        testdmonitor_add_sample(i_p);
    }
}

struct kern_tracepoint {
    void *callback;
    struct tracepoint *ptr;
    bool bregister;
};
static void clear_kern_tracepoint(struct kern_tracepoint *tp)
{
    if (tp->bregister) {
        tracepoint_probe_unregister(tp->ptr, tp->callback, NULL);
    }
}

#define INIT_KERN_TRACEPOINT(tracepoint_name) \
    static struct kern_tracepoint mykern_##tracepoint_name = {.callback = NULL, .ptr = NULL, .bregister = false};


#define TRACEPOINT_CHECK_AND_SET(tracepoint_name)                                             \
    static void tracepoint_name##_tracepoint_check_and_set(struct tracepoint *tp, void *priv) \
    {                                                                                \
        if (!strcmp(#tracepoint_name, tp->name))                                     \
        {                                                                            \
            ((struct kern_tracepoint *)priv)->ptr = tp;                          \
            return;                                                                  \
        }                                                                            \
    }

INIT_KERN_TRACEPOINT(sched_waking)
TRACEPOINT_CHECK_AND_SET(sched_waking)

typedef unsigned long (*kallsyms_lookup_name_func)(const char *name);
kallsyms_lookup_name_func _kallsyms_lookup_name_func;

void* get_func_by_symbol_name_kallsyms_lookup_name(void)
{
    int ret;
    void* pfunc = NULL;
	struct kprobe kp;
	memset(&kp, 0, sizeof(kp));
	kp.symbol_name = "kallsyms_lookup_name";
	kp.pre_handler = NULL;
	kp.addr = NULL;	// 作为强调，提示使用symbol_name
	ret = register_kprobe(&kp);
	if (ret < 0) {
		printk("register_kprobe fail!\n");
		return NULL;
	}
	printk("register_kprobe succeed!\n");
    pfunc = (void*)kp.addr;
	unregister_kprobe(&kp);
    return pfunc;
}

void* get_func_by_symbol_name(const char* i_symbol)
{
    if (_kallsyms_lookup_name_func == NULL) {
        return NULL;
    }
    return (void*)_kallsyms_lookup_name_func(i_symbol);
}

static int __init testwakeup_init(void)
{
    if (wakerpid == 0 || wakeepid == 0) {
        printk(KERN_ERR "wakerpid and wakeepid should NOT be 0\n");
        return -EINVAL;
    }
    _kallsyms_lookup_name_func = get_func_by_symbol_name_kallsyms_lookup_name();
    init_get_perf_callchain_func();
    init_testdmonitor_sample_ringbuff();
    init_write_file();
    _stack_trace_save_tsk = get_func_by_symbol_name("stack_trace_save_tsk");
    if (_stack_trace_save_tsk == NULL) {
        printk(KERN_ERR "get_func_by_symbol_name stack_trace_save_tsk failed!\n");
        return -1;
    }
    mykern_sched_waking.callback = cb_sched_waking;
    for_each_kernel_tracepoint(sched_waking_tracepoint_check_and_set, &mykern_sched_waking);
    if (!mykern_sched_waking.ptr) {
        printk(KERN_ERR "mykern_sched_waking register failed!\n");
        return -1;
    }
    else {
        printk(KERN_INFO "mykern_sched_waking register succeeded!\n");
    }
    tracepoint_probe_register(mykern_sched_waking.ptr, mykern_sched_waking.callback, NULL);
    mykern_sched_waking.bregister = 1;
    return 0;
}

static void __exit testwakeup_exit(void)
{
    clear_kern_tracepoint(&mykern_sched_waking);
    tracepoint_synchronize_unregister();
    exit_write_file();
    exit_testdmonitor_sample_ringbuff();
}

module_init(testwakeup_init);
module_exit(testwakeup_exit);

上面的ko程序有抓取用户栈的PC，相关的用户栈的解析见之前的博客。

2.2 使用方法

使用方法还是比较简单的，就是insmod时传入waker和wakee的参数，如下：

insmod testwakeup.ko waker=1 wakee=10

抓取的就是tid是1的线程唤醒tid是10的线程的唤醒栈。

三、相关解释

3.1 抓到的这次没有优先级继承的唤醒栈

通过ftrace开启抓取叠加同时insmod上面第二章的ko，在抓取到异常事件的trace后，rmmod掉ko，运行用户态解析程序，解析出完整的带用户栈的调用栈信息。根据perfetto里打开异常的trace里看到的时间来找对应的包含用户栈的完整调用栈，得到如下这个唤醒栈：

可以从上图里看到，唤醒者在执行内存分配的动作，内存分配用的jemalloc的库，在执行内存分配逻辑时触发了madvise的操作，而madvise的操作里有用到读写sem（上图里的up_read就是线索），这个读写sem，其实就是著名的mmap的读写sem，而被唤醒者也正在需要用这把mmap读写sem，因为被唤醒者在执行mmap的逻辑。do_madvise里调用的up_read的相关代码逻辑如下：

mmap的读写sem是一个进程内的锁，是一个与映射有关的内存操作都需要用到的一把常见锁，虽然rt-linux里的mutex和spinlock都是优先级继承的，但是读写锁并不是。

3.2 非正在运行的调用栈会抓取不全用户栈

可以从 3.1 里说到的抓到的下图的堆栈里可以看到：

被唤醒者，也就是在抓取时候没有正在运行的任务，其用户栈的抓取可能是不全的，无论是用perf抓还是用自己写的ko抓，都是一样的现象。

这种情况下，如果要进一步来找更细的用户栈，就需要去on-cpu的进行抓取，可通过：

bash 复制代码

perf record -g -t <tid>

来抓一个具体的线程，然后再perf script去找到底是哪个具体的调用链路执行的mmap系统调用。