进程内mmap锁相互干扰问题

一、背景

在调查进程内的非预期的较长时间的唤醒关系时,已经增加了trace_marker ftrace的trace_marker使用 并从抓到的异常状态的trace里发现有进程内的两个线程之间有一个从增加的trace_marker的用户态的函数逻辑来看是不存在锁关联情况的一次唤醒关系。且这次唤醒关系由于被唤醒者是高优线程,唤醒者是低优线程,唤醒者由于是低优被其他中优或者高优线程抢占,导致等了较长的时间后再执行逻辑,再去唤醒高优线程,如下图:

从上图里可以看到低优线程528169被抢占了很久时间后才继续运行,唤醒了528164线程,而528164线程是以D的状态陷入了睡眠,说明是在等内核的锁,而我们是rt-linux系统,底层的内核锁如果是用的mutex或者spinlock的话,它是有优先级继承的,怎么会没有更早让唤醒者528169提高优先级而得到调度,这样如此高优(都已经设置成了99这种rt了)的528164线程可以早一些被唤醒。所以可以说明这个锁并不是内核里的mutex或者spinlock。那么到底是什么锁呢?我们在下面第二章里通过ko来抓这样的唤醒栈,另外,这部分逻辑可能也涉及用户栈,所以抓取程序也抓用户栈的内容,相关用户态解析的程序及相关的逻辑见之前的博客 内核逻辑里抓取用户栈的几种方法用户栈的高效解析逻辑。另外,在第三章里,我们针对抓取程序及抓取到的内容进行相关解释。

二、抓取程序的ko

2.1 抓取唤醒栈的程序

cpp 复制代码
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/init.h>
#include <asm/atomic.h>
#include <trace/events/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/tracepoint.h>
#include <trace/events/osmonitor.h>
#include <trace/events/sched.h>
#include <trace/events/irq.h>
#include <trace/events/kmem.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/sched/task_stack.h>
#include <linux/nmi.h>
#include <linux/version.h>
#include <linux/sched/mm.h>
#include <asm/irq_regs.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/stop_machine.h>
#include <linux/perf_event.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("zhaoxin");
MODULE_DESCRIPTION("Module for monitor wakeup.");
MODULE_VERSION("1.0");

static int wakerpid = 0;
module_param(wakerpid, int, S_IRUGO);
static int wakeepid = 0;
module_param(wakeepid, int, S_IRUGO);

#define TEST_STACK_TRACE_ENTRIES   32

typedef unsigned int (*stack_trace_save_tsk_func)(struct task_struct *task,
				  unsigned long *store, unsigned int size,
				  unsigned int skipnr);
stack_trace_save_tsk_func _stack_trace_save_tsk;

typedef void (*perf_callchain_user_func)(struct perf_callchain_entry_ctx *entry,
				struct pt_regs *regs);
perf_callchain_user_func _perf_callchain_user;

void init_get_perf_callchain_func(void)
{
	int ret;
	struct kprobe kp;
	memset(&kp, 0, sizeof(kp));
	kp.symbol_name = "perf_callchain_user";
	kp.pre_handler = NULL;
	kp.addr = NULL;	// 作为强调,提示使用symbol_name
	ret = register_kprobe(&kp);
	if (ret < 0) {
		printk("register_kprobe fail!\n");
		return;
	}
	printk("register_kprobe succeed!\n");
	_perf_callchain_user = (void*)kp.addr;
	unregister_kprobe(&kp);
}

#define FILENAME        "dlog.txt"

#define TESTDIOMONITOR_FILE_MAXLEN  1024
#define PERF_MAX_STACK_DEPTH		127

typedef struct testdmonitor_sample {
    struct timespec64 time;
    u64 timens;
    int cpu;
    int pid;
    int tgid;
    int ppid;
    char comm[TASK_COMM_LEN];
    char ppidcomm[TASK_COMM_LEN];
    int stackn;
    void* parray_stack[TEST_STACK_TRACE_ENTRIES];
    int stackn_user;
    void* parray_stack_user[PERF_MAX_STACK_DEPTH];
    int wakercpu;
    int wakerpid;
    int wakertgid;
    int wakerppid;
    char wakercomm[TASK_COMM_LEN];
    char wakerppidcomm[TASK_COMM_LEN];
    int wakerstackn;
    void* parray_wakerstack[TEST_STACK_TRACE_ENTRIES];
    int waker_stackn_user;
    void* waker_parray_stack_user[PERF_MAX_STACK_DEPTH];
    u32 writedone;  // 0 or 1
} testdmonitor_sample;

#define TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT  8192*4

typedef struct testdmonitor_sample_ringbuff {
    testdmonitor_sample* parray_sample;
    volatile u64 wp;    // Index is wp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1).
    volatile u64 rp;    // Index is rp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1).
    u32 skipcount;  // 0 means no skip any abnormal event
} testdmonitor_sample_ringbuff;

#define TESTDIOMONITOR_LINEBUFF  1024
#define MAX_CPU_COUNT   64

typedef struct testdmonitor_env {
    struct file* file;
    char file_linebuff[TESTDIOMONITOR_LINEBUFF];
    int headoffset;
    loff_t file_pos;
    testdmonitor_sample_ringbuff ringbuff;
    struct perf_callchain_entry* pentry[MAX_CPU_COUNT];
} testdmonitor_env;

static testdmonitor_env _env;

static struct delayed_work work_write_file;
static struct workqueue_struct *wq_write_file;

void init_file(void)
{
    _env.file = filp_open(FILENAME, O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (IS_ERR(_env.file)) {
        _env.file = NULL;
    }
}

void exit_file(void)
{
    if (_env.file) {
        filp_close(_env.file, NULL);
    }
}

void testdmonitor_write_file(char* i_pchar, int i_size)
{
    if (_env.file) {
        kernel_write(_env.file, i_pchar, i_size, &_env.file_pos);
    }
}

void testdmonitor_write_file_emptyline(void)
{
    testdmonitor_write_file("\n", strlen("\n"));
}

void testdmonitor_file_oneline(const char* i_format, ...)
{
    char* pcontent = &_env.file_linebuff[_env.headoffset];
    va_list args;
    va_start(args, i_format);
    vsnprintf(pcontent, TESTDIOMONITOR_LINEBUFF - _env.headoffset, i_format, args);
    va_end(args);
    testdmonitor_write_file(_env.file_linebuff, strlen(_env.file_linebuff));
}

void testdmonitor_checkget_parentinfo(testdmonitor_sample* io_psample, struct task_struct* i_ptask)
{
    struct task_struct* parent;
    rcu_read_lock();
    parent = rcu_dereference(i_ptask->real_parent);
    io_psample->ppid = parent->pid;
    strlcpy(io_psample->ppidcomm, parent->comm, TASK_COMM_LEN);
    rcu_read_unlock();
}

void testdmonitor_checkget_parentinfo_waker(testdmonitor_sample* io_psample, struct task_struct* i_ptask)
{
    struct task_struct* parent;
    rcu_read_lock();
    parent = rcu_dereference(i_ptask->real_parent);
    io_psample->wakerppid = parent->pid;
    strlcpy(io_psample->wakerppidcomm, parent->comm, TASK_COMM_LEN);
    rcu_read_unlock();
}

static inline u64 gettscns(void)
{
    struct timespec64 ts;
    ktime_get_ts64(&ts);
    return timespec64_to_ns(&ts);
}

#define HEAD_USER       "user "

static void write_file(struct work_struct *w)
{
    //ssize_t ret;
    u32 index;
    testdmonitor_sample* psample;
    struct tm t;
    char timestr[64];
    int stacki;
    while (_env.ringbuff.rp != _env.ringbuff.wp) {
        index = (_env.ringbuff.rp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1));
        psample = &_env.ringbuff.parray_sample[index];
        if (psample->writedone != 1) {
            break;
        }
        testdmonitor_write_file_emptyline();
        _env.headoffset = sprintf(_env.file_linebuff, "[%llu] ", _env.ringbuff.rp);
        time64_to_tm(psample->time.tv_sec + 8 * 60 * 60, 0, &t);
        snprintf(timestr, 64, "%04ld-%02d-%02d-%02d_%02d_%02d.%09ld",
		    1900 + t.tm_year, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec, psample->time.tv_nsec);
        testdmonitor_file_oneline("[skipcount:%u]begin...time[%s][%llu]wakercpu[%d]\n", 
            _env.ringbuff.skipcount, timestr, psample->timens, psample->wakercpu);
        testdmonitor_file_oneline("wakertgid[%d]wakerpid[%d]wakercomm[%s]wakerppid[%d]wakerppidcomm[%s]\n",
            psample->wakertgid, psample->wakerpid, psample->wakercomm, psample->wakerppid, psample->wakerppidcomm);
        testdmonitor_file_oneline("stack[%d]:\n", psample->wakerstackn);
        for (stacki = 0; stacki < psample->wakerstackn; stacki++) {
            testdmonitor_file_oneline("%*c%pS\n", 5, ' ', (void *)psample->parray_wakerstack[stacki]);
        }
        for (stacki = 0; stacki < psample->waker_stackn_user; stacki++) {
            testdmonitor_file_oneline(HEAD_USER "%d %*c%llx\n",
                psample->wakertgid, 5, ' ', (u64)psample->waker_parray_stack_user[stacki]);
        }
        testdmonitor_file_oneline("cpu[%d]tgid[%d]pid[%d]comm[%s]ppid[%d]ppidcomm[%s]\n",
            psample->cpu, psample->tgid, psample->pid, psample->comm, psample->ppid, psample->ppidcomm);
        testdmonitor_file_oneline("stack[%d]:\n", psample->stackn);
        for (stacki = 0; stacki < psample->stackn; stacki++) {
            testdmonitor_file_oneline("%*c%pS\n", 5, ' ', (void *)psample->parray_stack[stacki]);
        }
        for (stacki = 0; stacki < psample->stackn_user; stacki++) {
            testdmonitor_file_oneline(HEAD_USER "%d %*c%llx\n",
                psample->tgid, 5, ' ', (u64)psample->parray_stack_user[stacki]);
        }
        testdmonitor_write_file_emptyline();
        smp_wmb();
        psample->writedone = 0;
        _env.ringbuff.rp ++;
    }
    queue_delayed_work_on(nr_cpu_ids - 1, wq_write_file,
        &work_write_file, 1);
}

static void init_write_file(void)
{
    init_file();
    wq_write_file = alloc_workqueue("testdmonitor_write_file", WQ_MEM_RECLAIM, 0);
    INIT_DELAYED_WORK(&work_write_file, write_file);
    queue_delayed_work_on(nr_cpu_ids - 1, wq_write_file,
        &work_write_file, 3);
}

static void exit_write_file(void)
{
    cancel_delayed_work_sync(&work_write_file);
    destroy_workqueue(wq_write_file);
    exit_file();
}

void init_testdmonitor_sample_ringbuff(void) 
{
    int i;
    _env.ringbuff.parray_sample = kvzalloc(sizeof(testdmonitor_sample) * TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT, GFP_KERNEL);
    for (i = 0; i < MAX_CPU_COUNT; i++) {
        _env.pentry[i] = kmalloc(sizeof(struct perf_callchain_entry) + sizeof(__u64) * PERF_MAX_STACK_DEPTH, 
            GFP_KERNEL);
    }
}

void exit_testdmonitor_sample_ringbuff(void)
{
    int i;
    for (i = 0; i < MAX_CPU_COUNT; i++) {
        kvfree(_env.pentry[i]);
    }
    kvfree(_env.ringbuff.parray_sample);
}

testdmonitor_sample* testdmonitor_get_psample(void)
{
    u64 windex_raw, windex_raw_old;
    u32 windex;
    while (1) {
        windex_raw = _env.ringbuff.wp;
        if (windex_raw - _env.ringbuff.rp >= (u64)(TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT)) {
            _env.ringbuff.skipcount ++;
            return NULL;
        }
        // atomic_cmpxchg return old value
		windex_raw_old = atomic64_cmpxchg((atomic64_t*)&_env.ringbuff.wp,
			windex_raw, windex_raw + 1);
        if (windex_raw_old == windex_raw) {
            break;
        }
    }
    windex = (u32)(windex_raw & (u64)(TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1));
    return &_env.ringbuff.parray_sample[windex];
}

void store_user_stack(int *o_pstackn_user, void** parray_stack_user, struct task_struct *i_p)
{
    int i;
    struct perf_callchain_entry_ctx ctx;
    struct perf_callchain_entry* pentry = NULL;
    struct pt_regs *regs = task_pt_regs(i_p);
    *o_pstackn_user = 0;
    pentry = _env.pentry[smp_processor_id()];
    ctx.entry     = pentry;
    ctx.max_stack = PERF_MAX_STACK_DEPTH;
    ctx.nr	      = pentry->nr = 0;
    ctx.contexts       = 0;
    ctx.contexts_maxed = false;
    _perf_callchain_user(&ctx, regs);
    *o_pstackn_user = pentry->nr;
    for (i = 0; i < pentry->nr; i++) {
        parray_stack_user[i] = (void*)pentry->ip[i];
    }
}

void testdmonitor_add_sample(struct task_struct* i_task)
{
    testdmonitor_sample* psample = testdmonitor_get_psample();
    if (!psample) {
        return;
    }
    ktime_get_real_ts64(&psample->time);
    psample->timens = gettscns();
    psample->cpu = task_cpu(i_task);
    psample->pid = i_task->pid;
    psample->tgid = i_task->tgid;
    strlcpy(psample->comm, i_task->comm, TASK_COMM_LEN);
    testdmonitor_checkget_parentinfo(psample, i_task);
    psample->stackn = _stack_trace_save_tsk(i_task, (unsigned long*)psample->parray_stack, TEST_STACK_TRACE_ENTRIES, 0);
    store_user_stack(&psample->stackn_user, psample->parray_stack_user, i_task);
    psample->wakercpu = smp_processor_id();
    psample->wakerpid = current->pid;
    psample->wakertgid = current->tgid;
    strlcpy(psample->wakercomm, current->comm, TASK_COMM_LEN);
    testdmonitor_checkget_parentinfo_waker(psample, current);
    psample->wakerstackn = _stack_trace_save_tsk(current, (unsigned long*)psample->parray_wakerstack, TEST_STACK_TRACE_ENTRIES, 0);
    store_user_stack(&psample->waker_stackn_user, psample->waker_parray_stack_user, current);
    smp_wmb();
    psample->writedone = 1;
}

static void cb_sched_waking(void *i_data, struct task_struct *i_p)
{
    if (i_p->on_cpu || i_p->on_rq == 1) {
        return;
    }
    //if (current->tgid == 3366)
    if (current->pid == wakerpid && i_p->pid == wakeepid)
    {
        testdmonitor_add_sample(i_p);
    }
}

struct kern_tracepoint {
    void *callback;
    struct tracepoint *ptr;
    bool bregister;
};
static void clear_kern_tracepoint(struct kern_tracepoint *tp)
{
    if (tp->bregister) {
        tracepoint_probe_unregister(tp->ptr, tp->callback, NULL);
    }
}

#define INIT_KERN_TRACEPOINT(tracepoint_name) \
    static struct kern_tracepoint mykern_##tracepoint_name = {.callback = NULL, .ptr = NULL, .bregister = false};


#define TRACEPOINT_CHECK_AND_SET(tracepoint_name)                                             \
    static void tracepoint_name##_tracepoint_check_and_set(struct tracepoint *tp, void *priv) \
    {                                                                                \
        if (!strcmp(#tracepoint_name, tp->name))                                     \
        {                                                                            \
            ((struct kern_tracepoint *)priv)->ptr = tp;                          \
            return;                                                                  \
        }                                                                            \
    }

INIT_KERN_TRACEPOINT(sched_waking)
TRACEPOINT_CHECK_AND_SET(sched_waking)

typedef unsigned long (*kallsyms_lookup_name_func)(const char *name);
kallsyms_lookup_name_func _kallsyms_lookup_name_func;

void* get_func_by_symbol_name_kallsyms_lookup_name(void)
{
    int ret;
    void* pfunc = NULL;
	struct kprobe kp;
	memset(&kp, 0, sizeof(kp));
	kp.symbol_name = "kallsyms_lookup_name";
	kp.pre_handler = NULL;
	kp.addr = NULL;	// 作为强调,提示使用symbol_name
	ret = register_kprobe(&kp);
	if (ret < 0) {
		printk("register_kprobe fail!\n");
		return NULL;
	}
	printk("register_kprobe succeed!\n");
    pfunc = (void*)kp.addr;
	unregister_kprobe(&kp);
    return pfunc;
}

void* get_func_by_symbol_name(const char* i_symbol)
{
    if (_kallsyms_lookup_name_func == NULL) {
        return NULL;
    }
    return (void*)_kallsyms_lookup_name_func(i_symbol);
}

static int __init testwakeup_init(void)
{
    if (wakerpid == 0 || wakeepid == 0) {
        printk(KERN_ERR "wakerpid and wakeepid should NOT be 0\n");
        return -EINVAL;
    }
    _kallsyms_lookup_name_func = get_func_by_symbol_name_kallsyms_lookup_name();
    init_get_perf_callchain_func();
    init_testdmonitor_sample_ringbuff();
    init_write_file();
    _stack_trace_save_tsk = get_func_by_symbol_name("stack_trace_save_tsk");
    if (_stack_trace_save_tsk == NULL) {
        printk(KERN_ERR "get_func_by_symbol_name stack_trace_save_tsk failed!\n");
        return -1;
    }
    mykern_sched_waking.callback = cb_sched_waking;
    for_each_kernel_tracepoint(sched_waking_tracepoint_check_and_set, &mykern_sched_waking);
    if (!mykern_sched_waking.ptr) {
        printk(KERN_ERR "mykern_sched_waking register failed!\n");
        return -1;
    }
    else {
        printk(KERN_INFO "mykern_sched_waking register succeeded!\n");
    }
    tracepoint_probe_register(mykern_sched_waking.ptr, mykern_sched_waking.callback, NULL);
    mykern_sched_waking.bregister = 1;
    return 0;
}

static void __exit testwakeup_exit(void)
{
    clear_kern_tracepoint(&mykern_sched_waking);
    tracepoint_synchronize_unregister();
    exit_write_file();
    exit_testdmonitor_sample_ringbuff();
}

module_init(testwakeup_init);
module_exit(testwakeup_exit);

上面的ko程序有抓取用户栈的PC,相关的用户栈的解析见之前的博客。

2.2 使用方法

使用方法还是比较简单的,就是insmod时传入waker和wakee的参数,如下:

insmod testwakeup.ko waker=1 wakee=10

抓取的就是tid是1的线程唤醒tid是10的线程的唤醒栈。

三、相关解释

3.1 抓到的这次没有优先级继承的唤醒栈

通过ftrace开启抓取叠加同时insmod上面第二章的ko,在抓取到异常事件的trace后,rmmod掉ko,运行用户态解析程序,解析出完整的带用户栈的调用栈信息。根据perfetto里打开异常的trace里看到的时间来找对应的包含用户栈的完整调用栈,得到如下这个唤醒栈:

可以从上图里看到,唤醒者在执行内存分配的动作,内存分配用的jemalloc的库,在执行内存分配逻辑时触发了madvise的操作,而madvise的操作里有用到读写sem(上图里的up_read就是线索),这个读写sem,其实就是著名的mmap的读写sem,而被唤醒者也正在需要用这把mmap读写sem,因为被唤醒者在执行mmap的逻辑。do_madvise里调用的up_read的相关代码逻辑如下:

mmap的读写sem是一个进程内的锁,是一个与映射有关的内存操作都需要用到的一把常见锁,虽然rt-linux里的mutex和spinlock都是优先级继承的,但是读写锁并不是。

3.2 非正在运行的调用栈会抓取不全用户栈

可以从 3.1 里说到的抓到的下图的堆栈里可以看到:

被唤醒者,也就是在抓取时候没有正在运行的任务,其用户栈的抓取可能是不全的,无论是用perf抓还是用自己写的ko抓,都是一样的现象。

这种情况下,如果要进一步来找更细的用户栈,就需要去on-cpu的进行抓取,可通过:

bash 复制代码
perf record -g -t <tid>

来抓一个具体的线程,然后再perf script去找到底是哪个具体的调用链路执行的mmap系统调用。

相关推荐
木泽八1 小时前
python实现pdf拆分与合并
服务器·python·pdf
2501_924064112 小时前
2025年一站式测试平台对比:可视化报告与自动化监控最佳实践
运维·自动化
恒创科技HK2 小时前
2026年香港服务器有哪些区域可选?
运维·服务器
xjxijd2 小时前
工业元宇宙 IDC 支撑:数字孪生算法 + 边缘服务器,生产调度响应速度提 3 倍
运维·服务器·算法
xlp666hub2 小时前
从零手写一个 printf 函数:变参宏与默认参数提升
linux
dblens 数据库管理和开发工具2 小时前
DBLens:让 SQL 查询更智能、更高效的数据库利器
服务器·数据库·sql·数据库连接工具·dblens
程序员zgh2 小时前
代码重构 —— 读后感
运维·c语言·开发语言·c++·重构
迅为电子2 小时前
迅为iTOP-Hi3516开发板linux驱动开发资料全面上线,构建从入门到精通的完整学习路径!
linux·驱动开发·学习
代码游侠2 小时前
应用——Linux进程通信与信号处理
linux·运维·服务器·笔记·学习·信号处理