ftrace驱动源码解析

1、ftrace的静动态实现

ftrace是function trace的简称,但是function trace只是ftrace里面的一个基本功能。function trace是基于gcc的-pg参数插入mcount函数实现的,这部分依赖于处理器架构。mcount在不同处理器的gcc实现名字可能略有差异,可能是mcount_mcount或者__mcount,arm64中是_mcount,本文统一使用mcount。

arm64的实现主要有以下三个文件:

c 复制代码
arch/arm64/kernel/entry-ftrace.S   // mcount的核心实现
arch/arm64/kernel/ftrace.c         // CONFIG_DYNAMIC_FTRACE 的支持接口
arch/arm64/include/asm/ftrace.h    // 声明和定义arm64为ftrace核心模块提供的接口

mcount的主要任务是根据是否打开了ftrace相关的宏,在每个内核函数入口加入trace代码,跳转到对应的注册函数中,做进一步处理。但是,在每个内核函数入口加入trace代码,必然影响内核的性能,为了减小对内核性能的影响,ftrace支持动态trace功能。下面来一一介绍这两种trace功能是如何实现的。

1.1 静态mcount实现

Kernel中打开CONFIG_FUNCTION_TRACER 后,会增加-pg编译选项,这样在每个函数入口处都会插入bl mcount跳转指令,函数运行时会进入mcount函数。mcount会判断函数指针ftrace_trace_function是否被注册,默认注册的是空函数ftrace_stub,只有打开function tracer后才会注册具体的处理函数ftrace_trace_function(插个眼,具体ftrace_trace_function如何赋值,后面分析)。

c 复制代码
//代码在kernel源码目录的Makefile:
ifdef CONFIG_FUNCTION_TRACER
  CC_FLAGS_FTRACE := -pg
endif

下图是静态ftrace中mount调用的function trace和function graph trace的实现流程与代码实现:

c 复制代码
//代码在arch/arm64/kernel/entry-ftrace.S
//没有定义CONFIG_DYNAMIC_FTRACE,即静态mcount
#ifndef CONFIG_DYNAMIC_FTRACE
/*
 * void _mcount(unsigned long return_address)
 * @return_address: return address to instrumented function
 */
ENTRY(_mcount)
	mcount_enter

	ldr_l	x2, ftrace_trace_function // 只有打开function tracer后才会注册具体的处理函数ftrace_trace_function
	adr	x0, ftrace_stub         // ftrace_stub是空函数
	cmp	x0, x2			// 比较x0与x2
	b.eq	skip_ftrace_call	// 若x0 == x2,则跳转到skip_ftrace_call

	mcount_get_pc	x0		// function's pc
	mcount_get_lr	x1		// function's lr (= parent's pc)
	blr	x2			// 若x0 != x2, 则 (*ftrace_trace_function)(pc, lr);

skip_ftrace_call:	
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
	ldr_l	x2, ftrace_graph_return // 加载ftrace_graph_return到x2
	cmp	x0, x2			// 比较x0与x2
	b.ne	ftrace_graph_caller	// 若x0 != x2,则跳转到ftrace_graph_caller()

	ldr_l	x2, ftrace_graph_entry	// 加载ftrace_graph_entry到x2
	adr_l	x0, ftrace_graph_entry_stub // 加载ftrace_graph_entry_stub到x0
	cmp	x0, x2                  // 比较x0与x2
	b.ne	ftrace_graph_caller	// 若x0 != x2,则跳转到ftrace_graph_caller();
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
	mcount_exit
ENDPROC(_mcount)

#else /* CONFIG_DYNAMIC_FTRACE */
......
#endif /* CONFIG_DYNAMIC_FTRACE */

1.2 动态mcount实现

static ftrace一旦使能,对kernel中所有的函数(除开notrace、online、其他特殊函数)进行插桩,这带来的性能开销是惊人的,有可能导致人们弃用ftrace功能。

为了解决这个问题,内核开发者推出了dynamic ftrace,因为实际上调用者一般不需要对所有函数进行追踪,只会对感兴趣的一部分函数进行追踪。如果配置了CONFIG_DYNAMIC_FTRACE, mcount会被实现成一个空函数(只有一条ret 指令)。在系统启动时ftrace_init()中,mcount会被替换成nop指令。打开tracer后,需要跟踪的函数的对应位置会被动态替换成跳转到ftrace_caller()的指令。

c 复制代码
//代码在arch/arm64/kernel/entry-ftrace.S
//没有定义CONFIG_DYNAMIC_FTRACE,即静态mcount
#ifndef CONFIG_DYNAMIC_FTRACE
......
#else /* CONFIG_DYNAMIC_FTRACE */
//如果配置了CONFIG_DYNAMIC_FTRACE, mcount会被实现成一个空函数(只有一条ret指令)。
//在系统启动时,mcount会被替换nop指令。打开tracer后,所有函数的对应位置会被动态替换成跳转到ftrace_caller()的指令。
ENTRY(_mcount)
	ret
ENDPROC(_mcount)
/*
 * void ftrace_caller(unsigned long return_address)
 * @return_address: return address to instrumented function
 */
ENTRY(ftrace_caller)
	mcount_enter
	mcount_get_pc0	x0		//     function's pc
	mcount_get_lr	x1		//     function's lr
GLOBAL(ftrace_call)			// tracer(pc, lr); function tracer的回调函数
	nop				// This will be replaced with "bl xxx"
					// where xxx can be any kind of tracer.
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
GLOBAL(ftrace_graph_call)		// ftrace_graph_caller(); function graph tracer的回调函数
	nop				// If enabled, this will be replaced
					// "b ftrace_graph_caller"
#endif
	mcount_exit
ENDPROC(ftrace_caller)
#endif /* CONFIG_DYNAMIC_FTRACE */

在编译的时候调用recordmcount.pl搜索所有静态mcount实现中的bl _mcount函数调用点,并且所有的调用点地址保存到section _mcount_loc,其定义在include/asm-generic/vmlinux.lds.h,详细的见文件scripts/recordmcount.pl、scripts/recordmcount.c

在初始化时,遍历section __mcount_loc的调用点地址,默认给所有bl _mcount替换成nop

c 复制代码
// kernel/trace/ftrace.c
void __init ftrace_init(void)
{
        extern unsigned long __start_mcount_loc[];
        extern unsigned long __stop_mcount_loc[];
        unsigned long count, flags;
        int ret;

        local_irq_save(flags);
        ret = ftrace_dyn_arch_init();
        local_irq_restore(flags);
        if (ret)
                goto failed;
        //计算__mcount_loc的大小
        count = __stop_mcount_loc - __start_mcount_loc;
        ......
        last_ftrace_enabled = ftrace_enabled = 1;

        //遍历section __mcount_loc,处理其中保存的调用地址
        ret = ftrace_process_locs(NULL,
                                  __start_mcount_loc,
                                  __stop_mcount_loc);
        set_ftrace_early_filters();

        return;
 failed:
        ftrace_disabled = 1;
}
c 复制代码
//使用ftrace_page来表示每一个调用地址
//每一个调用地址保存在ftrace_page的records中
//index表示在records中的下标
struct ftrace_page {
        struct ftrace_page      *next;
        struct dyn_ftrace       *records;
        int                     index;
        int                     size;
};
struct dyn_ftrace {
        unsigned long           ip; /* address of mcount call-site */
        unsigned long           flags;
        struct dyn_arch_ftrace  arch;
};
c 复制代码
static int ftrace_process_locs(struct module *mod,
                               unsigned long *start,
                               unsigned long *end)
{
        struct ftrace_page *start_pg;
        struct ftrace_page *pg;
        struct dyn_ftrace *rec;
        unsigned long count;
        unsigned long *p;
        unsigned long addr;
        unsigned long flags = 0; /* Shut up gcc */
        int ret = -ENOMEM;

        count = end - start;
        if (!count)
                return 0;
        //对地址进行排序
        sort(start, count, sizeof(*start),
             ftrace_cmp_ips, NULL);
        //对每个地址分配新的ftrace_page结构来存储,在section __mcount_loc中,
        //只是简单的存储了unsigned long类型的调用地址
        //dyn_ftrace结构除了使用->ip来存储地址,还使用->flags来存储当前的状态和被引用计数
        start_pg = ftrace_allocate_pages(count);
        if (!start_pg)
                return -ENOMEM;
        mutex_lock(&ftrace_lock);

        /*
         * Core and each module needs their own pages, as
         * modules will free them when they are removed.
         * Force a new page to be allocated for modules.
         */
        if (!mod) {
                WARN_ON(ftrace_pages || ftrace_pages_start);
                /* First initialization */
                ftrace_pages = ftrace_pages_start = start_pg;
        } else {
                if (!ftrace_pages)
                        goto out;
                if (WARN_ON(ftrace_pages->next)) {
                        /* Hmm, we have free pages? */
                        while (ftrace_pages->next)
                                ftrace_pages = ftrace_pages->next;
                }
                ftrace_pages->next = start_pg;
        }
        //更新dyn_ftrace新结构中的->ip字段
        p = start;
        pg = start_pg;
        while (p < end) {
                addr = ftrace_call_adjust(*p++);
                if (!addr)
                        continue;
                if (pg->index == pg->size) {
                        /* We should have allocated enough */
                        if (WARN_ON(!pg->next))
                                break;
                        pg = pg->next;
                }
                //addr保存在ftrace_page的records中
                rec = &pg->records[pg->index++];
                rec->ip = addr;
        }
        WARN_ON(pg->next);

        /* Assign the last page to ftrace_pages */
        ftrace_pages = pg;
        if (!mod)
                local_irq_save(flags);
                
        //更新dyn_ftrace新结构中的->flags字段,默认给所有调用点替换成"nop"指令
        ftrace_update_code(mod, start_pg);
        if (!mod)
                local_irq_restore(flags);
        ret = 0;
 out:
        mutex_unlock(&ftrace_lock);

        return ret;
}
c 复制代码
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
        struct ftrace_page *pg;
        struct dyn_ftrace *p;
        u64 start, stop;
        unsigned long update_cnt = 0;
        unsigned long rec_flags = 0;
        int i;

        start = ftrace_now(raw_smp_processor_id());

        //当一个模块被加载时,会调用此函数来将其文本中的 mcount 调用转换为 nops,并且在 ftrace 数据中创建一个条目。
        //现在,如果在调用此函数后但在模块将其文本设置为只读之前激活了 ftrace,那么在转换调用时只读操作做完了,启用 ftrace 的修改可能会失败
        //为了防止这种情况发生,模块的记录被设置为禁用,并将在将模块的文本设置为只读后启用
        if (mod)
                rec_flags |= FTRACE_FL_DISABLED;

        for (pg = new_pgs; pg; pg = pg->next) {

                for (i = 0; i < pg->index; i++) {

                        /* If something went wrong, bail without enabling anything */
                        if (unlikely(ftrace_disabled))
                                return -1;
                        p = &pg->records[i];
                        p->flags = rec_flags;
#ifndef CC_USING_NOP_MCOUNT
                        //在被插装的函数中关闭对 ftrace_caller() 的调用,使用nop
                        if (!ftrace_code_disable(mod, p))
                                break;
#endif

                        update_cnt++;
                }
        }

        stop = ftrace_now(raw_smp_processor_id());
        ftrace_update_time = stop - start;
        ftrace_update_tot_cnt += update_cnt;

        return 0;
}

总体过程如下:

2、function tracer

2.1 tracer注册

c 复制代码
//kernel/trace/trace_functions.c
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
        { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
        { } /* Always set a last empty entry */
};

static struct tracer_flags func_flags = {
        .val = 0, /* By default: all flags disabled */
        .opts = func_opts
};

static struct tracer function_trace __tracer_data =
{
        .name           = "function",
        .init           = function_trace_init,
        .reset          = function_trace_reset,
        .start          = function_trace_start,
        .flags          = &func_flags,
        .set_flag       = func_set_flag,
        .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
        .selftest       = trace_selftest_startup_function,
#endif
};

__init int init_function_trace(void)
{
        init_func_cmd_traceon();
        return register_tracer(&function_trace);
}
c 复制代码
/**
 * register_tracer - register a tracer with the ftrace system.
 * @type - the plugin for the tracer
 *
 * Register a new plugin tracer.
 */
int __init register_tracer(struct tracer *type)
{
        struct tracer *t;
        int ret = 0;

        ......
        //tracer没有设置,就使用dummy_set_flag与dummy_tracer_opt,function_trace使用func_set_flag与func_flags
        if (!type->set_flag)
                type->set_flag = &dummy_set_flag;
        if (!type->flags) {
                /*allocate a dummy tracer_flags*/
                type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL);
                if (!type->flags) {
                        ret = -ENOMEM;
                        goto out;
                }
                type->flags->val = 0;
                type->flags->opts = dummy_tracer_opt;
        } else
                if (!type->flags->opts)
                        type->flags->opts = dummy_tracer_opt;
        
        /* store the tracer for __set_tracer_option */
        type->flags->trace = type;
 
        ret = run_tracer_selftest(type);
        if (ret < 0) 
                goto out;
        //将新的tracer加入到trace_types链表中
        //The global_trace is the descriptor that holds the top-level tracing buffers for the live tracing.
        type->next = trace_types;
        trace_types = type;
        add_tracer_options(&global_trace, type);

 out:
        tracing_selftest_running = false;
        mutex_unlock(&trace_types_lock);

        if (ret || !default_bootup_tracer)
                goto out_unlock;

        if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
                goto out_unlock;

        printk(KERN_INFO "Starting tracer '%s'\n", type->name);
        /* Do we want this tracer to start on bootup? */
        //设置boot默认的tracer
        tracing_set_tracer(&global_trace, type->name);
        default_bootup_tracer = NULL;

        apply_trace_boot_options();
        /* disable other selftests, since this will break it. */
        tracing_selftest_disabled = true;
#ifdef CONFIG_FTRACE_STARTUP_TEST
        printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
               type->name);
#endif

 out_unlock:
        return ret;
}
c 复制代码
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
        /* Only enable if the directory has been created already. */
        if (!tr->dir)
                return;

        create_trace_option_files(tr, t);
}
c 复制代码
static void
create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
{
        struct trace_option_dentry *topts;
        struct trace_options *tr_topts;
        struct tracer_flags *flags;
        struct tracer_opt *opts;
        int cnt;
        int i;

        flags = tracer->flags;
        /*
         * If this is an instance, only create flags for tracers
         * the instance may have.
         */
        if (!trace_ok_for_array(tracer, tr))
                return;

        for (i = 0; i < tr->nr_topts; i++) {
                /* Make sure there's no duplicate flags. */
                if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags))
                        return;
        }

        opts = flags->opts;
        topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
        if (!topts)
                return;

        tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
                            GFP_KERNEL);

        tr->topts = tr_topts;
        tr->topts[tr->nr_topts].tracer = tracer;
        tr->topts[tr->nr_topts].topts = topts;
        tr->nr_topts++;

        for (cnt = 0; opts[cnt].name; cnt++) {
                create_trace_option_file(tr, &topts[cnt], flags,
                                         &opts[cnt]);
                WARN_ONCE(topts[cnt].entry == NULL,
                          "Failed to create trace option: %s",
                          opts[cnt].name);
        }
}
c 复制代码
static void
create_trace_option_file(struct trace_array *tr,
                         struct trace_option_dentry *topt,
                         struct tracer_flags *flags,
                         struct tracer_opt *opt)
{
        struct dentry *t_options;

        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return;

        topt->flags = flags;
        topt->opt = opt;
        topt->tr = tr;

        topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops);
}

2.2 tracer使能

可以使用echo function > current_tracer命令来使能或者切换tracer。

c 复制代码
static __init int tracer_init_tracefs(void)
{
        struct dentry *d_tracer;
        ......
        //返回/sys/kernel/debug/tracing目录
        d_tracer = tracing_init_dentry();
        event_trace_init();
        
        init_tracer_tracefs(&global_trace, d_tracer);
        ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
        ......
        create_trace_instances(d_tracer);
        update_tracer_options(&global_trace);

        return 0;
}
c 复制代码
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
        struct trace_event_file *file;
        int cpu;

        ......
        trace_create_file("current_tracer", 0644, d_tracer,
                        tr, &set_tracer_fops);
        ......
        trace_create_file("trace", 0644, d_tracer,
                          tr, &tracing_fops);
        ......
        trace_create_file("tracing_on", 0644, d_tracer,
                          tr, &rb_simple_fops);

        ftrace_init_tracefs(tr, d_tracer);
}
c 复制代码
static const struct file_operations set_tracer_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_set_trace_read,
        .write          = tracing_set_trace_write,
        .llseek         = generic_file_llseek,
};

static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
{
        //当前文件所在trace buffer,默认是global_trace 
        struct trace_array *tr = filp->private_data;
        char buf[MAX_TRACER_SIZE+1];
        ......
        //使能新的tracer
        err = tracing_set_tracer(tr, buf);
        ......
        return ret;
}
c 复制代码
static int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
        struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
        bool had_max_tr;
#endif
        int ret = 0;

        ......
        //根据名字,在trace_types链表中找到对应的tracer
        for (t = trace_types; t; t = t->next) {
                if (strcmp(t->name, buf) == 0)
                        break;
        }
        ......
        //调用新tracer的init函数
        if (t->init) {
                ret = tracer_init(t, tr);
                if (ret)
                        goto out;
        }
        //把新的tracer设置为当前tracer
        tr->current_trace = t;
        tr->current_trace->enabled++;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);

        return ret;
}
c 复制代码
int tracer_init(struct tracer *t, struct trace_array *tr)
{
        tracing_reset_online_cpus(&tr->trace_buffer);
        return t->init(tr);
}
2.2.1 function tracer的使能

以function tracer为例,t->init调用到function_trace_init():

c 复制代码
start_kernel() -> early_trace_init() -> tracer_alloc_buffers()

struct ftrace_ops global_ops = {
        .func                           = ftrace_stub,
        .local_hash.notrace_hash        = EMPTY_HASH,
        .local_hash.filter_hash         = EMPTY_HASH,
        INIT_OPS_HASH(global_ops)
        .flags                          = FTRACE_OPS_FL_RECURSION_SAFE |
                                          FTRACE_OPS_FL_INITIALIZED |
                                          FTRACE_OPS_FL_PID,
};

__init void ftrace_init_global_array_ops(struct trace_array *tr)
{
        tr->ops = &global_ops;
        tr->ops->private = tr;
        ftrace_init_trace_array(tr);
}
__init static int tracer_alloc_buffers(void)
{
        ftrace_init_global_array_ops(&global_trace);
}
c 复制代码
//这里的tr默认就是global_trace 
static int function_trace_init(struct trace_array *tr)
{
        ftrace_func_t func;

        /*
         * Instance trace_arrays get their ops allocated
         * at instance creation. Unless it failed
         * the allocation.
         */
        if (!tr->ops)
                return -ENOMEM;
        /* Currently only the global instance can do stack tracing */
        if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
            func_flags.val & TRACE_FUNC_OPT_STACK)
                func = function_stack_trace_call;
        else
                func = function_trace_call;
        ftrace_init_array_ops(tr, func);
        tr->trace_buffer.cpu = get_cpu();
        put_cpu();
        tracing_start_cmdline_record();

        //启动function tracer,将tr->ops也加入到ftrace_ops_list当中
        tracing_start_function_trace(tr);
        return 0;
}

static void tracing_start_function_trace(struct trace_array *tr)
{
        tr->function_enabled = 0;
        //此时ops是global_ops
        register_ftrace_function(tr->ops);
        tr->function_enabled = 1;
}
c 复制代码
int register_ftrace_function(struct ftrace_ops *ops)
{
        int ret = -1;
        //初始化hash表,ops->func_hash = &ops->local_hash;
        ftrace_ops_init(ops);
        mutex_lock(&ftrace_lock);
        //将global_ops加入ftrace_ops_list,并且根据情况,修改各个桩位置的指令
        ret = ftrace_startup(ops, 0);
        mutex_unlock(&ftrace_lock);

        return ret;
}
c 复制代码
int ftrace_startup(struct ftrace_ops *ops, int command)
{
        int ret;
        //1、把global_ops加入ftrace_ops_list 
        //2、根据ftrace_ops_list链表中成员的情况给ftrace_trace_function指针赋值:
        //    ftrace_ops_list链表为空:ftrace_trace_function= ftrace_stub
        //    ftrace_ops_list链表有1个成员:ftrace_trace_function= ftrace_ops_get_list_func(ftrace_ops_list)
        //    ftrace_ops_list链表有多个成员:ftrace_trace_function= ftrace_ops_list_func
        //3、更新ftrace_graph_entry

        ret = __register_ftrace_function(ops);
        if (ret)
                return ret;

        ftrace_start_up++;

        /*
         * Note that ftrace probes uses this to start up
         * and modify functions it will probe. But we still
         * set the ADDING flag for modification, as probes
         * do not have trampolines. If they add them in the
         * future, then the probes will need to distinguish
         * between adding and updating probes.
         */ 
        ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
        //遍历全部_mcount插桩点ftrace_rec 
        //根据ip在新、旧hash表中的变化,设置对应rec->flags中的FTRACE_FL_IPMODIFY
        ret = ftrace_hash_ipmodify_enable(ops);
        if (ret < 0) {
                /* Rollback registration process */
                __unregister_ftrace_function(ops);
                ftrace_start_up--;
                ops->flags &= ~FTRACE_OPS_FL_ENABLED;
                return ret;
        }
        //遍历全部_mcount插桩点ftrace_rec 
        //根据filter_hash、notrace_hash是否match ip,给对应rec->flags中ref_cnt进行加1/减1操作
        if (ftrace_hash_rec_enable(ops, 1))
                command |= FTRACE_UPDATE_CALLS; //ftrace_startup_enable()中使用
        //更新插桩点: 
        //FTRACE_UPDATE_CALLS被设置,更新_mcount插桩点:ref_cnt大于0的插桩点,更新成ftrace_caller()
        //FTRACE_UPDATE_TRACE_FUNC被设置,更新ftrace_call插桩点:更新成ftrace_trace_function指向的函数
        //FTRACE_START_FUNC_RET被设置,更新ftrace_graph_call插桩点:更新成ftrace_graph_caller()
        ftrace_startup_enable(command);

        ops->flags &= ~FTRACE_OPS_FL_ADDING;

        return 0;
}
c 复制代码
int __register_ftrace_function(struct ftrace_ops *ops)
{
        ......
        //把global_ops加入ftrace_ops_list
        add_ftrace_ops(&ftrace_ops_list, ops);

        ......
        //根据ftrace_ops_list链表中成员的情况给ftrace_trace_function指针赋值:
        //    ftrace_ops_list链表为空:ftrace_trace_function = ftrace_stub
        //    ftrace_ops_list链表有1个成员:ftrace_trace_function = ftrace_ops_get_list_func(ftrace_ops_list)
        //    ftrace_ops_list链表有多个成员:ftrace_trace_function = ftrace_ops_list_func
        if (ftrace_enabled)
                update_ftrace_function();

        return 0;
}
c 复制代码
static void ftrace_startup_enable(int command)
{
        //saved_ftrace_func != ftrace_trace_function则设置command |= FTRACE_UPDATE_TRACE_FUNC;
        if (saved_ftrace_func != ftrace_trace_function) {
                saved_ftrace_func = ftrace_trace_function;
                command |= FTRACE_UPDATE_TRACE_FUNC; //function trace
        }

        if (!command || !ftrace_enabled)
                return;

        ftrace_run_update_code(command);
}

ftrace_run_update_code() -> arch_ftrace_update_code() -> ftrace_modify_all_code() 

void ftrace_modify_all_code(int command)
{
        int update = command & FTRACE_UPDATE_TRACE_FUNC;
        int mod_flags = 0;
        int err = 0;

        if (command & FTRACE_MAY_SLEEP)
                mod_flags = FTRACE_MODIFY_MAY_SLEEP_FL;
        
        //如果FTRACE_UPDATE_TRACE_FUNC被设置,
        //对于ftrace_call插桩点,直接调用ftrace_ops_list链表中某个ftrace_ops的操作需要谨慎 
        //保险起见,默认还是使用ftrace_ops_list_func()替换ftrace_call(),它会轮询ftrace_ops_list链表中所有ftrace_ops
        if (update) {
                err = ftrace_update_ftrace_func(ftrace_ops_list_func);
                if (FTRACE_WARN_ON(err)) 
                        return;
        }
        //如果FTRACE_UPDATE_CALLS被设置,对于_mcount插桩点,
        //遍历全部ftrace_rec,ref_cnt大于0的插桩点,更新成ftrace_caller()
        if (command & FTRACE_UPDATE_CALLS)
                ftrace_replace_code(mod_flags | FTRACE_MODIFY_ENABLE_FL);
        else if (command & FTRACE_DISABLE_CALLS)
                ftrace_replace_code(mod_flags);

        //如果FTRACE_UPDATE_TRACE_FUNC被设置,
        //对于ftrace_call插桩点,如果ftrace_trace_function确实不等于ftrace_ops_list_func(),更新成ftrace_trace_function指向的函数
        if (update && ftrace_trace_function != ftrace_ops_list_func) {
                function_trace_op = set_function_trace_op;
                smp_wmb();
                /* If irqs are disabled, we are in stop machine */
                if (!irqs_disabled())
                        smp_call_function(ftrace_sync_ipi, NULL, 1);
                err = ftrace_update_ftrace_func(ftrace_trace_function);
                if (FTRACE_WARN_ON(err))
                        return;
        }
        //如果FTRACE_START_FUNC_RET被设置,对于ftrace_graph_call插桩点,
        //更新成ftrace_graph_caller()
        if (command & FTRACE_START_FUNC_RET)
                err = ftrace_enable_ftrace_graph_caller();
        else if (command & FTRACE_STOP_FUNC_RET)
                err = ftrace_disable_ftrace_graph_caller();
        FTRACE_WARN_ON(err);
}
c 复制代码
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
                                 struct ftrace_ops *op, struct pt_regs *regs)
{
        __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
}
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
                       struct ftrace_ops *ignored, struct pt_regs *regs)
{
        struct ftrace_ops *op;
        int bit;

        bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
        if (bit < 0)
                return;
        preempt_disable_notrace();
        //默认还是使用ftrace_ops_list_func()替换ftrace_call(),它会轮询ftrace_ops_list链表中所有ftrace_ops
        do_for_each_ftrace_op(op, ftrace_ops_list) {
                if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
                    ftrace_ops_test(op, ip, regs)) {
                        if (FTRACE_WARN_ON(!op->func)) {
                                pr_warn("op=%p %pS\n", op, op);
                                goto out;
                        }
                        op->func(ip, parent_ip, op, regs);
                }
        } while_for_each_ftrace_op(op);
out:
        preempt_enable_notrace();
        trace_clear_recursion(bit);
}
2.2.2 function tracer的filter

通过set_ftrace_filterset_ftrace_notrace设置function trcer的filter。本质上是操作global_ops的filter_hashnotrace_hash

c 复制代码
tracer_init_tracefs() -> ftrace_init_tracefs_toplevel() -> ftrace_init_dyn_tracefs()

static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{

        trace_create_file("available_filter_functions", 0444,
                        d_tracer, NULL, &ftrace_avail_fops);

        trace_create_file("enabled_functions", 0444,
                        d_tracer, NULL, &ftrace_enabled_fops);
        //在/sys/kernel/debug/tracing目录下创建set_ftrace_filter与set_ftrace_notrace
        ftrace_create_filter_files(&global_ops, d_tracer);

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        trace_create_file("set_graph_function", 0644, d_tracer,
                                    NULL,
                                    &ftrace_graph_fops);
        trace_create_file("set_graph_notrace", 0644, d_tracer,
                                    NULL,
                                    &ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */

        return 0;
}

void ftrace_create_filter_files(struct ftrace_ops *ops,
                                struct dentry *parent)
{
        trace_create_file("set_ftrace_filter", 0644, parent,
                          ops, &ftrace_filter_fops);
        trace_create_file("set_ftrace_notrace", 0644, parent,
                          ops, &ftrace_notrace_fops);
}

static const struct file_operations ftrace_filter_fops = {
        .open = ftrace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
        .llseek = tracing_lseek,
        .release = ftrace_regex_release,
};

set_ftrace_filterset_ftrace_notrace的文件操作有个技巧:

  • open的时候分配一个临时hash表iter->hash来拷贝global_ops的filter_hash/notrace_hash的内容,
  • 在write操作实际设置filter时对iter->hash操作(遍历ftrcae_rec,如果ip在filter中存在,将其加入/删除到iter->hash中),
  • 在close的时候使用新的hash表iter->hash来更新global_ops。根据最新hash表的内容,更新_mcount插桩点,遍历全部ftrace_rec:
    • ref_cnt大于0的插桩点,更新成ftrace_caller()
    • ref_cnt等于0的插桩点,更新成nop。
c 复制代码
ftrace_filter_write() -> ftrace_regex_write() -> ftrace_process_regex()
static int ftrace_process_regex(struct ftrace_iterator *iter,
                                char *buff, int len, int enable)
{
        struct ftrace_hash *hash = iter->hash;
        struct trace_array *tr = iter->ops->private;
        char *func, *command, *next = buff;
        struct ftrace_func_command *p;
        int ret = -EINVAL;

        func = strsep(&next, ":");
        if (!next) {
                //解析filter配置命令,配置到iter->hash中,操作global_ops
                ret = ftrace_match_records(hash, func, len);
                if (!ret)
                        ret = -EINVAL;
                if (ret < 0)
                        return ret;
                return 0;
        }

        /* command found */
        ......
 out_unlock:
        mutex_unlock(&ftrace_cmd_mutex);

        return ret;
}
2.2.3 function tracer的filter command

通过set_ftrace_filter设置function trcer的filter command。本质上向trace_probe_ops注册cmd,以及操作trace_probe_ops的filter_hash、notrace_hash。

虽然同样是操作set_ftrace_filter,但是配置filter和配置filter command是操作到不同的实体:

  • 配置filter:操作的是global_ops的filter_hash/notrace_hash的内容;
  • 配置filter command:是把command向trace_probe_ops注册,并且操作trace_probe_ops的filter_hash/notrace_hash的内容;

在配置filter command之前首先得注册command:

c 复制代码
init_function_trace() -> init_func_cmd_traceon()

static int __init init_func_cmd_traceon(void)
{
        int ret;
        //把command加入到ftrace_commands链表
        ret = register_ftrace_command(&ftrace_traceoff_cmd);
        if (ret)
                return ret;
        ret = register_ftrace_command(&ftrace_traceon_cmd);
        if (ret)
                goto out_free_traceoff;
        ret = register_ftrace_command(&ftrace_stacktrace_cmd);
        if (ret)
                goto out_free_traceon;
        ret = register_ftrace_command(&ftrace_dump_cmd);
        if (ret)
                goto out_free_stacktrace;
        ret = register_ftrace_command(&ftrace_cpudump_cmd);
        if (ret)
                goto out_free_dump;

        return 0;
}
c 复制代码
tatic struct ftrace_func_command ftrace_traceon_cmd = {
        .name                   = "traceon",
        .func                   = ftrace_trace_onoff_callback,
};

traceon command为例,继续分析上一节对set_ftrace_filter的文件操作:

c 复制代码
ftrace_filter_write() -> ftrace_regex_write() -> ftrace_process_regex()
static int ftrace_process_regex(struct ftrace_iterator *iter,
                                char *buff, int len, int enable)
{
        struct ftrace_hash *hash = iter->hash;
        struct trace_array *tr = iter->ops->private;
        char *func, *command, *next = buff;
        struct ftrace_func_command *p;
        int ret = -EINVAL;

        func = strsep(&next, ":");
        ......
        /* command found */
        command = strsep(&next, ":");
        //filter command配置,最后实际会操作到trace_probe_ops
        mutex_lock(&ftrace_cmd_mutex);
        list_for_each_entry(p, &ftrace_commands, list) {
                if (strcmp(p->name, command) == 0) {
                        ret = p->func(tr, hash, func, command, next, enable);
                        goto out_unlock;
                }
        }
 out_unlock:
        mutex_unlock(&ftrace_cmd_mutex);

        return ret;
}
c 复制代码
static int
ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,
                            char *glob, char *cmd, char *param, int enable)
{
        struct ftrace_probe_ops *ops;

        if (!tr)
                return -ENODEV;

        /* we register both traceon and traceoff to this callback */
        if (strcmp(cmd, "traceon") == 0)
                ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
        else
                ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
        //注册command到trace_probe_ops
        return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
                                           param, enable);
}
c 复制代码
static int
ftrace_trace_probe_callback(struct trace_array *tr,
                            struct ftrace_probe_ops *ops,
                            struct ftrace_hash *hash, char *glob,
                            char *cmd, char *param, int enable)
{
        void *count = (void *)-1;
        char *number;
        int ret;

        /* hash funcs only work with set_ftrace_filter */
        if (!enable)
                return -EINVAL;
        //如果命令是"!",注销filter command
        if (glob[0] == '!')
                return unregister_ftrace_function_probe_func(glob+1, tr, ops);

        if (!param)
                goto out_reg;
        number = strsep(&param, ":");
        if (!strlen(number))
                goto out_reg;
        //解析到filter command中的"count"字段
        ret = kstrtoul(number, 0, (unsigned long *)&count);
        if (ret)
                return ret;

 out_reg:
         //继续注册filter command
        ret = register_ftrace_function_probe(glob, tr, ops, count);

        return ret < 0 ? ret : 0;
}
c 复制代码
int
register_ftrace_function_probe(char *glob, struct trace_array *tr,
                               struct ftrace_probe_ops *probe_ops,
                               void *data)
{
        struct ftrace_func_entry *entry;
        struct ftrace_func_probe *probe;
        struct ftrace_hash **orig_hash;
        struct ftrace_hash *old_hash;
        struct ftrace_hash *hash;
        int count = 0;
        int size;
        int ret;
        int i;

        ......
        if (&probe->list == &tr->func_probes) {
                probe = kzalloc(sizeof(*probe), GFP_KERNEL);
                probe->probe_ops = probe_ops;
                //插桩点执行后的回调
                probe->ops.func = function_trace_probe_call;
                probe->tr = tr;
                ftrace_ops_init(&probe->ops);
                list_add(&probe->list, &tr->func_probes);
        }

        mutex_lock(&probe->ops.func_hash->regex_lock);
        //将trace_probe_ops的filter_hash拷贝到临时hash表
        orig_hash = &probe->ops.func_hash->filter_hash;
        old_hash = *orig_hash;
        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);

        //遍历ftrace_rec,如果当前ip符合当前函数filter规则,加入到临时hash表
        ret = ftrace_match_records(hash, glob, strlen(glob));

        ......
        //把临时hash表更新到trace_probe_ops的filter_hash中
        ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash,
                                              hash, 1);
        if (ret < 0)
                goto err_unlock;

        /* One ref for each new function traced */
        probe->ref += count;

        if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED))
                //注册trace_probe_ops到ftrace_ops_list,根据hash表的更新来更新_mcount插桩点
                ret = ftrace_startup(&probe->ops, 0);
        ......
}

trace_probe_ops在被调用的时候,执行ftrace_func_hash中的filter command:

c 复制代码
ftrace_caller() -> ftrace_ops_list_func() -> __ftrace_ops_list_func()

static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
                       struct ftrace_ops *ignored, struct pt_regs *regs)
{
        struct ftrace_ops *op;
        int bit;

        bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
        preempt_disable_notrace();

        do_for_each_ftrace_op(op, ftrace_ops_list) {
                //遍历ftrace_ops_list链表,在当前ip满足ops hash的情况下,逐个执行ftrace_ops->func()
                if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
                    ftrace_ops_test(op, ip, regs)) {
                        //函数function_trace_probe_call()
                        op->func(ip, parent_ip, op, regs);
                }
        } while_for_each_ftrace_op(op);
out:
        preempt_enable_notrace();
        trace_clear_recursion(bit);
}
c 复制代码
static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
                                      struct ftrace_ops *op, struct pt_regs *pt_regs)
{
        struct ftrace_probe_ops *probe_ops;
        struct ftrace_func_probe *probe;

        probe = container_of(op, struct ftrace_func_probe, ops);
        probe_ops = probe->probe_ops;

        preempt_disable_notrace();
        //probe_ops->func为ftrace_trace_onoff_callback()中定义的ops->func
        //traceon为traceon_count_probe_ops或者traceon_probe_ops的func
        probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data);
        preempt_enable_notrace();
}

static struct ftrace_probe_ops traceon_count_probe_ops = {
        .func                   = ftrace_traceon_count, //根据指定的计数值来控制追踪的启用或禁用
        .print                  = ftrace_traceon_print,
        .init                   = ftrace_count_init,
        .free                   = ftrace_count_free,
};

static struct ftrace_probe_ops traceon_probe_ops = {
        .func                   = ftrace_traceon, //enable tracing buffers
        .print                  = ftrace_traceon_print,
};

//enable tracing buffers
static void
ftrace_traceon(unsigned long ip, unsigned long parent_ip,
               struct trace_array *tr, struct ftrace_probe_ops *ops,
               void *data)
{
        if (tracer_tracing_is_on(tr))
                return;
        tracer_tracing_on(tr);
}
void tracer_tracing_on(struct trace_array *tr)
{
        if (tr->trace_buffer.buffer)
                ring_buffer_record_on(tr->trace_buffer.buffer);
        tr->buffer_disabled = 0;
        /* Make the flag seen by readers */
        smp_wmb();
}
2.2.4 function tracer的数据存入
c 复制代码
ftrace_caller() -> ftrace_ops_list_func() -> __ftrace_ops_list_func() -> global_ops->func() -> function_trace_call() -> trace_function()

void
trace_function(struct trace_array *tr,
               unsigned long ip, unsigned long parent_ip, unsigned long flags,
               int pc)
{
        struct trace_event_call *call = &event_function;
        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ring_buffer_event *event;
        struct ftrace_entry *entry;
        //从ringbuffer中分配空间
        event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
                                            flags, pc);
        if (!event)
                return;
        entry   = ring_buffer_event_data(event);
        entry->ip                       = ip;
        entry->parent_ip                = parent_ip;
        //存入function tracer自定义的trace数据:ip、parent_ip
        if (!call_filter_check_discard(call, entry, buffer, event)) {
                if (static_branch_unlikely(&ftrace_exports_enabled))
                        ftrace_exports(event);
                __buffer_unlock_commit(buffer, event);
        }
}

function tracer自定义的trace数据非常简单:ip、parent_ip

c 复制代码
struct ring_buffer_event {
        u32             type_len:5, time_delta:27;
        u32             array[];
};

/*
 * The trace entry - the most basic unit of tracing. This is what
 * is printed in the end as a single line in the trace output, such as:
 *
 *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
 */
struct trace_entry {
        unsigned short          type;
        unsigned char           flags;
        unsigned char           preempt_count;
        int                     pid;
};
 
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)     \
        struct struct_name {                                            \
                struct trace_entry      ent;                            \
                tstruct                                                 \
        }

#undef FTRACE_ENTRY_REG
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
                         filter, regfn) \
        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
                     filter)

FTRACE_ENTRY_REG(function, ftrace_entry,
        TRACE_FN,
        F_STRUCT(
                __field(        unsigned long,  ip              )
                __field(        unsigned long,  parent_ip       )
        ),
        F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
        FILTER_TRACE_FN,
        perf_ftrace_event_register
);
c 复制代码
struct ftrace_entry {
        struct trace_entry      ent;
        unsigned long  ip;
        unsigned long  parent_ip;
};
2.2.5 function tracer的数据读出

从trace文件读出的function tracer默认数据格式为:

kernel/trace/trace_output.c文件中,注册了系统默认的几种trace_event。function tracer使用TRACE_FN类型的trace_fn_event:

c 复制代码
static struct trace_event_functions trace_fn_funcs = {
        .trace          = trace_fn_trace,
        .raw            = trace_fn_raw,
        .hex            = trace_fn_hex,
        .binary         = trace_fn_bin,
};

static struct trace_event trace_fn_event = {
        .type           = TRACE_FN,
        .funcs          = &trace_fn_funcs,
};

static struct trace_event *events[] __initdata = {
        &trace_fn_event,
        &trace_ctx_event,
        &trace_wake_event,
        &trace_stack_event,
        &trace_user_stack_event,
        &trace_bputs_event,
        &trace_bprint_event,
        &trace_print_event,
        &trace_hwlat_event,
        &trace_raw_data_event,
        NULL
};

__init static int init_events(void)
{
        struct trace_event *event;
        int i, ret;

        for (i = 0; events[i]; i++) {
                event = events[i];

                ret = register_trace_event(event);
                if (!ret) {
                        printk(KERN_WARNING "event %d failed to register\n",
                               event->type);
                        WARN_ON_ONCE(1);
                }
        }

        return 0;
}

在数据读出时,会调用到event对应的event->funcs->trace()函数,seq_read() -> s_show() -> print_trace_line() -> print_trace_fmt() -> event->funcs->trace(): 由上可知,TRACE_FN的type,event->funcs->trace()对应trace_fn_trace()。

c 复制代码
static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
                                        struct trace_event *event)
{
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;

        trace_assign_type(field, iter->ent);
        //打印出本ip对应的符号
        seq_print_ip_sym(s, field->ip, flags);
        //如果trace option运行,打印出父ip对应的符号 
        if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
                trace_seq_puts(s, " <-");
                seq_print_ip_sym(s, field->parent_ip, flags);
        }

        trace_seq_putc(s, '\n');

        return trace_handle_return(s);
}
2.2.6 function tracer总结
  • tracer的使能:当使用echo xxx_tracer > current_tracer时,会关闭旧的current tracer并使能新的tracer。典型的包括function tracer和function_graph tracer
  • filter的配置:使用echo function_name > set_ftrace_filter/set_ftrace_notrace,可以配置部分function被trace,而不是所有function被trace
  • filter command的配置:使用echo '!__schedule_bug:traceoff' > set_ftrace_filter,类似命令可以配置条件触发的command,当条件满足后command会被执行

3、function graph tracer

function_graph tracerfunction tracer发展而来,function tracer使用_mcount插桩可以跟踪到每个函数的调用入口,而function_graph tracer即可以跟踪到函数的入口还可以跟踪到函数的返回。

一切的关键是在入口桩函数被调用时,修改了func()的返回地址,不是返回到func's parent()函数继续去执行,而是返回到reurn桩函数return_to_handler()中。return_to_handler()中执行完自己的return处理函数以后,再把返回地址恢复成func's parent中的地址,返回继续执行原有的路径。

原本的入口处插桩,只能追踪到函数的切换。现在入口、出口同时插桩,还能获得函数的执行时长,做更多的分析。

3.1 function graph tracer注册

c 复制代码
static __init int init_graph_trace(void)
{
        max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1);

        if (!register_trace_event(&graph_trace_entry_event)) {
                pr_warn("Warning: could not register graph trace events\n");
                return 1;
        }

        if (!register_trace_event(&graph_trace_ret_event)) {
                pr_warn("Warning: could not register graph trace events\n");
                return 1;
        }
        //注册function graph tracer
        return register_tracer(&graph_trace);
}
core_initcall(init_graph_trace);

static struct tracer graph_trace __tracer_data = {
        .name           = "function_graph",
        .update_thresh  = graph_trace_update_thresh,
        .open           = graph_trace_open,
        .pipe_open      = graph_trace_open,
        .close          = graph_trace_close,
        .pipe_close     = graph_trace_close,
        .init           = graph_trace_init,
        .reset          = graph_trace_reset,
        .print_line     = print_graph_function,
        .print_header   = print_graph_headers,
        .flags          = &tracer_flags,
        .set_flag       = func_graph_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
        .selftest       = trace_selftest_startup_function_graph,
#endif
};

3.2 function graph tracer使能

3.2.1 function graph tracer的使能

如2.2节 function tracer使能可知,可以使用echo function_graph > current_tracer命令来使能或者切换tracer。

c 复制代码
static int graph_trace_init(struct trace_array *tr)
{
        int ret;

        set_graph_array(tr);
        if (tracing_thresh)
                ret = register_ftrace_graph(&funcgraph_thresh_ops);
        else
                ret = register_ftrace_graph(&funcgraph_ops);
        if (ret)
                return ret;
        tracing_start_cmdline_record();

        return 0;
}

static struct fgraph_ops funcgraph_thresh_ops = {
        .entryfunc = &trace_graph_entry,
        .retfunc = &trace_graph_thresh_return,
};

static struct fgraph_ops funcgraph_ops = {
        .entryfunc = &trace_graph_entry,
        .retfunc = &trace_graph_return,
};

int register_ftrace_graph(struct fgraph_ops *gops)
{
        int ret = 0;

        ......
        ftrace_graph_active++;
        ret = start_graph_tracing();
        
        //给ftrace_graph_entry、ftrace_graph_return指针赋值
        ftrace_graph_return = gops->retfunc;

        __ftrace_graph_entry = gops->entryfunc;
        ftrace_graph_entry = ftrace_graph_entry_test;
        update_function_graph_func();
        //注册graph_ops:
        //1、将graph_ops加入到ftrace_ops_list链表;
        //2、根据graph_ops的hash表,更新_mcount插桩点;
        //3、更新ftrace_graph_call插桩点为ftrace_graph_caller()
        ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
out:
        mutex_unlock(&ftrace_lock);
        return ret;
}
3.2.2 function graph tracer的filter

因为function_graph tracer的graph_ops继续共用global_ops的hash表,所以可以继续使用set_ftrace_filter/set_ftrace_notrace来配置function_graph tracer的filter。

function_graph tracer还可以使用set_graph_function/set_graph_notrace接口来配置过滤,需要两种过滤条件都满足的函数才能被trace。实际上是配置到ftrace_graph_funcs[]/ftrace_graph_notrace_funcs[]表中。

c 复制代码
static const struct file_operations ftrace_graph_fops = {
        .open           = ftrace_graph_open,
        .read           = seq_read,
        .write          = ftrace_graph_write,
        .llseek         = tracing_lseek,
        .release        = ftrace_graph_release,
};

static const struct file_operations ftrace_graph_notrace_fops = {
        .open           = ftrace_graph_notrace_open,
        .read           = seq_read,
        .write          = ftrace_graph_write,
        .llseek         = tracing_lseek,
        .release        = ftrace_graph_release,
};

tracer_init_tracefs() -> ftrace_init_tracefs_toplevel() -> ftrace_init_dyn_tracefs()

static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
        ......
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        trace_create_file("set_graph_function", 0644, d_tracer,
                                    NULL,
                                    &ftrace_graph_fops);
        trace_create_file("set_graph_notrace", 0644, d_tracer,
                                    NULL,
                                    &ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
        return 0;
}

static ssize_t
ftrace_graph_write(struct file *file, const char __user *ubuf,
                   size_t cnt, loff_t *ppos)
{
        ssize_t read, ret = 0;
        struct ftrace_graph_data *fgd = file->private_data;
        struct trace_parser *parser;

        ......
        if (read >= 0 && trace_parser_loaded(parser) &&
            !trace_parser_cont(parser)) {
                //根据filter设置条件hash表
                ret = ftrace_graph_set_hash(fgd->new_hash,
                                            parser->buffer);
                trace_parser_clear(parser);
        }
        ......
        return ret;
}

trace_graph_entry()中会进行filter的合法性判断,函数必须是set_graph_function/set_graph_notrace配置中允许的函数或者是被它们调用的子函数才能继续执行,否则出错返回,后面的trace_graph_return()都不会被执行。

c 复制代码
int trace_graph_entry(struct ftrace_graph_ent *trace)
{
        struct trace_array *tr = graph_array;
        struct trace_array_cpu *data;
        unsigned long flags;
        long disabled;
        int ret;
        int cpu;
        int pc;

        if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
                return 0;
        //func在notrace hash中
        if (ftrace_graph_notrace_addr(trace->func)) {
                trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
                return 1;
        }

        if (!ftrace_trace_task(tr))
                return 0;
        //当func是调用的子函数,或是一个在hash中的函数时进行跟踪
        if (ftrace_graph_ignore_func(trace))
                return 0;

        if (ftrace_graph_ignore_irqs())
                return 0;

        //Do not trace a function if it's filtered by set_graph_notrace.
        //为什么要跑两次?
        if (ftrace_graph_notrace_addr(trace->func))
                return 1;
        ......
        if (likely(disabled == 1)) {
                pc = preempt_count();
                //将trace数据存入ring_buffer
                ret = __trace_graph_entry(tr, trace, flags, pc);
        } else {
                ret = 0;
        }
        atomic_dec(&data->disabled);
        local_irq_restore(flags);

        return ret;
}
3.2.3 function graph tracer的数据写入

和function tracer不一样的是,function_graph在进入函数和返回函数时都有trace数据存入。

c 复制代码
/* Function call entry */
FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,

        TRACE_GRAPH_ENT,

        F_STRUCT(
                __field_struct( struct ftrace_graph_ent,        graph_ent       )
                __field_desc(   unsigned long,  graph_ent,      func            )
                __field_desc(   int,            graph_ent,      depth           )
        ),

        F_printk("--> %lx (%d)", __entry->func, __entry->depth),

        FILTER_OTHER
);

/* Function return entry */
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,

        TRACE_GRAPH_RET,

        F_STRUCT(
                __field_struct( struct ftrace_graph_ret,        ret     )
                __field_desc(   unsigned long,  ret,            func    )
                __field_desc(   unsigned long long, ret,        calltime)
                __field_desc(   unsigned long long, ret,        rettime )
                __field_desc(   unsigned long,  ret,            overrun )
                __field_desc(   int,            ret,            depth   )
        ),

        F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d",
                 __entry->func, __entry->depth,
                 __entry->calltime, __entry->rettime,
                 __entry->depth),

        FILTER_OTHER
);
c 复制代码
trace_graph_entry() -> __trace_graph_entry()
int __trace_graph_entry(struct trace_array *tr,
                                struct ftrace_graph_ent *trace,
                                unsigned long flags,
                                int pc)
{       
        struct trace_event_call *call = &event_funcgraph_entry;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ftrace_graph_ent_entry *entry;

        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
                                          sizeof(*entry), flags, pc);
        if (!event)
                return 0;
        entry   = ring_buffer_event_data(event);
        entry->graph_ent                        = *trace;
        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit_nostack(buffer, event);
                
        return 1;       
} 
c 复制代码
trace_graph_return() -> __trace_graph_return()
void trace_graph_return(struct ftrace_graph_ret *trace)
{
        struct trace_array *tr = graph_array;
        struct trace_array_cpu *data;
        unsigned long flags;
        long disabled;
        int cpu;
        int pc;

        ftrace_graph_addr_finish(trace);

        if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
                trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
                return;
        }

        local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        disabled = atomic_inc_return(&data->disabled);
        if (likely(disabled == 1)) {
                pc = preempt_count();
                __trace_graph_return(tr, trace, flags, pc);
        }
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
}
void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
                                int pc)
{
        struct trace_event_call *call = &event_funcgraph_exit;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ftrace_graph_ret_entry *entry;
 
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
                                          sizeof(*entry), flags, pc);
        if (!event)
                return;
        entry   = ring_buffer_event_data(event);
        entry->ret                              = *trace;
        if (!call_filter_check_discard(call, entry, buffer, event))
                trace_buffer_unlock_commit_nostack(buffer, event);
}       

那么目标函数执行时,是怎么执行到trace_graph_entry与trace_graph_return的呢:

c 复制代码
ftrace_startup() -> ftrace_startup_enable() -> ftrace_run_update_code() -> arch_ftrace_update_code() -> 
ftrace_modify_all_code() -> ftrace_enable_ftrace_graph_caller() -> ftrace_modify_graph_caller()

/*
 * Turn on/off the call to ftrace_graph_caller() in ftrace_caller()
 * depending on @enable.
 */
static int ftrace_modify_graph_caller(bool enable)
{
        unsigned long pc = (unsigned long)&ftrace_graph_call;
        u32 branch, nop; 
        //把ftrace_graph_call替换为ftrace_graph_caller
        branch = aarch64_insn_gen_branch_imm(pc,
                                             (unsigned long)ftrace_graph_caller,
                                             AARCH64_INSN_BRANCH_NOLINK);
        nop = aarch64_insn_gen_nop();

        if (enable)
                return ftrace_modify_code(pc, nop, branch, true);
        else
                return ftrace_modify_code(pc, branch, nop, true);
}

ftrace_graph_entry的执行:

c 复制代码
//arch/arm64/kernel/entry-ftrace.S
ENTRY(ftrace_graph_caller)
       mcount_get_lr_addr      x0      // pointer to function's saved lr
       mcount_get_pc           x1      // function's pc
       mcount_get_parent_fp    x2      // parent's fp
       bl      prepare_ftrace_return   // prepare_ftrace_return(&lr, pc, fp)

       mcount_exit
ENDPROC(ftrace_graph_caller)

void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
                           unsigned long frame_pointer)
{
        unsigned long return_hooker = (unsigned long)&return_to_handler;
        unsigned long old;

        if (unlikely(atomic_read(&current->tracing_graph_pause)))
                return;
        old = *parent;

        if (!function_graph_enter(old, self_addr, frame_pointer, NULL))
                //修改func的返回地址为return_to_handler,return_to_handler()中执行完自己的return处理函数以后,
                //再把返回地址恢复成func's parent中的地址,返回继续执行原有的路径。
                *parent = return_hooker;
}

int function_graph_enter(unsigned long ret, unsigned long func,
                         unsigned long frame_pointer, unsigned long *retp)
{
        struct ftrace_graph_ent trace;

        trace.func = func;
        trace.depth = ++current->curr_ret_depth;

        if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
                goto out;

        /* Only trace if the calling function expects to */
        if (!ftrace_graph_entry(&trace))
                goto out_ret;

        return 0;
 out_ret:
        current->curr_ret_stack--;
 out:
        current->curr_ret_depth--;
        return -EBUSY;
}

trace_graph_return的执行:

c 复制代码
.globl return_to_handler
return_to_handler:
        pushl   %eax
        pushl   %edx
#ifdef CC_USING_FENTRY
        movl    $0, %eax
#else
        movl    %ebp, %eax
#endif
        call    ftrace_return_to_handler
        movl    %eax, %ecx
        popl    %edx
        popl    %eax
        JMP_NOSPEC %ecx

unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
{
        struct ftrace_graph_ret trace;
        unsigned long ret;
        
        ftrace_pop_return_trace(&trace, &ret, frame_pointer);
        trace.rettime = trace_clock_local();
        //ftrace_graph_return被赋值为trace_graph_return
        ftrace_graph_return(&trace);
        barrier();
        current->curr_ret_stack--;
        if (unlikely(!ret)) {
                ftrace_graph_stop();
                WARN_ON(1);
                /* Might as well panic. What else to do? */
                ret = (unsigned long)panic;
        }
        
        return ret; 
}
3.2.3 function graph tracer的数据读出

从trace文件读出的function_graph tracer默认数据格式为:(显示function_graph中设置的函数或者是被它们调用的子函数)

缺一个图!!!

3.2.4 function graph tracer总结

4、irqsoff tracer

irqsoff tracer用来追踪最大关中断时间。它的trace会提供几部分信息:

1、irqoff的最大时长:latency;

2、在最大irqoff这期间所有的function trace信息;

3、最后的irqon的函数回调信息;

irqsoff tracer的插桩方法,是直接在local_irq_enable()、local_irq_disable()中直接插入钩子函数trace_hardirqs_on()、trace_hardirqs_off()。

5、参考文档

pwl999 的tracer (function、function_graph、irq_off)

相关推荐
何陈陈4 分钟前
【Linux】线程池
linux·服务器·开发语言·c++
S hh5 分钟前
【Linux 】文件描述符fd、重定向、缓冲区(超详解)
linux·运维·服务器
DuoRuaiMiFa6 分钟前
Linux系统性能调优实战指南
linux
憧憬一下8 分钟前
线程池的实现和讲解:解决多线程并发服务器创建销毁线程消耗过大的问题
linux·线程池·c/c++·嵌入式linux
朝九晚五ฺ1 小时前
【Linux探索学习】第三弹——Linux的基础指令(下)——开启新篇章的大门
linux·运维·学习
肥or胖1 小时前
【MySQL】索引
linux·数据库·mysql
卓琢2 小时前
(九)Shell 脚本(四):正则表达式、sed 和 awk 详解
linux·mysql·正则表达式
一切皆是定数3 小时前
Linux驱动开发——LED驱动开发
linux·驱动开发·b树
小小不董3 小时前
图文深入理解Oracle DB Scheduler
linux·运维·服务器·数据库·oracle
不烦下雨c4 小时前
[网络]抓包工具介绍 tcpdump
linux·tcpdump