1、ftrace的静动态实现
ftrace是function trace的简称,但是function trace只是ftrace里面的一个基本功能。function trace是基于gcc的-pg
参数插入mcount
函数实现的,这部分依赖于处理器架构。mcount
在不同处理器的gcc
实现名字可能略有差异,可能是mcount
,_mcount
或者__mcount
,arm64中是_mcount
,本文统一使用mcount。
arm64的实现主要有以下三个文件:
c
arch/arm64/kernel/entry-ftrace.S // mcount的核心实现
arch/arm64/kernel/ftrace.c // CONFIG_DYNAMIC_FTRACE 的支持接口
arch/arm64/include/asm/ftrace.h // 声明和定义arm64为ftrace核心模块提供的接口
mcount的主要任务是根据是否打开了ftrace相关的宏,在每个内核函数入口加入trace代码,跳转到对应的注册函数中,做进一步处理。但是,在每个内核函数入口加入trace代码,必然影响内核的性能,为了减小对内核性能的影响,ftrace支持动态trace功能。下面来一一介绍这两种trace功能是如何实现的。
1.1 静态mcount实现
Kernel中打开CONFIG_FUNCTION_TRACER
后,会增加-pg
编译选项,这样在每个函数入口处都会插入bl mcount
跳转指令,函数运行时会进入mcount
函数。mcount
会判断函数指针ftrace_trace_function
是否被注册,默认注册的是空函数ftrace_stub
,只有打开function tracer后才会注册具体的处理函数ftrace_trace_function
(插个眼,具体ftrace_trace_function如何赋值,后面分析)。
c
//代码在kernel源码目录的Makefile:
ifdef CONFIG_FUNCTION_TRACER
CC_FLAGS_FTRACE := -pg
endif
下图是静态ftrace中mount调用的function trace和function graph trace的实现流程与代码实现:
c
//代码在arch/arm64/kernel/entry-ftrace.S
//没有定义CONFIG_DYNAMIC_FTRACE,即静态mcount
#ifndef CONFIG_DYNAMIC_FTRACE
/*
* void _mcount(unsigned long return_address)
* @return_address: return address to instrumented function
*/
ENTRY(_mcount)
mcount_enter
ldr_l x2, ftrace_trace_function // 只有打开function tracer后才会注册具体的处理函数ftrace_trace_function
adr x0, ftrace_stub // ftrace_stub是空函数
cmp x0, x2 // 比较x0与x2
b.eq skip_ftrace_call // 若x0 == x2,则跳转到skip_ftrace_call
mcount_get_pc x0 // function's pc
mcount_get_lr x1 // function's lr (= parent's pc)
blr x2 // 若x0 != x2, 则 (*ftrace_trace_function)(pc, lr);
skip_ftrace_call:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ldr_l x2, ftrace_graph_return // 加载ftrace_graph_return到x2
cmp x0, x2 // 比较x0与x2
b.ne ftrace_graph_caller // 若x0 != x2,则跳转到ftrace_graph_caller()
ldr_l x2, ftrace_graph_entry // 加载ftrace_graph_entry到x2
adr_l x0, ftrace_graph_entry_stub // 加载ftrace_graph_entry_stub到x0
cmp x0, x2 // 比较x0与x2
b.ne ftrace_graph_caller // 若x0 != x2,则跳转到ftrace_graph_caller();
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
mcount_exit
ENDPROC(_mcount)
#else /* CONFIG_DYNAMIC_FTRACE */
......
#endif /* CONFIG_DYNAMIC_FTRACE */
1.2 动态mcount实现
static ftrace一旦使能,对kernel中所有的函数(除开notrace、online、其他特殊函数)进行插桩,这带来的性能开销是惊人的,有可能导致人们弃用ftrace功能。
为了解决这个问题,内核开发者推出了dynamic ftrace,因为实际上调用者一般不需要对所有函数进行追踪,只会对感兴趣的一部分函数进行追踪。如果配置了CONFIG_DYNAMIC_FTRACE
, mcount
会被实现成一个空函数(只有一条ret
指令)。在系统启动时ftrace_init()中,mcount会被替换成nop
指令。打开tracer后,需要跟踪的函数的对应位置会被动态替换成跳转到ftrace_caller()
的指令。
c
//代码在arch/arm64/kernel/entry-ftrace.S
//没有定义CONFIG_DYNAMIC_FTRACE,即静态mcount
#ifndef CONFIG_DYNAMIC_FTRACE
......
#else /* CONFIG_DYNAMIC_FTRACE */
//如果配置了CONFIG_DYNAMIC_FTRACE, mcount会被实现成一个空函数(只有一条ret指令)。
//在系统启动时,mcount会被替换nop指令。打开tracer后,所有函数的对应位置会被动态替换成跳转到ftrace_caller()的指令。
ENTRY(_mcount)
ret
ENDPROC(_mcount)
/*
* void ftrace_caller(unsigned long return_address)
* @return_address: return address to instrumented function
*/
ENTRY(ftrace_caller)
mcount_enter
mcount_get_pc0 x0 // function's pc
mcount_get_lr x1 // function's lr
GLOBAL(ftrace_call) // tracer(pc, lr); function tracer的回调函数
nop // This will be replaced with "bl xxx"
// where xxx can be any kind of tracer.
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
GLOBAL(ftrace_graph_call) // ftrace_graph_caller(); function graph tracer的回调函数
nop // If enabled, this will be replaced
// "b ftrace_graph_caller"
#endif
mcount_exit
ENDPROC(ftrace_caller)
#endif /* CONFIG_DYNAMIC_FTRACE */
在编译的时候调用recordmcount.pl
搜索所有静态mcount实现中的bl _mcount
函数调用点,并且所有的调用点地址保存到section _mcount_loc
,其定义在include/asm-generic/vmlinux.lds.h
,详细的见文件scripts/recordmcount.pl、scripts/recordmcount.c
。
在初始化时,遍历section __mcount_loc
的调用点地址,默认给所有bl _mcount
替换成nop
。
c
// kernel/trace/ftrace.c
void __init ftrace_init(void)
{
extern unsigned long __start_mcount_loc[];
extern unsigned long __stop_mcount_loc[];
unsigned long count, flags;
int ret;
local_irq_save(flags);
ret = ftrace_dyn_arch_init();
local_irq_restore(flags);
if (ret)
goto failed;
//计算__mcount_loc的大小
count = __stop_mcount_loc - __start_mcount_loc;
......
last_ftrace_enabled = ftrace_enabled = 1;
//遍历section __mcount_loc,处理其中保存的调用地址
ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
set_ftrace_early_filters();
return;
failed:
ftrace_disabled = 1;
}
c
//使用ftrace_page来表示每一个调用地址
//每一个调用地址保存在ftrace_page的records中
//index表示在records中的下标
struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
int index;
int size;
};
struct dyn_ftrace {
unsigned long ip; /* address of mcount call-site */
unsigned long flags;
struct dyn_arch_ftrace arch;
};
c
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
struct ftrace_page *start_pg;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
unsigned long count;
unsigned long *p;
unsigned long addr;
unsigned long flags = 0; /* Shut up gcc */
int ret = -ENOMEM;
count = end - start;
if (!count)
return 0;
//对地址进行排序
sort(start, count, sizeof(*start),
ftrace_cmp_ips, NULL);
//对每个地址分配新的ftrace_page结构来存储,在section __mcount_loc中,
//只是简单的存储了unsigned long类型的调用地址
//dyn_ftrace结构除了使用->ip来存储地址,还使用->flags来存储当前的状态和被引用计数
start_pg = ftrace_allocate_pages(count);
if (!start_pg)
return -ENOMEM;
mutex_lock(&ftrace_lock);
/*
* Core and each module needs their own pages, as
* modules will free them when they are removed.
* Force a new page to be allocated for modules.
*/
if (!mod) {
WARN_ON(ftrace_pages || ftrace_pages_start);
/* First initialization */
ftrace_pages = ftrace_pages_start = start_pg;
} else {
if (!ftrace_pages)
goto out;
if (WARN_ON(ftrace_pages->next)) {
/* Hmm, we have free pages? */
while (ftrace_pages->next)
ftrace_pages = ftrace_pages->next;
}
ftrace_pages->next = start_pg;
}
//更新dyn_ftrace新结构中的->ip字段
p = start;
pg = start_pg;
while (p < end) {
addr = ftrace_call_adjust(*p++);
if (!addr)
continue;
if (pg->index == pg->size) {
/* We should have allocated enough */
if (WARN_ON(!pg->next))
break;
pg = pg->next;
}
//addr保存在ftrace_page的records中
rec = &pg->records[pg->index++];
rec->ip = addr;
}
WARN_ON(pg->next);
/* Assign the last page to ftrace_pages */
ftrace_pages = pg;
if (!mod)
local_irq_save(flags);
//更新dyn_ftrace新结构中的->flags字段,默认给所有调用点替换成"nop"指令
ftrace_update_code(mod, start_pg);
if (!mod)
local_irq_restore(flags);
ret = 0;
out:
mutex_unlock(&ftrace_lock);
return ret;
}
c
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
u64 start, stop;
unsigned long update_cnt = 0;
unsigned long rec_flags = 0;
int i;
start = ftrace_now(raw_smp_processor_id());
//当一个模块被加载时,会调用此函数来将其文本中的 mcount 调用转换为 nops,并且在 ftrace 数据中创建一个条目。
//现在,如果在调用此函数后但在模块将其文本设置为只读之前激活了 ftrace,那么在转换调用时只读操作做完了,启用 ftrace 的修改可能会失败
//为了防止这种情况发生,模块的记录被设置为禁用,并将在将模块的文本设置为只读后启用
if (mod)
rec_flags |= FTRACE_FL_DISABLED;
for (pg = new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
p->flags = rec_flags;
#ifndef CC_USING_NOP_MCOUNT
//在被插装的函数中关闭对 ftrace_caller() 的调用,使用nop
if (!ftrace_code_disable(mod, p))
break;
#endif
update_cnt++;
}
}
stop = ftrace_now(raw_smp_processor_id());
ftrace_update_time = stop - start;
ftrace_update_tot_cnt += update_cnt;
return 0;
}
总体过程如下:
2、function tracer
2.1 tracer注册
c
//kernel/trace/trace_functions.c
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
{ } /* Always set a last empty entry */
};
static struct tracer_flags func_flags = {
.val = 0, /* By default: all flags disabled */
.opts = func_opts
};
static struct tracer function_trace __tracer_data =
{
.name = "function",
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
.flags = &func_flags,
.set_flag = func_set_flag,
.allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function,
#endif
};
__init int init_function_trace(void)
{
init_func_cmd_traceon();
return register_tracer(&function_trace);
}
c
/**
* register_tracer - register a tracer with the ftrace system.
* @type - the plugin for the tracer
*
* Register a new plugin tracer.
*/
int __init register_tracer(struct tracer *type)
{
struct tracer *t;
int ret = 0;
......
//tracer没有设置,就使用dummy_set_flag与dummy_tracer_opt,function_trace使用func_set_flag与func_flags
if (!type->set_flag)
type->set_flag = &dummy_set_flag;
if (!type->flags) {
/*allocate a dummy tracer_flags*/
type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL);
if (!type->flags) {
ret = -ENOMEM;
goto out;
}
type->flags->val = 0;
type->flags->opts = dummy_tracer_opt;
} else
if (!type->flags->opts)
type->flags->opts = dummy_tracer_opt;
/* store the tracer for __set_tracer_option */
type->flags->trace = type;
ret = run_tracer_selftest(type);
if (ret < 0)
goto out;
//将新的tracer加入到trace_types链表中
//The global_trace is the descriptor that holds the top-level tracing buffers for the live tracing.
type->next = trace_types;
trace_types = type;
add_tracer_options(&global_trace, type);
out:
tracing_selftest_running = false;
mutex_unlock(&trace_types_lock);
if (ret || !default_bootup_tracer)
goto out_unlock;
if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
goto out_unlock;
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
/* Do we want this tracer to start on bootup? */
//设置boot默认的tracer
tracing_set_tracer(&global_trace, type->name);
default_bootup_tracer = NULL;
apply_trace_boot_options();
/* disable other selftests, since this will break it. */
tracing_selftest_disabled = true;
#ifdef CONFIG_FTRACE_STARTUP_TEST
printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
type->name);
#endif
out_unlock:
return ret;
}
c
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
if (!tr->dir)
return;
create_trace_option_files(tr, t);
}
c
static void
create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
{
struct trace_option_dentry *topts;
struct trace_options *tr_topts;
struct tracer_flags *flags;
struct tracer_opt *opts;
int cnt;
int i;
flags = tracer->flags;
/*
* If this is an instance, only create flags for tracers
* the instance may have.
*/
if (!trace_ok_for_array(tracer, tr))
return;
for (i = 0; i < tr->nr_topts; i++) {
/* Make sure there's no duplicate flags. */
if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags))
return;
}
opts = flags->opts;
topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
if (!topts)
return;
tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
GFP_KERNEL);
tr->topts = tr_topts;
tr->topts[tr->nr_topts].tracer = tracer;
tr->topts[tr->nr_topts].topts = topts;
tr->nr_topts++;
for (cnt = 0; opts[cnt].name; cnt++) {
create_trace_option_file(tr, &topts[cnt], flags,
&opts[cnt]);
WARN_ONCE(topts[cnt].entry == NULL,
"Failed to create trace option: %s",
opts[cnt].name);
}
}
c
static void
create_trace_option_file(struct trace_array *tr,
struct trace_option_dentry *topt,
struct tracer_flags *flags,
struct tracer_opt *opt)
{
struct dentry *t_options;
t_options = trace_options_init_dentry(tr);
if (!t_options)
return;
topt->flags = flags;
topt->opt = opt;
topt->tr = tr;
topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops);
}
2.2 tracer使能
可以使用echo function > current_tracer
命令来使能或者切换tracer。
c
static __init int tracer_init_tracefs(void)
{
struct dentry *d_tracer;
......
//返回/sys/kernel/debug/tracing目录
d_tracer = tracing_init_dentry();
event_trace_init();
init_tracer_tracefs(&global_trace, d_tracer);
ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
......
create_trace_instances(d_tracer);
update_tracer_options(&global_trace);
return 0;
}
c
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
struct trace_event_file *file;
int cpu;
......
trace_create_file("current_tracer", 0644, d_tracer,
tr, &set_tracer_fops);
......
trace_create_file("trace", 0644, d_tracer,
tr, &tracing_fops);
......
trace_create_file("tracing_on", 0644, d_tracer,
tr, &rb_simple_fops);
ftrace_init_tracefs(tr, d_tracer);
}
c
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic,
.read = tracing_set_trace_read,
.write = tracing_set_trace_write,
.llseek = generic_file_llseek,
};
static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
//当前文件所在trace buffer,默认是global_trace
struct trace_array *tr = filp->private_data;
char buf[MAX_TRACER_SIZE+1];
......
//使能新的tracer
err = tracing_set_tracer(tr, buf);
......
return ret;
}
c
static int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
#endif
int ret = 0;
......
//根据名字,在trace_types链表中找到对应的tracer
for (t = trace_types; t; t = t->next) {
if (strcmp(t->name, buf) == 0)
break;
}
......
//调用新tracer的init函数
if (t->init) {
ret = tracer_init(t, tr);
if (ret)
goto out;
}
//把新的tracer设置为当前tracer
tr->current_trace = t;
tr->current_trace->enabled++;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
return ret;
}
c
int tracer_init(struct tracer *t, struct trace_array *tr)
{
tracing_reset_online_cpus(&tr->trace_buffer);
return t->init(tr);
}
2.2.1 function tracer的使能
以function tracer为例,t->init调用到function_trace_init():
c
start_kernel() -> early_trace_init() -> tracer_alloc_buffers()
struct ftrace_ops global_ops = {
.func = ftrace_stub,
.local_hash.notrace_hash = EMPTY_HASH,
.local_hash.filter_hash = EMPTY_HASH,
INIT_OPS_HASH(global_ops)
.flags = FTRACE_OPS_FL_RECURSION_SAFE |
FTRACE_OPS_FL_INITIALIZED |
FTRACE_OPS_FL_PID,
};
__init void ftrace_init_global_array_ops(struct trace_array *tr)
{
tr->ops = &global_ops;
tr->ops->private = tr;
ftrace_init_trace_array(tr);
}
__init static int tracer_alloc_buffers(void)
{
ftrace_init_global_array_ops(&global_trace);
}
c
//这里的tr默认就是global_trace
static int function_trace_init(struct trace_array *tr)
{
ftrace_func_t func;
/*
* Instance trace_arrays get their ops allocated
* at instance creation. Unless it failed
* the allocation.
*/
if (!tr->ops)
return -ENOMEM;
/* Currently only the global instance can do stack tracing */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
func_flags.val & TRACE_FUNC_OPT_STACK)
func = function_stack_trace_call;
else
func = function_trace_call;
ftrace_init_array_ops(tr, func);
tr->trace_buffer.cpu = get_cpu();
put_cpu();
tracing_start_cmdline_record();
//启动function tracer,将tr->ops也加入到ftrace_ops_list当中
tracing_start_function_trace(tr);
return 0;
}
static void tracing_start_function_trace(struct trace_array *tr)
{
tr->function_enabled = 0;
//此时ops是global_ops
register_ftrace_function(tr->ops);
tr->function_enabled = 1;
}
c
int register_ftrace_function(struct ftrace_ops *ops)
{
int ret = -1;
//初始化hash表,ops->func_hash = &ops->local_hash;
ftrace_ops_init(ops);
mutex_lock(&ftrace_lock);
//将global_ops加入ftrace_ops_list,并且根据情况,修改各个桩位置的指令
ret = ftrace_startup(ops, 0);
mutex_unlock(&ftrace_lock);
return ret;
}
c
int ftrace_startup(struct ftrace_ops *ops, int command)
{
int ret;
//1、把global_ops加入ftrace_ops_list
//2、根据ftrace_ops_list链表中成员的情况给ftrace_trace_function指针赋值:
// ftrace_ops_list链表为空:ftrace_trace_function= ftrace_stub
// ftrace_ops_list链表有1个成员:ftrace_trace_function= ftrace_ops_get_list_func(ftrace_ops_list)
// ftrace_ops_list链表有多个成员:ftrace_trace_function= ftrace_ops_list_func
//3、更新ftrace_graph_entry
ret = __register_ftrace_function(ops);
if (ret)
return ret;
ftrace_start_up++;
/*
* Note that ftrace probes uses this to start up
* and modify functions it will probe. But we still
* set the ADDING flag for modification, as probes
* do not have trampolines. If they add them in the
* future, then the probes will need to distinguish
* between adding and updating probes.
*/
ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
//遍历全部_mcount插桩点ftrace_rec
//根据ip在新、旧hash表中的变化,设置对应rec->flags中的FTRACE_FL_IPMODIFY
ret = ftrace_hash_ipmodify_enable(ops);
if (ret < 0) {
/* Rollback registration process */
__unregister_ftrace_function(ops);
ftrace_start_up--;
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
return ret;
}
//遍历全部_mcount插桩点ftrace_rec
//根据filter_hash、notrace_hash是否match ip,给对应rec->flags中ref_cnt进行加1/减1操作
if (ftrace_hash_rec_enable(ops, 1))
command |= FTRACE_UPDATE_CALLS; //ftrace_startup_enable()中使用
//更新插桩点:
//FTRACE_UPDATE_CALLS被设置,更新_mcount插桩点:ref_cnt大于0的插桩点,更新成ftrace_caller()
//FTRACE_UPDATE_TRACE_FUNC被设置,更新ftrace_call插桩点:更新成ftrace_trace_function指向的函数
//FTRACE_START_FUNC_RET被设置,更新ftrace_graph_call插桩点:更新成ftrace_graph_caller()
ftrace_startup_enable(command);
ops->flags &= ~FTRACE_OPS_FL_ADDING;
return 0;
}
c
int __register_ftrace_function(struct ftrace_ops *ops)
{
......
//把global_ops加入ftrace_ops_list
add_ftrace_ops(&ftrace_ops_list, ops);
......
//根据ftrace_ops_list链表中成员的情况给ftrace_trace_function指针赋值:
// ftrace_ops_list链表为空:ftrace_trace_function = ftrace_stub
// ftrace_ops_list链表有1个成员:ftrace_trace_function = ftrace_ops_get_list_func(ftrace_ops_list)
// ftrace_ops_list链表有多个成员:ftrace_trace_function = ftrace_ops_list_func
if (ftrace_enabled)
update_ftrace_function();
return 0;
}
c
static void ftrace_startup_enable(int command)
{
//saved_ftrace_func != ftrace_trace_function则设置command |= FTRACE_UPDATE_TRACE_FUNC;
if (saved_ftrace_func != ftrace_trace_function) {
saved_ftrace_func = ftrace_trace_function;
command |= FTRACE_UPDATE_TRACE_FUNC; //function trace
}
if (!command || !ftrace_enabled)
return;
ftrace_run_update_code(command);
}
ftrace_run_update_code() -> arch_ftrace_update_code() -> ftrace_modify_all_code()
void ftrace_modify_all_code(int command)
{
int update = command & FTRACE_UPDATE_TRACE_FUNC;
int mod_flags = 0;
int err = 0;
if (command & FTRACE_MAY_SLEEP)
mod_flags = FTRACE_MODIFY_MAY_SLEEP_FL;
//如果FTRACE_UPDATE_TRACE_FUNC被设置,
//对于ftrace_call插桩点,直接调用ftrace_ops_list链表中某个ftrace_ops的操作需要谨慎
//保险起见,默认还是使用ftrace_ops_list_func()替换ftrace_call(),它会轮询ftrace_ops_list链表中所有ftrace_ops
if (update) {
err = ftrace_update_ftrace_func(ftrace_ops_list_func);
if (FTRACE_WARN_ON(err))
return;
}
//如果FTRACE_UPDATE_CALLS被设置,对于_mcount插桩点,
//遍历全部ftrace_rec,ref_cnt大于0的插桩点,更新成ftrace_caller()
if (command & FTRACE_UPDATE_CALLS)
ftrace_replace_code(mod_flags | FTRACE_MODIFY_ENABLE_FL);
else if (command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(mod_flags);
//如果FTRACE_UPDATE_TRACE_FUNC被设置,
//对于ftrace_call插桩点,如果ftrace_trace_function确实不等于ftrace_ops_list_func(),更新成ftrace_trace_function指向的函数
if (update && ftrace_trace_function != ftrace_ops_list_func) {
function_trace_op = set_function_trace_op;
smp_wmb();
/* If irqs are disabled, we are in stop machine */
if (!irqs_disabled())
smp_call_function(ftrace_sync_ipi, NULL, 1);
err = ftrace_update_ftrace_func(ftrace_trace_function);
if (FTRACE_WARN_ON(err))
return;
}
//如果FTRACE_START_FUNC_RET被设置,对于ftrace_graph_call插桩点,
//更新成ftrace_graph_caller()
if (command & FTRACE_START_FUNC_RET)
err = ftrace_enable_ftrace_graph_caller();
else if (command & FTRACE_STOP_FUNC_RET)
err = ftrace_disable_ftrace_graph_caller();
FTRACE_WARN_ON(err);
}
c
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
}
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
struct ftrace_ops *op;
int bit;
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
preempt_disable_notrace();
//默认还是使用ftrace_ops_list_func()替换ftrace_call(),它会轮询ftrace_ops_list链表中所有ftrace_ops
do_for_each_ftrace_op(op, ftrace_ops_list) {
if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
ftrace_ops_test(op, ip, regs)) {
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
}
op->func(ip, parent_ip, op, regs);
}
} while_for_each_ftrace_op(op);
out:
preempt_enable_notrace();
trace_clear_recursion(bit);
}
2.2.2 function tracer的filter
通过set_ftrace_filter
、set_ftrace_notrace
设置function trcer的filter。本质上是操作global_ops的filter_hash
、notrace_hash
。
c
tracer_init_tracefs() -> ftrace_init_tracefs_toplevel() -> ftrace_init_dyn_tracefs()
static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
trace_create_file("available_filter_functions", 0444,
d_tracer, NULL, &ftrace_avail_fops);
trace_create_file("enabled_functions", 0444,
d_tracer, NULL, &ftrace_enabled_fops);
//在/sys/kernel/debug/tracing目录下创建set_ftrace_filter与set_ftrace_notrace
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
trace_create_file("set_graph_function", 0644, d_tracer,
NULL,
&ftrace_graph_fops);
trace_create_file("set_graph_notrace", 0644, d_tracer,
NULL,
&ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
return 0;
}
void ftrace_create_filter_files(struct ftrace_ops *ops,
struct dentry *parent)
{
trace_create_file("set_ftrace_filter", 0644, parent,
ops, &ftrace_filter_fops);
trace_create_file("set_ftrace_notrace", 0644, parent,
ops, &ftrace_notrace_fops);
}
static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
.llseek = tracing_lseek,
.release = ftrace_regex_release,
};
set_ftrace_filter
、set_ftrace_notrace
的文件操作有个技巧:
- open的时候分配一个临时hash表iter->hash来拷贝global_ops的filter_hash/notrace_hash的内容,
- 在write操作实际设置filter时对iter->hash操作(遍历ftrcae_rec,如果ip在filter中存在,将其加入/删除到iter->hash中),
- 在close的时候使用新的hash表iter->hash来更新global_ops。根据最新hash表的内容,更新_mcount插桩点,遍历全部ftrace_rec:
- ref_cnt大于0的插桩点,更新成ftrace_caller()
- ref_cnt等于0的插桩点,更新成nop。
c
ftrace_filter_write() -> ftrace_regex_write() -> ftrace_process_regex()
static int ftrace_process_regex(struct ftrace_iterator *iter,
char *buff, int len, int enable)
{
struct ftrace_hash *hash = iter->hash;
struct trace_array *tr = iter->ops->private;
char *func, *command, *next = buff;
struct ftrace_func_command *p;
int ret = -EINVAL;
func = strsep(&next, ":");
if (!next) {
//解析filter配置命令,配置到iter->hash中,操作global_ops
ret = ftrace_match_records(hash, func, len);
if (!ret)
ret = -EINVAL;
if (ret < 0)
return ret;
return 0;
}
/* command found */
......
out_unlock:
mutex_unlock(&ftrace_cmd_mutex);
return ret;
}
2.2.3 function tracer的filter command
通过set_ftrace_filter
设置function trcer的filter command。本质上向trace_probe_ops
注册cmd,以及操作trace_probe_ops
的filter_hash、notrace_hash。
虽然同样是操作set_ftrace_filter
,但是配置filter和配置filter command是操作到不同的实体:
- 配置filter:操作的是global_ops的filter_hash/notrace_hash的内容;
- 配置filter command:是把command向
trace_probe_ops
注册,并且操作trace_probe_ops
的filter_hash/notrace_hash的内容;
在配置filter command之前首先得注册command:
c
init_function_trace() -> init_func_cmd_traceon()
static int __init init_func_cmd_traceon(void)
{
int ret;
//把command加入到ftrace_commands链表
ret = register_ftrace_command(&ftrace_traceoff_cmd);
if (ret)
return ret;
ret = register_ftrace_command(&ftrace_traceon_cmd);
if (ret)
goto out_free_traceoff;
ret = register_ftrace_command(&ftrace_stacktrace_cmd);
if (ret)
goto out_free_traceon;
ret = register_ftrace_command(&ftrace_dump_cmd);
if (ret)
goto out_free_stacktrace;
ret = register_ftrace_command(&ftrace_cpudump_cmd);
if (ret)
goto out_free_dump;
return 0;
}
c
tatic struct ftrace_func_command ftrace_traceon_cmd = {
.name = "traceon",
.func = ftrace_trace_onoff_callback,
};
以traceon command
为例,继续分析上一节对set_ftrace_filter
的文件操作:
c
ftrace_filter_write() -> ftrace_regex_write() -> ftrace_process_regex()
static int ftrace_process_regex(struct ftrace_iterator *iter,
char *buff, int len, int enable)
{
struct ftrace_hash *hash = iter->hash;
struct trace_array *tr = iter->ops->private;
char *func, *command, *next = buff;
struct ftrace_func_command *p;
int ret = -EINVAL;
func = strsep(&next, ":");
......
/* command found */
command = strsep(&next, ":");
//filter command配置,最后实际会操作到trace_probe_ops
mutex_lock(&ftrace_cmd_mutex);
list_for_each_entry(p, &ftrace_commands, list) {
if (strcmp(p->name, command) == 0) {
ret = p->func(tr, hash, func, command, next, enable);
goto out_unlock;
}
}
out_unlock:
mutex_unlock(&ftrace_cmd_mutex);
return ret;
}
c
static int
ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
if (!tr)
return -ENODEV;
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
else
ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
//注册command到trace_probe_ops
return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd,
param, enable);
}
c
static int
ftrace_trace_probe_callback(struct trace_array *tr,
struct ftrace_probe_ops *ops,
struct ftrace_hash *hash, char *glob,
char *cmd, char *param, int enable)
{
void *count = (void *)-1;
char *number;
int ret;
/* hash funcs only work with set_ftrace_filter */
if (!enable)
return -EINVAL;
//如果命令是"!",注销filter command
if (glob[0] == '!')
return unregister_ftrace_function_probe_func(glob+1, tr, ops);
if (!param)
goto out_reg;
number = strsep(¶m, ":");
if (!strlen(number))
goto out_reg;
//解析到filter command中的"count"字段
ret = kstrtoul(number, 0, (unsigned long *)&count);
if (ret)
return ret;
out_reg:
//继续注册filter command
ret = register_ftrace_function_probe(glob, tr, ops, count);
return ret < 0 ? ret : 0;
}
c
int
register_ftrace_function_probe(char *glob, struct trace_array *tr,
struct ftrace_probe_ops *probe_ops,
void *data)
{
struct ftrace_func_entry *entry;
struct ftrace_func_probe *probe;
struct ftrace_hash **orig_hash;
struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
int count = 0;
int size;
int ret;
int i;
......
if (&probe->list == &tr->func_probes) {
probe = kzalloc(sizeof(*probe), GFP_KERNEL);
probe->probe_ops = probe_ops;
//插桩点执行后的回调
probe->ops.func = function_trace_probe_call;
probe->tr = tr;
ftrace_ops_init(&probe->ops);
list_add(&probe->list, &tr->func_probes);
}
mutex_lock(&probe->ops.func_hash->regex_lock);
//将trace_probe_ops的filter_hash拷贝到临时hash表
orig_hash = &probe->ops.func_hash->filter_hash;
old_hash = *orig_hash;
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
//遍历ftrace_rec,如果当前ip符合当前函数filter规则,加入到临时hash表
ret = ftrace_match_records(hash, glob, strlen(glob));
......
//把临时hash表更新到trace_probe_ops的filter_hash中
ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash,
hash, 1);
if (ret < 0)
goto err_unlock;
/* One ref for each new function traced */
probe->ref += count;
if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED))
//注册trace_probe_ops到ftrace_ops_list,根据hash表的更新来更新_mcount插桩点
ret = ftrace_startup(&probe->ops, 0);
......
}
trace_probe_ops在被调用的时候,执行ftrace_func_hash中的filter command:
c
ftrace_caller() -> ftrace_ops_list_func() -> __ftrace_ops_list_func()
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
struct ftrace_ops *op;
int bit;
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
preempt_disable_notrace();
do_for_each_ftrace_op(op, ftrace_ops_list) {
//遍历ftrace_ops_list链表,在当前ip满足ops hash的情况下,逐个执行ftrace_ops->func()
if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
ftrace_ops_test(op, ip, regs)) {
//函数function_trace_probe_call()
op->func(ip, parent_ip, op, regs);
}
} while_for_each_ftrace_op(op);
out:
preempt_enable_notrace();
trace_clear_recursion(bit);
}
c
static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct ftrace_probe_ops *probe_ops;
struct ftrace_func_probe *probe;
probe = container_of(op, struct ftrace_func_probe, ops);
probe_ops = probe->probe_ops;
preempt_disable_notrace();
//probe_ops->func为ftrace_trace_onoff_callback()中定义的ops->func
//traceon为traceon_count_probe_ops或者traceon_probe_ops的func
probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data);
preempt_enable_notrace();
}
static struct ftrace_probe_ops traceon_count_probe_ops = {
.func = ftrace_traceon_count, //根据指定的计数值来控制追踪的启用或禁用
.print = ftrace_traceon_print,
.init = ftrace_count_init,
.free = ftrace_count_free,
};
static struct ftrace_probe_ops traceon_probe_ops = {
.func = ftrace_traceon, //enable tracing buffers
.print = ftrace_traceon_print,
};
//enable tracing buffers
static void
ftrace_traceon(unsigned long ip, unsigned long parent_ip,
struct trace_array *tr, struct ftrace_probe_ops *ops,
void *data)
{
if (tracer_tracing_is_on(tr))
return;
tracer_tracing_on(tr);
}
void tracer_tracing_on(struct trace_array *tr)
{
if (tr->trace_buffer.buffer)
ring_buffer_record_on(tr->trace_buffer.buffer);
tr->buffer_disabled = 0;
/* Make the flag seen by readers */
smp_wmb();
}
2.2.4 function tracer的数据存入
c
ftrace_caller() -> ftrace_ops_list_func() -> __ftrace_ops_list_func() -> global_ops->func() -> function_trace_call() -> trace_function()
void
trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip, unsigned long flags,
int pc)
{
struct trace_event_call *call = &event_function;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
//从ringbuffer中分配空间
event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->ip = ip;
entry->parent_ip = parent_ip;
//存入function tracer自定义的trace数据:ip、parent_ip
if (!call_filter_check_discard(call, entry, buffer, event)) {
if (static_branch_unlikely(&ftrace_exports_enabled))
ftrace_exports(event);
__buffer_unlock_commit(buffer, event);
}
}
function tracer自定义的trace数据非常简单:ip、parent_ip
c
struct ring_buffer_event {
u32 type_len:5, time_delta:27;
u32 array[];
};
/*
* The trace entry - the most basic unit of tracing. This is what
* is printed in the end as a single line in the trace output, such as:
*
* bash-15816 [01] 235.197585: idle_cpu <- irq_enter
*/
struct trace_entry {
unsigned short type;
unsigned char flags;
unsigned char preempt_count;
int pid;
};
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
struct struct_name { \
struct trace_entry ent; \
tstruct \
}
#undef FTRACE_ENTRY_REG
#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
filter, regfn) \
FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
filter)
FTRACE_ENTRY_REG(function, ftrace_entry,
TRACE_FN,
F_STRUCT(
__field( unsigned long, ip )
__field( unsigned long, parent_ip )
),
F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
FILTER_TRACE_FN,
perf_ftrace_event_register
);
c
struct ftrace_entry {
struct trace_entry ent;
unsigned long ip;
unsigned long parent_ip;
};
2.2.5 function tracer的数据读出
从trace文件读出的function tracer默认数据格式为:
在kernel/trace/trace_output.c
文件中,注册了系统默认的几种trace_event。function tracer使用TRACE_FN类型的trace_fn_event:
c
static struct trace_event_functions trace_fn_funcs = {
.trace = trace_fn_trace,
.raw = trace_fn_raw,
.hex = trace_fn_hex,
.binary = trace_fn_bin,
};
static struct trace_event trace_fn_event = {
.type = TRACE_FN,
.funcs = &trace_fn_funcs,
};
static struct trace_event *events[] __initdata = {
&trace_fn_event,
&trace_ctx_event,
&trace_wake_event,
&trace_stack_event,
&trace_user_stack_event,
&trace_bputs_event,
&trace_bprint_event,
&trace_print_event,
&trace_hwlat_event,
&trace_raw_data_event,
NULL
};
__init static int init_events(void)
{
struct trace_event *event;
int i, ret;
for (i = 0; events[i]; i++) {
event = events[i];
ret = register_trace_event(event);
if (!ret) {
printk(KERN_WARNING "event %d failed to register\n",
event->type);
WARN_ON_ONCE(1);
}
}
return 0;
}
在数据读出时,会调用到event对应的event->funcs->trace()
函数,seq_read() -> s_show() -> print_trace_line() -> print_trace_fmt() -> event->funcs->trace()
: 由上可知,TRACE_FN的type,event->funcs->trace()对应trace_fn_trace()。
c
static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
trace_assign_type(field, iter->ent);
//打印出本ip对应的符号
seq_print_ip_sym(s, field->ip, flags);
//如果trace option运行,打印出父ip对应的符号
if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
trace_seq_puts(s, " <-");
seq_print_ip_sym(s, field->parent_ip, flags);
}
trace_seq_putc(s, '\n');
return trace_handle_return(s);
}
2.2.6 function tracer总结
- tracer的使能:当使用
echo xxx_tracer > current_tracer
时,会关闭旧的current tracer并使能新的tracer。典型的包括function tracer和function_graph tracer - filter的配置:使用
echo function_name > set_ftrace_filter/set_ftrace_notrace
,可以配置部分function被trace,而不是所有function被trace - filter command的配置:使用
echo '!__schedule_bug:traceoff' > set_ftrace_filter
,类似命令可以配置条件触发的command,当条件满足后command会被执行
3、function graph tracer
function_graph tracer
从function tracer
发展而来,function tracer
使用_mcount
插桩可以跟踪到每个函数的调用入口,而function_graph tracer
即可以跟踪到函数的入口还可以跟踪到函数的返回。
一切的关键是在入口桩函数被调用时,修改了func()的返回地址,不是返回到func's parent()函数继续去执行,而是返回到reurn桩函数return_to_handler()
中。return_to_handler()
中执行完自己的return处理函数以后,再把返回地址恢复成func's parent中的地址,返回继续执行原有的路径。
原本的入口处插桩,只能追踪到函数的切换。现在入口、出口同时插桩,还能获得函数的执行时长,做更多的分析。
3.1 function graph tracer注册
c
static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1);
if (!register_trace_event(&graph_trace_entry_event)) {
pr_warn("Warning: could not register graph trace events\n");
return 1;
}
if (!register_trace_event(&graph_trace_ret_event)) {
pr_warn("Warning: could not register graph trace events\n");
return 1;
}
//注册function graph tracer
return register_tracer(&graph_trace);
}
core_initcall(init_graph_trace);
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
.update_thresh = graph_trace_update_thresh,
.open = graph_trace_open,
.pipe_open = graph_trace_open,
.close = graph_trace_close,
.pipe_close = graph_trace_close,
.init = graph_trace_init,
.reset = graph_trace_reset,
.print_line = print_graph_function,
.print_header = print_graph_headers,
.flags = &tracer_flags,
.set_flag = func_graph_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function_graph,
#endif
};
3.2 function graph tracer使能
3.2.1 function graph tracer的使能
如2.2节 function tracer使能可知,可以使用echo function_graph > current_tracer
命令来使能或者切换tracer。
c
static int graph_trace_init(struct trace_array *tr)
{
int ret;
set_graph_array(tr);
if (tracing_thresh)
ret = register_ftrace_graph(&funcgraph_thresh_ops);
else
ret = register_ftrace_graph(&funcgraph_ops);
if (ret)
return ret;
tracing_start_cmdline_record();
return 0;
}
static struct fgraph_ops funcgraph_thresh_ops = {
.entryfunc = &trace_graph_entry,
.retfunc = &trace_graph_thresh_return,
};
static struct fgraph_ops funcgraph_ops = {
.entryfunc = &trace_graph_entry,
.retfunc = &trace_graph_return,
};
int register_ftrace_graph(struct fgraph_ops *gops)
{
int ret = 0;
......
ftrace_graph_active++;
ret = start_graph_tracing();
//给ftrace_graph_entry、ftrace_graph_return指针赋值
ftrace_graph_return = gops->retfunc;
__ftrace_graph_entry = gops->entryfunc;
ftrace_graph_entry = ftrace_graph_entry_test;
update_function_graph_func();
//注册graph_ops:
//1、将graph_ops加入到ftrace_ops_list链表;
//2、根据graph_ops的hash表,更新_mcount插桩点;
//3、更新ftrace_graph_call插桩点为ftrace_graph_caller()
ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
return ret;
}
3.2.2 function graph tracer的filter
因为function_graph tracer
的graph_ops继续共用global_ops的hash表,所以可以继续使用set_ftrace_filter/set_ftrace_notrace
来配置function_graph tracer的filter。
function_graph tracer
还可以使用set_graph_function/set_graph_notrace
接口来配置过滤,需要两种过滤条件都满足的函数才能被trace。实际上是配置到ftrace_graph_funcs[]/ftrace_graph_notrace_funcs[]
表中。
c
static const struct file_operations ftrace_graph_fops = {
.open = ftrace_graph_open,
.read = seq_read,
.write = ftrace_graph_write,
.llseek = tracing_lseek,
.release = ftrace_graph_release,
};
static const struct file_operations ftrace_graph_notrace_fops = {
.open = ftrace_graph_notrace_open,
.read = seq_read,
.write = ftrace_graph_write,
.llseek = tracing_lseek,
.release = ftrace_graph_release,
};
tracer_init_tracefs() -> ftrace_init_tracefs_toplevel() -> ftrace_init_dyn_tracefs()
static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
{
......
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
trace_create_file("set_graph_function", 0644, d_tracer,
NULL,
&ftrace_graph_fops);
trace_create_file("set_graph_notrace", 0644, d_tracer,
NULL,
&ftrace_graph_notrace_fops);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
return 0;
}
static ssize_t
ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
ssize_t read, ret = 0;
struct ftrace_graph_data *fgd = file->private_data;
struct trace_parser *parser;
......
if (read >= 0 && trace_parser_loaded(parser) &&
!trace_parser_cont(parser)) {
//根据filter设置条件hash表
ret = ftrace_graph_set_hash(fgd->new_hash,
parser->buffer);
trace_parser_clear(parser);
}
......
return ret;
}
在trace_graph_entry()
中会进行filter的合法性判断,函数必须是set_graph_function/set_graph_notrace
配置中允许的函数或者是被它们调用的子函数才能继续执行,否则出错返回,后面的trace_graph_return()
都不会被执行。
c
int trace_graph_entry(struct ftrace_graph_ent *trace)
{
struct trace_array *tr = graph_array;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int ret;
int cpu;
int pc;
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
return 0;
//func在notrace hash中
if (ftrace_graph_notrace_addr(trace->func)) {
trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
return 1;
}
if (!ftrace_trace_task(tr))
return 0;
//当func是调用的子函数,或是一个在hash中的函数时进行跟踪
if (ftrace_graph_ignore_func(trace))
return 0;
if (ftrace_graph_ignore_irqs())
return 0;
//Do not trace a function if it's filtered by set_graph_notrace.
//为什么要跑两次?
if (ftrace_graph_notrace_addr(trace->func))
return 1;
......
if (likely(disabled == 1)) {
pc = preempt_count();
//将trace数据存入ring_buffer
ret = __trace_graph_entry(tr, trace, flags, pc);
} else {
ret = 0;
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
return ret;
}
3.2.3 function graph tracer的数据写入
和function tracer不一样的是,function_graph在进入函数和返回函数时都有trace数据存入。
c
/* Function call entry */
FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
TRACE_GRAPH_ENT,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
__field_desc( unsigned long, graph_ent, func )
__field_desc( int, graph_ent, depth )
),
F_printk("--> %lx (%d)", __entry->func, __entry->depth),
FILTER_OTHER
);
/* Function return entry */
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
__field_desc( unsigned long, ret, func )
__field_desc( unsigned long long, ret, calltime)
__field_desc( unsigned long long, ret, rettime )
__field_desc( unsigned long, ret, overrun )
__field_desc( int, ret, depth )
),
F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth),
FILTER_OTHER
);
c
trace_graph_entry() -> __trace_graph_entry()
int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned long flags,
int pc)
{
struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
sizeof(*entry), flags, pc);
if (!event)
return 0;
entry = ring_buffer_event_data(event);
entry->graph_ent = *trace;
if (!call_filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
}
c
trace_graph_return() -> __trace_graph_return()
void trace_graph_return(struct ftrace_graph_ret *trace)
{
struct trace_array *tr = graph_array;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu;
int pc;
ftrace_graph_addr_finish(trace);
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
return;
}
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
__trace_graph_return(tr, trace, flags, pc);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
}
void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned long flags,
int pc)
{
struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
sizeof(*entry), flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
if (!call_filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit_nostack(buffer, event);
}
那么目标函数执行时,是怎么执行到trace_graph_entry与trace_graph_return
的呢:
c
ftrace_startup() -> ftrace_startup_enable() -> ftrace_run_update_code() -> arch_ftrace_update_code() ->
ftrace_modify_all_code() -> ftrace_enable_ftrace_graph_caller() -> ftrace_modify_graph_caller()
/*
* Turn on/off the call to ftrace_graph_caller() in ftrace_caller()
* depending on @enable.
*/
static int ftrace_modify_graph_caller(bool enable)
{
unsigned long pc = (unsigned long)&ftrace_graph_call;
u32 branch, nop;
//把ftrace_graph_call替换为ftrace_graph_caller
branch = aarch64_insn_gen_branch_imm(pc,
(unsigned long)ftrace_graph_caller,
AARCH64_INSN_BRANCH_NOLINK);
nop = aarch64_insn_gen_nop();
if (enable)
return ftrace_modify_code(pc, nop, branch, true);
else
return ftrace_modify_code(pc, branch, nop, true);
}
ftrace_graph_entry的执行:
c
//arch/arm64/kernel/entry-ftrace.S
ENTRY(ftrace_graph_caller)
mcount_get_lr_addr x0 // pointer to function's saved lr
mcount_get_pc x1 // function's pc
mcount_get_parent_fp x2 // parent's fp
bl prepare_ftrace_return // prepare_ftrace_return(&lr, pc, fp)
mcount_exit
ENDPROC(ftrace_graph_caller)
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long frame_pointer)
{
unsigned long return_hooker = (unsigned long)&return_to_handler;
unsigned long old;
if (unlikely(atomic_read(¤t->tracing_graph_pause)))
return;
old = *parent;
if (!function_graph_enter(old, self_addr, frame_pointer, NULL))
//修改func的返回地址为return_to_handler,return_to_handler()中执行完自己的return处理函数以后,
//再把返回地址恢复成func's parent中的地址,返回继续执行原有的路径。
*parent = return_hooker;
}
int function_graph_enter(unsigned long ret, unsigned long func,
unsigned long frame_pointer, unsigned long *retp)
{
struct ftrace_graph_ent trace;
trace.func = func;
trace.depth = ++current->curr_ret_depth;
if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
goto out;
/* Only trace if the calling function expects to */
if (!ftrace_graph_entry(&trace))
goto out_ret;
return 0;
out_ret:
current->curr_ret_stack--;
out:
current->curr_ret_depth--;
return -EBUSY;
}
trace_graph_return的执行:
c
.globl return_to_handler
return_to_handler:
pushl %eax
pushl %edx
#ifdef CC_USING_FENTRY
movl $0, %eax
#else
movl %ebp, %eax
#endif
call ftrace_return_to_handler
movl %eax, %ecx
popl %edx
popl %eax
JMP_NOSPEC %ecx
unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
{
struct ftrace_graph_ret trace;
unsigned long ret;
ftrace_pop_return_trace(&trace, &ret, frame_pointer);
trace.rettime = trace_clock_local();
//ftrace_graph_return被赋值为trace_graph_return
ftrace_graph_return(&trace);
barrier();
current->curr_ret_stack--;
if (unlikely(!ret)) {
ftrace_graph_stop();
WARN_ON(1);
/* Might as well panic. What else to do? */
ret = (unsigned long)panic;
}
return ret;
}
3.2.3 function graph tracer的数据读出
从trace文件读出的function_graph tracer默认数据格式为:(显示function_graph中设置的函数或者是被它们调用的子函数)
缺一个图!!!
3.2.4 function graph tracer总结
4、irqsoff tracer
irqsoff tracer
用来追踪最大关中断时间。它的trace会提供几部分信息:
1、irqoff的最大时长:latency;
2、在最大irqoff这期间所有的function trace信息;
3、最后的irqon的函数回调信息;
irqsoff tracer的插桩方法,是直接在local_irq_enable()、local_irq_disable()中直接插入钩子函数trace_hardirqs_on()、trace_hardirqs_off()。