原文地址:
https://gitee.com/kiraskyler/Articles/blob/master/eBPF/debugfs中的追踪点format实现原理.md
文章目录
- How
- Start
- init
- [_ftrace_events section](#_ftrace_events section)
-
- [SYSCALL_DEFINEx Macro](#SYSCALL_DEFINEx Macro)
- ftrace_event_field
- SYSCALL_DEFINEx
- [TRACE_EVENT Macro](#TRACE_EVENT Macro)
- End

How
debugfs中可以通过format查看追踪点的参数
# cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/format
name: sched_wakeup
ID: 333
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
......
那么,这里是如何实现的呢?这些信息从哪里来的?
原来以为这些内容来自内核的.BTF节,现在发现是来自于定义追踪点时的宏保存的。。。。。。
Start
# cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/format
name: sched_wakeup
ID: 333
根据上面的一点点信息,内核中直接搜索"name,找到位置:
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 1572
static int f_show(struct seq_file *m, void *v)
{
struct trace_event_file *file = event_file_data(m->private);
struct trace_event_call *call = file->event_call;
struct ftrace_event_field *field;
const char *array_descriptor;
switch ((unsigned long)v) {
case FORMAT_HEADER:
seq_printf(m, "name: %s\n", trace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
seq_puts(m, "format:\n");
return 0;
不过seq_puts(m, "format:\n")之后就return了,详细的format不是从这里来的
当前文件内由定义这样的结构体用到了f_show
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 1651
static const struct seq_operations trace_format_seq_ops = {
.start = f_start,
.next = f_next,
.stop = f_stop,
.show = f_show,
重要的前三个函数,首先open这个文件时候是f_start
将f_next/f_show的第二个参数v设置成FORMAT_HEADER,即打印前两行提示信息
static void *f_start(struct seq_file *m, loff_t *pos)
{
struct trace_event_file *file;
void *p = (void *)FORMAT_HEADER;
/* ->stop() is called even if ->start() fails */
file = event_file_file(m->private);
return p; // 返回的是f_next/f_show的第二个参数v
接下载在f_next函数中,第一次进入的v参数即f_start函数中初始化的FORMAT_HEADER
每次显示一行内容时候node = node->prev;,也即ftrace_event_field->prev,每个ftrace_event_field存储的正是一行format显示的参数信息
也就是,参数列表存储存储在file->event_call->class->fields/get_fields
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 1539
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
struct trace_event_file *file = event_file_data(m->private); // file_inode(m->private)->i_private
struct trace_event_call *call = file->event_call;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call); // 参数列表来源:call->class->fields/get_fields
struct list_head *node = v; // ftrace_event_field->link,参数信息列表
(*pos)++; // 递增,用来f_show控制显示内容
switch ((unsigned long)v) {
case FORMAT_HEADER:
node = common_head;
break;
case FORMAT_FIELD_SEPERATOR:
node = head; // 参数列表头,呼应1563行
break;
case FORMAT_PRINTFMT:
/* all done */
return NULL;
}
node = node->prev;
if (node == common_head)
return (void *)FORMAT_FIELD_SEPERATOR;
else if (node == head)
return (void *)FORMAT_PRINTFMT;
else
return node;
static inline struct list_head *
trace_get_fields(struct trace_event_call *event_call)
{
if (!event_call->class->get_fields)
return &event_call->class->fields;
return event_call->class->get_fields(event_call);
f_show负责打印struct ftrace_event_field *field中的内容
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 1572
static int f_show(struct seq_file *m, void *v)
{
struct trace_event_file *file = event_file_data(m->private);
struct trace_event_call *call = file->event_call;
struct ftrace_event_field *field;
const char *array_descriptor;
switch ((unsigned long)v) {
case FORMAT_HEADER:
seq_printf(m, "name: %s\n", trace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
seq_puts(m, "format:\n");
return 0;
case FORMAT_FIELD_SEPERATOR:
seq_putc(m, '\n');
return 0;
case FORMAT_PRINTFMT:
seq_printf(m, "\nprint fmt: %s\n",
call->print_fmt);
return 0;
}
field = list_entry(v, struct ftrace_event_field, link); // v是ftrace_event_field->link,找到field
array_descriptor = strchr(field->type, '['); // eg: unsigned char
seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
(int)(array_descriptor - field->type),
field->type, field->name,
field->offset, field->size, !!field->is_signed); // 在此显示
ok,知道了参数列表保存在file->event_call->class->fields/get_fields,那么,再看看在哪里初始化
init
当前文件内只有一处设置file->event_call,断点后重新启动kernel
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 2920
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
......
file->event_call = call
-exec bt
#0 trace_create_new_event (call=call@entry=0xffffffff82e25a60 <event_ma_op>, tr=tr@entry=0xffffffff82d4b5e0 <global_trace>) at kernel/trace/trace_events.c:2929
#1 0xffffffff811b616a in __trace_early_add_new_event (tr=0xffffffff82d4b5e0 <global_trace>, call=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:3035
#2 __trace_early_add_events (tr=tr@entry=0xffffffff82d4b5e0 <global_trace>) at kernel/trace/trace_events.c:3727
#3 0xffffffff8336d2fd in event_trace_enable () at kernel/trace/trace_events.c:3987
#4 trace_event_init () at kernel/trace/trace_events.c:4067
#5 0xffffffff8336ccb1 in trace_init () at kernel/trace/trace.c:10594
#6 0xffffffff83338ba0 in start_kernel () at init/main.c:970
call参数在#2传递而来,来自于ftrace_events
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 3716
void __trace_early_add_events(struct trace_array *tr)
{
struct trace_event_call *call;
int ret;
list_for_each_entry(call, &ftrace_events, list) {
/* Early boot up should not have any modules loaded */
if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) &&
WARN_ON_ONCE(call->module))
continue;
ret = __trace_early_add_new_event(call, tr);
再继续找到call是从__start_ftrace_events加载而来
kernel/trace/trace_events.c:3962
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
struct trace_event_call **iter, *call;
int ret;
for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
call = *iter;
ret = event_init(call);
if (!ret)
list_add(&call->list, &ftrace_events);
-exec bt
#0 event_trace_enable () at kernel/trace/trace_events.c:3968
#1 trace_event_init () at kernel/trace/trace_events.c:4067
#2 0xffffffff8336ccb1 in trace_init () at kernel/trace/trace.c:10594
#3 0xffffffff83338ba0 in start_kernel () at init/main.c:970
__start_ftrace_events则是vmlinux的_ftrace_events节地址
/root/qemu/linux-6.6.58/arch/x86/kernel/vmlinux.lds: 73
__start_ftrace_events = .; KEEP(*(_ftrace_events)
_ftrace_events section
SYSCALL_DEFINEx Macro
这里是一个系统调用追踪点定义:
/root/qemu/linux-6.6.58/include/linux/syscalls.h: 141
#define SYSCALL_TRACE_ENTER_EVENT(sname) \
static struct syscall_metadata __syscall_meta_##sname; \ // 前向声明,实际在SYSCALL_METADATA中定义
static struct trace_event_call __used \
event_enter_##sname = { \
.class = &event_class_syscall_enter, \
{ \
.name = "sys_enter"#sname, \
}, \
.event.funcs = &enter_syscall_print_funcs, \
.data = (void *)&__syscall_meta_##sname,\ // 这里的types/args分别保存参数类型和参数名称列表
.flags = TRACE_EVENT_FL_CAP_ANY, \
}; \
static struct trace_event_call __used \
__section("_ftrace_events") \
*__event_enter_##sname = &event_enter_##sname; // event_enter_##sname指针地址放到_ftrace_events节中
#define SYSCALL_METADATA(sname, nb, ...) \
static const char *types_##sname[] = { \ // 前向声明参数列表
__MAP(nb,__SC_STR_TDECL,__VA_ARGS__) \
}; \
static const char *args_##sname[] = { \
__MAP(nb,__SC_STR_ADECL,__VA_ARGS__) \ // 前向声明参数名称
}; \
SYSCALL_TRACE_ENTER_EVENT(sname); \
SYSCALL_TRACE_EXIT_EVENT(sname); \
static struct syscall_metadata __used \
__syscall_meta_##sname = { \
.name = "sys"#sname, \
.syscall_nr = -1, /* Filled in at boot */ \
.nb_args = nb, \
.types = nb ? types_##sname : NULL, \ // 参数列表
.args = nb ? args_##sname : NULL, \ // 参数名称列表
.enter_event = &event_enter_##sname, \
.exit_event = &event_exit_##sname, \
.enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
}; \
static struct syscall_metadata __used \
__section("__syscalls_metadata") \
*__p_syscall_meta_##sname = &__syscall_meta_##sname; // __syscall_meta_##sname指针地址放到__syscalls_metadata节中
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) // 一个参数的系统调用宏定义
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \ // 这里定义参数
_ftrace_events节中存储的是struct trace_event_call的地址
struct trace_event_call->data->types/args分别是在定义追踪点时通过宏创建出的参数/参数名称列表,接下来看看这个列表如何被使用到
ftrace_event_field
还是在内核启动阶段,会遍历call
遍历fields_array将参数列表放到call->class->fields/get_fields中,而field->type == TRACE_FUNCTION_TYPE时候会调用define_fields将参数列表放到call->class->fields/get_fields中
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 2415
static int
event_define_fields(struct trace_event_call *call)
{
head = trace_get_fields(call);
if (list_empty(head)) {
struct trace_event_fields *field = call->class->fields_array;
unsigned int offset = sizeof(struct trace_entry);
for (; field->type; field++) {
if (field->type == TRACE_FUNCTION_TYPE) {// syscall_enter类型fields在此填充
field->define_fields(call);
break;
}
// 大多其他类型tracepoint走下面的define_fields_ext,作用差不多都是填充到call->class->fields
offset = ALIGN(offset, field->align);
ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
field->is_signed, field->filter_type,
field->len); // 每个参数信息独立添加到对应的ftrace_event_field
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 115
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
int is_signed, int filter_type, int len)
{
struct ftrace_event_field *field;
field = kmem_cache_alloc(field_cachep, GFP_TRACE);
field->name = name;
field->type = type;
......
list_add(&field->link, head); // head = call->class->fields/get_fields
#0 __trace_define_field (len=0, filter_type=0, is_signed=0, size=8, offset=8, name=0xffffffff82446089 "fn", type=0xffffffff8243b227 "const char *", head=0xffffffff82e35b50 <event_class_ma_op+48>) at kernel/trace/trace_events.c:121
#1 trace_define_field_ext (len=<optimized out>, filter_type=0, is_signed=<optimized out>, size=8, offset=8, name=0xffffffff82446089 "fn", type=0xffffffff8243b227 "const char *", call=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:168
#2 event_define_fields (call=call@entry=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:2437
#3 0xffffffff811b6264 in event_define_fields (call=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:3732
#4 __trace_early_add_new_event (tr=0xffffffff82d4b5e0 <global_trace>, call=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:3039
#5 __trace_early_add_events (tr=tr@entry=0xffffffff82d4b5e0 <global_trace>) at kernel/trace/trace_events.c:3727
#6 0xffffffff8336d2fd in event_trace_enable () at kernel/trace/trace_events.c:3987
#7 trace_event_init () at kernel/trace/trace_events.c:4067
#8 0xffffffff8336ccb1 in trace_init () at kernel/trace/trace.c:10594
#9 0xffffffff83338ba0 in start_kernel () at init/main.c:970
so,参数来源于trace_event_class->fields_array
SYSCALL_DEFINEx
再看一下系统调用追踪点时候fields_array内容:
fields_array开头的SYSCALL_FIELD(int, __syscall_nr)是通用的
接下来通过type = TRACE_FUNCTION_TYPE换SYSCALL提供的函数syscall_enter_define_fields自己遍历call->data中的types类型列表和args参数名列表到call->class->fields/get_fields中
/root/qemu/linux-6.6.58/kernel/trace/trace_syscalls.c:270
static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
int offset = offsetof(typeof(trace), args);
int ret = 0;
int i;
for (i = 0; i < meta->nb_args; i++) {
ret = trace_define_field(call, meta->types[i],
meta->args[i], offset,
sizeof(unsigned long), 0,
FILTER_OTHER);
static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
SYSCALL_FIELD(int, __syscall_nr),
{ .type = TRACE_FUNCTION_TYPE,
.define_fields = syscall_enter_define_fields },
struct trace_event_class __refdata event_class_syscall_enter = {
.fields_array = syscall_enter_fields_array,
TRACE_EVENT Macro
在看一下大多埋于各处的其他追踪点的定义方法:
/root/qemu/linux-6.6.58/include/trace/events/sched.h: 222
TRACE_EVENT(sched_switch,
......
TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
__field( int, prev_prio )
__field( long, prev_state )
__array( char, next_comm, TASK_COMM_LEN )
TRACE_EVENT在include/trace/trace_events.h被分步解析,很像lsm框架的用法,每一次步骤替换一下某些宏的实现
最开始是定义TRACE_EVENT为DECLARE_EVENT_CLASS,后面分步骤替换DECLARE_EVENT_CLASS实现
/root/qemu/linux-6.6.58/include/trace/trace_events.h: 38
#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
DECLARE_EVENT_CLASS(name, \
PARAMS(proto), \
PARAMS(args), \
PARAMS(tstruct), \
PARAMS(assign), \
PARAMS(print)); \
第4步时候
TP_STRUCT__entry{__array( char, next_comm, TASK_COMM_LEN为例,__array的内容即被展开到struct trace_event_fields结构体信息,这个结构体正是trace_event_class->fields_array需要的
/root/qemu/linux-6.6.58/include/trace/stages/stage4_event_fields.h: 26
#define __array(_type, _item, _len) { \
.type = #_type"["__stringify(_len)"]", .name = #_item, \
.size = sizeof(_type[_len]), .align = ALIGN_STRUCTFIELD(_type), \
.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER,\
.len = _len },
#include "stages/stage4_event_fields.h"
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \
static struct trace_event_fields trace_custom_event_fields_##call[] = { \
tstruct \
{} };
所以第7步时候.fields_array=第四步生成的结构体
/root/qemu/linux-6.6.58/include/trace/trace_events.h: 421
#include "stages/stage7_class_define.h"
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static char custom_print_fmt_##call[] = print; \
static struct trace_event_class __used __refdata custom_event_class_##call = { \
.system = TRACE_SYSTEM_STRING, \
.fields_array = trace_custom_event_fields_##call, \
同样,_ftrace_events节中存储trace_event_call地址,trace_event_call->class->fields_array就是参数和参数名列表
/root/qemu/linux-6.6.58/include/trace/trace_events.h: 440
static struct trace_event_call __used event_##call = { \
.class = &event_class_##template, \
static struct trace_event_call __used \
__section("_ftrace_events") *__event_##call = &event_##call
即,TRACE_EVENT时候通过分步的宏解析,参数列表和参数名列表都直接被存储到trace_event_class->fields_array中,在内核启动时候这些信息会被展开到trace_event_class->fields中以供读取format文件时使用。
End
- 定义追踪点时候通过宏定义生成了
trace_event_class结构,trace_event_class->fields_array包含了参数或参数名称列表(TRACE_EVENT),或找到保存参数和参数名列表的函数方法(syscall宏),trace_event_call->class指向trace_event_class地址,所有追踪点的trace_event_call的地址则存放到_ftrace_events节中 - 构建内核时,通过链接脚本将__start_ftrace_events = _ftrace_events节地址
- 启动内核时,遍历__start_ftrace_events指向的区域中的地址,将这些
trace_event_class->fields_array中的列表保存到trace_event_class->fields中 - 当读取到
[event]/format时候遍历trace_event_class->fields链表即可