eBPF debugfs中的追踪点format实现原理

原文地址:

https://gitee.com/kiraskyler/Articles/blob/master/eBPF/debugfs中的追踪点format实现原理.md

文章目录

How

debugfs中可以通过format查看追踪点的参数

复制代码
# cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/format
name: sched_wakeup
ID: 333
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
......

那么,这里是如何实现的呢?这些信息从哪里来的?

原来以为这些内容来自内核的.BTF节,现在发现是来自于定义追踪点时的宏保存的。。。。。。

Start

复制代码
# cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/format
name: sched_wakeup
ID: 333

根据上面的一点点信息,内核中直接搜索"name,找到位置:

复制代码
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 1572

static int f_show(struct seq_file *m, void *v)
{
	struct trace_event_file *file = event_file_data(m->private);
	struct trace_event_call *call = file->event_call;
	struct ftrace_event_field *field;
	const char *array_descriptor;

	switch ((unsigned long)v) {
	case FORMAT_HEADER:
		seq_printf(m, "name: %s\n", trace_event_name(call));
		seq_printf(m, "ID: %d\n", call->event.type);
		seq_puts(m, "format:\n");
		return 0;

不过seq_puts(m, "format:\n")之后就return了,详细的format不是从这里来的

当前文件内由定义这样的结构体用到了f_show

复制代码
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 1651

static const struct seq_operations trace_format_seq_ops = {
	.start		= f_start,
	.next		= f_next,
	.stop		= f_stop,
	.show		= f_show,

重要的前三个函数,首先open这个文件时候是f_start

将f_next/f_show的第二个参数v设置成FORMAT_HEADER,即打印前两行提示信息

复制代码
static void *f_start(struct seq_file *m, loff_t *pos)
{
	struct trace_event_file *file;
	void *p = (void *)FORMAT_HEADER;

	/* ->stop() is called even if ->start() fails */
	file = event_file_file(m->private);

	return p; // 返回的是f_next/f_show的第二个参数v

接下载在f_next函数中,第一次进入的v参数即f_start函数中初始化的FORMAT_HEADER

每次显示一行内容时候node = node->prev;,也即ftrace_event_field->prev,每个ftrace_event_field存储的正是一行format显示的参数信息

也就是,参数列表存储存储在file->event_call->class->fields/get_fields

复制代码
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 1539

static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
	struct trace_event_file *file = event_file_data(m->private); // file_inode(m->private)->i_private
	struct trace_event_call *call = file->event_call;
	struct list_head *common_head = &ftrace_common_fields;
	struct list_head *head = trace_get_fields(call); // 参数列表来源:call->class->fields/get_fields
	struct list_head *node = v; // ftrace_event_field->link,参数信息列表

	(*pos)++; // 递增,用来f_show控制显示内容

	switch ((unsigned long)v) {
	case FORMAT_HEADER:
		node = common_head;
		break;

	case FORMAT_FIELD_SEPERATOR:
		node = head; // 参数列表头,呼应1563行
		break;

	case FORMAT_PRINTFMT:
		/* all done */
		return NULL;
	}

	node = node->prev;
	if (node == common_head)
		return (void *)FORMAT_FIELD_SEPERATOR;
	else if (node == head)
		return (void *)FORMAT_PRINTFMT;
	else
		return node;

static inline struct list_head *
trace_get_fields(struct trace_event_call *event_call)
{
	if (!event_call->class->get_fields)
		return &event_call->class->fields;
	return event_call->class->get_fields(event_call);

f_show负责打印struct ftrace_event_field *field中的内容

复制代码
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 1572

static int f_show(struct seq_file *m, void *v)
{
	struct trace_event_file *file = event_file_data(m->private);
	struct trace_event_call *call = file->event_call;
	struct ftrace_event_field *field;
	const char *array_descriptor;

	switch ((unsigned long)v) {
	case FORMAT_HEADER:
		seq_printf(m, "name: %s\n", trace_event_name(call));
		seq_printf(m, "ID: %d\n", call->event.type);
		seq_puts(m, "format:\n");
		return 0;

	case FORMAT_FIELD_SEPERATOR:
		seq_putc(m, '\n');
		return 0;

	case FORMAT_PRINTFMT:
		seq_printf(m, "\nprint fmt: %s\n",
			   call->print_fmt);
		return 0;
	}

	field = list_entry(v, struct ftrace_event_field, link); // v是ftrace_event_field->link,找到field

	array_descriptor = strchr(field->type, '['); // eg: unsigned char

		seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n",
				(int)(array_descriptor - field->type),
				field->type, field->name,
				field->offset, field->size, !!field->is_signed); // 在此显示

ok,知道了参数列表保存在file->event_call->class->fields/get_fields,那么,再看看在哪里初始化

init

当前文件内只有一处设置file->event_call,断点后重新启动kernel

复制代码
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 2920

static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
		       struct trace_array *tr)
{
    ......

	file->event_call = call

-exec bt
#0  trace_create_new_event (call=call@entry=0xffffffff82e25a60 <event_ma_op>, tr=tr@entry=0xffffffff82d4b5e0 <global_trace>) at kernel/trace/trace_events.c:2929
#1  0xffffffff811b616a in __trace_early_add_new_event (tr=0xffffffff82d4b5e0 <global_trace>, call=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:3035
#2  __trace_early_add_events (tr=tr@entry=0xffffffff82d4b5e0 <global_trace>) at kernel/trace/trace_events.c:3727
#3  0xffffffff8336d2fd in event_trace_enable () at kernel/trace/trace_events.c:3987
#4  trace_event_init () at kernel/trace/trace_events.c:4067
#5  0xffffffff8336ccb1 in trace_init () at kernel/trace/trace.c:10594
#6  0xffffffff83338ba0 in start_kernel () at init/main.c:970

call参数在#2传递而来,来自于ftrace_events

复制代码
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 3716

void __trace_early_add_events(struct trace_array *tr)
{
	struct trace_event_call *call;
	int ret;

	list_for_each_entry(call, &ftrace_events, list) {
		/* Early boot up should not have any modules loaded */
		if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) &&
		    WARN_ON_ONCE(call->module))
			continue;

		ret = __trace_early_add_new_event(call, tr);

再继续找到call是从__start_ftrace_events加载而来

复制代码
kernel/trace/trace_events.c:3962

static __init int event_trace_enable(void)
{
	struct trace_array *tr = top_trace_array();
	struct trace_event_call **iter, *call;
	int ret;

	for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {

		call = *iter;
		ret = event_init(call);
		if (!ret)
			list_add(&call->list, &ftrace_events);

-exec bt
#0  event_trace_enable () at kernel/trace/trace_events.c:3968
#1  trace_event_init () at kernel/trace/trace_events.c:4067
#2  0xffffffff8336ccb1 in trace_init () at kernel/trace/trace.c:10594
#3  0xffffffff83338ba0 in start_kernel () at init/main.c:970

__start_ftrace_events则是vmlinux的_ftrace_events节地址

复制代码
/root/qemu/linux-6.6.58/arch/x86/kernel/vmlinux.lds: 73

__start_ftrace_events = .; KEEP(*(_ftrace_events)

_ftrace_events section

SYSCALL_DEFINEx Macro

这里是一个系统调用追踪点定义:

复制代码
/root/qemu/linux-6.6.58/include/linux/syscalls.h: 141

#define SYSCALL_TRACE_ENTER_EVENT(sname)				\
	static struct syscall_metadata __syscall_meta_##sname;		\ // 前向声明,实际在SYSCALL_METADATA中定义
	static struct trace_event_call __used				\
	  event_enter_##sname = {					\
		.class			= &event_class_syscall_enter,	\
		{							\
			.name                   = "sys_enter"#sname,	\
		},							\
		.event.funcs            = &enter_syscall_print_funcs,	\
		.data			= (void *)&__syscall_meta_##sname,\ // 这里的types/args分别保存参数类型和参数名称列表
		.flags                  = TRACE_EVENT_FL_CAP_ANY,	\
	};								\
	static struct trace_event_call __used				\
	  __section("_ftrace_events")					\
	 *__event_enter_##sname = &event_enter_##sname; // event_enter_##sname指针地址放到_ftrace_events节中

#define SYSCALL_METADATA(sname, nb, ...)			\
	static const char *types_##sname[] = {			\ // 前向声明参数列表
		__MAP(nb,__SC_STR_TDECL,__VA_ARGS__)		\
	};							\
	static const char *args_##sname[] = {			\
		__MAP(nb,__SC_STR_ADECL,__VA_ARGS__)		\ // 前向声明参数名称
	};							\
	SYSCALL_TRACE_ENTER_EVENT(sname);			\
	SYSCALL_TRACE_EXIT_EVENT(sname);			\
	static struct syscall_metadata __used			\
	  __syscall_meta_##sname = {				\
		.name 		= "sys"#sname,			\
		.syscall_nr	= -1,	/* Filled in at boot */	\
		.nb_args 	= nb,				\
		.types		= nb ? types_##sname : NULL,	\ // 参数列表
		.args		= nb ? args_##sname : NULL,	\ // 参数名称列表
		.enter_event	= &event_enter_##sname,		\
		.exit_event	= &event_exit_##sname,		\
		.enter_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
	};							\
	static struct syscall_metadata __used			\
	  __section("__syscalls_metadata")			\
	 *__p_syscall_meta_##sname = &__syscall_meta_##sname; // __syscall_meta_##sname指针地址放到__syscalls_metadata节中

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) // 一个参数的系统调用宏定义

#define SYSCALL_DEFINEx(x, sname, ...)				\
	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\ // 这里定义参数

_ftrace_events节中存储的是struct trace_event_call的地址

struct trace_event_call->data->types/args分别是在定义追踪点时通过宏创建出的参数/参数名称列表,接下来看看这个列表如何被使用到

ftrace_event_field

还是在内核启动阶段,会遍历call

遍历fields_array将参数列表放到call->class->fields/get_fields中,而field->type == TRACE_FUNCTION_TYPE时候会调用define_fields将参数列表放到call->class->fields/get_fields

复制代码
/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 2415

static int
event_define_fields(struct trace_event_call *call)
{
	head = trace_get_fields(call);
	if (list_empty(head)) {
	
		struct trace_event_fields *field = call->class->fields_array;
		unsigned int offset = sizeof(struct trace_entry);

		for (; field->type; field++) {
			if (field->type == TRACE_FUNCTION_TYPE) {// syscall_enter类型fields在此填充
				field->define_fields(call);
				break;
			}
			// 大多其他类型tracepoint走下面的define_fields_ext,作用差不多都是填充到call->class->fields
			offset = ALIGN(offset, field->align);
			ret = trace_define_field_ext(call, field->type, field->name,
						 offset, field->size,
						 field->is_signed, field->filter_type,
						 field->len); // 每个参数信息独立添加到对应的ftrace_event_field

/root/qemu/linux-6.6.58/kernel/trace/trace_events.c: 115

static int __trace_define_field(struct list_head *head, const char *type,
				const char *name, int offset, int size,
				int is_signed, int filter_type, int len)
{
	struct ftrace_event_field *field;

	field = kmem_cache_alloc(field_cachep, GFP_TRACE);

	field->name = name;
	field->type = type;
    ......

    list_add(&field->link, head); // head = call->class->fields/get_fields

#0  __trace_define_field (len=0, filter_type=0, is_signed=0, size=8, offset=8, name=0xffffffff82446089 "fn", type=0xffffffff8243b227 "const char *", head=0xffffffff82e35b50 <event_class_ma_op+48>) at kernel/trace/trace_events.c:121
#1  trace_define_field_ext (len=<optimized out>, filter_type=0, is_signed=<optimized out>, size=8, offset=8, name=0xffffffff82446089 "fn", type=0xffffffff8243b227 "const char *", call=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:168
#2  event_define_fields (call=call@entry=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:2437
#3  0xffffffff811b6264 in event_define_fields (call=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:3732
#4  __trace_early_add_new_event (tr=0xffffffff82d4b5e0 <global_trace>, call=0xffffffff82e25a60 <event_ma_op>) at kernel/trace/trace_events.c:3039
#5  __trace_early_add_events (tr=tr@entry=0xffffffff82d4b5e0 <global_trace>) at kernel/trace/trace_events.c:3727
#6  0xffffffff8336d2fd in event_trace_enable () at kernel/trace/trace_events.c:3987
#7  trace_event_init () at kernel/trace/trace_events.c:4067
#8  0xffffffff8336ccb1 in trace_init () at kernel/trace/trace.c:10594
#9  0xffffffff83338ba0 in start_kernel () at init/main.c:970

so,参数来源于trace_event_class->fields_array

SYSCALL_DEFINEx

再看一下系统调用追踪点时候fields_array内容:

fields_array开头的SYSCALL_FIELD(int, __syscall_nr)是通用的

接下来通过type = TRACE_FUNCTION_TYPE换SYSCALL提供的函数syscall_enter_define_fields自己遍历call->data中的types类型列表和args参数名列表到call->class->fields/get_fields

复制代码
/root/qemu/linux-6.6.58/kernel/trace/trace_syscalls.c:270

static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
	struct syscall_trace_enter trace;
	struct syscall_metadata *meta = call->data;
	int offset = offsetof(typeof(trace), args);
	int ret = 0;
	int i;

	for (i = 0; i < meta->nb_args; i++) {
		ret = trace_define_field(call, meta->types[i],
					 meta->args[i], offset,
					 sizeof(unsigned long), 0,
					 FILTER_OTHER);

static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
	SYSCALL_FIELD(int, __syscall_nr),
	{ .type = TRACE_FUNCTION_TYPE,
	  .define_fields = syscall_enter_define_fields },

struct trace_event_class __refdata event_class_syscall_enter = {
	.fields_array	= syscall_enter_fields_array,

TRACE_EVENT Macro

在看一下大多埋于各处的其他追踪点的定义方法:

复制代码
/root/qemu/linux-6.6.58/include/trace/events/sched.h: 222

TRACE_EVENT(sched_switch,

    ......

	TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	prev_pid			)
		__field(	int,	prev_prio			)
		__field(	long,	prev_state			)
		__array(	char,	next_comm,	TASK_COMM_LEN	)

TRACE_EVENTinclude/trace/trace_events.h被分步解析,很像lsm框架的用法,每一次步骤替换一下某些宏的实现

最开始是定义TRACE_EVENTDECLARE_EVENT_CLASS,后面分步骤替换DECLARE_EVENT_CLASS实现

复制代码
/root/qemu/linux-6.6.58/include/trace/trace_events.h: 38

#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
	DECLARE_EVENT_CLASS(name,			       \
			     PARAMS(proto),		       \
			     PARAMS(args),		       \
			     PARAMS(tstruct),		       \
			     PARAMS(assign),		       \
			     PARAMS(print));		       \

第4步时候

TP_STRUCT__entry{__array( char, next_comm, TASK_COMM_LEN为例,__array的内容即被展开到struct trace_event_fields结构体信息,这个结构体正是trace_event_class->fields_array需要的

复制代码
/root/qemu/linux-6.6.58/include/trace/stages/stage4_event_fields.h: 26

#define __array(_type, _item, _len) {					\
	.type = #_type"["__stringify(_len)"]", .name = #_item,		\
	.size = sizeof(_type[_len]), .align = ALIGN_STRUCTFIELD(_type),	\
	.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER,\
	.len = _len },

#include "stages/stage4_event_fields.h"

#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print)	\
static struct trace_event_fields trace_custom_event_fields_##call[] = {	\
	tstruct								\
	{} };

所以第7步时候.fields_array=第四步生成的结构体

复制代码
/root/qemu/linux-6.6.58/include/trace/trace_events.h: 421

#include "stages/stage7_class_define.h"

#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static char custom_print_fmt_##call[] = print;					\
static struct trace_event_class __used __refdata custom_event_class_##call = { \
	.system			= TRACE_SYSTEM_STRING,			\
	.fields_array		= trace_custom_event_fields_##call,		\

同样,_ftrace_events节中存储trace_event_call地址,trace_event_call->class->fields_array就是参数和参数名列表

复制代码
/root/qemu/linux-6.6.58/include/trace/trace_events.h: 440

static struct trace_event_call __used event_##call = {			\
	.class			= &event_class_##template,		\

static struct trace_event_call __used					\
__section("_ftrace_events") *__event_##call = &event_##call

即,TRACE_EVENT时候通过分步的宏解析,参数列表和参数名列表都直接被存储到trace_event_class->fields_array中,在内核启动时候这些信息会被展开到trace_event_class->fields中以供读取format文件时使用。

End

  • 定义追踪点时候通过宏定义生成了trace_event_class结构,trace_event_class->fields_array包含了参数或参数名称列表(TRACE_EVENT),或找到保存参数和参数名列表的函数方法(syscall宏),trace_event_call->class指向trace_event_class地址,所有追踪点的trace_event_call的地址则存放到_ftrace_events节中
  • 构建内核时,通过链接脚本将__start_ftrace_events = _ftrace_events节地址
  • 启动内核时,遍历__start_ftrace_events指向的区域中的地址,将这些trace_event_class->fields_array中的列表保存到trace_event_class->fields
  • 当读取到[event]/format时候遍历trace_event_class->fields链表即可
相关推荐
2501_927773073 小时前
uboot挂载
linux·运维·服务器
wdfk_prog3 小时前
[Linux]学习笔记系列 -- [drivers][dma]dmapool
linux·笔记·学习
goxingman4 小时前
在 Linux 中查看磁盘运行占用(I/O 使用率)
linux·运维·chrome
STCNXPARM4 小时前
Linux camera之Media子系统
linux·camera·v4l2·media子系统
小天源4 小时前
XShell一台控制多台操作详情
linux·运维·服务器
xu_yule4 小时前
网络和Linux网络-13(高级IO+多路转接)五种IO模型+select编程
linux·网络·c++·select·i/o
夜流冰5 小时前
编程参考 - Linux kernel代码查看
linux·运维·服务器
xu_yule5 小时前
网络和Linux网络-14(IO多路转接)poll和epoll编程-服务器
linux·运维·服务器·epoll·poll
timi先生5 小时前
全新的linux如何进行远程xshell操作?
linux·运维·服务器