内核观测工具BPF
总体思路
bpf uprobe
bpf程序除了通过kprobe监控内核函数外,还能通过uprobe监控用户态函数,包括动态库的调用,如线程库libpthread.so里锁的调用。
- 线程里锁的操作在动态库libpthread.so里。
- 调用pthread_mutex_lock是申请锁
- 从pthred_mutext_lock返回是得到锁
- pthred_mutext_unlock是释放锁
perf buffer
-
perf_buffer 是 libbpf 提供的一个高性能事件传输机制,用于将 eBPF 程序中产生的数据高效、可靠地传递到用户态程序。
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ eBPF 内核态 │ │ Perf Buffer │ │ 用户态程序 │
│ │ ────→ │ │ ────→ │ │
│ 产生事件数据 │ │ 环形缓冲区 │ │ 接收并处理 │
└─────────────┘ └─────────────┘ └─────────────┘ -
Perf Event Array Map
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); // map 类型:perf event 数组
__uint(key_size, sizeof(u32)); // key 类型:CPU 编号
__uint(value_size, sizeof(u32)); // value 类型:文件描述符
} events SEC(".maps");- type BPF_MAP_TYPE_PERF_EVENT_ARRAY 专门用于perf_event 的 map 类型
- key_size sizeof(u32) key 是 CPU 编号(0,1,2...)
- value_size sizeof(u32) value 是 perf_event 的文件描述符
- max_entries 未指定 默认为系统 CPU 数量
-
bpf输出perf事件
struct lock_event event = {}; event.timestamp = get_time_us(); event.pid = pid; event.tid = tid; event.mutex = mutex; event.action = ACTION_REQUEST; bpf_get_current_comm(&event.comm, sizeof(event.comm)); bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); -
用户端程序申请perf buffer绑定到events
struct perf_buffer_opts pb_opts = { .sample_cb = handle_event, .lost_cb = handle_lost_events, }; pb = perf_buffer__new(bpf_map__fd(skel->maps.events), 4096, &pb_opts); -
用户端程序主循环中请求perf事件
通过poll获取Buffer中的事件,获取后会自动调用注册的回调处理函数。// 事件循环 while (!exiting) { err = perf_buffer__poll(pb, 100); if (err < 0 && err != -EINTR) { fprintf(stderr, "Error polling perf buffer: %d\n", err); break; } }
BPF内核态程序
// mutex_trace.bpf.c
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
typedef unsigned int u32;
typedef unsigned long long u64;
char LICENSE[] SEC("license") = "GPL";
// 定义锁事件结构体
struct lock_event {
u64 timestamp; // 时间戳(微秒)
u32 pid; // 进程ID
u32 tid; // 线程ID
u64 mutex; // mutex地址
u32 action; // 操作类型
int ret; // 返回值
char comm[16]; // 进程名
};
// 操作类型
#define ACTION_REQUEST 1
#define ACTION_GOT 2
#define ACTION_RELEASE 3
#define ACTION_TRY_LOCK 4
#define ACTION_TRY_RESULT 5
// 需要监控的进程集合
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 10240);
__type(key, u32);
__type(value, u8);
} target_pids SEC(".maps");
// Perf event 输出
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u32));
} events SEC(".maps");
// 获取第一个参数(x86_64)
static __always_inline u64 get_first_param(struct pt_regs *ctx)
{
return (u64)ctx->di;
}
// 获取时间戳(微秒)
static __always_inline u64 get_time_us(void)
{
return bpf_ktime_get_ns() / 1000;
}
// 检查是否为目标进程或线程
static __always_inline int is_target(u32 id)
{
u8 *exists = bpf_map_lookup_elem(&target_pids, &id);
return exists ? 1 : 0;
}
// 申请锁
SEC("uprobe")
int trace_mutex_lock(struct pt_regs *ctx)
{
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid >> 32;
u32 tid = (u32)pid_tgid;
u64 mutex = get_first_param(ctx);
if (!is_target(pid) && !is_target(tid)) {
return 0;
}
struct lock_event event = {};
event.timestamp = get_time_us();
event.pid = pid;
event.tid = tid;
event.mutex = mutex;
event.action = ACTION_REQUEST;
bpf_get_current_comm(&event.comm, sizeof(event.comm));
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
return 0;
}
// 得到锁
SEC("uretprobe")
int trace_mutex_lock_ret(struct pt_regs *ctx)
{
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid >> 32;
u32 tid = (u32)pid_tgid;
u64 mutex = get_first_param(ctx);
int ret = (int)ctx->ax;
if (!is_target(pid) && !is_target(tid)) {
return 0;
}
struct lock_event event = {};
event.timestamp = get_time_us();
event.pid = pid;
event.tid = tid;
event.mutex = mutex;
event.action = ACTION_GOT;
event.ret = ret;
bpf_get_current_comm(&event.comm, sizeof(event.comm));
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
return 0;
}
// 释放锁
SEC("uprobe")
int trace_mutex_unlock(struct pt_regs *ctx)
{
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid >> 32;
u32 tid = (u32)pid_tgid;
u64 mutex = get_first_param(ctx);
if (!is_target(pid) && !is_target(tid)) {
return 0;
}
struct lock_event event = {};
event.timestamp = get_time_us();
event.pid = pid;
event.tid = tid;
event.mutex = mutex;
event.action = ACTION_RELEASE;
bpf_get_current_comm(&event.comm, sizeof(event.comm));
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
return 0;
}
// trylock
SEC("uprobe")
int trace_mutex_trylock(struct pt_regs *ctx)
{
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid >> 32;
u32 tid = (u32)pid_tgid;
u64 mutex = get_first_param(ctx);
if (!is_target(pid) && !is_target(tid)) {
return 0;
}
struct lock_event event = {};
event.timestamp = get_time_us();
event.pid = pid;
event.tid = tid;
event.mutex = mutex;
event.action = ACTION_TRY_LOCK;
bpf_get_current_comm(&event.comm, sizeof(event.comm));
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
return 0;
}
// trylock 结果
SEC("uretprobe")
int trace_mutex_trylock_ret(struct pt_regs *ctx)
{
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid >> 32;
u32 tid = (u32)pid_tgid;
u64 mutex = get_first_param(ctx);
int ret = (int)ctx->ax;
if (!is_target(pid) && !is_target(tid)) {
return 0;
}
struct lock_event event = {};
event.timestamp = get_time_us();
event.pid = pid;
event.tid = tid;
event.mutex = mutex;
event.action = ACTION_TRY_RESULT;
event.ret = ret;
bpf_get_current_comm(&event.comm, sizeof(event.comm));
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
return 0;
}
用户态程序
// loader.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <string.h>
#include <errno.h>
#include <time.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <dirent.h>
#include <bpf/libbpf.h>
#include <bpf/bpf.h>
#include "mutex_trace_bpf.skel.h"
typedef unsigned int u32;
typedef unsigned long long u64;
typedef unsigned char u8;
// 操作类型
#define ACTION_REQUEST 1
#define ACTION_GOT 2
#define ACTION_RELEASE 3
#define ACTION_TRY_LOCK 4
#define ACTION_TRY_RESULT 5
// 锁事件结构体
struct lock_event {
u64 timestamp;
u32 pid;
u32 tid;
u64 mutex;
u32 action;
int ret;
char comm[16];
};
static volatile bool exiting = false;
static int target_pid = -1;
static u64 boot_time_us = 0;
static void sig_handler(int sig)
{
exiting = true;
}
static int bump_memlock_rlimit(void)
{
struct rlimit rlim = {
.rlim_cur = 128 * 1024 * 1024,
.rlim_max = 128 * 1024 * 1024,
};
return setrlimit(RLIMIT_MEMLOCK, &rlim);
}
// 获取系统启动时间(微秒)
static u64 get_boot_time_us(void)
{
FILE *fp = fopen("/proc/uptime", "r");
if (!fp) return 0;
double uptime;
fscanf(fp, "%lf", &uptime);
fclose(fp);
// 获取当前时间
struct timeval tv;
gettimeofday(&tv, NULL);
// 计算启动时间 = 当前时间 - uptime
u64 now_us = (u64)tv.tv_sec * 1000000 + tv.tv_usec;
u64 uptime_us = (u64)(uptime * 1000000);
return now_us - uptime_us;
}
// 将单调时间转换为友好时间
static void format_time(u64 timestamp_us, char *buf, size_t buf_size)
{
// timestamp_us 是从系统启动开始的时间
// 转换为绝对时间
u64 abs_time_us = boot_time_us + timestamp_us;
time_t sec = abs_time_us / 1000000;
long usec = abs_time_us % 1000000;
struct tm *tm = localtime(&sec);
strftime(buf, buf_size, "%H:%M:%S", tm);
char tmp[64];
snprintf(tmp, sizeof(tmp), "%s.%06ld", buf, usec);
strncpy(buf, tmp, buf_size);
}
static const char* get_action_name(u32 action)
{
switch(action) {
case ACTION_REQUEST: return "REQUEST";
case ACTION_GOT: return "GOT";
case ACTION_RELEASE: return "RELEASE";
case ACTION_TRY_LOCK: return "TRY";
case ACTION_TRY_RESULT: return "TRY_RES";
default: return "UNKNOWN";
}
}
static size_t get_symbol_offset(const char *libpath, const char *symbol)
{
char cmd[512];
FILE *fp;
char line[256];
size_t offset = 0;
snprintf(cmd, sizeof(cmd), "nm -D %s 2>/dev/null | grep ' %s$' | head -1 | awk '{print $1}'",
libpath, symbol);
fp = popen(cmd, "r");
if (!fp) return 0;
if (fgets(line, sizeof(line), fp)) {
line[strcspn(line, "\n")] = 0;
sscanf(line, "%zx", &offset);
}
pclose(fp);
return offset;
}
static void add_threads(int pid, struct mutex_trace_bpf *skel)
{
char path[256];
DIR *dir;
struct dirent *entry;
int map_fd = bpf_map__fd(skel->maps.target_pids);
if (map_fd < 0) return;
snprintf(path, sizeof(path), "/proc/%d/task", pid);
dir = opendir(path);
if (!dir) return;
while ((entry = readdir(dir)) != NULL) {
if (entry->d_name[0] >= '0' && entry->d_name[0] <= '9') {
int tid = atoi(entry->d_name);
u8 value = 1;
bpf_map_update_elem(map_fd, &tid, &value, BPF_ANY);
printf(" Adding thread: %d\n", tid);
}
}
closedir(dir);
}
static void add_child_processes(int parent_pid, struct mutex_trace_bpf *skel)
{
char path[256];
int map_fd = bpf_map__fd(skel->maps.target_pids);
if (map_fd < 0) return;
snprintf(path, sizeof(path), "/proc/%d/children", parent_pid);
FILE *fp = fopen(path, "r");
if (!fp) return;
int child_pid;
while (fscanf(fp, "%d", &child_pid) == 1) {
u8 value = 1;
bpf_map_update_elem(map_fd, &child_pid, &value, BPF_ANY);
printf(" Adding child process: %d\n", child_pid);
add_threads(child_pid, skel);
add_child_processes(child_pid, skel);
}
fclose(fp);
}
// 处理 perf event
static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
{
struct lock_event *e = (struct lock_event *)data;
char time_str[64];
format_time(e->timestamp, time_str, sizeof(time_str));
switch(e->action) {
case ACTION_REQUEST:
printf("[%s] REQUEST | PID=%d TID=%d COMM=%-15s MUTEX=%p\n",
time_str, e->pid, e->tid, e->comm, (void*)e->mutex);
break;
case ACTION_GOT:
printf("[%s] GOT | PID=%d TID=%d COMM=%-15s MUTEX=%p RESULT=%s\n",
time_str, e->pid, e->tid, e->comm, (void*)e->mutex,
e->ret == 0 ? "SUCCESS" : "FAIL");
break;
case ACTION_RELEASE:
printf("[%s] RELEASE | PID=%d TID=%d COMM=%-15s MUTEX=%p\n",
time_str, e->pid, e->tid, e->comm, (void*)e->mutex);
break;
case ACTION_TRY_LOCK:
printf("[%s] TRY | PID=%d TID=%d COMM=%-15s MUTEX=%p\n",
time_str, e->pid, e->tid, e->comm, (void*)e->mutex);
break;
case ACTION_TRY_RESULT:
printf("[%s] TRY_RES | PID=%d TID=%d COMM=%-15s MUTEX=%p RESULT=%s\n",
time_str, e->pid, e->tid, e->comm, (void*)e->mutex,
e->ret == 0 ? "SUCCESS" : "FAIL");
break;
}
fflush(stdout);
}
static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
{
fprintf(stderr, "Lost %llu events on CPU %d\n", lost_cnt, cpu);
}
int main(int argc, char **argv)
{
struct mutex_trace_bpf *skel;
const char *pthread_path = "/lib/x86_64-linux-gnu/libpthread.so.0";
size_t lock_offset, unlock_offset, trylock_offset;
struct perf_buffer *pb = NULL;
int err;
if (argc < 2) {
fprintf(stderr, "Usage: %s <PID>\n", argv[0]);
fprintf(stderr, "Example: sudo %s 1234\n", argv[0]);
return 1;
}
target_pid = atoi(argv[1]);
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
// 获取系统启动时间
boot_time_us = get_boot_time_us();
if (bump_memlock_rlimit()) {
fprintf(stderr, "Failed to increase memlock limit\n");
return 1;
}
// 检查进程
char proc_path[256];
snprintf(proc_path, sizeof(proc_path), "/proc/%d", target_pid);
if (access(proc_path, F_OK) != 0) {
fprintf(stderr, "Process %d does not exist\n", target_pid);
return 1;
}
// 检查 libpthread
if (access(pthread_path, F_OK) != 0) {
pthread_path = "/usr/lib/x86_64-linux-gnu/libpthread.so.0";
if (access(pthread_path, F_OK) != 0) {
fprintf(stderr, "libpthread not found\n");
return 1;
}
}
// 获取偏移量
lock_offset = get_symbol_offset(pthread_path, "pthread_mutex_lock");
unlock_offset = get_symbol_offset(pthread_path, "pthread_mutex_unlock");
trylock_offset = get_symbol_offset(pthread_path, "pthread_mutex_trylock");
if (lock_offset == 0 || unlock_offset == 0) {
fprintf(stderr, "Failed to find symbols in %s\n", pthread_path);
fprintf(stderr, "Try: nm -D %s | grep pthread_mutex\n", pthread_path);
return 1;
}
printf("========================================\n");
printf("Mutex Trace Tool - Real-time Monitor\n");
printf("========================================\n");
printf("Target PID: %d\n", target_pid);
printf("libpthread: %s\n", pthread_path);
printf("lock offset: 0x%zx\n", lock_offset);
printf("unlock offset: 0x%zx\n", unlock_offset);
if (trylock_offset) {
printf("trylock offset: 0x%zx\n", trylock_offset);
}
printf("========================================\n\n");
// 打开 skeleton
skel = mutex_trace_bpf__open();
if (!skel) {
fprintf(stderr, "Failed to open BPF skeleton\n");
return 1;
}
// 设置程序类型
bpf_program__set_type(skel->progs.trace_mutex_lock, BPF_PROG_TYPE_KPROBE);
bpf_program__set_type(skel->progs.trace_mutex_lock_ret, BPF_PROG_TYPE_KPROBE);
bpf_program__set_type(skel->progs.trace_mutex_unlock, BPF_PROG_TYPE_KPROBE);
if (trylock_offset) {
bpf_program__set_type(skel->progs.trace_mutex_trylock, BPF_PROG_TYPE_KPROBE);
bpf_program__set_type(skel->progs.trace_mutex_trylock_ret, BPF_PROG_TYPE_KPROBE);
}
// 加载 BPF 程序
err = mutex_trace_bpf__load(skel);
if (err) {
fprintf(stderr, "Failed to load BPF skeleton: %d\n", err);
goto cleanup;
}
// 添加目标进程到监控列表
printf("Adding target process: %d\n", target_pid);
int map_fd = bpf_map__fd(skel->maps.target_pids);
u8 value = 1;
if (bpf_map_update_elem(map_fd, &target_pid, &value, BPF_ANY) != 0) {
fprintf(stderr, "Failed to add target PID\n");
goto cleanup;
}
// 添加线程和子进程
printf("Scanning threads...\n");
add_threads(target_pid, skel);
printf("Scanning child processes...\n");
add_child_processes(target_pid, skel);
// 挂载 probes
skel->links.trace_mutex_lock = bpf_program__attach_uprobe(
skel->progs.trace_mutex_lock, false, -1, pthread_path, lock_offset);
if (!skel->links.trace_mutex_lock) {
fprintf(stderr, "Failed to attach lock uprobe\n");
goto cleanup;
}
skel->links.trace_mutex_lock_ret = bpf_program__attach_uprobe(
skel->progs.trace_mutex_lock_ret, true, -1, pthread_path, lock_offset);
if (!skel->links.trace_mutex_lock_ret) {
fprintf(stderr, "Failed to attach lock ret uprobe\n");
goto cleanup;
}
skel->links.trace_mutex_unlock = bpf_program__attach_uprobe(
skel->progs.trace_mutex_unlock, false, -1, pthread_path, unlock_offset);
if (!skel->links.trace_mutex_unlock) {
fprintf(stderr, "Failed to attach unlock uprobe\n");
goto cleanup;
}
if (trylock_offset) {
skel->links.trace_mutex_trylock = bpf_program__attach_uprobe(
skel->progs.trace_mutex_trylock, false, -1, pthread_path, trylock_offset);
skel->links.trace_mutex_trylock_ret = bpf_program__attach_uprobe(
skel->progs.trace_mutex_trylock_ret, true, -1, pthread_path, trylock_offset);
}
// 创建 perf buffer
struct perf_buffer_opts pb_opts = {
.sample_cb = handle_event,
.lost_cb = handle_lost_events,
};
pb = perf_buffer__new(bpf_map__fd(skel->maps.events), 4096, &pb_opts);
if (!pb) {
fprintf(stderr, "Failed to create perf buffer\n");
goto cleanup;
}
printf("\n✅ Monitoring PID %d and its children/threads...\n", target_pid);
printf("Press Ctrl-C to stop...\n\n");
// 事件循环
while (!exiting) {
err = perf_buffer__poll(pb, 100);
if (err < 0 && err != -EINTR) {
fprintf(stderr, "Error polling perf buffer: %d\n", err);
break;
}
}
cleanup:
if (pb) perf_buffer__free(pb);
mutex_trace_bpf__destroy(skel);
printf("\n👋 Exiting...\n");
return 0;
}
效果

测试程序代码
写一个程序程序,作为被测进程,用来被跟踪其锁的操作。
// test_lock.c - 测试锁监控
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
void* thread_func(void* arg) {
int id = *(int*)arg;
sleep(20); // 持有锁100ms
for (int i = 0; i < 5; i++) {
printf("Thread %d: trying to acquire lock\n", id);
pthread_mutex_lock(&mutex);
printf("Thread %d: acquired lock\n", id);
sleep(5); // 持有锁100ms
pthread_mutex_unlock(&mutex);
printf("Thread %d: released lock\n", id);
usleep(50000);
}
return NULL;
}
int main() {
pthread_t t1, t2,t3,t4;
int id1 = 1, id2 = 2, id3=3,id4=4;
printf("Starting lock test...\n");
pthread_create(&t1, NULL, thread_func, &id1);
pthread_create(&t2, NULL, thread_func, &id2);
pthread_create(&t3, NULL, thread_func, &id3);
pthread_create(&t4, NULL, thread_func, &id4);
pthread_join(t1, NULL);
pthread_join(t2, NULL);
pthread_join(t3, NULL);
pthread_join(t4, NULL);
printf("Test complete\n");
return 0;
}