Linux eBPF 案例:sk_filter 读取 IP 地址崩溃

文章目录

  • [1. 前言](#1. 前言)
  • [2. 案例](#2. 案例)
  • [3. 解析](#3. 解析)

1. 前言

限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。

2. 案例

笔者在制作一个使用 eBPF 分析网络收发流程耗时的工具,以 Linux 自带的示例代码为起点。eBPF 测试代码包括用户空间内核空间两部分。用户空间部分测试代码如下:

c 复制代码
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <assert.h>
#include <linux/bpf.h>
#include "libbpf.h"
#include "bpf_load.h"
#include "sock_example.h"
#include <unistd.h>
#include <arpa/inet.h>
#include <linux/net_tstamp.h>

int main(int argc, char **argv)
{
	char filename[256];
	FILE *f;
	int i, sock;
	
	if (argc != 2) {
		printf("usage: %s <bpf-prog-name>\n", argv[0]);
		return 0;
	}

	snprintf(filename, sizeof(filename), "%s_kern.o", argv[1]);
	
	if (load_bpf_file(filename)) {
		printf("%s(): %s", __func__, bpf_log_buf);
		return 1;
	}
	
	sock = open_raw_sock("lo");

	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
			  sizeof(prog_fd[0])) == 0);
	
	f = popen("ping -c5 localhost", "r");
	(void) f;
	
	printf("sizeof(long long) = %d\n", sizeof(long long));
	for (i = 0; i < 5; i++) {
		//long long tcp_cnt[4], udp_cnt[4], icmp_cnt[4];
		long long tcp_cnt, udp_cnt, icmp_cnt;
		int key;

		key = IPPROTO_TCP;
		assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);

		key = IPPROTO_UDP;
		assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);

		key = IPPROTO_ICMP;
		assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);

		printf("[%d] TCP: 0x%08llx UDP: 0x%08llx ICMP: 0x%08llx\n",
		       i + 1, tcp_cnt, udp_cnt, icmp_cnt);

		sleep(1);
	}

	return 0;
}

内核空间部分代码如下:

c 复制代码
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include "bpf_helpers.h"

#include <stddef.h>

#define bpf_printk(fmt, ...)                                    \
({                                                              \
               char ____fmt[] = fmt;                            \
               bpf_trace_printk(____fmt, sizeof(____fmt),       \
                                ##__VA_ARGS__);                 \
})

typedef unsigned int u32;

struct bpf_map_def SEC("maps") my_map = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(long long)/* * 4*/,
	.max_entries = 256,
};

SEC("socket1")
int bpf_prog1(struct __sk_buff *skb)
{
	int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
	long long *value;

	if (skb->pkt_type != PACKET_OUTGOING)
		return 0;

	value = bpf_map_lookup_elem(&my_map, &index);
	if (value) {
		value[0] = skb->remote_ip4;
		//value[1] = skb->remote_port;
		//value[2] = skb->local_ip4;
		//value[3] = skb->local_port;
	}

	return 0;
}
char _license[] SEC("license") = "GPL";

由于内核对一些 __sk_buff 成员的访问做了限制(如 Linux eBPF 错误:invalid bpf_context access 所述),所以这里先修改下内核代码函数 sk_filter_is_valid_access() 跳过这个访问限制:

c 复制代码
static bool sk_filter_is_valid_access(int off, int size,
				      enum bpf_access_type type,
				      struct bpf_insn_access_aux *info)
{
	switch (off) {
	case bpf_ctx_range(struct __sk_buff, tc_classid):
	case bpf_ctx_range(struct __sk_buff, data):
	case bpf_ctx_range(struct __sk_buff, data_end):
	//case bpf_ctx_range_till(struct __sk_buff, family, local_port): // 注释掉这一行
		return false;
	}

	if (type == BPF_WRITE) {
		switch (off) {
		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
			break;
		default:
			return false;
		}
	}

	return bpf_skb_is_valid_access(off, size, type, info);
}

重新编译内核,然后再编译本文的测试程序后运行:

c 复制代码
# ./sockex1_user sockex1

内核崩了,日志如下:

bash 复制代码
Unhandled fault: page domain fault (0x01b) at 0x00000000
pgd = 9ea28000
[00000000] *pgd=7ede3831, *pte=00000000, *ppte=00000000
Internal error: : 1b [#1] SMP ARM
Modules linked in:
CPU: 2 PID: 944 Comm: ping Not tainted 4.14.111 #18
Hardware name: ARM-Versatile Express
task: 9edbc200 task.stack: 9ef54000
PC is at ___bpf_prog_run+0x142c/0x19a8
LR is at bpf_map_lookup_elem+0x24/0x2c
pc : [<801e9ab0>]    lr : [<801f5654>]    psr: a00e0013
sp : 9ef55998  ip : 00000000  fp : 9ef559fc
r10: 801e6064  r9 : 8080bd34  r8 : 9ef55a20
r7 : 00000370  r6 : 00000000  r5 : 801f5630  r4 : a12be094
r3 : 00000001  r2 : 9ef55a28  r1 : 00000000  r0 : 00000000
Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 7ea2806a  DAC: 00000051
Process ping (pid: 944, stack limit = 0x9ef54210)
Stack: (0x9ef55998 to 0x9ef56000)
5980:                                                       00000000 00000370
59a0: 00000003 9eed8bc0 9efa9cc0 9f411000 600e0013 00000000 00000000 00000000
59c0: 00000000 00000000 9ef55a14 9ef559d8 8016ad6c a12be000 9efa9cc0 9f411000
59e0: 9ee9e002 9f411000 00000062 00000062 9ef55a84 9ef55a00 801ea464 801e8690
5a00: 00000062 00000062 9ef55a2c 9ef55a18 8016b068 8016ac8c 80925a90 00000001
5a20: 9f64d098 00000000 00000000 00000000 9ef55a1c 00000000 00000000 00000370
5a40: 00000003 9eed8bc0 9efa9cc0 9f411000 9efa9cc0 00000000 8016bda4 8016c3f0
5a60: 9ef55adc 9ef55a7c 9ef55a9c 9ef55a88 9ef55a20 00000000 9ef55a9c 9ef55a88
5a80: 806bd948 801ea434 9efa9cc0 9ee62400 9ef55ad4 9ef55aa0 806bd9e8 806bd8f4
5aa0: 1a407200 188e4ce0 9efa99c0 806bd960 9efa9cc0 9ee62780 9efa99c0 9f411054
5ac0: 9f411000 8091fe28 9ef55b14 9ef55ad8 8061afd8 806bd96c 80b04e7c 9f411000
5ae0: 01080020 00000000 401d7c69 9efa99c0 80a85144 00000000 9f411000 9ed57500
5b00: 9f411000 00000000 9ef55b6c 9ef55b18 80620a84 8061ad74 9efa99c0 80a85144
5b20: 9ef55b88 80b02d00 80b04e7c 80b03c6c 9ef54000 80b65084 80b65070 9ed57500
5b40: 9fbd4800 9efa99c0 80a85144 00000002 00000000 9ed57500 9f411000 80649984
5b60: 9ef55bcc 9ef55b70 80621388 806209e4 8063007c 9efa9780 9ec0dac8 9f411000
5b80: 9ef55bac 00000200 fffffff4 ffffe000 806312c8 9efa9794 00000008 806499b0
5ba0: 00000800 9efa99c0 9efa9780 9f411000 00000000 00000000 9efa97dc 80649984
5bc0: 9ef55bdc 9ef55bd0 8062142c 80620d40 9ef55c14 9ef55be0 80631264 8062141c
5be0: 00000000 00000054 9ef55c14 9f411000 9efa99c0 9efa9780 9f58ef80 80b54580
5c00: 80b54580 00000000 9ef55c44 9ef55c18 8065b8f4 80631160 00000008 0100007f
5c20: 9efa99c0 9efa99c0 9ef03180 0000ffff 9ef03180 80b54580 9ef55c7c 9ef55c48
5c40: 8065d7a0 8065b6f0 9ef55d14 00000000 9ef55c7c 9efa99c0 80b54580 9ef03180
5c60: 9f411000 9ef55d14 00000000 00000000 9ef55c9c 9ef55c80 8065dab8 8065d5d4
5c80: 9efa99c0 80b54580 9ef03180 00000000 9ef55cbc 9ef55ca0 8065b50c 8065da14
5ca0: 00000040 80b54580 9ef03180 00000000 9ef55cd4 9ef55cc0 8065e388 8065b4ac
5cc0: 00000040 9ef55f48 9ef55ce4 9ef55cd8 8065e43c 8065e370 9ef55dec 9ef55ce8
5ce0: 80687030 8065e414 00000040 00000000 9ef55d2c 9ef55d14 00000000 808d4f8c
5d00: 9f06ff30 00000001 9ef55d4c 9ef55d18 80289aec 00000000 9f2edaa0 9ef55f48
5d20: 9f770008 9efa9b40 00000002 9f06fee0 802f0000 0100007f 00000000 00000000
5d40: ffff0000 00000200 00000001 00000001 00000000 00010000 00000000 9ef55e48
5d60: 00000000 00000000 00000000 802712a0 0100007f 0100007f 00000008 9ef55e24
5d80: 00000001 00000001 9ef54000 00000128 9ef55dd4 9ef55da0 8042703c 80271218
5da0: 9ef55e48 9ef55dac 00000010 9ef55e48 00000010 9ef55f50 00000000 9ef55e24
5dc0: 00000051 9ef03180 00000000 00000000 9f2c1380 801080a4 9ef55e28 00000000
5de0: 9ef55e0c 9ef55df0 8069517c 80686bac 00000001 0001814c 9ef55f48 00000000
5e00: 9ef55e1c 9ef55e10 80600c34 80695130 9ef55f34 9ef55e20 80601704 80600c1c
5e20: 9eeeba80 00000000 9ef55ea4 9ef55e38 80249584 8021266c 9f2c1520 fffff000
5e40: 9eeeba80 00000054 00019314 00000040 76e23000 9ea29db8 9ea29db8 00000000
5e60: 00000000 00000000 00000000 00000000 9ffe6dbc 00000000 9ef55ea4 7ecd9ca0
5e80: 376eac80 000000c5 00000002 0100007f 00000000 00000000 00000051 80275d6c
5ea0: 0000000a 00000000 00000000 806065c8 00001180 00000001 00000000 00000200
5ec0: 00000001 ffffe000 80606604 7ecda1b0 9f2c1380 00000001 9ef55f04 9ef55ee8
5ee0: 80128944 801d115c 9ef03180 80283dc8 7ecda114 9f31aec8 9f5d69c0 802909b8
5f00: 9ef55f1c 9ef55f10 802909b8 00018164 00000000 9f2c1380 00000128 801080a4
5f20: 9ef54000 00000128 9ef55f94 9ef55f38 8060259c 80601524 00000000 00000000
5f40: 00000000 fffffff7 9ef55e88 00000010 00000001 00000000 00000000 9ef55e50
5f60: 00000000 0000004e 0001814c 00000000 00000000 00000000 7ecd9150 00019304
5f80: 00000000 00000040 9ef55fa4 9ef55f98 806025e0 80602558 00000000 9ef55fa8
5fa0: 80107ec0 806025d4 00019304 00000000 00000003 00018164 00000000 0001813c
5fc0: 00019304 00000000 00000040 00000128 0002b350 00018230 000192f0 10624dd3
5fe0: 00000000 7ecd914c 0000af4b 76e80ad8 400e0030 00000003 00000000 00000000
[<801e9ab0>] (___bpf_prog_run) from [<801ea464>] (__bpf_prog_run32+0x3c/0x44)
[<801ea464>] (__bpf_prog_run32) from [<806bd948>] (run_filter+0x60/0x78)
[<806bd948>] (run_filter) from [<806bd9e8>] (packet_rcv+0x88/0x38c)
[<806bd9e8>] (packet_rcv) from [<8061afd8>] (dev_queue_xmit_nit+0x270/0x298)
[<8061afd8>] (dev_queue_xmit_nit) from [<80620a84>] (dev_hard_start_xmit+0xac/0x258)
[<80620a84>] (dev_hard_start_xmit) from [<80621388>] (__dev_queue_xmit+0x654/0x6dc)
[<80621388>] (__dev_queue_xmit) from [<8062142c>] (dev_queue_xmit+0x1c/0x20)
[<8062142c>] (dev_queue_xmit) from [<80631264>] (neigh_resolve_output+0x110/0x198)
[<80631264>] (neigh_resolve_output) from [<8065b8f4>] (ip_finish_output2+0x210/0x44c)
[<8065b8f4>] (ip_finish_output2) from [<8065d7a0>] (ip_finish_output+0x1d8/0x1f0)
[<8065d7a0>] (ip_finish_output) from [<8065dab8>] (ip_output+0xb0/0xbc)
[<8065dab8>] (ip_output) from [<8065b50c>] (ip_local_out+0x6c/0x78)
[<8065b50c>] (ip_local_out) from [<8065e388>] (ip_send_skb+0x24/0xa4)
[<8065e388>] (ip_send_skb) from [<8065e43c>] (ip_push_pending_frames+0x34/0x40)
[<8065e43c>] (ip_push_pending_frames) from [<80687030>] (raw_sendmsg+0x490/0x83c)
[<80687030>] (raw_sendmsg) from [<8069517c>] (inet_sendmsg+0x58/0xf4)
[<8069517c>] (inet_sendmsg) from [<80600c34>] (sock_sendmsg+0x24/0x34)
[<80600c34>] (sock_sendmsg) from [<80601704>] (___sys_sendmsg+0x1ec/0x214)
[<80601704>] (___sys_sendmsg) from [<8060259c>] (__sys_sendmsg+0x50/0x7c)
[<8060259c>] (__sys_sendmsg) from [<806025e0>] (SyS_sendmsg+0x18/0x1c)
[<806025e0>] (SyS_sendmsg) from [<80107ec0>] (ret_fast_syscall+0x0/0x28)
Code: e1a02223 e203300f e798c182 e0882183 (e79c0000) 
---[ end trace 2168b38dee2f6dfc ]---
Kernel panic - not syncing: Fatal exception in interrupt
CPU3: stopping
CPU: 3 PID: 0 Comm: swapper/3 Tainted: G      D         4.14.111 #18
Hardware name: ARM-Versatile Express
[<80110f40>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f1a78>] (dump_stack+0xac/0xd8)
[<806f1a78>] (dump_stack) from [<8010f324>] (handle_IPI+0x2d0/0x34c)
[<8010f324>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<8010d170>] (__irq_svc+0x70/0x98)
Exception stack(0x9f50bf28 to 0x9f50bf70)
bf20:                   00000001 00000000 00000000 80b0402c 9f50a000 00000000
bf40: 00000000 80b03cb8 80b03c6c 80a84b48 9f50bf98 9f50bf84 9f50bf78 9f50bf78
bf60: 801088c8 801088cc 60070013 ffffffff
[<8010d170>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<8070cf14>] (default_idle_call+0x34/0x48)
[<8070cf14>] (default_idle_call) from [<8015f7c4>] (do_idle+0x16c/0x218)
[<8015f7c4>] (do_idle) from [<8015fb1c>] (cpu_startup_entry+0x28/0x2c)
[<8015fb1c>] (cpu_startup_entry) from [<8010ede4>] (secondary_start_kernel+0x168/0x170)
[<8010ede4>] (secondary_start_kernel) from [<60101a0c>] (0x60101a0c)
CPU0: stopping
CPU: 0 PID: 0 Comm: swapper/0 Tainted: G      D         4.14.111 #18
Hardware name: ARM-Versatile Express
[<80110f40>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f1a78>] (dump_stack+0xac/0xd8)
[<806f1a78>] (dump_stack) from [<8010f324>] (handle_IPI+0x2d0/0x34c)
[<8010f324>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<8010d170>] (__irq_svc+0x70/0x98)
Exception stack(0x80b01ee0 to 0x80b01f28)
1ee0: 00000001 00000000 00000000 80b0402c 80b00000 00000000 00000000 80b03cb8
1f00: 80b03c6c 80a84b48 80b01f50 80b01f3c 80b01f30 80b01f30 801088c8 801088cc
1f20: 600e0013 ffffffff
[<8010d170>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<8070cf14>] (default_idle_call+0x34/0x48)
[<8070cf14>] (default_idle_call) from [<8015f7c4>] (do_idle+0x16c/0x218)
[<8015f7c4>] (do_idle) from [<8015fb1c>] (cpu_startup_entry+0x28/0x2c)
[<8015fb1c>] (cpu_startup_entry) from [<80706940>] (rest_init+0xbc/0xc0)
[<80706940>] (rest_init) from [<80a00dec>] (start_kernel+0x3a8/0x3b4)
CPU1: stopping
CPU: 1 PID: 0 Comm: swapper/1 Tainted: G      D         4.14.111 #18
Hardware name: ARM-Versatile Express
[<80110f40>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f1a78>] (dump_stack+0xac/0xd8)
[<806f1a78>] (dump_stack) from [<8010f324>] (handle_IPI+0x2d0/0x34c)
[<8010f324>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<8010d170>] (__irq_svc+0x70/0x98)
Exception stack(0x9f507f28 to 0x9f507f70)
7f20:                   00000001 00000000 00000000 80b0402c 9f506000 00000000
7f40: 00000000 80b03cb8 80b03c6c 80a84b48 9f507f98 9f507f84 9f507f78 9f507f78
7f60: 801088c8 801088cc 600f0013 ffffffff
[<8010d170>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<8070cf14>] (default_idle_call+0x34/0x48)
[<8070cf14>] (default_idle_call) from [<8015f7c4>] (do_idle+0x16c/0x218)
[<8015f7c4>] (do_idle) from [<8015fb1c>] (cpu_startup_entry+0x28/0x2c)
[<8015fb1c>] (cpu_startup_entry) from [<8010ede4>] (secondary_start_kernel+0x168/0x170)
[<8010ede4>] (secondary_start_kernel) from [<60101a0c>] (0x60101a0c)
---[ end Kernel panic - not syncing: Fatal exception in interrupt

测试环境为 QEMU 模拟的 ARM-Versatile Express 开发板,通过 addr2line 定位到 ___bpf_prog_run() 的对应代码行:

c 复制代码
static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
				    u64 *stack)
{
	...
	// 这里的代码,是宏 LDST(W,  u32) 展开后的样子
	LDX_MEM_W:
		regs[insn->dst_reg] = *(u32 *)(unsigned long) (regs[insn->src_reg] + insn->off);
		({
			insn++;
			goto select_insn;
		});
	...
}

崩溃时 eBPF 解释器正解释执行一条 eBPF 指令 61 11 00 00 00 00 00 00,该指令中,regs[insn->src_reg] 存储的指针值为 0,同时 insn->off 也为 0,所以出现了 NULL 指针访问错误。

这里要说明一下,这条崩溃指令 61 11 00 00 00 00 00 00 的内容,是笔者插入日志打印出来的。

3. 解析

首先看下 eBPF 内核程序 sockex1_kern.o 的反汇编代码:

c 复制代码
$ llvm-objdump-8 -d sockex1_kern.o

sockex1_kern.o:	file format ELF64-BPF

Disassembly of section socket1:
0000000000000000 bpf_prog1:
       0:	bf 16 00 00 00 00 00 00 	r6 = r1
       1:	30 00 00 00 17 00 00 00 	r0 = *(u8 *)skb[23]
       2:	63 0a fc ff 00 00 00 00 	*(u32 *)(r10 - 4) = r0
       3:	61 61 04 00 00 00 00 00 	r1 = *(u32 *)(r6 + 4)
       4:	55 01 08 00 04 00 00 00 	if r1 != 4 goto +8 <LBB0_3>
       5:	bf a2 00 00 00 00 00 00 	r2 = r10
       6:	07 02 00 00 fc ff ff ff 	r2 += -4
       7:	18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 	r1 = 0 ll
       9:	85 00 00 00 01 00 00 00 	call 1
      10:	15 00 02 00 00 00 00 00 	if r0 == 0 goto +2 <LBB0_3>
      11:	61 61 5c 00 00 00 00 00 	r1 = *(u32 *)(r6 + 92)
      12:	7b 10 00 00 00 00 00 00 	*(u64 *)(r0 + 0) = r1

0000000000000068 LBB0_3:
      13:	b7 00 00 00 00 00 00 00 	r0 = 0
      14:	95 00 00 00 00 00 00 00 	exit

细心比对,并没有前面提到的崩溃指令 61 11 00 00 00 00 00 00,这是怎么一回事?原因是 eBPF 程序注入内核时,一些对内核数据结构的访问指令,需要进行修正,这期间会修改和插入新的 eBPF 指令。具体到本文测试的 socket 的 eBPF 钩子程序而言,需要将 eBPF 程序 sockex1_kern.o__sk_buff 的成员访问,修正为 sk_buff 中对应成员的访问。一方面,因为在调用 eBPF 程序 sockex1_kern.o 时传递参数实际是一个 sk_buff 指针,而不是 __sk_buff 指针

c 复制代码
/* 运行 eBPF 钩子程序 */
static unsigned int run_filter(struct sk_buff *skb,
			       const struct sock *sk,
			       unsigned int res)
{
	struct sk_filter *filter;

	rcu_read_lock();
	filter = rcu_dereference(sk->sk_filter);
	if (filter != NULL)
		res = bpf_prog_run_clear_cb(filter->prog, skb);
	rcu_read_unlock();

	return res;
}

bpf_prog_run_clear_cb(filter->prog, skb)
	// 传递的是 struct sk_buff * 而不是 struct __sk_buff *
	__bpf_prog_run32(skb, filter->insnsi)

另一方面,通过这样的方式,可以不将整个 sk_buff 的定义细节暴露到用户空间。

将 eBPF 程序 sockex1_kern.o 注入内核时,把 eBPF 程序 sockex1_kern.o 中对 __sk_buff 的成员访问,转换为 sk_buff 成员的访问细节如下:

c 复制代码
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	...
	switch (cmd) {
	...
	case BPF_PROG_LOAD:
		err = bpf_prog_load(&attr);
		break;
	...
	}
	...
}

static int bpf_prog_load(union bpf_attr *attr)
{
	...
	err = bpf_check(&prog, attr);
	...
}

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
	...
	if (ret == 0)
		/* program is valid, convert *(u32*)(ctx + off) accesses */
		ret = convert_ctx_accesses(env);
	...
}

static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
	...
	for (i = 0; i < insn_cnt; i++, insn++) {
		...
		/* bpf_convert_ctx_access() */
		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
					      &target_size);
		...
	}
	...
}

static u32 bpf_convert_ctx_access(enum bpf_access_type type,
				  const struct bpf_insn *si,
				  struct bpf_insn *insn_buf,
				  struct bpf_prog *prog, u32 *target_size)
{
	...
	switch (si->off) {
	...
	case offsetof(struct __sk_buff, pkt_type):
		*target_size = 1;
		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
				      PKT_TYPE_OFFSET());
		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
#endif
		break;
	...
	case offsetof(struct __sk_buff, remote_ip4):
		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);

		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
				      si->dst_reg, si->src_reg,
				      offsetof(struct sk_buff, sk));
		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
				      bpf_target_off(struct sock_common,
						     skc_daddr,
						     4, target_size));
		break;
	...
	}
	...
}

修正后的 eBPF 程序变为:

c 复制代码
bf 16 00 00 00 00 00 00
30 00 00 00 17 00 00 00
63 0a fc ff 00 00 00 00
71 61 64 00 00 00 00 00
54 01 00 00 07 00 00 00
55 01 09 00 04 00 00 00
bf a2 00 00 00 00 00 00
07 02 00 00 fc ff ff ff
18 01 00 00 00 d0 64 9f
00 00 00 00 00 00 00 00
85 00 00 00 cc f5 00 00
15 00 03 00 00 00 00 00
61 61 0c 00 00 00 00 00
61 11 00 00 00 00 00 00
7b 10 00 00 00 00 00 00
b7 00 00 00 00 00 00 00
95 00 00 00 00 00 00 00

相对于原程序,将原指令 61 61 04 00 00 00 00 00 修改为如下两条指令(即代码里的 skb->pkt_type 读取语句):

c 复制代码
71 61 64 00 00 00 00 00 // r1 = *(u8 *)(r6 + 100)
54 01 00 00 07 00 00 00 // r1 = r1 & 0x00000007

即将 __sk_buff::pkt_type 的访问,转换为 sk_buff::__pkt_type_offset[0] 的访问,看一下 __sk_buffsk_buff 相关定义:

c 复制代码
struct __sk_buff {
	__u32 len;
	__u32 pkt_type;
	...
};

struct sk_buff {
	...
	__u8			__pkt_type_offset[0];
	__u8			pkt_type:3;
	...
};

另外,将原指令 61 61 5c 00 00 00 00 00 修改为(即代码里的 skb->remote_ip4 读取语句):

c 复制代码
61 61 0c 00 00 00 00 00 // r1 = *(u32 *)(r6 + 12) [r1 = sk_buff::sk]
61 11 00 00 00 00 00 00 // r1 = *(u32 *)r1

即将对 __sk_buff::remote_ip4 的访问,转换为 sk_buff::sk->__sk_common.skc_daddr

说完 eBPF 程序的修正,回到本文崩溃的根因分析,也即函数 ___bpf_prog_run() 中:

c 复制代码
static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
				    u64 *stack)
{
	...
	// 这里的代码,是宏 LDST(W,  u32) 展开后的样子
	LDX_MEM_W:
		regs[insn->dst_reg] = *(u32 *)(unsigned long) (regs[insn->src_reg] + insn->off);
		({
			insn++;
			goto select_insn;
		});
	...
}

前面提到崩溃是在解释执行指令 61 11 00 00 00 00 00 00 时,那这是一条什么指令?看一下 eBPF 虚拟机对指令的定义:

c 复制代码
struct bpf_insn {
	__u8	code;		/* opcode */
	__u8	dst_reg:4;	/* dest register */
	__u8	src_reg:4;	/* source register */ /* 值 BPF_PSEUDO_MAP_FD 表示 bpf_map 访问指令 */
	__s16	off;		/* signed offset */
	__s32	imm;		/* signed immediate constant */
};

所以指令 61 11 00 00 00 00 00 00 解释为 r1 = *(u32 *)r1,那 eBPF 解释器寄存器 r1 的内容又是谁设定的?61 11 00 00 00 00 00 00 的上一条指令 61 61 0c 00 00 00 00 00 设定了 r1r1 = *(u32 *)(r6 + 12)。而 r6 在 eBPF 程序开头设置为 sk_buff 指针,所以综合起来,崩溃指令 61 11 00 00 00 00 00 00 的操作就是 sk_buff::sk->__sk_common.skc_daddr,内核崩溃是因为 sk_buff::sk == NULL (即 r1 = 0) 导致,也即 skb 关联的 sock 成员 sk_buff::sk 没有设定导致:

c 复制代码
packet_rcv()
	run_filter()
		__bpf_prog_run32()
			__bpf_prog_run()
相关推荐
zhangrelay2 小时前
如何让手机电脑流畅飞起低碳节能性能拉满-软件安装篇-ESR-Extended Support Release-延长支持版-LTS
linux·运维·笔记·学习
hinewcc2 小时前
Linux电源管理 - wakelocks
android·linux
qq_229058012 小时前
Docker常用命令
linux·服务器·docker
Vect__2 小时前
基于抢票系统的线程互斥详解
linux
是个西兰花2 小时前
进程间通信:匿名管道
linux·运维·服务器
小北方城市网2 小时前
Spring Cloud Gateway 生产级微内核架构设计与可插拔过滤器开发
java·大数据·linux·运维·spring boot·redis·分布式
wacpguo2 小时前
Ubuntu 24.04 安装 Docker
linux·ubuntu·docker
Lenyiin3 小时前
Linux 进程控制
linux·运维·服务器
春日见3 小时前
Git 相关操作大全
linux·人工智能·驱动开发·git·算法·机器学习