文章目录
- [1. 前言](#1. 前言)
- [2. 案例](#2. 案例)
- [3. 解析](#3. 解析)
1. 前言
限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
2. 案例
笔者在制作一个使用 eBPF 分析网络收发流程耗时的工具,以 Linux 自带的示例代码为起点。eBPF 测试代码包括用户空间和内核空间两部分。用户空间部分测试代码如下:
c
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <assert.h>
#include <linux/bpf.h>
#include "libbpf.h"
#include "bpf_load.h"
#include "sock_example.h"
#include <unistd.h>
#include <arpa/inet.h>
#include <linux/net_tstamp.h>
int main(int argc, char **argv)
{
char filename[256];
FILE *f;
int i, sock;
if (argc != 2) {
printf("usage: %s <bpf-prog-name>\n", argv[0]);
return 0;
}
snprintf(filename, sizeof(filename), "%s_kern.o", argv[1]);
if (load_bpf_file(filename)) {
printf("%s(): %s", __func__, bpf_log_buf);
return 1;
}
sock = open_raw_sock("lo");
assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
sizeof(prog_fd[0])) == 0);
f = popen("ping -c5 localhost", "r");
(void) f;
printf("sizeof(long long) = %d\n", sizeof(long long));
for (i = 0; i < 5; i++) {
//long long tcp_cnt[4], udp_cnt[4], icmp_cnt[4];
long long tcp_cnt, udp_cnt, icmp_cnt;
int key;
key = IPPROTO_TCP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);
key = IPPROTO_UDP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);
key = IPPROTO_ICMP;
assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);
printf("[%d] TCP: 0x%08llx UDP: 0x%08llx ICMP: 0x%08llx\n",
i + 1, tcp_cnt, udp_cnt, icmp_cnt);
sleep(1);
}
return 0;
}
内核空间部分代码如下:
c
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include "bpf_helpers.h"
#include <stddef.h>
#define bpf_printk(fmt, ...) \
({ \
char ____fmt[] = fmt; \
bpf_trace_printk(____fmt, sizeof(____fmt), \
##__VA_ARGS__); \
})
typedef unsigned int u32;
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(long long)/* * 4*/,
.max_entries = 256,
};
SEC("socket1")
int bpf_prog1(struct __sk_buff *skb)
{
int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
long long *value;
if (skb->pkt_type != PACKET_OUTGOING)
return 0;
value = bpf_map_lookup_elem(&my_map, &index);
if (value) {
value[0] = skb->remote_ip4;
//value[1] = skb->remote_port;
//value[2] = skb->local_ip4;
//value[3] = skb->local_port;
}
return 0;
}
char _license[] SEC("license") = "GPL";
由于内核对一些 __sk_buff 成员的访问做了限制(如 Linux eBPF 错误:invalid bpf_context access 所述),所以这里先修改下内核代码函数 sk_filter_is_valid_access() 跳过这个访问限制:
c
static bool sk_filter_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
//case bpf_ctx_range_till(struct __sk_buff, family, local_port): // 注释掉这一行
return false;
}
if (type == BPF_WRITE) {
switch (off) {
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
default:
return false;
}
}
return bpf_skb_is_valid_access(off, size, type, info);
}
重新编译内核,然后再编译本文的测试程序后运行:
c
# ./sockex1_user sockex1
内核崩了,日志如下:
bash
Unhandled fault: page domain fault (0x01b) at 0x00000000
pgd = 9ea28000
[00000000] *pgd=7ede3831, *pte=00000000, *ppte=00000000
Internal error: : 1b [#1] SMP ARM
Modules linked in:
CPU: 2 PID: 944 Comm: ping Not tainted 4.14.111 #18
Hardware name: ARM-Versatile Express
task: 9edbc200 task.stack: 9ef54000
PC is at ___bpf_prog_run+0x142c/0x19a8
LR is at bpf_map_lookup_elem+0x24/0x2c
pc : [<801e9ab0>] lr : [<801f5654>] psr: a00e0013
sp : 9ef55998 ip : 00000000 fp : 9ef559fc
r10: 801e6064 r9 : 8080bd34 r8 : 9ef55a20
r7 : 00000370 r6 : 00000000 r5 : 801f5630 r4 : a12be094
r3 : 00000001 r2 : 9ef55a28 r1 : 00000000 r0 : 00000000
Flags: NzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none
Control: 10c5387d Table: 7ea2806a DAC: 00000051
Process ping (pid: 944, stack limit = 0x9ef54210)
Stack: (0x9ef55998 to 0x9ef56000)
5980: 00000000 00000370
59a0: 00000003 9eed8bc0 9efa9cc0 9f411000 600e0013 00000000 00000000 00000000
59c0: 00000000 00000000 9ef55a14 9ef559d8 8016ad6c a12be000 9efa9cc0 9f411000
59e0: 9ee9e002 9f411000 00000062 00000062 9ef55a84 9ef55a00 801ea464 801e8690
5a00: 00000062 00000062 9ef55a2c 9ef55a18 8016b068 8016ac8c 80925a90 00000001
5a20: 9f64d098 00000000 00000000 00000000 9ef55a1c 00000000 00000000 00000370
5a40: 00000003 9eed8bc0 9efa9cc0 9f411000 9efa9cc0 00000000 8016bda4 8016c3f0
5a60: 9ef55adc 9ef55a7c 9ef55a9c 9ef55a88 9ef55a20 00000000 9ef55a9c 9ef55a88
5a80: 806bd948 801ea434 9efa9cc0 9ee62400 9ef55ad4 9ef55aa0 806bd9e8 806bd8f4
5aa0: 1a407200 188e4ce0 9efa99c0 806bd960 9efa9cc0 9ee62780 9efa99c0 9f411054
5ac0: 9f411000 8091fe28 9ef55b14 9ef55ad8 8061afd8 806bd96c 80b04e7c 9f411000
5ae0: 01080020 00000000 401d7c69 9efa99c0 80a85144 00000000 9f411000 9ed57500
5b00: 9f411000 00000000 9ef55b6c 9ef55b18 80620a84 8061ad74 9efa99c0 80a85144
5b20: 9ef55b88 80b02d00 80b04e7c 80b03c6c 9ef54000 80b65084 80b65070 9ed57500
5b40: 9fbd4800 9efa99c0 80a85144 00000002 00000000 9ed57500 9f411000 80649984
5b60: 9ef55bcc 9ef55b70 80621388 806209e4 8063007c 9efa9780 9ec0dac8 9f411000
5b80: 9ef55bac 00000200 fffffff4 ffffe000 806312c8 9efa9794 00000008 806499b0
5ba0: 00000800 9efa99c0 9efa9780 9f411000 00000000 00000000 9efa97dc 80649984
5bc0: 9ef55bdc 9ef55bd0 8062142c 80620d40 9ef55c14 9ef55be0 80631264 8062141c
5be0: 00000000 00000054 9ef55c14 9f411000 9efa99c0 9efa9780 9f58ef80 80b54580
5c00: 80b54580 00000000 9ef55c44 9ef55c18 8065b8f4 80631160 00000008 0100007f
5c20: 9efa99c0 9efa99c0 9ef03180 0000ffff 9ef03180 80b54580 9ef55c7c 9ef55c48
5c40: 8065d7a0 8065b6f0 9ef55d14 00000000 9ef55c7c 9efa99c0 80b54580 9ef03180
5c60: 9f411000 9ef55d14 00000000 00000000 9ef55c9c 9ef55c80 8065dab8 8065d5d4
5c80: 9efa99c0 80b54580 9ef03180 00000000 9ef55cbc 9ef55ca0 8065b50c 8065da14
5ca0: 00000040 80b54580 9ef03180 00000000 9ef55cd4 9ef55cc0 8065e388 8065b4ac
5cc0: 00000040 9ef55f48 9ef55ce4 9ef55cd8 8065e43c 8065e370 9ef55dec 9ef55ce8
5ce0: 80687030 8065e414 00000040 00000000 9ef55d2c 9ef55d14 00000000 808d4f8c
5d00: 9f06ff30 00000001 9ef55d4c 9ef55d18 80289aec 00000000 9f2edaa0 9ef55f48
5d20: 9f770008 9efa9b40 00000002 9f06fee0 802f0000 0100007f 00000000 00000000
5d40: ffff0000 00000200 00000001 00000001 00000000 00010000 00000000 9ef55e48
5d60: 00000000 00000000 00000000 802712a0 0100007f 0100007f 00000008 9ef55e24
5d80: 00000001 00000001 9ef54000 00000128 9ef55dd4 9ef55da0 8042703c 80271218
5da0: 9ef55e48 9ef55dac 00000010 9ef55e48 00000010 9ef55f50 00000000 9ef55e24
5dc0: 00000051 9ef03180 00000000 00000000 9f2c1380 801080a4 9ef55e28 00000000
5de0: 9ef55e0c 9ef55df0 8069517c 80686bac 00000001 0001814c 9ef55f48 00000000
5e00: 9ef55e1c 9ef55e10 80600c34 80695130 9ef55f34 9ef55e20 80601704 80600c1c
5e20: 9eeeba80 00000000 9ef55ea4 9ef55e38 80249584 8021266c 9f2c1520 fffff000
5e40: 9eeeba80 00000054 00019314 00000040 76e23000 9ea29db8 9ea29db8 00000000
5e60: 00000000 00000000 00000000 00000000 9ffe6dbc 00000000 9ef55ea4 7ecd9ca0
5e80: 376eac80 000000c5 00000002 0100007f 00000000 00000000 00000051 80275d6c
5ea0: 0000000a 00000000 00000000 806065c8 00001180 00000001 00000000 00000200
5ec0: 00000001 ffffe000 80606604 7ecda1b0 9f2c1380 00000001 9ef55f04 9ef55ee8
5ee0: 80128944 801d115c 9ef03180 80283dc8 7ecda114 9f31aec8 9f5d69c0 802909b8
5f00: 9ef55f1c 9ef55f10 802909b8 00018164 00000000 9f2c1380 00000128 801080a4
5f20: 9ef54000 00000128 9ef55f94 9ef55f38 8060259c 80601524 00000000 00000000
5f40: 00000000 fffffff7 9ef55e88 00000010 00000001 00000000 00000000 9ef55e50
5f60: 00000000 0000004e 0001814c 00000000 00000000 00000000 7ecd9150 00019304
5f80: 00000000 00000040 9ef55fa4 9ef55f98 806025e0 80602558 00000000 9ef55fa8
5fa0: 80107ec0 806025d4 00019304 00000000 00000003 00018164 00000000 0001813c
5fc0: 00019304 00000000 00000040 00000128 0002b350 00018230 000192f0 10624dd3
5fe0: 00000000 7ecd914c 0000af4b 76e80ad8 400e0030 00000003 00000000 00000000
[<801e9ab0>] (___bpf_prog_run) from [<801ea464>] (__bpf_prog_run32+0x3c/0x44)
[<801ea464>] (__bpf_prog_run32) from [<806bd948>] (run_filter+0x60/0x78)
[<806bd948>] (run_filter) from [<806bd9e8>] (packet_rcv+0x88/0x38c)
[<806bd9e8>] (packet_rcv) from [<8061afd8>] (dev_queue_xmit_nit+0x270/0x298)
[<8061afd8>] (dev_queue_xmit_nit) from [<80620a84>] (dev_hard_start_xmit+0xac/0x258)
[<80620a84>] (dev_hard_start_xmit) from [<80621388>] (__dev_queue_xmit+0x654/0x6dc)
[<80621388>] (__dev_queue_xmit) from [<8062142c>] (dev_queue_xmit+0x1c/0x20)
[<8062142c>] (dev_queue_xmit) from [<80631264>] (neigh_resolve_output+0x110/0x198)
[<80631264>] (neigh_resolve_output) from [<8065b8f4>] (ip_finish_output2+0x210/0x44c)
[<8065b8f4>] (ip_finish_output2) from [<8065d7a0>] (ip_finish_output+0x1d8/0x1f0)
[<8065d7a0>] (ip_finish_output) from [<8065dab8>] (ip_output+0xb0/0xbc)
[<8065dab8>] (ip_output) from [<8065b50c>] (ip_local_out+0x6c/0x78)
[<8065b50c>] (ip_local_out) from [<8065e388>] (ip_send_skb+0x24/0xa4)
[<8065e388>] (ip_send_skb) from [<8065e43c>] (ip_push_pending_frames+0x34/0x40)
[<8065e43c>] (ip_push_pending_frames) from [<80687030>] (raw_sendmsg+0x490/0x83c)
[<80687030>] (raw_sendmsg) from [<8069517c>] (inet_sendmsg+0x58/0xf4)
[<8069517c>] (inet_sendmsg) from [<80600c34>] (sock_sendmsg+0x24/0x34)
[<80600c34>] (sock_sendmsg) from [<80601704>] (___sys_sendmsg+0x1ec/0x214)
[<80601704>] (___sys_sendmsg) from [<8060259c>] (__sys_sendmsg+0x50/0x7c)
[<8060259c>] (__sys_sendmsg) from [<806025e0>] (SyS_sendmsg+0x18/0x1c)
[<806025e0>] (SyS_sendmsg) from [<80107ec0>] (ret_fast_syscall+0x0/0x28)
Code: e1a02223 e203300f e798c182 e0882183 (e79c0000)
---[ end trace 2168b38dee2f6dfc ]---
Kernel panic - not syncing: Fatal exception in interrupt
CPU3: stopping
CPU: 3 PID: 0 Comm: swapper/3 Tainted: G D 4.14.111 #18
Hardware name: ARM-Versatile Express
[<80110f40>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f1a78>] (dump_stack+0xac/0xd8)
[<806f1a78>] (dump_stack) from [<8010f324>] (handle_IPI+0x2d0/0x34c)
[<8010f324>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<8010d170>] (__irq_svc+0x70/0x98)
Exception stack(0x9f50bf28 to 0x9f50bf70)
bf20: 00000001 00000000 00000000 80b0402c 9f50a000 00000000
bf40: 00000000 80b03cb8 80b03c6c 80a84b48 9f50bf98 9f50bf84 9f50bf78 9f50bf78
bf60: 801088c8 801088cc 60070013 ffffffff
[<8010d170>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<8070cf14>] (default_idle_call+0x34/0x48)
[<8070cf14>] (default_idle_call) from [<8015f7c4>] (do_idle+0x16c/0x218)
[<8015f7c4>] (do_idle) from [<8015fb1c>] (cpu_startup_entry+0x28/0x2c)
[<8015fb1c>] (cpu_startup_entry) from [<8010ede4>] (secondary_start_kernel+0x168/0x170)
[<8010ede4>] (secondary_start_kernel) from [<60101a0c>] (0x60101a0c)
CPU0: stopping
CPU: 0 PID: 0 Comm: swapper/0 Tainted: G D 4.14.111 #18
Hardware name: ARM-Versatile Express
[<80110f40>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f1a78>] (dump_stack+0xac/0xd8)
[<806f1a78>] (dump_stack) from [<8010f324>] (handle_IPI+0x2d0/0x34c)
[<8010f324>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<8010d170>] (__irq_svc+0x70/0x98)
Exception stack(0x80b01ee0 to 0x80b01f28)
1ee0: 00000001 00000000 00000000 80b0402c 80b00000 00000000 00000000 80b03cb8
1f00: 80b03c6c 80a84b48 80b01f50 80b01f3c 80b01f30 80b01f30 801088c8 801088cc
1f20: 600e0013 ffffffff
[<8010d170>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<8070cf14>] (default_idle_call+0x34/0x48)
[<8070cf14>] (default_idle_call) from [<8015f7c4>] (do_idle+0x16c/0x218)
[<8015f7c4>] (do_idle) from [<8015fb1c>] (cpu_startup_entry+0x28/0x2c)
[<8015fb1c>] (cpu_startup_entry) from [<80706940>] (rest_init+0xbc/0xc0)
[<80706940>] (rest_init) from [<80a00dec>] (start_kernel+0x3a8/0x3b4)
CPU1: stopping
CPU: 1 PID: 0 Comm: swapper/1 Tainted: G D 4.14.111 #18
Hardware name: ARM-Versatile Express
[<80110f40>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f1a78>] (dump_stack+0xac/0xd8)
[<806f1a78>] (dump_stack) from [<8010f324>] (handle_IPI+0x2d0/0x34c)
[<8010f324>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<8010d170>] (__irq_svc+0x70/0x98)
Exception stack(0x9f507f28 to 0x9f507f70)
7f20: 00000001 00000000 00000000 80b0402c 9f506000 00000000
7f40: 00000000 80b03cb8 80b03c6c 80a84b48 9f507f98 9f507f84 9f507f78 9f507f78
7f60: 801088c8 801088cc 600f0013 ffffffff
[<8010d170>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<8070cf14>] (default_idle_call+0x34/0x48)
[<8070cf14>] (default_idle_call) from [<8015f7c4>] (do_idle+0x16c/0x218)
[<8015f7c4>] (do_idle) from [<8015fb1c>] (cpu_startup_entry+0x28/0x2c)
[<8015fb1c>] (cpu_startup_entry) from [<8010ede4>] (secondary_start_kernel+0x168/0x170)
[<8010ede4>] (secondary_start_kernel) from [<60101a0c>] (0x60101a0c)
---[ end Kernel panic - not syncing: Fatal exception in interrupt
测试环境为 QEMU 模拟的 ARM-Versatile Express 开发板,通过 addr2line 定位到 ___bpf_prog_run() 的对应代码行:
c
static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
u64 *stack)
{
...
// 这里的代码,是宏 LDST(W, u32) 展开后的样子
LDX_MEM_W:
regs[insn->dst_reg] = *(u32 *)(unsigned long) (regs[insn->src_reg] + insn->off);
({
insn++;
goto select_insn;
});
...
}
崩溃时 eBPF 解释器正解释执行一条 eBPF 指令 61 11 00 00 00 00 00 00,该指令中,regs[insn->src_reg] 存储的指针值为 0,同时 insn->off 也为 0,所以出现了 NULL 指针访问错误。
这里要说明一下,这条崩溃指令 61 11 00 00 00 00 00 00 的内容,是笔者插入日志打印出来的。
3. 解析
首先看下 eBPF 内核程序 sockex1_kern.o 的反汇编代码:
c
$ llvm-objdump-8 -d sockex1_kern.o
sockex1_kern.o: file format ELF64-BPF
Disassembly of section socket1:
0000000000000000 bpf_prog1:
0: bf 16 00 00 00 00 00 00 r6 = r1
1: 30 00 00 00 17 00 00 00 r0 = *(u8 *)skb[23]
2: 63 0a fc ff 00 00 00 00 *(u32 *)(r10 - 4) = r0
3: 61 61 04 00 00 00 00 00 r1 = *(u32 *)(r6 + 4)
4: 55 01 08 00 04 00 00 00 if r1 != 4 goto +8 <LBB0_3>
5: bf a2 00 00 00 00 00 00 r2 = r10
6: 07 02 00 00 fc ff ff ff r2 += -4
7: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll
9: 85 00 00 00 01 00 00 00 call 1
10: 15 00 02 00 00 00 00 00 if r0 == 0 goto +2 <LBB0_3>
11: 61 61 5c 00 00 00 00 00 r1 = *(u32 *)(r6 + 92)
12: 7b 10 00 00 00 00 00 00 *(u64 *)(r0 + 0) = r1
0000000000000068 LBB0_3:
13: b7 00 00 00 00 00 00 00 r0 = 0
14: 95 00 00 00 00 00 00 00 exit
细心比对,并没有前面提到的崩溃指令 61 11 00 00 00 00 00 00,这是怎么一回事?原因是 eBPF 程序注入内核时,一些对内核数据结构的访问指令,需要进行修正,这期间会修改和插入新的 eBPF 指令。具体到本文测试的 socket 的 eBPF 钩子程序而言,需要将 eBPF 程序 sockex1_kern.o 对 __sk_buff 的成员访问,修正为 sk_buff 中对应成员的访问。一方面,因为在调用 eBPF 程序 sockex1_kern.o 时传递参数实际是一个 sk_buff 指针,而不是 __sk_buff 指针:
c
/* 运行 eBPF 钩子程序 */
static unsigned int run_filter(struct sk_buff *skb,
const struct sock *sk,
unsigned int res)
{
struct sk_filter *filter;
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter != NULL)
res = bpf_prog_run_clear_cb(filter->prog, skb);
rcu_read_unlock();
return res;
}
bpf_prog_run_clear_cb(filter->prog, skb)
// 传递的是 struct sk_buff * 而不是 struct __sk_buff *
__bpf_prog_run32(skb, filter->insnsi)
另一方面,通过这样的方式,可以不将整个 sk_buff 的定义细节暴露到用户空间。
将 eBPF 程序 sockex1_kern.o 注入内核时,把 eBPF 程序 sockex1_kern.o 中对 __sk_buff 的成员访问,转换为 sk_buff 成员的访问细节如下:
c
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
...
switch (cmd) {
...
case BPF_PROG_LOAD:
err = bpf_prog_load(&attr);
break;
...
}
...
}
static int bpf_prog_load(union bpf_attr *attr)
{
...
err = bpf_check(&prog, attr);
...
}
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
...
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
...
}
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
...
for (i = 0; i < insn_cnt; i++, insn++) {
...
/* bpf_convert_ctx_access() */
cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
&target_size);
...
}
...
}
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
...
switch (si->off) {
...
case offsetof(struct __sk_buff, pkt_type):
*target_size = 1;
*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
PKT_TYPE_OFFSET());
*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
#endif
break;
...
case offsetof(struct __sk_buff, remote_ip4):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
offsetof(struct sk_buff, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
bpf_target_off(struct sock_common,
skc_daddr,
4, target_size));
break;
...
}
...
}
修正后的 eBPF 程序变为:
c
bf 16 00 00 00 00 00 00
30 00 00 00 17 00 00 00
63 0a fc ff 00 00 00 00
71 61 64 00 00 00 00 00
54 01 00 00 07 00 00 00
55 01 09 00 04 00 00 00
bf a2 00 00 00 00 00 00
07 02 00 00 fc ff ff ff
18 01 00 00 00 d0 64 9f
00 00 00 00 00 00 00 00
85 00 00 00 cc f5 00 00
15 00 03 00 00 00 00 00
61 61 0c 00 00 00 00 00
61 11 00 00 00 00 00 00
7b 10 00 00 00 00 00 00
b7 00 00 00 00 00 00 00
95 00 00 00 00 00 00 00
相对于原程序,将原指令 61 61 04 00 00 00 00 00 修改为如下两条指令(即代码里的 skb->pkt_type 读取语句):
c
71 61 64 00 00 00 00 00 // r1 = *(u8 *)(r6 + 100)
54 01 00 00 07 00 00 00 // r1 = r1 & 0x00000007
即将 __sk_buff::pkt_type 的访问,转换为 sk_buff::__pkt_type_offset[0] 的访问,看一下 __sk_buff 和 sk_buff 相关定义:
c
struct __sk_buff {
__u32 len;
__u32 pkt_type;
...
};
struct sk_buff {
...
__u8 __pkt_type_offset[0];
__u8 pkt_type:3;
...
};
另外,将原指令 61 61 5c 00 00 00 00 00 修改为(即代码里的 skb->remote_ip4 读取语句):
c
61 61 0c 00 00 00 00 00 // r1 = *(u32 *)(r6 + 12) [r1 = sk_buff::sk]
61 11 00 00 00 00 00 00 // r1 = *(u32 *)r1
即将对 __sk_buff::remote_ip4 的访问,转换为 sk_buff::sk->__sk_common.skc_daddr。
说完 eBPF 程序的修正,回到本文崩溃的根因分析,也即函数 ___bpf_prog_run() 中:
c
static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
u64 *stack)
{
...
// 这里的代码,是宏 LDST(W, u32) 展开后的样子
LDX_MEM_W:
regs[insn->dst_reg] = *(u32 *)(unsigned long) (regs[insn->src_reg] + insn->off);
({
insn++;
goto select_insn;
});
...
}
前面提到崩溃是在解释执行指令 61 11 00 00 00 00 00 00 时,那这是一条什么指令?看一下 eBPF 虚拟机对指令的定义:
c
struct bpf_insn {
__u8 code; /* opcode */
__u8 dst_reg:4; /* dest register */
__u8 src_reg:4; /* source register */ /* 值 BPF_PSEUDO_MAP_FD 表示 bpf_map 访问指令 */
__s16 off; /* signed offset */
__s32 imm; /* signed immediate constant */
};
所以指令 61 11 00 00 00 00 00 00 解释为 r1 = *(u32 *)r1,那 eBPF 解释器寄存器 r1 的内容又是谁设定的?61 11 00 00 00 00 00 00 的上一条指令 61 61 0c 00 00 00 00 00 设定了 r1:r1 = *(u32 *)(r6 + 12)。而 r6 在 eBPF 程序开头设置为 sk_buff 指针,所以综合起来,崩溃指令 61 11 00 00 00 00 00 00 的操作就是 sk_buff::sk->__sk_common.skc_daddr,内核崩溃是因为 sk_buff::sk == NULL (即 r1 = 0) 导致,也即 skb 关联的 sock 成员 sk_buff::sk 没有设定导致:
c
packet_rcv()
run_filter()
__bpf_prog_run32()
__bpf_prog_run()