连接跟踪conntrack的基本信息
Linux内核的conntrack模块是网络过滤子系统netfilter重要组成部分,它是网络地址转换NAT
和防火墙等网络功能的基础。Linux内核中一个连接(可以为UDP
或TCP
,或其他)的建立是一个冗长耗时的过程,例如,该连接经过内核过滤规则(对应防火墙的规则)或端口转发等规则的确认,最终成功建立。当连接建立后,如何避免后续数据量庞大、数量众多的网络包快速检测通过(从而降低Linux内核网络的负载),跟踪连接是十分必要的。为了跟踪一个已存在的网络连接,Linux内核(版本为6.6.67
)使了以下结构体作为一个连接的指纹:
c
/* include/net/netfilter/nf_conntrack_tuple.h */
/* This contains the information to distinguish a connection. */
struct nf_conntrack_tuple {
struct nf_conntrack_man src;
/* These are the parts of the tuple which are fixed. */
struct {
union nf_inet_addr u3;
union {
/* Add other protocols here. */
__be16 all;
struct {
__be16 port;
} tcp;
struct {
__be16 port;
} udp;
......
}
可以看到,它包含了一个连接的重要信息:源和目标IP地址、源和目标端口号等。对于NAT
,它还包含转换的IP地址和端口号等。该nf_conntack_tuple
结构体在内核函数nf_ct_get_tuple
中被填充:
c
/* net/netfilter/nf_conntrack_core.c */
static bool
nf_ct_get_tuple(const struct sk_buff *skb,
unsigned int nhoff,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
struct net *net,
struct nf_conntrack_tuple *tuple)
{
unsigned int size;
const __be32 *ap;
__be32 _addrs[8];
memset(tuple, 0, sizeof(*tuple));
之后通过 __nf_conntrack_find_get
函数将该结构体映射到struct nf_conn
指针;可以把这一过程简化成把nf_conntrack_tuple
结构体作为一个哈稀表的键值,查找得到struct nf_conn
指针:
c
/* net/netfilter/nf_conntrack_core.c */
/* Find a connection corresponding to a tuple. */
static struct nf_conntrack_tuple_hash *
__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
h = ____nf_conntrack_find(net, zone, tuple, hash);
......
ct = nf_ct_tuplehash_to_ctrack(h);
最后,结构体struct nf_conn
包含了一个已建立的(严格地说,也包含待建立的)链接的状态息:
c
/* incude/net/netfilter/nf_conntrack_core.c */
struct nf_conn {
struct nf_conntrack ct_general;
spinlock_t lock;
/* jiffies32 when this ct is considered dead */
u32 timeout;
#ifdef CONFIG_NF_CONNTRACK_ZONES
struct nf_conntrack_zone zone;
#endif
/* XXX should I move this to the tail ? - Y.K */
/* These are my tuples; original and reply */
struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
/* Have we seen traffic both ways yet? (bitset) */
unsigned long status;
这里我们重点关注timeout/status
;其中timeout
以jiffies
为单位,表示该连接跟踪的失效的时间;status
则提供了该连接的状态比特标志位等信息。
UDP的连接状态跟踪
笔者为了加深对conntrack
的了解,修改了busybox的代码,在其中增加了绑定本地端口的功能:
diff
diff --git a/libbb/xconnect.c b/libbb/xconnect.c
index 0e0b247..6456c65 100644
--- a/libbb/xconnect.c
+++ b/libbb/xconnect.c
@@ -369,6 +369,25 @@ int FAST_FUNC xsocket_type(len_and_sockaddr **lsap, int family, int sock_type)
lsa = xzalloc(LSA_LEN_SIZE + len);
lsa->len = len;
lsa->u.sa.sa_family = family;
+
+ /* bind to local port number for IPv4/IPv6 */
+ if (family == AF_INET || family == AF_INET6) {
+ int pno = -1;
+ const char * lport = getenv("BB_PORTNO");
+ if (lport && lport[0])
+ pno = (int) strtol(lport, NULL, 0);
+ if (pno > 0 && pno < 65536) {
+ if (family == AF_INET) {
+ struct sockaddr_in * addr;
+ addr = (struct sockaddr_in *) &(lsa->u.sa);
+ addr->sin_port = htons((unsigned short) pno);
+ } else {
+ struct sockaddr_in6 * addr;
+ addr = (struct sockaddr_in6 *) &(lsa->u.sa);
+ addr->sin6_port = htons((unsigned short) pno);
+ }
+ }
+ }
*lsap = lsa;
return fd;
}
这样,通过配置环境变量BB_PORTNO
固定为4321
,可以强制nslookup
命令行工具多次调用时,使用同一端口:
root@localhost:~# export BB_PORTNO=4321
root@localhost:~# nslookup www.baidu.com 223.5.5.5
Server: 223.5.5.5
Address: 223.5.5.5:53
Non-authoritative answer:
www.baidu.com canonical name = www.a.shifen.com
Name: www.a.shifen.com
Address: 223.109.82.16
Name: www.a.shifen.com
Address: 223.109.82.212
以上命令在PC侧执行;此时,在路由器设备上(笔者使用了树莓派做软路由),可以通过conntrack
命令行工具查看连建立的UDP
连接信息:
root@OpenWrt:~# conntrack -L | grep -e udp
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp 17 49 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=1 bytes=90 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=2 bytes=266 mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp 17 46 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=1 bytes=90 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=2 bytes=266 mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp 17 176 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=2 bytes=180 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=4 bytes=532 [ASSURED] mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp 17 174 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=3 bytes=270 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=6 bytes=798 [ASSURED] mark=0 use=1
其中,17为网络协议编号,对应UDP
;之后的数值单位为秒,即该连接跟踪在多少秒后超时。超时后,DNS
服务器223.5.5.5
的回应不会被软路由NAT转发。注意到,一开始该UDP
连接的超时时间分别为49秒和46秒;但之后变成了176秒,这是笔者在PC上多次执行nslookup www.baidu.com 223.5.5.5
命令的结果;简单地说,当一个UDP
有了初次的回应后,它的超时时间会变成60秒;当有多次回应后,超时时间会变成180秒。这一变化过程下面有相关说明。
UDP连接的状态的内核调试
笔者编写了一个简单的bpftrace脚本,用于跟踪UDP
的连接状态信息:
bpftrace
#!/usr/bin/bpftrace
#include <net/netfilter/nf_conntrack.h>
kprobe:nf_conntrack_udp_packet {
$c = (struct nf_conn *) arg0;
printf("%8d.%06d: PID: %d, comm: %s, nf_conntrack_udp_packet(0x%lx, 0x%lx, 0x%lx), status: 0x%x, timeout: %u",
elapsed / 1000000, elapsed % 1000000, pid, comm, arg0, arg1, arg2, $c->status, $c->timeout);
print(kstack);
}
kretprobe:__nf_conntrack_alloc {
printf("%8d.%06d: PID: %d, comm: %s, __nf_conntrack_alloc has returned: 0x%lx",
elapsed / 1000000, elapsed % 1000000, pid, comm, retval);
print(kstack);
}
kprobe:nf_conntrack_free {
printf("%8d.%06d: PID: %d, comm: %s, nf_conntrack_free(0x%lx)",
elapsed / 1000000, elapsed % 1000000, pid, comm, arg0);
print(kstack);
}
使用该脚本对这一过程进行调试,得到的结果如下(调试结果有精简):
4675.050648: PID: 0, comm: swapper/0, __nf_conntrack_alloc has returned: 0xffffff8006469200
init_conntrack.isra.0+976
nf_conntrack_in+912
ipv4_conntrack_in+24
nf_hook_slow+72
br_nf_pre_routing+444
br_handle_frame+404
__netif_receive_skb_core.constprop.0+500
__netif_receive_skb_one_core+44
process_backlog+168
__napi_poll.constprop.0+56
net_rx_action+344
handle_softirqs+352
__softirqentry_text_start+20
____do_softirq+16
4675.180752: PID: 0, comm: swapper/0, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff80050e0e00, 0x14), status: 0x0, timeout: 0
nf_conntrack_udp_packet+0
ipv4_conntrack_in+24
nf_hook_slow+72
br_nf_pre_routing+444
br_handle_frame+404
__netif_receive_skb_core.constprop.0+500
__netif_receive_skb_one_core+44
process_backlog+168
__napi_poll.constprop.0+56
net_rx_action+344
handle_softirqs+352
4681.793946: PID: 2774, comm: bpftrace, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681b00, 0x14), status: 0x198, timeout: 372070
nf_conntrack_udp_packet+0
ipv4_conntrack_in+24
nf_hook_slow+72
ip_rcv+92
__netif_receive_skb_one_core+72
process_backlog+168
__napi_poll.constprop.0+56
net_rx_action+344
handle_softirqs+352
4682.189260: PID: 2774, comm: bpftrace, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681600, 0x14), status: 0x19a, timeout: 372071
nf_conntrack_udp_packet+0
ipv4_conntrack_in+24
nf_hook_slow+72
ip_rcv+92
__netif_receive_skb_one_core+72
process_backlog+168
__napi_poll.constprop.0+56
net_rx_action+344
handle_softirqs+352
当一个连接生成时,会调用__nf_conntrack_alloc
函数分配连接跟踪结构体nf_conn
。函数nf_conntrack_udp_packet
用于检查并更新一个UDP
连接的跟踪信息;第一次调用时,可以看到nf_conn
中的status
和timeout
都为0,此时会默认使用以下代码更新跟踪信息:
c
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
注意,timeouts[UDP_CT_UNREPLIED]
默认值为 30*HZ
,但openwrt
系统将之配置为60秒:
root@OpenWrt:~# cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout
60
root@OpenWrt:~# cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream
180
针对笔者使用的树莓派设备,使能了内核选项CONFIG_HZ_100=y
,那么HZ
值为100;上面的调试结果:status: 0x198, timeout: 372070
,表明该UDP
连接跟踪会在启动系动的第3720.7
秒后超时失效。可以确定,该UDP
连接是树莓派设备启动的第3720.7 - 60
秒,即第3660.7秒时从PC机上收到的(此时树莓派启动了约一小时)。下面会有数据的变化与此印证。此时,该UDP连接的状态位为0x198,对应着:
c
-----------------------------------------------
Value [0x198] (0x198, 408):
28 24 20 16 12 8 4 0
0000 0000 0000 0000 0000 0001 1001 1000
31 27 23 19 15 11 7 3
/* Connection is confirmed: originating packet has left box */
IPS_CONFIRMED_BIT = 3,
IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT),
/* Connection needs src nat in orig dir. This bit never changed. */
IPS_SRC_NAT_BIT = 4,
IPS_SRC_NAT = (1 << IPS_SRC_NAT_BIT)
上面调用了nf_conntrack_udp_packet
函数两次,分别对应对DNS 223.5.5.5
服务器的一收一发,连接已确认,第3位比特会置1。
下面笔者再次(即第二次)在PC机上执行了nslookup www.baidu.com 223.5.5.5
,但连接跟踪信息的超时时间没有变化,仍是系统启动的第3720.7
秒。此时,状态位由之前的0x198
变为0x19a
,即第1位置1(其实是第二次调用nf_conntrack_udp_packet
函数返回后的状态值):
c
/* We've seen packets both ways: bit 1 set. Can be set, not unset. */
IPS_SEEN_REPLY_BIT = 1,
IPS_SEEN_REPLY = (1 << IPS_SEEN_REPLY_BIT),
第二次执行nslookup
的内核调试结果如下:
21181.259131: PID: 0, comm: swapper/0, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681e00, 0x14), status: 0x19a, timeout: 372071
nf_conntrack_udp_packet+0
ipv4_conntrack_in+24
nf_hook_slow+72
br_nf_pre_routing+444
br_handle_frame+404
__netif_receive_skb_core.constprop.0+500
__netif_receive_skb_one_core+44
process_backlog+168
__napi_poll.constprop.0+56
net_rx_action+344
handle_softirqs+352
21187.751231: PID: 308, comm: kworker/u13:1, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff80065e6700, 0x14), status: 0x19e, timeout: 385721
nf_conntrack_udp_packet+0
ipv4_conntrack_in+24
nf_hook_slow+72
ip_rcv+92
__netif_receive_skb_one_core+72
process_backlog+168
__napi_poll.constprop.0+56
net_rx_action+344
handle_softirqs+352
除了status
中的比特位2置1外:
c
/* Conntrack should never be early-expired. */
IPS_ASSURED_BIT = 2,
IPS_ASSURED = (1 << IPS_ASSURED_BIT),
该UDP跟踪信息的超时时间由原来的372071
变成了385721
,二者相差了13650 jiffies
,对应着135.5秒;也就是说,内核把这个UDP连接跟踪失效的时间在原来的基础上又推迟了136.5秒,这个超时时间接近nf_conntrack_udp_timeout_stream
中指定的180超时时间:
cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream
180
最后,UDP
连接跟踪的状态更新函数内容如下:
c
/* net/netfilter/nf_conntrack_proto_udp.c */
int nf_conntrack_udp_packet(struct nf_conn *ct,
struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
const struct nf_hook_state *state)
{
unsigned int *timeouts;
unsigned long status;
if (udp_error(skb, dataoff, state))
return -NF_ACCEPT;
timeouts = nf_ct_timeout_lookup(ct);
if (!timeouts)
timeouts = udp_get_timeouts(nf_ct_net(ct));
status = READ_ONCE(ct->status);
if ((status & IPS_CONFIRMED) == 0)
ct->proto.udp.stream_ts = 2 * HZ + jiffies;
/* If we've seen traffic both ways, this is some kind of UDP
* stream. Set Assured.
*/
if (status & IPS_SEEN_REPLY) {
unsigned long extra = timeouts[UDP_CT_UNREPLIED];
bool stream = false;
/* Still active after two seconds? Extend timeout. */
if (time_after(jiffies, ct->proto.udp.stream_ts)) {
extra = timeouts[UDP_CT_REPLIED];
stream = (status & IPS_ASSURED) == 0;
}
nf_ct_refresh_acct(ct, ctinfo, skb, extra);
/* never set ASSURED for IPS_NAT_CLASH, they time out soon */
if (unlikely((status & IPS_NAT_CLASH)))
return NF_ACCEPT;
/* Also, more likely to be important, and not a probe */
if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
以上代码中的UDP_CT_REPLIED
即对应内核配置/proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream
的值(UDP_CT_UNREPLIED
对应nf_conntrack_udp_timeout
)。注意到,把超时时间更新到timeouts[UDP_CT_REPLIED]
是有条件的,其条件就是间隔两秒之后仍有数据活动(从而该连接被视为持续活动的连接,即udp_stream
)。
上面提到,连接跟踪结构体struct nf_conn
包含了一些NAT
的信息,这个信息是网络地址转换需要的;例如上面的status
字段中第4位比待位置1(对应IPS_SRC_NAT
),则以下代码会执行:
c
/* net/netfilter/nf_conntrack_core.c */
static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, ...) {
if (ct->status & IPS_SRC_NAT) {
memcpy(tuple.src.u3.all,
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
sizeof(tuple.src.u3.all));
tuple.src.u.all =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
}
......
if (status & IPS_SRC_NAT &&
nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
IP_CT_DIR_ORIGINAL) == NF_DROP)
return -1;
}
上面根据标志位IPS_SRC_NAT
更新了tuple
中的UDP源地址。之后调用了manip_pkt
来进一步处理。下面笔者找到了函数__udp_manip_pkt
对应的汇编代码,编写了另一个bpftrace
脚本,查看对网络数据包的UDP端口的修改:
#!/usr/bin/bpftrace
/*
net/netfilter/nf_nat_proto.c
static void __udp_manip_pkt(struct sk_buff *skb,
unsigned int iphdroff, struct udphdr *hdr,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype, bool do_csum)
{
__be16 *portptr, newport;
......
*portptr = newport; // => 0xffffffc080810410 <l4proto_manip_pkt+428>: strh w25, [x24]
}
Dump of assembler code from 0xffffffc080810400 to 0xffffffc080810420:
0xffffffc080810400 <l4proto_manip_pkt+412>: cbnz w21, 0xffffffc08081063c <l4proto_manip_pkt+984>
0xffffffc080810404 <l4proto_manip_pkt+416>: ldrh w25, [x22, #16]
0xffffffc080810408 <l4proto_manip_pkt+420>: mov x24, x19
0xffffffc08081040c <l4proto_manip_pkt+424>: cbnz w0, 0xffffffc080810398 <l4proto_manip_pkt+308>
0xffffffc080810410 <l4proto_manip_pkt+428>: strh w25, [x24]
*/
kprobe:l4proto_manip_pkt+0x1ac {
$n = reg("r25");
$r = (uint16 *) reg("r24");
$o = *kptr($r);
$o = ($o >> 8) | (($o << 8) & 0x00FF00);
$n = ($n >> 8) | (($n << 8) & 0x00FF00);
printf("PID: %d, comm: %s, UDP/NAT replacing port from %d to %d",
pid, comm, $o, $n);
print(kstack);
}
上面脚本的调试结果只有把路由端口修改为4321
端口的操作,却没有把4321
端口替代成路由端口的操作,需要进一步探究:
PID: 0, comm: swapper/1, UDP/NAT replacing port from 57616 to 4321
l4proto_manip_pkt+428
nf_nat_ipv4_manip_pkt+116
nf_nat_manip_pkt+192
nf_nat_inet_fn+460
nf_nat_ipv4_pre_routing+84
nf_hook_slow+72
ip_rcv+92
跟踪连接的超时失效
结构体struct nf_conn
保存了已建立连接的基本信息;当一个nf_conn
失效时,Linux内核会丢弃该数据包(必要时返回TCP/RST
或icmp/unreachable
),因为不知道如何对该数据包进行NAT转发。连接跟踪超时的判断,目前的调试观察到有两种方式,分别是内核工作线徎周期性检测,和应用层的netlink
访问(例如上面的conntrack
命令行工具)。当一个连接跟踪失效时,会调用nf_conntrack_free
释放内存:
146678.923239: PID: 36, comm: kworker/u12:0, nf_conntrack_free(0xffffff8006469900)
nf_conntrack_free+0
nf_ct_gc_expired.part.0+152
nf_ct_gc_expired+96
gc_worker+592
process_one_work+408
worker_thread+768
kthread+220
ret_from_fork+16
235795.008082: PID: 2817, comm: conntrack, nf_conntrack_free(0xffffff8006469200)
nf_conntrack_free+0
ctnetlink_dump_table+1024
netlink_dump+300
__netlink_dump_start+364
ctnetlink_get_conntrack+484
nfnetlink_rcv_msg+560
netlink_rcv_skb+96
nfnetlink_rcv+108
netlink_unicast+488
netlink_sendmsg+412
__sys_sendto+224
__arm64_sys_sendto+40
invoke_syscall.constprop.0+92
do_el0_svc+64
el0_svc+48
el0t_64_sync_handler+288
el0t_64_sync+376
至此,我们就对Linux内核的连接跟踪conntrack
有了初步的了解;这一块比较复杂,值得深入探究。