深度剖析 Java NIO Socket 从 read() 调用到 Linux 内核 tcp_recvmsg,以及 epoll 从注册到唤醒的完整源码路径,逐行追踪,揭示 Java 网络通信与 epoll 事件机制的底层奥秘
1. 引言
当我们在 Java 中写下 socketChannel.read(byteBuffer) 这行代码时,一条跨越用户态与内核态边界的调用链路被瞬间激活。当我们在 NIO 编程中调用 selector.select() 时,另一条从 epoll 到 socket 的事件通知链路也在并行运转。这两条链路------数据读取路径 和事件通知路径------共同构成了 Java NIO 高性能网络通信的基石。
本文将从源码层面,逐层追踪两条完整的调用链路:
- 数据读取路径 :从
SocketChannel.read()到内核tcp_recvmsg,完成数据从内核到用户空间的拷贝。 - 事件通知路径 :从
epoll_ctl注册 socket,到数据到达时ep_poll_callback被触发,再到epoll_wait返回就绪事件。
两条链路通过 socket 的等待队列(sk_wq) 这个核心数据结构紧密耦合。理解这两条链路,就理解了 Java NIO 高性能的底层秘密。
阅读提示:本文基于 OpenJDK 源码和 Linux 6.x 内核源码,所有核心调用关系均来源于真实代码。
2. 全景图:两条完整的调用链路
text
scss
┌─────────────────────────────────────────────────────────────────────────────┐
│ Java 应用层 │
│ socketChannel.read(buffer) selector.select() │
└─────────────────────────────────────────────────────────────────────────────┘
↓
┌─────────────────────────────────────────────────────────────────────────────┐
│ Java NIO 层 (SocketChannelImpl/EPollSelectorImpl) │
│ IOUtil.read() EPollArrayWrapper.poll() │
│ park(Net.POLLIN) epollWait() │
└─────────────────────────────────────────────────────────────────────────────┘
↓ (JNI 调用)
┌─────────────────────────────────────────────────────────────────────────────┐
│ JNI 本地方法层 (SocketDispatcher.c / EPoll.c) │
│ NET_Read() → recv() Java_sun_nio_ch_EPoll_ctl/Wait() │
└─────────────────────────────────────────────────────────────────────────────┘
↓ (系统调用)
┌─────────────────────────────────────────────────────────────────────────────┐
│ Linux 内核系统调用层 │
│ SYSCALL_DEFINE6(recvfrom) SYSCALL_DEFINE4(epoll_ctl) │
│ → __sys_recvfrom() → do_epoll_ctl() │
│ SYSCALL_DEFINE4(epoll_wait) │
│ → do_epoll_wait() │
└─────────────────────────────────────────────────────────────────────────────┘
↓
┌─────────────────────────────────────────────────────────────────────────────┐
│ Socket / epoll 核心层 │
│ sock_recvmsg() ep_insert() / ep_poll() │
│ → inet_recvmsg() → ep_ptable_queue_proc() │
│ → tcp_recvmsg() → ep_poll_callback() │
│ → tcp_recvmsg_locked() → ep_send_events() │
└─────────────────────────────────────────────────────────────────────────────┘
↓
┌─────────────────────────────────────────────────────────────────────────────┐
│ TCP 层 / 等待队列 │
│ skb_queue_walk() → 遍历接收队列 sk_wq (socket 等待队列) │
│ skb_copy_datagram_msg() → 拷数据 ep_poll_callback() 挂载在 sk_wq 上 │
│ tcp_eat_recv_skb() → 移除 skb wake_up(&sk_wq) → 触发回调 │
└─────────────────────────────────────────────────────────────────────────────┘
3. 第一部分:数据读取路径 ------ 从 Java 到 tcp_recvmsg
3.1 Java NIO 层:SocketChannelImpl.read()
Java NIO 的 SocketChannel 在 Linux 下的具体实现是 sun.nio.ch.SocketChannelImpl。其 read 方法的核心逻辑如下:
java
scss
// sun.nio.ch.SocketChannelImpl.java
@Override
public long read(ByteBuffer[] dsts, int offset, int length) throws IOException {
readLock.lock();
try {
ensureOpenAndConnected();
boolean blocking = isBlocking();
long n = 0;
try {
beginRead(blocking);
// 第一次尝试读取:直接调用 native 方法
n = IOUtil.read(fd, dsts, offset, length, nd);
if (blocking) {
// 阻塞模式:如果返回 RETRY,则进入循环等待
while (IOStatus.okayToRetry(n) && isOpen()) {
park(Net.POLLIN); // ★ 阻塞等待可读事件
n = IOUtil.read(fd, dsts, offset, length, nd);
}
}
} finally {
endRead(blocking, n > 0);
}
return IOStatus.normalize(n);
} finally {
readLock.unlock();
}
}
这段代码体现了 Java NIO 的一个精妙设计:
- 第一次尝试 :无论阻塞还是非阻塞模式,都先调用
IOUtil.read尝试读取数据。如果内核接收队列中有数据,直接返回;如果没有数据,tcp_recvmsg会返回-EAGAIN,JNI 层将其转换为IOStatus.RETRY(通常为 -2)。 - 阻塞模式的重试循环 :如果
IOStatus.okayToRetry(n)返回true(即返回值为 -2),且当前是阻塞模式,则调用park(Net.POLLIN)将当前线程挂起,等待该 socket 有数据可读时被唤醒,然后再次尝试读取。 - 非阻塞模式 :不会进入
while循环,IOUtil.read返回 -2 后直接向上层返回,由上层(如 Selector)决定如何处理。
park(Net.POLLIN) 是一个 native 方法,在 Linux 下对于单个 fd 的阻塞等待,最终会调用 poll 或 ppoll 系统调用,将当前线程挂起。
3.2 JNI 本地方法层:SocketDispatcher.read0
IOUtil.read 是一个 native 方法,其 Linux 实现位于 src/java.base/linux/native/libnio/ch/SocketDispatcher.c:
c
scss
JNIEXPORT jlong JNICALL
Java_sun_nio_ch_SocketDispatcher_read0(JNIEnv *env, jobject this, jobject fdo,
jlong address, jint len)
{
jint fd = fdval(env, fdo); // 从 FileDescriptor 对象中取出 int 类型的 fd
void *buf = (void *)jlong_to_ptr(address); // 获取 DirectBuffer 的内存地址
// ★ 核心:调用 NET_Read 宏,最终触发 recv 系统调用
return convertReturnVal(env, NET_Read(fd, buf, len), JNI_FALSE);
}
NET_Read 定义在 net_util_md.h 中:
c
scss
#define NET_Read(fd, buf, n) recv(fd, buf, n, 0)
recv 是 POSIX 标准函数,在 glibc 中会触发 sys_recvfrom 系统调用,陷入内核。
3.3 Linux 内核系统调用层:__sys_recvfrom
recv 系统调用在内核中的入口是 SYSCALL_DEFINE6(recvfrom),定义在 net/socket.c 中:
c
arduino
// net/socket.c
SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
unsigned int, flags, struct sockaddr __user *, addr,
int __user *, addr_len)
{
return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len);
}
__sys_recvfrom 是真正的执行者:
c
ini
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
struct sockaddr __user *addr, int __user *addr_len)
{
struct sockaddr_storage address;
struct msghdr msg = {
.msg_name = addr ? (struct sockaddr *)&address : NULL,
};
struct socket *sock;
int err, err2;
int fput_needed;
// 1. 将用户空间缓冲区信息导入内核
err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
if (unlikely(err))
return err;
// 2. 通过 fd 查找对应的 socket 对象
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
// 3. 如果 socket 是非阻塞的,设置 MSG_DONTWAIT 标志
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
// 4. ★ 核心:调用 sock_recvmsg 进行实际的数据接收
err = sock_recvmsg(sock, &msg, flags);
// 5. 如果需要返回发送方地址,拷贝回用户空间
if (err >= 0 && addr != NULL) {
err2 = move_addr_to_user(&address, msg.msg_namelen, addr, addr_len);
if (err2 < 0)
err = err2;
}
fput_light(sock->file, fput_needed);
out:
return err;
}
import_ubuf 函数将用户态传入的缓冲区地址和长度封装成内核的 struct iov_iter 结构体,这个结构体是后续 tcp_recvmsg 将数据从内核拷贝到用户空间的关键。
sockfd_lookup_light 通过 fd 从当前进程的文件描述符表中查找对应的 struct socket 对象:
text
scss
当前进程 (current)
│
└── files_struct (进程的文件描述符表)
│
└── fd_array[fd] // fd 作为下标
│
▼
struct file *file
│
│ (file->private_data)
▼
struct socket *sock // ★ 拿到了 TCP Socket 的内核对象
3.4 Socket 抽象层与 INET 层:协议多态分发
sock_recvmsg 是 Socket 抽象层的核心函数:
c
arduino
// net/socket.c
int sock_recvmsg(struct socket *sock, struct msghdr *msg, unsigned int flags)
{
// ★ 多态调用:根据 socket 类型调用不同的 recvmsg 实现
return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);
}
对于 TCP Socket,sock->ops 指向 inet_stream_ops,因此这里会调用 inet_recvmsg:
c
arduino
// net/ipv4/af_inet.c
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
struct sock *sk = sock->sk;
// ★ 再次多态:调用传输层协议的处理函数
return sk->sk_prot->recvmsg(sk, msg, size, flags, NULL);
}
对于 TCP,sk->sk_prot 指向 tcp_prot,因此最终调用 tcp_recvmsg。
3.5 TCP 层:tcp_recvmsg 与 tcp_recvmsg_locked
tcp_prot 是 TCP 协议的方法函数表:
c
ini
struct proto tcp_prot = {
.name = "TCP",
.close = tcp_close,
.connect = tcp_v4_connect,
.accept = inet_csk_accept,
.recvmsg = tcp_recvmsg, // ★ 数据接收
.sendmsg = tcp_sendmsg, // ★ 数据发送
.backlog_rcv = tcp_v4_do_rcv, // ★ 软中断中处理接收队列
// ...
};
tcp_recvmsg 是 tcp_prot 中的 .recvmsg 实现:
c
scss
// net/ipv4/tcp.c
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
int *addr_len)
{
int cmsg_flags = 0, ret;
struct scm_timestamping_internal tss;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
// 总线轮询优化(用于低延迟场景)
if (sk_can_busy_loop(sk) &&
skb_queue_empty_lockless(&sk->sk_receive_queue) &&
sk->sk_state == TCP_ESTABLISHED)
sk_busy_loop(sk, flags & MSG_DONTWAIT);
// ★ 加锁,调用加锁版本的接收函数
lock_sock(sk);
ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
release_sock(sk);
// 处理时间戳等附属信息
if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
// ...
}
return ret;
}
tcp_recvmsg 的主要职责是加锁 和处理附属信息 (如时间戳),真正的"取数据"和"拷数据"发生在 tcp_recvmsg_locked 中。
3.6 tcp_recvmsg_locked:取数据与拷数据的核心
tcp_recvmsg_locked 是 TCP 数据接收的核心实现:
c
arduino
static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
int flags, struct scm_timestamping_internal *tss,
int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0;
u32 *seq;
struct sk_buff *skb, *last;
// ...
do {
u32 offset;
// ★ 遍历接收队列,查找包含待读取数据的 sk_buff
last = skb_peek_tail(&sk->sk_receive_queue);
skb_queue_walk(&sk->sk_receive_queue, skb) {
last = skb;
// ...
offset = *seq - TCP_SKB_CB(skb)->seq;
if (offset < skb->len)
goto found_ok_skb; // ★ 找到了包含数据的 skb
// ...
}
// ...
} while (len > 0);
// ...
}
skb_queue_walk 是一个宏,它从队列头部开始遍历,每次循环将 skb 指针指向当前的 sk_buff。这一步只是"取出来看",并没有从队列中删除它。
当找到包含数据的 sk_buff 后,代码跳转到 found_ok_skb 标签:
c
ini
found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
// ★★★ 核心:将内核 skb 中的数据拷贝到用户空间 ★★★
if (!(flags & MSG_TRUNC)) {
err = skb_copy_datagram_msg(skb, offset, msg, used);
if (err) {
if (!copied)
copied = -EFAULT;
break;
}
}
WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
// ...
skb_copy_datagram_msg 是数据拷贝的核心函数。它内部会根据 msg->msg_iter 的类型(ITER_UBUF 或 ITER_IOVEC)选择正确的拷贝方式,将内核 sk_buff 中的数据直接拷贝到用户态内存。
当数据被完全读取后,从队列中正式移除:
c
scss
if (used + offset < skb->len)
continue; // 只读了部分数据,skb 保留在队列中
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
tcp_eat_recv_skb(sk, skb); // ★ 从接收队列中正式移除 skb
continue;
found_fin_ok:
/* Process the FIN. */
WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
tcp_eat_recv_skb(sk, skb); // ★ 处理 FIN 包时也移除
break;
tcp_eat_recv_skb 内部封装了 __skb_unlink(skb, &sk->sk_receive_queue),这才是真正把 sk_buff 从 sk_receive_queue 链表中摘除并释放的动作。
重要细节 :如果 used + offset < skb->len(即只读取了 skb 的一部分),代码会 continue,不会调用 tcp_eat_recv_skb 。这意味着同一个 sk_buff 会保留在队列中,下次读取时 offset 后移,继续从同一个 skb 中取数据。这正是 TCP 流式读取的体现。
4. 第二部分:事件通知路径 ------ epoll 与 socket 的关联
理解了数据读取路径后,我们需要回答一个核心问题:epoll 是如何知道 socket 有数据可读的?
答案在于 Linux 内核的等待队列(Wait Queue) 机制。socket 和 epoll 通过 sk_wq 这个数据结构建立了关联。
4.1 socket 的等待队列 ------ sk_wq
每个 struct sock 都包含一个等待队列头:
c
arduino
struct sock {
// ...
struct socket_wq *sk_wq; // ★ 等待队列头
// ...
};
sock_def_readable 是 socket 的默认数据就绪回调函数:
c
scss
void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
// ★ 唤醒等待队列上的所有等待者
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | ...);
rcu_read_unlock();
}
当数据到达时,软中断处理函数会调用 sk->sk_data_ready(sk),即 sock_def_readable,从而唤醒等待队列上的所有等待者。
4.2 epoll 的核心数据结构
在深入源码之前,先了解 epoll 的三个核心数据结构:
① struct eventpoll :每个 epoll 实例对应一个 eventpoll 结构。
rbr:红黑树根节点,用于快速查找被监控的文件描述符。rdllist:就绪链表,存放已经就绪的epitem。wq:等待队列,存放调用epoll_wait而阻塞的进程。
② struct epitem :每个被监控的文件描述符对应一个 epitem。
rbn:红黑树节点,用于插入ep->rbr。rdllink:链表节点,用于链入ep->rdllist(就绪列表)。ffd:指向被监控的struct file对象。event:用户注册的事件(EPOLLIN、EPOLLOUT等)。
③ struct eppoll_entry :连接 epitem 和目标文件等待队列的桥梁。
wait:等待队列项,func指向ep_poll_callback。whead:指向目标文件的等待队列头(如 socket 的sk_wq)。base:指回epitem。
4.3 关联阶段:epoll_ctl(EPOLL_CTL_ADD) 时
当用户调用 epoll_ctl(EPOLL_CTL_ADD, socket_fd, ...) 时,内核执行 do_epoll_ctl → ep_insert。
在 ep_insert 中,初始化 poll_table,将 qproc 指向 ep_ptable_queue_proc:
c
csharp
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
// ...
struct ep_pqueue epq;
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); // ★ 设置回调函数
// ★ 调用文件对象的 poll 方法,触发 ep_ptable_queue_proc
revents = ep_item_poll(epi, &epq.pt, 1);
// ...
}
ep_item_poll 调用 vfs_poll,对于 socket 文件,最终调用 tcp_poll,而 tcp_poll 内部调用 poll_wait,触发 ep_ptable_queue_proc。
ep_ptable_queue_proc 是关联的核心:
c
ini
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
struct epitem *epi = epq->epi;
struct eppoll_entry *pwq;
pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
// ★★★ 关键 1:将 epoll 的回调函数设置为等待项的 func ★★★
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead; // whead 就是 socket 的 sk_wq
pwq->base = epi;
// ★★★ 关键 2:将 epoll 等待项加入到 socket 的等待队列中 ★★★
add_wait_queue(whead, &pwq->wait);
pwq->next = epi->pwqlist;
epi->pwqlist = pwq;
}
这段代码完成了 epoll 与 socket 的"物理关联" :
init_waitqueue_func_entry将等待项的func指针设置为ep_poll_callback。add_wait_queue将这个等待项加入到 socket 的sk_wq等待队列中。
从此,socket 的 sk_wq 队列上多了一个属于 epoll 的节点,其 func 指向 ep_poll_callback。
4.4 触发阶段:数据到达时
当数据包到达 socket 时,完整的唤醒流程如下:
① 软中断处理 :tcp_v4_rcv → tcp_v4_do_rcv → tcp_rcv_established → tcp_data_queue → 数据入队。
② 调用数据就绪回调 :tcp_data_ready(sk) → sk->sk_data_ready(sk)(即 sock_def_readable)。
③ 唤醒等待队列 :sock_def_readable 调用 wake_up_interruptible_sync_poll(&sk_wq->wait, EPOLLIN)。
④ 遍历等待队列 :wake_up 遍历 sk_wq 链表上的所有等待项:
- 对于普通阻塞读的进程:执行
woken_wake_function,唤醒该进程。 - 对于 epoll 注册的等待项 :执行
ep_poll_callback。
4.5 ep_poll_callback ------ epoll 被唤醒
ep_poll_callback 是 epoll 被 socket 数据到达事件唤醒时的回调函数:
c
scss
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
// 1. 检查事件是否匹配
if (pollflags && !(pollflags & epi->event.events))
goto out_unlock;
// 2. ★ 将 epitem 加入到 epoll 的就绪链表 (rdllist)
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
// 如果正在向用户空间传递事件,加入 overflow 链表
if (chain_epi_lockless(epi))
ep_pm_stay_awake_rcu(epi);
} else if (!ep_is_linked(epi)) {
list_add_tail_lockless(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake_rcu(epi);
}
// 3. ★ 唤醒正在 epoll_wait 上阻塞的进程
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
// ...
}
ep_poll_callback 做了三件关键的事情:
- 事件匹配检查 :检查
key中携带的pollflags是否与用户注册的epi->event.events匹配。 - 加入就绪链表 :将对应的
epitem加入到 epoll 实例的就绪链表(rdllist) 中。 - 唤醒 epoll_wait :调用
wake_up(&ep->wq),唤醒正在epoll_wait系统调用上阻塞的用户进程。
4.6 epoll_wait 的唤醒与事件返回
用户进程调用 epoll_wait 时,内核进入 do_epoll_wait → ep_poll:
c
scss
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout)
{
// ...
while (1) {
if (eavail) {
// ★ 有就绪事件,调用 ep_send_events 传递到用户空间
res = ep_send_events(ep, events, maxevents);
if (res)
return res;
}
// 没有就绪事件,进入睡眠
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
__set_current_state(TASK_INTERRUPTIBLE);
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
// ★ 睡眠,等待被 ep_poll_callback 唤醒
timed_out = !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS);
// ...
}
}
当 ep_poll_callback 调用 wake_up(&ep->wq) 后:
ep_poll中被唤醒的进程继续执行。- 重新检查
eavail,发现rdllist非空。 - 调用
ep_send_events,遍历rdllist,将就绪事件拷贝到用户空间。
ep_send_events 的核心逻辑:
c
scss
static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
// 从 rdllist 取出就绪的 epitem
list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
// 调用 ep_item_poll 获取当前事件
revents = ep_item_poll(epi, &pt, 1);
if (!revents)
continue;
// ★ 将事件写入用户空间
events = epoll_put_uevent(revents, epi->event.data, events);
res++;
// LT 模式:重新插入就绪链表
if (!(epi->event.events & EPOLLET))
list_add_tail(&epi->rdllink, &ep->rdllist);
// ET 模式:不重新插入
}
return res;
}
LT(水平触发)与 ET(边缘触发)的区别在此体现:
- LT 模式 :事件处理后,
epitem重新插入rdllist,下次epoll_wait会再次报告。 - ET 模式 :事件处理后,
epitem从rdllist移除,直到有新事件到来才会再次加入。
5. 第三部分:JVM 层面的桥梁 ------ SelectionKey
在 Java NIO 中,SelectionKey 是连接 SelectableChannel 和 Selector 的纽带。
5.1 JVM 内部的 fd 到 SelectionKey 映射
sun.nio.ch.EPollSelectorImpl 内部维护了一个 fdToKey 映射表:
java
scala
// sun.nio.ch.EPollSelectorImpl
class EPollSelectorImpl extends SelectorImpl {
// fd 到 SelectionKey 的映射
private Map<Integer, SelectionKeyImpl> fdToKey = new HashMap<>();
// ...
}
当 SocketChannel.register(selector, OP_READ) 被调用时:
- JNI 层调用
epoll_ctl(EPOLL_CTL_ADD, fd, EPOLLIN)将 fd 注册到 epoll 实例。 - JVM 在
fdToKey映射表中记录fd → SelectionKeyImpl的对应关系。
5.2 epoll_wait 返回后的处理
当 EPollArrayWrapper.poll()(封装了 epoll_wait 系统调用)返回后:
-
内核返回就绪的 fd 列表(即被加入
rdllist的那些epitem对应的 fd)。 -
JVM 遍历这个 fd 列表,对每个 fd:
- 在
fdToKey映射表中查找对应的SelectionKeyImpl。 - 将内核返回的
EPOLLIN/EPOLLOUT等事件转换为 Java 的OP_READ/OP_WRITE。 - 设置
SelectionKeyImpl.readyOps = 就绪事件。
- 在
-
应用程序通过
key.isReadable()检查就绪事件,然后调用SocketChannel.read()读取数据。
5.3 SelectionKey 的核心字段
java
scala
// sun.nio.ch.SelectionKeyImpl
class SelectionKeyImpl extends SelectionKey {
private int interestOps; // 用户感兴趣的事件(OP_READ, OP_WRITE...)
private int readyOps; // ★ 内核填写的就绪事件
private Object attachment; // 用户挂载的业务对象
private SelChImpl channel; // 指向 SocketChannel
// ...
}
关键点 :readyOps 字段是 JVM 从内核获取就绪事件后填充的,应用程序通过 key.isReadable() 等便捷方法检查的就是这个字段。
6. 数据结构全景图
下图展示了从用户态到内核态的完整数据结构关联:
text
arduino
┌─────────────────────────────────────────────────────────────────────────────┐
│ 【用户态】 │
├─────────────────────────────────────────────────────────────────────────────┤
│ ┌──────────────────────────────────────────────────────────────────┐ │
│ │ sun.nio.ch.SelectionKeyImpl │ │
│ │ ┌────────────────────────────────────────────────────────────┐ │ │
│ │ │ int interestOps = OP_READ; │ │ │
│ │ │ int readyOps = 0; ← 内核返回后由 JVM 填充 │ │ │
│ │ │ Object attachment; ← 用户挂载的业务对象 │ │ │
│ │ │ SelChImpl channel; ← 指向 SocketChannel │ │ │
│ │ └────────────────────────────────────────────────────────────┘ │ │
│ └──────────────────────────┬───────────────────────────────────────┘ │
│ │ (持有) │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────────────┐ │
│ │ sun.nio.ch.FileDescriptorImpl │ │
│ │ ┌────────────────────────────────────────────────────────────┐ │ │
│ │ │ int fd = 0x1a; ← Linux 文件描述符编号 │ │ │
│ │ └────────────────────────────────────────────────────────────┘ │ │
│ └──────────────────────────────────────────────────────────────────┘ │
│ │
│ ┌──────────────────────────────────────────────────────────────────┐ │
│ │ EPollArrayWrapper (JVM 内部) │ │
│ │ Map<Integer, SelectionKeyImpl> fdToKey = {0x1a → key} │ │
│ └──────────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────────────────┘
│
syscall (epoll_ctl / epoll_wait / recv)
│
════════════════════════════════════╪═════════════════════════════════════════
│
┌───────────────────────────────────┼─────────────────────────────────────────┐
│ 【内核态】 │
│ │ │
│ ┌───────────────────────────────┼───────────────────────────────────┐ │
│ │ struct eventpoll (epoll 实例) │ │
│ │ ┌─────────────────────────────────────────────────────────────┐ │ │
│ │ │ struct rb_root rbr; // 红黑树根 → 用于查找 fd │ │ │
│ │ │ struct list_head rdllist; // 就绪链表 (有数据的fd) ←────┼──┘ │
│ │ │ struct wait_queue_head wq; // epoll_wait 等待队列 │ │ │
│ │ └─────────────────────────────────────────────────────────────┘ │ │
│ └───────────────────────────────┬───────────────────────────────────┘ │
│ │ 红黑树节点 │
│ ▼ │
│ ┌───────────────────────────────────────────────────────────────────┐ │
│ │ struct epitem (每个 fd 一个) │ │
│ │ ┌─────────────────────────────────────────────────────────────┐ │ │
│ │ │ struct rb_node rbn; // 红黑树节点 │ │ │
│ │ │ struct list_head rdllink; // 就绪链表节点 (可入链) ←─────┼──┘ │
│ │ │ struct epoll_event event; // {events, data} 用户事件 │ │ │
│ │ │ struct file *ffd; // 指向被监控的文件对象 (socket) │ │ │
│ │ │ struct list_head pwqlist; // 等待队列项列表 │ │ │
│ │ └─────────────────────────────────────────────────────────────┘ │ │
│ └───────────────────────────────┬───────────────────────────────────┘ │
│ │ pwqlist │
│ ▼ │
│ ┌───────────────────────────────────────────────────────────────────┐ │
│ │ struct eppoll_entry (连接的桥梁) │ │
│ │ ┌─────────────────────────────────────────────────────────────┐ │ │
│ │ │ wait_queue_entry_t wait; │ │ │
│ │ │ └── wait_queue_func_t func = ep_poll_callback; │ │ │
│ │ │ wait_queue_head_t *whead; // 指向 socket->sk_wq │ │ │
│ │ │ struct epitem *base; // 指回 epitem │ │ │
│ │ └─────────────────────────────────────────────────────────────┘ │ │
│ └───────────────────────────────┬───────────────────────────────────┘ │
│ │ 加入 socket->sk_wq │
│ ▼ │
│ ┌───────────────────────────────────────────────────────────────────┐ │
│ │ struct sock (TCP Socket 内核对象) │ │
│ │ ┌─────────────────────────────────────────────────────────────┐ │ │
│ │ │ struct socket_wq *sk_wq; // ★ 等待队列头 │ │ │
│ │ │ └── wait_queue_head_t wait; (链表头) │ │ │
│ │ │ ▲ │ │ │
│ │ │ └── 链表包含 eppoll_entry │ │ │
│ │ │ struct sk_buff_head sk_receive_queue; // ★ 数据队列 │ │ │
│ │ │ void (*sk_data_ready)(struct sock *sk) │ │ │
│ │ │ = sock_def_readable; // ★ 数据到达回调函数 │ │ │
│ │ │ struct proto *sk_prot = &tcp_prot; // TCP 操作函数表 │ │ │
│ │ └─────────────────────────────────────────────────────────────┘ │ │
│ └───────────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────────────────┘
7. 两条链路的交汇点
数据读取路径和事件通知路径在 socket 的等待队列(sk_wq) 处交汇:
| 路径 | 方向 | 关键操作 | 涉及函数 |
|---|---|---|---|
| 事件通知路径 | socket → epoll | 数据到达时,通过 sk_wq 唤醒 ep_poll_callback |
sock_def_readable → wake_up(&sk_wq) → ep_poll_callback |
| 数据读取路径 | 用户 → socket | 用户调用 read,从 sk_receive_queue 取数据 |
tcp_recvmsg → skb_queue_walk → skb_copy_datagram_msg |
核心洞察 :socket 的 sk_wq 是"门铃",sk_receive_queue 是"信箱"。epoll 通过把自己的回调函数挂在"门铃"上,实现了事件通知;而 tcp_recvmsg 通过访问"信箱"实现了数据读取。
8. 总结
从 Java 的 SocketChannel.read() 到 Linux 内核的 tcp_recvmsg,从 epoll_ctl 注册到 ep_poll_callback 被触发,我们追踪了两条跨越多个层次的完整调用链路:
8.1 数据读取路径
| 层次 | 关键函数/组件 | 作用 |
|---|---|---|
| Java 应用层 | SocketChannel.read() |
发起读操作 |
| Java NIO 层 | IOUtil.read() / park(Net.POLLIN) |
native 调用 / 阻塞等待 |
| JNI 层 | SocketDispatcher.read0() |
调用 recv 系统调用 |
| 系统调用层 | __sys_recvfrom() |
根据 fd 查找 socket,准备 msg |
| Socket 抽象层 | sock_recvmsg() → inet_recvmsg() |
协议多态分发 |
| TCP 层 | tcp_recvmsg() → tcp_recvmsg_locked() |
取数据、拷数据、移除数据 |
| 数据拷贝 | skb_copy_datagram_msg() |
从内核 sk_buff 拷贝到用户空间 |
8.2 事件通知路径
| 阶段 | 关键函数 | 作用 |
|---|---|---|
| 注册关联 | ep_insert() → ep_ptable_queue_proc() |
将 ep_poll_callback 挂载到 socket 的 sk_wq |
| 事件触发 | sock_def_readable() |
数据到达时唤醒 sk_wq 等待队列 |
| 回调执行 | ep_poll_callback() |
将 epitem 加入就绪链表,唤醒 epoll_wait |
| 事件返回 | ep_send_events() |
遍历就绪链表,将事件拷贝到用户空间 |
| JVM 映射 | EPollSelectorImpl.updateSelectedKeys() |
将 fd 映射为 SelectionKey,填充 readyOps |
8.3 核心设计思想
- 等待队列解耦 :socket 不关心谁在等待它,只负责在数据就绪时唤醒
sk_wq队列上的所有等待者。epoll 通过将自己伪装成一个"等待者"挂载到sk_wq上,实现了与 socket 的解耦关联。 - 红黑树 + 就绪链表 :epoll 使用红黑树(
rbr)管理所有被监控的 fd,实现 O(log N) 的查找、插入、删除;使用就绪链表(rdllist)管理就绪的 fd,实现 O(1) 的事件获取。 - 协议多态 :通过
sock->ops和sk->sk_prot两层级的多态函数指针,实现了从 Socket 抽象层到具体传输层协议(TCP/UDP)的灵活分发。 - 流式读取 :TCP 层通过保留未读完的
sk_buff在队列中并记录偏移量,实现了对用户透明的字节流语义。
理解这两条完整的调用链路,有助于我们在编写高性能网络程序时做出更优的设计决策,也能在排查性能问题、分析网络异常时快速定位问题所在。
#源码
ini
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.gettstamp = sock_gettstamp,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
.splice_eof = inet_splice_eof,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
.read_skb = tcp_read_skb,
.sendmsg_locked = tcp_sendmsg_locked,
.peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet_compat_ioctl,
#endif
.set_rcvlowat = tcp_set_rcvlowat,
};
EXPORT_SYMBOL(inet_stream_ops);
SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
unsigned int, flags, struct sockaddr __user *, addr,
int __user *, addr_len)
{
return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len);
}
/*
* Receive a frame from the socket and optionally record the address of the
* sender. We verify the buffers are writable and if needed move the
* sender address from kernel to user space.
*/
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
struct sockaddr __user *addr, int __user *addr_len)
{
struct sockaddr_storage address;
struct msghdr msg = {
/* Save some cycles and don't copy the address if not needed */
.msg_name = addr ? (struct sockaddr *)&address : NULL,
};
struct socket *sock;
int err, err2;
int fput_needed;
err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
if (unlikely(err))
return err;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
err = sock_recvmsg(sock, &msg, flags);
if (err >= 0 && addr != NULL) {
err2 = move_addr_to_user(&address,
msg.msg_namelen, addr, addr_len);
if (err2 < 0)
err = err2;
}
fput_light(sock->file, fput_needed);
out:
return err;
}
/**
* sock_recvmsg - receive a message from @sock
* @sock: socket
* @msg: message to receive
* @flags: message flags
*
* Receives @msg from @sock, passing through LSM. Returns the total number
* of bytes received, or an error.
*/
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
return err ?: sock_recvmsg_nosec(sock, msg, flags);
}
EXPORT_SYMBOL(sock_recvmsg);
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
int flags)
{
int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg,
inet6_recvmsg,
inet_recvmsg, sock, msg,
msg_data_left(msg), flags);
if (trace_sock_recv_length_enabled())
call_trace_sock_recv_length(sock->sk, ret, flags);
return ret;
}
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
struct sock *sk = sock->sk;
int addr_len = 0;
int err;
if (likely(!(flags & MSG_ERRQUEUE)))
sock_rps_record_flow(sk);
err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
}
EXPORT_SYMBOL(inet_recvmsg);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
int *addr_len)
{
int cmsg_flags = 0, ret;
struct scm_timestamping_internal tss;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
if (sk_can_busy_loop(sk) &&
skb_queue_empty_lockless(&sk->sk_receive_queue) &&
sk->sk_state == TCP_ESTABLISHED)
sk_busy_loop(sk, flags & MSG_DONTWAIT);
lock_sock(sk);
ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
release_sock(sk);
if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
tcp_recv_timestamp(msg, sk, &tss);
if (msg->msg_get_inq) {
msg->msg_inq = tcp_inq_hint(sk);
if (cmsg_flags & TCP_CMSG_INQ)
put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
sizeof(msg->msg_inq), &msg->msg_inq);
}
}
return ret;
}
EXPORT_SYMBOL(tcp_recvmsg);
/*
* This routine copies from a sock struct into the user buffer.
*
* Technical note: in 2.3 we work on _locked_ socket, so that
* tricks with *seq access order and skb->users are not required.
* Probably, code can be easily improved even more.
*/
static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
int flags, struct scm_timestamping_internal *tss,
int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0;
u32 peek_seq;
u32 *seq;
unsigned long used;
int err;
int target; /* Read at least this many bytes */
long timeo;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
if (tp->recvmsg_inq) {
*cmsg_flags = TCP_CMSG_INQ;
msg->msg_get_inq = 1;
}
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
goto recv_urg;
if (unlikely(tp->repair)) {
err = -EPERM;
if (!(flags & MSG_PEEK))
goto out;
if (tp->repair_queue == TCP_SEND_QUEUE)
goto recv_sndq;
err = -EINVAL;
if (tp->repair_queue == TCP_NO_QUEUE)
goto out;
/* 'common' recv queue MSG_PEEK-ing */
}
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
peek_seq = tp->copied_seq;
seq = &peek_seq;
}
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
do {
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
break;
}
}
/* Next get a buffer. */
last = skb_peek_tail(&sk->sk_receive_queue);
skb_queue_walk(&sk->sk_receive_queue, skb) {
last = skb;
/* Now that we have two receive queues this
* shouldn't happen.
*/
if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
"TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
flags))
break;
offset = *seq - TCP_SKB_CB(skb)->seq;
if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
}
if (offset < skb->len)
goto found_ok_skb;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK),
"TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
}
/* Well, if we have backlog, try to process it now yet. */
if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
break;
if (copied) {
if (!timeo ||
sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current))
break;
} else {
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
copied = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
if (sk->sk_state == TCP_CLOSE) {
/* This occurs when user tries to read
* from never connected socket.
*/
copied = -ENOTCONN;
break;
}
if (!timeo) {
copied = -EAGAIN;
break;
}
if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
}
if (copied >= target) {
/* Do not sleep, just process backlog. */
__sk_flush_backlog(sk);
} else {
tcp_cleanup_rbuf(sk, copied);
err = sk_wait_data(sk, &timeo, last);
if (err < 0) {
err = copied ? : err;
goto out;
}
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
current->comm,
task_pid_nr(current));
peek_seq = tp->copied_seq;
}
continue;
found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
/* Do we have urgent data here? */
if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
WRITE_ONCE(*seq, *seq + 1);
urg_hole++;
offset++;
used--;
if (!used)
goto skip_copy;
}
} else
used = urg_offset;
}
}
if (!(flags & MSG_TRUNC)) {
err = skb_copy_datagram_msg(skb, offset, msg, used);
if (err) {
/* Exception. Bailout! */
if (!copied)
copied = -EFAULT;
break;
}
}
WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
tcp_rcv_space_adjust(sk);
skip_copy:
if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
WRITE_ONCE(tp->urg_data, 0);
tcp_fast_path_check(sk);
}
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, tss);
*cmsg_flags |= TCP_CMSG_TS;
}
if (used + offset < skb->len)
continue;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
tcp_eat_recv_skb(sk, skb);
continue;
found_fin_ok:
/* Process the FIN. */
WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
tcp_eat_recv_skb(sk, skb);
break;
} while (len > 0);
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
return copied;
out:
return err;
recv_urg:
err = tcp_recv_urg(sk, msg, len, flags);
goto out;
recv_sndq:
err = tcp_peek_sndq(sk, msg, len);
goto out;
}
#源码
static
EPollSelectorImpl(SelectorProvider sp) throws IOException {
super(sp);
this.epfd = EPoll.create();
this.pollArrayAddress = EPoll.allocatePollArray(NUM_EPOLLEVENTS);
try {
this.eventfd = new EventFD();
IOUtil.configureBlocking(IOUtil.newFD(eventfd.efd()), false);
} catch (IOException ioe) {
EPoll.freePollArray(pollArrayAddress);
FileDispatcherImpl.closeIntFD(epfd);
throw ioe;
}
// register the eventfd object for wakeups
EPoll.ctl(epfd, EPOLL_CTL_ADD, eventfd.efd(), EPOLLIN);
}
JNIEXPORT jint JNICALL
Java_sun_nio_ch_EPoll_ctl(JNIEnv *env, jclass clazz, jint epfd,
jint opcode, jint fd, jint events)
{
struct epoll_event event;
int res;
event.events = events;
event.data.fd = fd;
res = epoll_ctl(epfd, (int)opcode, (int)fd, &event);
return (res == 0) ? 0 : errno;
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
error = -EBADF;
f = fdget(epfd);
if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd);
if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
/*
* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
* descriptor, there is the chance of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
* 'epnested_mutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
}
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
}
}
/*
* Try to lookup the file inside our RB tree. Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
if (epi) {
/*
* The eventpoll itself is still alive: the refcount
* can't go to zero here.
*/
ep_remove_safe(ep, epi);
error = 0;
} else {
error = -ENOENT;
}
break;
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
error_tgt_fput:
if (full_check) {
clear_tfile_check_list();
loop_check_gen++;
mutex_unlock(&epnested_mutex);
}
fdput(tf);
error_fput:
fdput(f);
error_return:
return error;
}
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
struct timespec64 to;
return do_epoll_pwait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout),
sigmask, sigsetsize);
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
*/
static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to,
const sigset_t __user *sigmask, size_t sigsetsize)
{
int error;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, to);
restore_saved_sigmask_unless(error == -EINTR);
return error;
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
int error;
struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
if (!f.file)
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!is_file_epoll(f.file))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, to);
error_fput:
fdput(f);
return error;
}
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
*
* @ep: Pointer to the eventpoll context.
* @events: Pointer to the userspace buffer where the ready events should be
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
* timespec. If the timeout is zero, the function will not block,
* while if the @timeout ptr is NULL, the function will block
* until at least one event has been retrieved (or an error
* occurred).
*
* Return: the number of ready events which have been fetched, or an
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout)
{
int res, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
slack = select_estimate_accuracy(timeout);
to = &expires;
*to = timespec64_to_ktime(*timeout);
} else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation.
*/
timed_out = 1;
}
/*
* This call is racy: We may or may not see events that are being added
* to the ready list under the lock (e.g., in IRQ callbacks). For cases
* with a non-zero timeout, this thread will check the ready list under
* lock and will add to the wait queue. For cases with a zero
* timeout, the user by definition should not care and will have to
* recheck again.
*/
eavail = ep_events_available(ep);
while (1) {
if (eavail) {
/*
* Try to transfer events to user space. In case we get
* 0 events and there's still timeout left over, we go
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
if (res)
return res;
}
if (timed_out)
return 0;
eavail = ep_busy_loop(ep, timed_out);
if (eavail)
continue;
if (signal_pending(current))
return -EINTR;
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
* wakeup. Why it is important? In case of several waiters
* each new wakeup will hit the next waiter, giving it the
* chance to harvest new event. Otherwise wakeup can be
* lost. This is also good performance-wise, because on
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
*
* In fact, we now use an even more aggressive function that
* unconditionally removes, because we don't reuse the wait
* entry between loop iterations. This lets us also avoid the
* performance issue if a process is killed, causing all of its
* threads to wake up without being removed normally.
*/
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
write_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
* is safe to avoid an explicit barrier.
*/
__set_current_state(TASK_INTERRUPTIBLE);
/*
* Do the final check under the lock. ep_scan_ready_list()
* plays with two lists (->rdllist and ->ovflist) and there
* is always a race when both lists are empty for short
* period of time although events are pending, so lock is
* important.
*/
eavail = ep_events_available(ep);
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
//yym-gaizao
if (!eavail) {
pr_debug("ep_poll: [process=%d, thread=%d] going to sleep, timeout=%lld ns\n", current->tgid, current->pid, ktime_to_ns(expires));
timed_out = !schedule_hrtimeout_range(to, slack,
HRTIMER_MODE_ABS);
}
__set_current_state(TASK_RUNNING);
/*
* We were woken up, thus go and try to harvest some events.
* If timed out and still on the wait queue, recheck eavail
* carefully under lock, below.
*/
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
* timeout expired before it could reacquire the lock.
* Thus, when wait.entry is empty, it needs to harvest
* events.
*/
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
}
}
static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
struct epitem *epi, *tmp;
LIST_HEAD(txlist);
poll_table pt;
int res = 0;
/*
* Always short-circuit for fatal signals to allow threads to make a
* timely exit without the chance of finding more events available and
* fetching repeatedly.
*/
if (fatal_signal_pending(current))
return -EINTR;
init_poll_funcptr(&pt, NULL);
mutex_lock(&ep->mtx);
ep_start_scan(ep, &txlist);
/*
* We can loop without lock because we are passed a task private list.
* Items cannot vanish during the loop we are holding ep->mtx.
*/
list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
struct wakeup_source *ws;
__poll_t revents;
if (res >= maxevents)
break;
/*
* Activate ep->ws before deactivating epi->ws to prevent
* triggering auto-suspend here (in case we reactive epi->ws
* below).
*
* This could be rearranged to delay the deactivation of epi->ws
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
ws = ep_wakeup_source(epi);
if (ws) {
if (ws->active)
__pm_stay_awake(ep->ws);
__pm_relax(ws);
}
list_del_init(&epi->rdllink);
/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, we are holding ep->mtx,
* so no operations coming from userspace can change the item.
*/
revents = ep_item_poll(epi, &pt, 1);
if (!revents)
continue;
events = epoll_put_uevent(revents, epi->event.data, events);
if (!events) {
list_add(&epi->rdllink, &txlist);
ep_pm_stay_awake(epi);
if (!res)
res = -EFAULT;
break;
}
res++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
else if (!(epi->event.events & EPOLLET)) {
/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
* the ready list, so that the next call to
* epoll_wait() will check again the events
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
* ep_scan_ready_list() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
ep_done_scan(ep, &txlist);
mutex_unlock(&ep->mtx);
return res;
}
static inline struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
struct epoll_event __user *uevent)
{
if (__put_user(revents, &uevent->events) ||
__put_user(data, &uevent->data))
return NULL;
return uevent+1;
}
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*
* This callback takes a read lock in order not to contend with concurrent
* events from another file descriptor, thus all modifications to ->rdllist
* or ->ovflist are lockless. Read lock is paired with the write lock from
* ep_scan_ready_list(), which stops all list modifications and guarantees
* that lists state is seen correctly.
*
* Another thing worth to mention is that ep_poll_callback() can be called
* concurrently for the same @epi from different CPUs if poll table was inited
* with several wait queues entries. Plural wakeup from different CPUs of a
* single wait queue is serialized by wq.lock, but the case when multiple wait
* queues are used should be detected accordingly. This is detected using
* cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
unsigned long flags;
int ewake = 0;
read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
* Check the events coming with the callback. At this stage, not
* every device reports the events in the "key" parameter of the
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
if (pollflags && !(pollflags & epi->event.events))
goto out_unlock;
/*
* If we are transferring events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
if (chain_epi_lockless(epi))
ep_pm_stay_awake_rcu(epi);
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
if ((epi->event.events & EPOLLEXCLUSIVE) &&
!(pollflags & POLLFREE)) {
switch (pollflags & EPOLLINOUT_BITS) {
case EPOLLIN:
if (epi->event.events & EPOLLIN)
ewake = 1;
break;
case EPOLLOUT:
if (epi->event.events & EPOLLOUT)
ewake = 1;
break;
case 0:
ewake = 1;
break;
}
}
wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
if (pollflags & POLLFREE) {
/*
* If we race with ep_remove_wait_queue() it can miss
* ->whead = NULL and do another remove_wait_queue() after
* us, so we can't use __remove_wait_queue().
*/
list_del_init(&wait->entry);
/*
* ->whead != NULL protects us from the race with
* ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
* takes whead->lock held by the caller. Once we nullify it,
* nothing protects ep/epi or even wait.
*/
smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
}
return ewake;
}
/*
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
int error, pwake = 0;
__poll_t revents;
struct epitem *epi;
struct ep_pqueue epq;
struct eventpoll *tep = NULL;
if (is_file_epoll(tfile))
tep = tfile->private_data;
lockdep_assert_irqs_enabled();
if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
max_user_watches) >= 0))
return -ENOSPC;
percpu_counter_inc(&ep->user->epoll_watches);
if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
percpu_counter_dec(&ep->user->epoll_watches);
return -ENOMEM;
}
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->next = EP_UNACTIVE_PTR;
if (tep)
mutex_lock_nested(&tep->mtx, 1);
/* Add the current item to the list of active epoll hook for this file */
if (unlikely(attach_epitem(tfile, epi) < 0)) {
if (tep)
mutex_unlock(&tep->mtx);
kmem_cache_free(epi_cache, epi);
percpu_counter_dec(&ep->user->epoll_watches);
return -ENOMEM;
}
if (full_check && !tep)
list_file(tfile);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
ep_rbtree_insert(ep, epi);
if (tep)
mutex_unlock(&tep->mtx);
/*
* ep_remove_safe() calls in the later error paths can't lead to
* ep_free() as the ep file itself still holds an ep reference.
*/
ep_get(ep);
/* now check if we've created too many backpaths */
if (unlikely(full_check && reverse_path_check())) {
ep_remove_safe(ep, epi);
return -EINVAL;
}
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error) {
ep_remove_safe(ep, epi);
return error;
}
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
revents = ep_item_poll(epi, &epq.pt, 1);
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (unlikely(!epq.epi)) {
ep_remove_safe(ep, epi);
return -ENOMEM;
}
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
/* If the file is already "ready" we drop it inside the ready list */
if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irq(&ep->lock);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, NULL, 0);
return 0;
}
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
struct epitem *epi = epq->epi;
struct eppoll_entry *pwq;
if (unlikely(!epi)) // an earlier allocation has failed
return;
pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
if (unlikely(!pwq)) {
epq->epi = NULL;
return;
}
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
if (epi->event.events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(whead, &pwq->wait);
else
add_wait_queue(whead, &pwq->wait);
pwq->next = epi->pwqlist;
epi->pwqlist = pwq;
}
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake that number of exclusive tasks, and potentially all
* the non-exclusive tasks. Normally, exclusive tasks will be at the end of
* the list and any non-exclusive tasks will be woken first. A priority task
* may be at the head of the list, and can consume the event without any other
* tasks being woken.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_entry_t *curr, *next;
lockdep_assert_held(&wq_head->lock);
curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
if (&curr->entry == &wq_head->head)
return nr_exclusive;
list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
int ret;
ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
return nr_exclusive;
}
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
* @mode: which threads
* @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
* away soon, so while the target thread will be woken up, it will not
* be migrated to another CPU - ie. the two threads are 'synchronized'
* with each other. This can prevent needless bouncing between CPUs.
*
* On UP it can prevent extra preemption.
*
* If this function wakes up a task, it executes a full memory barrier before
* accessing the task state.
*/
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
void *key)
{
if (unlikely(!wq_head))
return;
__wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
int remaining;
spin_lock_irqsave(&wq_head->lock, flags);
remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
key);
spin_unlock_irqrestore(&wq_head->lock, flags);
return nr_exclusive - remaining;
}
void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
trace_sk_data_ready(sk);
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
{
sk_init_common(sk);
sk->sk_send_head = NULL;
timer_setup(&sk->sk_timer, NULL, 0);
sk->sk_allocation = GFP_KERNEL;
sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
sk->sk_use_task_frag = true;
sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk->sk_type = sock->type;
RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
sock->sk = sk;
} else {
RCU_INIT_POINTER(sk->sk_wq, NULL);
}
sk->sk_uid = uid;
rwlock_init(&sk->sk_callback_lock);
if (sk->sk_kern_sock)
lockdep_set_class_and_name(
&sk->sk_callback_lock,
af_kern_callback_keys + sk->sk_family,
af_family_kern_clock_key_strings[sk->sk_family]);
else
lockdep_set_class_and_name(
&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
sk->sk_peek_off = -1;
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
spin_lock_init(&sk->sk_peer_lock);
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = SK_DEFAULT_STAMP;
#if BITS_PER_LONG==32
seqlock_init(&sk->sk_stamp_seq);
#endif
atomic_set(&sk->sk_zckey, 0);
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
#endif
sk->sk_max_pacing_rate = ~0UL;
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
sk_rx_queue_clear(sk);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.rst for details)
*/
smp_wmb();
refcount_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data_uid);
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
enum skb_drop_reason reason;
bool fragstolen;
int eaten;
/* If a subflow has been reset, the packet should not continue
* to be processed, drop the packet.
*/
if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
__kfree_skb(skb);
return;
}
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
return;
}
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
reason = SKB_DROP_REASON_NOT_SPECIFIED;
tp->rx_opt.dsack = 0;
/* Queue data for delivery to the user.
* Packets in sequence go to the receive queue.
* Out of sequence packets to the out_of_order_queue.
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0) {
reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
/* Ok. In sequence. In window. */
queue_and_out:
if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
/* TODO: maybe ratelimit these WIN 0 ACK ? */
inet_csk(sk)->icsk_ack.pending |=
(ICSK_ACK_NOMEM | ICSK_ACK_NOW);
inet_csk_schedule_ack(sk);
sk->sk_data_ready(sk);
if (skb_queue_len(&sk->sk_receive_queue)) {
reason = SKB_DROP_REASON_PROTO_MEM;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
}
sk_forced_mem_schedule(sk, skb->truesize);
}
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC5681. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
tcp_fast_path_check(sk);
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
tcp_data_ready(sk);
return;
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
tcp_rcv_spurious_retrans(sk, skb);
/* A retransmit, 2nd most common case. Force an immediate ack. */
reason = SKB_DROP_REASON_TCP_OLD_DATA;
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
tcp_drop_reason(sk, skb, reason);
return;
}
/* Out of window. F.e. zero window probe. */
if (!before(TCP_SKB_CB(skb)->seq,
tp->rcv_nxt + tcp_receive_window(tp))) {
reason = SKB_DROP_REASON_TCP_OVERWINDOW;
goto out_of_window;
}
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
/* If window is closed, drop tail of packet. But after
* remembering D-SACK for its head made in previous line.
*/
if (!tcp_receive_window(tp)) {
reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
goto queue_and_out;
}
tcp_data_queue_ofo(sk, skb);
}
/*
* TCP receive function for the ESTABLISHED state.
*
* It is split into a fast path and a slow path. The fast path is
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
* - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
* (detected by checking the TCP header against pred_flags)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
* - Unexpected TCP option.
*
* When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
* The first three cases are guaranteed by proper pred_flags setting,
* the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int len = skb->len;
/* TCP congestion window tracking */
trace_tcp_probe(sk, skb);
tcp_mstamp_refresh(tp);
if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
* The code loosely follows the one in the famous
* "30 instruction TCP receive" Van Jacobson mail.
*
* Van's trick is to deposit buffers into socket queue
* on a device interrupt, to call tcp_recv function
* on the receive process context and checksum and copy
* the buffer to user space. smart...
*
* Our current scheme is not silly either but we take the
* extra cost of the net_bh soft interrupt processing...
* We do checksum and copy also but from device to kernel.
*/
tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
*/
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th->doff*4 due to pred_flags
* match.
*/
/* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
/* No? Slow path! */
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
/* If PAWS failed, check it more carefully in slow path */
if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
* and timestamp was corrupted part, it will result
* in a hung connection since we will drop all
* future packets due to the PAWS test.
*/
}
if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
if (len == tcp_header_len) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
/* We know that such packets are checksummed
* on entry.
*/
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
/* When receiving pure ack in fast path, update
* last ts ecr directly instead of calling
* tcp_rcv_rtt_measure_ts()
*/
tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
return;
} else { /* Header too small */
reason = SKB_DROP_REASON_PKT_TOO_SMALL;
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
int eaten = 0;
bool fragstolen = false;
if (tcp_checksum_complete(skb))
goto csum_error;
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
skb_dst_drop(skb);
__skb_pull(skb, tcp_header_len);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
} else {
tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
}
__tcp_ack_snd_check(sk, 0);
no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
tcp_data_ready(sk);
return;
}
}
slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn) {
reason = SKB_DROP_REASON_TCP_FLAGS;
goto discard;
}
/*
* Standard slow path.
*/
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
step5:
reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
if ((int)reason < 0) {
reason = -reason;
goto discard;
}
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
tcp_urg(sk, skb, th);
/* step 7: process the segment text */
tcp_data_queue(sk, skb);
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
return;
csum_error:
reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
tcp_drop_reason(sk, skb, reason);
}
EXPORT_SYMBOL(tcp_rcv_established);
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
enum skb_drop_reason reason;
struct sock *rsk;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst;
dst = rcu_dereference_protected(sk->sk_rx_dst,
lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
dst, 0)) {
RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
dst_release(dst);
}
}
tcp_rcv_established(sk, skb);
return 0;
}
reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (tcp_checksum_complete(skb))
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_cookie_check(sk, skb);
if (!nsk)
goto discard;
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
}
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
return 0;
reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;
csum_err:
reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
/*
* From tcp_input.c
*/
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
enum skb_drop_reason drop_reason;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
struct sock *sk;
int ret;
drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
__TCP_INC_STATS(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
th = (const struct tcphdr *)skb->data;
if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
goto bad_packet;
}
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
* provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
goto csum_error;
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
lookup:
sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
bool req_stolen = false;
struct sock *nsk;
sk = req->rsk_listener;
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
drop_reason = SKB_DROP_REASON_XFRM_POLICY;
else
drop_reason = tcp_inbound_hash(sk, req, skb,
&iph->saddr, &iph->daddr,
AF_INET, dif, sdif);
if (unlikely(drop_reason)) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
}
if (tcp_checksum_complete(skb)) {
reqsk_put(req);
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
if (!nsk) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
sk = nsk;
/* reuseport_migrate_sock() has already held one sk_refcnt
* before returning.
*/
} else {
/* We own a reference on the listener, increase it again
* as we might lose it too soon.
*/
sock_hold(sk);
}
refcounted = true;
nsk = NULL;
if (!tcp_filter(sk, skb)) {
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
} else {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
}
if (!nsk) {
reqsk_put(req);
if (req_stolen) {
/* Another cpu got exclusive access to req
* and created a full blown socket.
* Try to feed this packet to this socket
* instead of discarding it.
*/
tcp_v4_restore_cb(skb);
sock_put(sk);
goto lookup;
}
goto discard_and_relse;
}
nf_reset_ct(skb);
if (nsk == sk) {
reqsk_put(req);
tcp_v4_restore_cb(skb);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v4_send_reset(nsk, skb);
goto discard_and_relse;
} else {
sock_put(sk);
return 0;
}
}
if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
drop_reason = SKB_DROP_REASON_TCP_MINTTL;
goto discard_and_relse;
}
}
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto discard_and_relse;
}
drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
AF_INET, dif, sdif);
if (drop_reason)
goto discard_and_relse;
nf_reset_ct(skb);
if (tcp_filter(sk, skb)) {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
goto discard_and_relse;
}
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
skb->dev = NULL;
if (sk->sk_state == TCP_LISTEN) {
ret = tcp_v4_do_rcv(sk, skb);
goto put_and_return;
}
sk_incoming_cpu_update(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
ret = tcp_v4_do_rcv(sk, skb);
} else {
if (tcp_add_backlog(sk, skb, &drop_reason))
goto discard_and_relse;
}
bh_unlock_sock(sk);
put_and_return:
if (refcounted)
sock_put(sk);
return ret;
no_tcp_socket:
drop_reason = SKB_DROP_REASON_NO_SOCKET;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
tcp_v4_fill_cb(skb, iph, th);
if (tcp_checksum_complete(skb)) {
csum_error:
drop_reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
tcp_v4_send_reset(NULL, skb);
}
discard_it:
SKB_DR_OR(drop_reason, NOT_SPECIFIED);
/* Discard frame. */
kfree_skb_reason(skb, drop_reason);
return 0;
discard_and_relse:
sk_drops_add(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
drop_reason = SKB_DROP_REASON_XFRM_POLICY;
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
tcp_v4_fill_cb(skb, iph, th);
if (tcp_checksum_complete(skb)) {
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(net,
net->ipv4.tcp_death_row.hashinfo,
skb, __tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
sdif);
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
tcp_v4_restore_cb(skb);
refcounted = false;
goto process;
}
}
/* to ACK */
fallthrough;
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
tcp_v4_send_reset(sk, skb);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
}
goto discard_it;
}
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
const struct net_protocol *ipprot;
int raw, ret;
resubmit:
raw = raw_local_deliver(skb, protocol);
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot) {
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb_reason(skb,
SKB_DROP_REASON_XFRM_POLICY);
return;
}
nf_reset_ct(skb);
}
ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
skb);
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
} else {
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
}
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb_clear_delivery_time(skb);
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
}
/*
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
EXPORT_SYMBOL(ip_local_deliver);
/* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{
return INDIRECT_CALL_INET(skb_dst(skb)->input,
ip6_input, ip_local_deliver, skb);
}
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
int ret;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_rcv(skb);
if (!skb)
return NET_RX_SUCCESS;
ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
}
/*
* IP receive entry point
*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev)
{
struct net *net = dev_net(dev);
skb = ip_rcv_core(skb, net);
if (skb == NULL)
return NET_RX_DROP;
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
}
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
orig_dev = skb->dev;
skb_reset_network_header(skb);
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
another_round:
skb->skb_iif = skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret2;
migrate_disable();
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
migrate_enable();
if (ret2 != XDP_PASS) {
ret = NET_RX_DROP;
goto out;
}
}
if (eth_type_vlan(skb->protocol)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
}
if (skb_skip_tc_classify(skb))
goto skip_classify;
if (pfmemalloc)
goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
bool another = false;
nf_skip_egress(skb, true);
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
&another);
if (another)
goto another_round;
if (!skb)
goto out;
nf_skip_egress(skb, false);
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
goto out;
}
#endif
skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
if (skb_vlan_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb))
goto another_round;
else if (unlikely(!skb))
goto out;
}
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
break;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
if (skb_vlan_tag_get_id(skb)) {
/* Vlan id is non 0 and vlan_do_receive() above couldn't
* find vlan device.
*/
skb->pkt_type = PACKET_OTHERHOST;
} else if (eth_type_vlan(skb->protocol)) {
/* Outer header is 802.1P with vlan 0, inner header is
* 802.1Q or 802.1AD and vlan_do_receive() above could
* not find vlan dev for vlan id 0.
*/
__vlan_hwaccel_clear_tag(skb);
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
if (vlan_do_receive(&skb))
/* After stripping off 802.1P header with vlan 0
* vlan dev is found for inner header.
*/
goto another_round;
else if (unlikely(!skb))
goto out;
else
/* We have stripped outer 802.1P vlan 0 header.
* But could not find vlan dev.
* check again for vlan id to set OTHERHOST.
*/
goto check_vlan_id;
}
/* Note: we might in the future use prio bits
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
__vlan_hwaccel_clear_tag(skb);
}
type = skb->protocol;
/* deliver only exact match when indicated */
if (likely(!deliver_exact)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&orig_dev->ptype_specific);
if (unlikely(skb->dev != orig_dev)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&skb->dev->ptype_specific);
}
if (pt_prev) {
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
*ppt_prev = pt_prev;
} else {
drop:
if (!deliver_exact)
dev_core_stats_rx_dropped_inc(skb->dev);
else
dev_core_stats_rx_nohandler_inc(skb->dev);
kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
/* The invariant here is that if *ppt_prev is not NULL
* then skb should also be non-NULL.
*
* Apparently *ppt_prev assignment above holds this invariant due to
* skb dereferencing near it.
*/
*pskb = skb;
return ret;
}
static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
int ret;
ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev);
return ret;
}
/**
* netif_receive_skb_core - special purpose version of netif_receive_skb
* @skb: buffer to process
*
* More direct receive version of netif_receive_skb(). It should
* only be used by callers that have a need to skip RPS and Generic XDP.
* Caller must also take care of handling if ``(page_is_)pfmemalloc``.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb_core(struct sk_buff *skb)
{
int ret;
rcu_read_lock();
ret = __netif_receive_skb_one_core(skb, false);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);
static int lapbeth_napi_poll(struct napi_struct *napi, int budget)
{
struct lapbethdev *lapbeth = container_of(napi, struct lapbethdev,
napi);
struct sk_buff *skb;
int processed = 0;
for (; processed < budget; ++processed) {
skb = skb_dequeue(&lapbeth->rx_queue);
if (!skb)
break;
netif_receive_skb_core(skb);
}
if (processed < budget)
napi_complete(napi);
return processed;
}