创建网络套接字的时候,操作系统会创建很多数据结构,其中有一个叫做:
c
struct socket {
socket_state state;
kmemcheck_bitfield_begin(type);
short type;
kmemcheck_bitfield_end(type);
unsigned long flags;
/*
* Please keep fasync_list & wait fields in the same cache line
*/
struct fasync_struct *fasync_list;
wait_queue_head_t wait;
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
};
而网络服务的本质是进程在内核当中叫做struct task_strcut
,Linux一切皆文件,每个文件有自己的文件描述符struct files_struct
,里面包含了struct file*fd_array[]
,当文件被打开之后会创建strcut file
,里面有一个字段叫:
c
void *private_data;
当是网络服务的时候,它就会指向struct socket
,而struct socket
里面的有一个字段struct file *file;
,它会回指向文件
所以最终网络文件挂接到struct file
之下,在应用层通过文件描述符就能找到网络文件。
struct socket
结构体里面有一个wait_queue_head_t wait;
,它是一个自定义类型,转到定义:
c
struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;
里面有一个进程队列,进程在阻塞等待的时候,实际上要将自己的pcb
链入指定的数据结构里面
所以,当网络数据不就绪的时候,将指定的进程挂接到这个wait
里面等待即可
const struct proto_ops *ops
字段就是我们所调用的方法:
cstruct proto_ops { int family; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, struct sockaddr *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, struct sockaddr *vaddr, int sockaddr_len, int flags); int (*socketpair)(struct socket *sock1, struct socket *sock2); int (*accept) (struct socket *sock, struct socket *newsock, int flags); int (*getname) (struct socket *sock, struct sockaddr *addr, int *sockaddr_len, int peer); unsigned int (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); int (*compat_ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int (*compat_setsockopt)(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen); int (*compat_getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int (*sendmsg) (struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len); int (*recvmsg) (struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len, int flags); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); ssize_t (*sendpage) (struct socket *sock, struct page *page, int offset, size_t size, int flags); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); };
从操作系统到网络struct sock *sk;
,它里面有接收队列和发送队列:
c
struct sk_buff_head sk_receive_queue;
struct sk_buff_head sk_write_queue;
在创建套接字的时候传入SOCK_STREAM
或者SOCK_DGRAM
,这就表明是面向字节流的还是面向数据报的,sk
就指向struct udp_sock
或者struct tcp_sock
的开头
struct udp_sock
:
cstruct udp_sock { /* inet_sock has to be the first member */ struct inet_sock inet; int pending; /* Any pending frames ? */ unsigned int corkflag; /* Cork is required */ __u16 encap_type; /* Is this an Encapsulation socket? */ /* * Following member retains the information to create a UDP header * when the socket is uncorked. */ __u16 len; /* total length of pending frames */ /* * Fields specific to UDP-Lite. */ __u16 pcslen; __u16 pcrlen; /* indicator bits used by pcflag: */ #define UDPLITE_BIT 0x1 /* set by udplite proto init function */ #define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ #define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */ __u8 pcflag; /* marks socket as UDP-Lite if > 0 */ __u8 unused[3]; /* * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); };
struct tcp_sock
:
cstruct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ u16 xmit_size_goal_segs; /* Goal for segmenting output packets */ /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ /* Data for direct copy to user */ struct { struct sk_buff_head prequeue; struct task_struct *task; struct iovec *iov; int memory; int len; #ifdef CONFIG_NET_DMA /* members for async copy */ struct dma_chan *dma_chan; int wakeup; struct dma_pinned_list *pinned_list; dma_cookie_t dma_cookie; #endif } ucopy; u32 snd_wl1; /* Sequence for window update */ u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ u32 frto_highmark; /* snd_nxt when RTO occurred */ u16 advmss; /* Advertised MSS */ u8 frto_counter; /* Number of new acks after RTO */ u8 nonagle; /* Disable Nagle algorithm? */ /* RTT measurement */ u32 srtt; /* smoothed round trip time << 3 */ u32 mdev; /* medium deviation */ u32 mdev_max; /* maximal mdev for the last rtt period */ u32 rttvar; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ u8 reordering; /* Packet reordering metric. */ u32 snd_up; /* Urgent pointer */ u8 keepalive_probes; /* num of allowed keep alive probes */ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_ssthresh; /* Slow start size threshold */ u32 snd_cwnd; /* Sending congestion window */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ u32 fackets_out; /* FACK'd packets */ u32 tso_deferred; u32 bytes_acked; /* Appropriate Byte Counting - RFC3465 */ /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; struct sk_buff *scoreboard_skb_hint; struct sk_buff *retransmit_skb_hint; struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ /* SACKs data, these 2 need to be together (see tcp_build_and_update_options) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block recv_sack_cache[4]; struct sk_buff *highest_sack; /* highest skb with SACK received * (validity guaranteed only if * sacked_out > 0) */ int lost_cnt_hint; u32 retransmit_high; /* L-bits may be on up to this seqno */ u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */ u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ u32 undo_marker; /* tracking retrans started here. */ int undo_retrans; /* number of undoable retransmissions. */ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; /* Receiver side RTT estimation */ struct { u32 rtt; u32 seq; u32 time; } rcv_rtt_est; /* Receiver queue space */ struct { int space; u32 seq; u32 time; } rcvq_space; /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; /* TCP MD5 Signature Option information */ struct tcp_md5sig_info *md5sig_info; #endif };
之后要访问其他属性内容,只需强转就能访问(struct tcp_sock*)sk
,本质就是C语言的多态
我们的网络协议栈的本质就是:
- 用特定数据结构表述的协议
- 和特定协议匹配的方法集
如果是网络文件,
struct file
里面的const struct file_operations *f_op;
有指向网络的方法;而struct sock
里面的const struct proto_ops *ops;
也有指向网络的方法。前者解决的是对上的,后者是解决对下交付的
操作系统内会同时收到很多,如果这些报文上层来不及处理,那么操作系统内就会存在很多报文,对应这些报文,操作系统是需要管理起来的------先描述,再组织
c
struct sk_buff_head {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
__u32 qlen;
spinlock_t lock;
};
这个sk_buff
也是自定义类型:
c
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
//...
//...
sk_buff_data_t transport_header;
sk_buff_data_t network_header;
sk_buff_data_t mac_header;
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
atomic_t users;
};
将报文交给每层,实际上就是将sk_buff
在层和层之间流动,加报头就是头指针向上移动(封装),去掉报头就是头指针向下移动(解包) ,这就是先描述;
报文到了传输层之后,将报文分发给不同的文件描述符,实际上就是将sk_buff
组织到对应的缓冲区当中。
所以说建立连接和维护连接是有成本的,因为要在内核当中创建大量的数据结构。
简单示意图: