参考资料

<<linux内核网络栈源代码情景分析>>

af_inet.c文件中调用函数在协议层的实现

本文主要根据在af_inet.c文件中根据初始化不同的协议,来调用不同的协议族处理。在af_inet.c中的inet_create函数中就是根据传入不同的连接类型展示不同的协议族。

static int inet_create(struct socket *sock, int protocol)
{...    switch(sock->type)                                   // 根据传入的套接字进行赋值{case SOCK_STREAM:                                   // 流式套接字使用tcp协议操作函数case SOCK_SEQPACKET:if (protocol && protocol != IPPROTO_TCP)        // 检查protocl是否与IPPROTO_TCP相同 如果不同则报错返回{kfree_s((void *)sk, sizeof(*sk));return(-EPROTONOSUPPORT);}protocol = IPPROTO_TCP;                      // 设置协议/* TCP_NO_CHECK设置为1 表示对于tcp协议默认使用校验  */sk->no_check = TCP_NO_CHECK;prot = &tcp_prot;                                // 设置协议操作函数集break;...}...
}

此时选择TCP协议就选择的协议操作函数就是tcp_prot,该类型的定义值如下:

struct proto {struct sk_buff *   (*wmalloc)(struct sock *sk,unsigned long size, int force,int priority);struct sk_buff * (*rmalloc)(struct sock *sk,unsigned long size, int force,int priority);void         (*wfree)(struct sock *sk, struct sk_buff *skb,unsigned long size);void          (*rfree)(struct sock *sk, struct sk_buff *skb,unsigned long size);unsigned long (*rspace)(struct sock *sk);unsigned long    (*wspace)(struct sock *sk);void         (*close)(struct sock *sk, int timeout);int              (*read)(struct sock *sk, unsigned char *to,int len, int nonblock, unsigned flags);int               (*write)(struct sock *sk, unsigned char *to,int len, int nonblock, unsigned flags);int              (*sendto)(struct sock *sk,unsigned char *from, int len, int noblock,unsigned flags, struct sockaddr_in *usin,int addr_len);int              (*recvfrom)(struct sock *sk,unsigned char *from, int len, int noblock,unsigned flags, struct sockaddr_in *usin,int *addr_len);int               (*build_header)(struct sk_buff *skb,unsigned long saddr,unsigned long daddr,struct device **dev, int type,struct options *opt, int len, int tos, int ttl);int               (*connect)(struct sock *sk,struct sockaddr_in *usin, int addr_len);struct sock *    (*accept) (struct sock *sk, int flags);void         (*queue_xmit)(struct sock *sk,struct device *dev, struct sk_buff *skb,int free);void            (*retransmit)(struct sock *sk, int all);void            (*write_wakeup)(struct sock *sk);void           (*read_wakeup)(struct sock *sk);int         (*rcv)(struct sk_buff *buff, struct device *dev,struct options *opt, unsigned long daddr,unsigned short len, unsigned long saddr,int redo, struct inet_protocol *protocol);int          (*select)(struct sock *sk, int which,select_table *wait);int            (*ioctl)(struct sock *sk, int cmd,unsigned long arg);int            (*init)(struct sock *sk);void       (*shutdown)(struct sock *sk, int how);int           (*setsockopt)(struct sock *sk, int level, int optname,char *optval, int optlen);int         (*getsockopt)(struct sock *sk, int level, int optname,char *optval, int *option);    unsigned short max_header;unsigned long    retransmits;struct sock *   sock_array[SOCK_ARRAY_SIZE];    // sock数组char           name[80];int                inuse,          // 是否正在被使用highestinuse;
};// 定义tcp_prot如下
struct proto tcp_prot = {sock_wmalloc,sock_rmalloc,sock_wfree,sock_rfree,sock_rspace,sock_wspace,tcp_close,tcp_read,tcp_write,tcp_sendto,tcp_recvfrom,ip_build_header,tcp_connect,tcp_accept,ip_queue_xmit,tcp_retransmit,tcp_write_wakeup,tcp_read_wakeup,tcp_rcv,tcp_select,tcp_ioctl,NULL,              // inittcp_shutdown,tcp_setsockopt,tcp_getsockopt,128,0,{NULL,},"TCP",0, 0
};

如上可知,在af_inet文件中调用的协议处理函数如,connect对应的协议函数为tcp_connect,accept接受的函数就是tcp_accept,inet_release中prot调用的close方法等。本文就来分析一下该函数的具体操作。

socket执行过程

主要就是操作连接connect,accept等操作。

tcp_connect函数
/**  This will initiate an outgoing connection. */static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
{struct sk_buff *buff;struct device *dev=NULL;unsigned char *ptr;int tmp;int atype;struct tcphdr *t1;struct rtable *rt;if (sk->state != TCP_CLOSE)                                                             // 检查当前状态{return(-EISCONN);}if (addr_len < 8)                                                                        // 检查当前长度return(-EINVAL);if (usin->sin_family && usin->sin_family != AF_INET)                                    // 检查当前协议类型return(-EAFNOSUPPORT);/**    connect() to INADDR_ANY means loopback (BSD'ism).*/if(usin->sin_addr.s_addr==INADDR_ANY)usin->sin_addr.s_addr=ip_my_addr();                                               // 获取地址/**  Don't want a TCP connection going to a broadcast address */if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)          // 获取广播地址 并判断类型return -ENETUNREACH;sk->inuse = 1;                                                                                   // 设置正在使用 sk->daddr = usin->sin_addr.s_addr;sk->write_seq = tcp_init_seq();                                                              // 获取待发送的seqsk->window_seq = sk->write_seq;sk->rcv_ack_seq = sk->write_seq -1;                                                            // 希望接受的队列sk->err = 0;sk->dummy_th.dest = usin->sin_port;release_sock(sk);buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);                                        // 申请内存大小if (buff == NULL) {return(-ENOMEM);}sk->inuse = 1;buff->len = 24;buff->sk = sk;buff->free = 0;buff->localroute = sk->localroute;t1 = (struct tcphdr *) buff->data;/** Put in the IP header and routing stuff. */rt=ip_rt_route(sk->daddr, NULL, NULL);                                                        // 设置ip信息/**    We need to build the routing stuff from the things saved in skb. */tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);                     // 建立头部信息if (tmp < 0) {sk->prot->wfree(sk, buff->mem_addr, buff->mem_len);                                   // 如果失败则释放内容release_sock(sk);return(-ENETUNREACH);}buff->len += tmp;t1 = (struct tcphdr *)((char *)t1 +tmp);memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));                                          // 拷贝内容t1->seq = ntohl(sk->write_seq++);sk->sent_seq = sk->write_seq;buff->h.seq = sk->write_seq;t1->ack = 0;t1->window = 2;t1->res1=0;t1->res2=0;t1->rst = 0;t1->urg = 0;t1->psh = 0;t1->syn = 1;t1->urg_ptr = 0;t1->doff = 6;/* use 512 or whatever user asked for */if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))sk->window_clamp=rt->rt_window;elsesk->window_clamp=0;if (sk->user_mss)sk->mtu = sk->user_mss;else if(rt!=NULL && (rt->rt_flags&RTF_MTU))sk->mtu = rt->rt_mss;else {
#ifdef CONFIG_INET_SNARLif ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr))
#elseif ((sk->saddr ^ sk->daddr) & dev->pa_mask)
#endifsk->mtu = 576 - HEADER_SIZE;elsesk->mtu = MAX_WINDOW;}/** but not bigger than device MTU */if(sk->mtu <32)sk->mtu = 32; /* Sanity limit */sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE);/**  Put in the TCP options to say MTU. */ptr = (unsigned char *)(t1+1);ptr[0] = 2;ptr[1] = 4;ptr[2] = (sk->mtu) >> 8;ptr[3] = (sk->mtu) & 0xff;tcp_send_check(t1, sk->saddr, sk->daddr,sizeof(struct tcphdr) + 4, sk);                                                 // 校验和/**   This must go first otherwise a really quick response will get reset. */tcp_set_state(sk,TCP_SYN_SENT);                                                  // 设置状态为TCP_SYN_SENTsk->rto = TCP_TIMEOUT_INIT;
#if 0 /* we already did this */init_timer(&sk->retransmit_timer);
#endifsk->retransmit_timer.function=&retransmit_timer;                                  // 设置会到函数并保持回调数据sk->retransmit_timer.data = (unsigned long)sk;reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */sk->retransmits = TCP_SYN_RETRIES;sk->prot->queue_xmit(sk, dev, buff, 0);                                            //发送数据reset_xmit_timer(sk, TIME_WRITE, sk->rto);tcp_statistics.TcpActiveOpens++;tcp_statistics.TcpOutSegs++;release_sock(sk);return(0);
}

该函数主要是connect系统调用传输层的实现,该函数主要是发送SYN请求连续数据包,在主动打开的一端,主要进行了参数检查,对TCP首部各字段的赋值并对socket状态的更新,最后并启动超时重发定时器。

tcp_accept函数
/**  This will accept the next outstanding connection.**/static struct sock *tcp_accept(struct sock *sk, int flags)
{struct sock *newsk;struct sk_buff *skb;/** We need to make sure that this socket is listening,* and that it has something pending.*/if (sk->state != TCP_LISTEN)                           // 检查必须为监听套接字 如果不是监听套接字则返回错误{sk->err = EINVAL;return(NULL); }/* Avoid the race. */cli();                                                    // 禁止中断sk->inuse = 1;while((skb = tcp_dequeue_established(sk)) == NULL)      // 监听套接字接受队列,检查是否存在已经完成连接的数据包{if (flags & O_NONBLOCK)                            // 如果是非阻塞状态{sti();release_sock(sk);sk->err = EAGAIN;return(NULL);                                   // 返回错误}release_sock(sk);                                   // 监听并解析头部interruptible_sleep_on(sk->sleep);if (current->signal & ~current->blocked) {sti();sk->err = ERESTARTSYS;return(NULL);}sk->inuse = 1;}sti();                                                  // 使能中断 证明有接受数据/**  Now all we need to do is return skb->sk. */newsk = skb->sk;                                      // 获取新sockkfree_skb(skb, FREE_READ);sk->ack_backlog--;release_sock(sk);                                      return(newsk);                                          // 返回
}

该函数是accept系统调用的传输层的实现,该函数主要是从侦听套接字接受队列中取数据包,查看其是否完成TCP三次握手建立过程,如果没有则等待完成,否则返回数据包对应的sock,实际上该函数获取了处理结果,其过程都由其他函数如tcp_conn_request,tcp_ack完成。

tcp_write函数
/**  This routine copies from a user buffer into a socket,*  and starts the transmit system.                             被上层调用用于发送数据 是系统调用write函数对应的传输层处理函数 对于TCP协议而言send sendto系统调用都会调用tcp_write进行处理*/static int tcp_write(struct sock *sk, unsigned char *from,int len, int nonblock, unsigned flags)
{int copied = 0;int copy;int tmp;struct sk_buff *skb;struct sk_buff *send_tmp;unsigned char *buff;struct proto *prot;struct device *dev = NULL;sk->inuse=1;                                           // 设置sk正在使用prot = sk->prot;                                         // 获取sock的协议sockwhile(len > 0)                                       // 如果长度大于0{if (sk->err)                                          // 判断是否有错误 如果有错误 则返回错误 {            /* Stop on an error */release_sock(sk);                                 // 检查是否有数据输入if (copied) return(copied);tmp = -sk->err;sk->err = 0;return(tmp);}/**  First thing we do is make sure that we are established. */if (sk->shutdown & SEND_SHUTDOWN)                      // 判断是否sk关闭 {release_sock(sk);                              // 检查是否有输入到达sk->err = EPIPE;if (copied) return(copied);sk->err = 0;return(-EPIPE);                              // 返回错误}/* *    Wait for a connection to finish.*/while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)    // 如果套接字为已经建立连接或者tcp关闭时 就退出循环{if (sk->err)                                                       // 如果出错则返回错误信息{release_sock(sk);if (copied)                                                     // 如果已经发送了字节数 则返回当前发送出去的长度return(copied);tmp = -sk->err;sk->err = 0;return(tmp);}if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)      // 如果状态不是TCP_SYN_SENT 或者状态不是TCP_SYN_RECV 处理错误{release_sock(sk);                                               if (copied) return(copied);if (sk->err) {tmp = -sk->err;sk->err = 0;return(tmp);}if (sk->keepopen) {send_sig(SIGPIPE, current, 0);}return(-EPIPE);}if (nonblock || copied)                                        // 如果设置是否是阻塞 或者 已经发送的长度{release_sock(sk);if (copied) return(copied);                                        // 返回长度return(-EAGAIN);                                             // 返回错误}release_sock(sk);                                               // 检查是否有数据到cli();                                                           // 关闭中断if (sk->state != TCP_ESTABLISHED &&sk->state != TCP_CLOSE_WAIT && sk->err == 0)             // 如果状态不是连接已建立并且不是关闭状态 并且没有错误{interruptible_sleep_on(sk->sleep);                             // 进入等待休眠if (current->signal & ~current->blocked)                     // 获取当前的信号量 如果有信号量则返回{sti();if (copied) return(copied);return(-ERESTARTSYS);}}sk->inuse = 1;                                                    // 证明 此时连接完成 设置正在使用该套接字sti();                                                           // 使能中断}/** The following code can result in copy <= if sk->mss is ever* decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).* sk->mtu is constant once SYN processing is finished.  I.e. we* had better not get here until we've seen his SYN and at least one* valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)* But ESTABLISHED should guarantee that.  sk->max_window is by definition* non-decreasing.  Note that any ioctl to set user_mss must be done* before the exchange of SYN's.  If the initial ack from the other* end has a window of 0, max_window and thus mss will both be 0.*//* *   Now we need to check if we have a half built packet. */if ((skb = tcp_dequeue_partial(sk)) != NULL)                           // 获取partial数据{int hdrlen;/* IP header + TCP header */hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)+ sizeof(struct tcphdr);                                   // 获取TP TCP 头部长度信息/* Add more stuff to the end of skb->len */if (!(flags & MSG_OOB)) {copy = min(sk->mss - (skb->len - hdrlen), len);                 // 检查剩余长度/* FIXME: this is really a bug. */if (copy <= 0) {printk("TCP: **bug**: \"copy\" <= 0!!\n");copy = 0;}memcpy_fromfs(skb->data + skb->len, from, copy);                 // 拷贝数据skb->len += copy;from += copy;copied += copy;len -= copy;sk->write_seq += copy;                                           // 添加数据到write_seq}if ((skb->len - hdrlen) >= sk->mss ||(flags & MSG_OOB) || !sk->packets_out)                          // 当前数据足够发送  获取当前数据需要立马发送tcp_send_skb(sk, skb);                                             // 发送数据elsetcp_enqueue_partial(skb, sk);                                    // 将数据添加到skb中continue;}/** We also need to worry about the window.* If window < 1/2 the maximum window we've seen from this*   host, don't use it.  This is sender side*   silly window prevention, as specified in RFC1122.*   (Note that this is different than earlier versions of*   SWS prevention, e.g. RFC813.).  What we actually do is *   use the whole MSS.  Since the results in the right*   edge of the packet being outside the window, it will*   be queued for later rather than sent.*/copy = sk->window_seq - sk->write_seq;                                     // 当前窗口可发送的数据长度if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)       // 如果窗口小于最大窗口值的一半 或者大于MSS值 都设置为MSScopy = sk->mss;if (copy > len)                                                         // 如果大于实际要发送的数据长度则设置为最大值copy = len;/** We should really check the window here also. */send_tmp = NULL;if (copy < sk->mss && !(flags & MSG_OOB))                                 // 如果数据为非OOB数据 则进行等待以进行数据合并发送,所以更新容量为MSS值{/**    We will release the socket in case we sleep here. */release_sock(sk);/**    NB: following must be mtu, because mss can be increased.*   mss is always <= mtu */skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);    // 获取长度sk->inuse = 1;                                                               // 设置当前sk正在使用send_tmp = skb;                                                           // 保存当前skb} else {/**   We will release the socket in case we sleep here. */release_sock(sk);                                                           skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);           // 直接分配对应的数据长度sk->inuse = 1;}/**    If we didn't get any memory, we need to sleep. */if (skb == NULL)                                                                // 如果skb为空则证明数据分配失败{sk->socket->flags |= SO_NOSPACE;if (nonblock)                                                                // 如果是非阻塞模式则直接返回{release_sock(sk);if (copied) return(copied);                                                   // 如果已经发送了部分数据则直接返回return(-EAGAIN);}/** FIXME: here is another race condition. */tmp = sk->wmem_alloc;                                                      // 获取保存的数据release_sock(sk);cli();                                                                       // 禁止中断/**  Again we will try to avoid it. */if (tmp <= sk->wmem_alloc &&(sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)           // 检查写缓冲区 检查套接字状态是否为连接状态 或者关闭状态&& sk->err == 0) {sk->socket->flags &= ~SO_NOSPACE;interruptible_sleep_on(sk->sleep);                                         // 睡眠等待if (current->signal & ~current->blocked) {sti();if (copied) return(copied);return(-ERESTARTSYS);}}sk->inuse = 1;                                                               // 设置当前sk正在使用sti();continue;}skb->len = 0;                                                                  // 设置skbskb->sk = sk;skb->free = 0;skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);buff = skb->data;                                                              // 获取data/** FIXME: we need to optimize this.* Perhaps some hints here would be good.*/tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);            // 建立mac ip 等数据if (tmp < 0 ) {prot->wfree(sk, skb->mem_addr, skb->mem_len);release_sock(sk);if (copied) return(copied);return(tmp);}skb->len += tmp;skb->dev = dev;buff += tmp;skb->h.th =(struct tcphdr *) buff;                                                // 更新字段值tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);                   // 建立tcp头部if (tmp < 0)                                                                   // 如果小于0 则创建失败{prot->wfree(sk, skb->mem_addr, skb->mem_len);release_sock(sk);if (copied) return(copied);return(tmp);}if (flags & MSG_OOB) {((struct tcphdr *)buff)->urg = 1;                                           // 设置该值为1((struct tcphdr *)buff)->urg_ptr = ntohs(copy);}skb->len += tmp;                                                              // 增加长度memcpy_fromfs(buff+tmp, from, copy);                        from += copy;copied += copy;len -= copy;skb->len += copy;skb->free = 0;sk->write_seq += copy;                                                            // 增加数据if (send_tmp != NULL && sk->packets_out)                                         // 如果不为空则表示可以合并数据{tcp_enqueue_partial(send_tmp, sk);                                            // 将数据缓存到队列中continue;}tcp_send_skb(sk, skb);                                                            // 调用该函数发送给下层}sk->err = 0;                                                                      // 此次数据都成功处理 重置错误字段值/** Nagle's rule. Turn Nagle off with TCP_NODELAY for highly*  interactive fast network servers. It's meant to be on and* it really improves the throughput though not the echo time* on my slow slip link - Alan*//**    Avoid possible race on send_tmp - c/o Johannes Stille */if(sk->partial && ((!sk->packets_out) /* If not nagling we can send on the before case too.. */|| (sk->nonagle && before(sk->write_seq , sk->window_seq))                // 如果当前partial队列中缓存有数据包并且之前发送数据包得到应答 或者未采用nagle算法且数据包长度在窗口限制内,则直接发送数据))tcp_send_partial(sk);                                                         // 发送数据release_sock(sk);return(copied);                                                                 // 返回发送的长度
}

该函数是系统调用write函数的传输层处理函数,对于TCP协议而言,send,sendto系统调用最后都将调用tcp_write函数进行处理,该函数本身由一个while循环构成,循环条件就是len>0,首先判断之前套接字操作是否出现错误,如果出现错误,则返回错误,在发送数据之前我们必须确定套接字状态为可发送数据状态,如果状态条件不满足,则根据是否阻塞来觉得是否等待。当套接字状态允许发送数据时,则进行内核数据结构的创建,复制用户缓冲区数据到内核缓冲区,在成功完成帧首部创建后,将封装后的数据发送下层进一步处理

tcp_read函数
/**  This routine copies from a sock struct into the user buffer.   主要是系统调用read的调用将内核态接受的数据返回用户态*/static int tcp_read(struct sock *sk, unsigned char *to,int len, int nonblock, unsigned flags)
{struct wait_queue wait = { current, NULL };int copied = 0;unsigned long peek_seq;volatile unsigned long *seq;    /* So gcc doesn't overoptimise */unsigned long used;/* *   This error should be checked. */if (sk->state == TCP_LISTEN)                   // 如果传入套接字为监听状态 则返回错误return -ENOTCONN;/**   Urgent data needs to be handled specially. */if (flags & MSG_OOB)                           // 检查是否有紧急数据return tcp_read_urg(sk, nonblock, to, len, flags);          // 读取紧急数据信息/**  Copying sequence to update. This is volatile to handle* the multi-reader case neatly (memcpy_to/fromfs might be *   inline and thus not flush cached variables otherwise).*/peek_seq = sk->copied_seq;                      // 更新变量值seq = &sk->copied_seq;  if (flags & MSG_PEEK)                           // 如果仅仅是PEEK则不更新内容seq = &peek_seq;add_wait_queue(sk->sleep, &wait);                 // 添加wait到休眠队列中sk->inuse = 1;                                   // 设置正在使用while (len > 0)                                 // 如果长度大于0 {struct sk_buff * skb;unsigned long offset;/** Are we at urgent data? Stop if we have read anything.*/if (copied && sk->urg_data && sk->urg_seq == *seq)         // 是否是紧急数据break;/** Next get a buffer.*/current->state = TASK_INTERRUPTIBLE;                    // 设置为可中断状态skb = skb_peek(&sk->receive_queue);                  // 读receive_queue队列的数据do {if (!skb)                                             // 检查队列中是否有数据 如果没有则停止break;if (before(*seq, skb->h.th->seq))                  // 检查获取的数据是否出现断裂break;offset = *seq - skb->h.th->seq;                    // 读取当前字节偏移的位置if (skb->h.th->syn)                                 // 如果syn设置了值 则偏移位减一offset--;if (offset < skb->len)                                // 如果偏移量小于skb长度则表示有可用数据 跳转到found_ok_skbgoto found_ok_skb;if (skb->h.th->fin)                              // 如果设置了fingoto found_fin_ok;                               // 跳转到found_fin_okif (!(flags & MSG_PEEK))                          // 检查是否是PEEKskb->used = 1;                                  // 设置为使用标志位 处理下一个数据skb = skb->next;                                     // 循环下一个skb}while (skb != (struct sk_buff *)&sk->receive_queue);    // 循环判断是否有内容if (copied)                                             break;if (sk->err)                                           // 如果处理过程中出现错误  {copied = -sk->err;                                     // 保存错误值sk->err = 0;                                        // 重置返回break;}if (sk->state == TCP_CLOSE)                              // 如果状态是关闭状态{if (!sk->done)                                      // 如果不是done状态 则设置为1{sk->done = 1;break;}copied = -ENOTCONN;                                // 设置错误码break;}if (sk->shutdown & RCV_SHUTDOWN)                          // 如果是关闭状态{sk->done = 1;                                        // 设置done 为1break;}if (nonblock)                                            // 如果设置为非阻塞{copied = -EAGAIN;                                  // 返回错误码break;}cleanup_rbuf(sk);                                        // 清理数据release_sock(sk);sk->socket->flags |= SO_WAITDATA; schedule();                                            // 调度该程序sk->socket->flags &= ~SO_WAITDATA;sk->inuse = 1;if (current->signal & ~current->blocked)               // 检查是否有信号量{copied = -ERESTARTSYS;break;}continue;found_ok_skb:                                                // 找到一个可读的数据包/**    Lock the buffer. We can be fairly relaxed as*   an interrupt will never steal a buffer we are * using unless I've missed something serious in* tcp_data.*/skb->users++;                                           // 保护该数据以免被释放/**    Ok so how much can we use ? */used = skb->len - offset;                                 // 获取已经使用的长度if (len < used)                                      // 如果使用的大于读取长度used = len;                                      // 则设置为传入长度/**  Do we have urgent data here? */if (sk->urg_data)                                         // 是否紧急数据{unsigned long urg_offset = sk->urg_seq - *seq;if (urg_offset < used) {if (!urg_offset) {if (!sk->urginline) {++*seq;offset++;used--;}}elseused = urg_offset;}}/**  Copy it - We _MUST_ update *seq first so that we*   don't ever double read when we have dual readers*/*seq += used;                                          // 更新读取的字节数偏移量/**   This memcpy_tofs can sleep. If it sleeps and we*    do a second read it relies on the skb->users to avoid*   a crash when cleanup_rbuf() gets called.*/memcpy_tofs(to,((unsigned char *)skb->h.th) +skb->h.th->doff*4 + offset, used);copied += used;                                       // 获取新数据len -= used;                                           // 获取剩余的长度to += used;                                             // 新增读取的长度/**   We now will not sleep again until we are finished*  with skb. Sorry if you are doing the SMP port*  but you'll just have to fix it neatly ;)*/skb->users --;                                            // 减掉if (after(sk->copied_seq,sk->urg_seq))                   // 比较是否有紧急数据sk->urg_data = 0;                                   // 设置为空if (used + offset < skb->len)                             // 如果还有数据没有读完 则继续读continue;/**  Process the FIN.*/if (skb->h.th->fin)                                     // 如果是fingoto found_fin_ok;                                     // 跳转到找到fin处执行if (flags & MSG_PEEK)                                     // 是否是PEEKcontinue;skb->used = 1;                                           // 重置为1 继续continue;found_fin_ok:++*seq;                                               // seq加1if (flags & MSG_PEEK)break;/**  All is done*/skb->used = 1;sk->shutdown |= RCV_SHUTDOWN;                            // 设置为used 并设置为关闭状态break;}remove_wait_queue(sk->sleep, &wait);                       // 移除等待队列current->state = TASK_RUNNING;                                 // 修改当前进程状态为 task_running/* Clean up data we have read: This will do ACK frames */cleanup_rbuf(sk);                                             // 清除最后数据release_sock(sk);  return copied;
}

该函数j就是系统调用read的网络层实现,其操作主要就是sock结构receive_queue队列,只有进入该队列中数据包才将里面的数据交给应用程序处理,tcp_read主要就是从receive_queue队列中获取数据包,检查其中数据的合法性,并根据用户所要求读取的数据量,尽量拷贝该数量的数据到用户缓冲区中。大致流程,首先会检查套接字状态是否正确,然后检查是否是紧急数据,然后进入一个whilte循环,该循环就是需要读取一定数量的数据才能返回,退出的条件就是读取指定数量的数据,如果设置的为非阻塞则会立马返回不会等待,此处将当前进程状态设置为可中断状态,当条件不满足时,进入睡眠等待,当有数据时就从receive_queue队列中取数据包,进入循环。如果找到可用数据则直接跳转出来到found_ok_skb出,该循环的主要作用就是检查接受队列是否存在可读的数据包,如果没有则跳出,检查数据流是否出现断裂,如果出现断裂则跳出,检查数据包是否有可用数据,如有则跳转到found_ok_skb标志符处进行处理,检查数据包FIN字段设置情况,如果被设置则表示该数据包可能携带还携带一个请求关闭发送通道数据包,如果数据包中包含的是重复数据则释放该数据包并处理下一个可能的数据包。

tcp_rcv函数

分析完主要的函数后,主要获取数据的入口就是tcp_rcv函数,该函数就是TCP协议数据包接受的总入口函数,网络层协议在判断数据包使用的是TCP协议后,将调用tcp_rcv函数对该数据包进行传输层的处理,该函数更像是一个任务分发器,根据数据包中各标志位的设置,将数据包进一步分发给相关具体函数进行具体处理,当前的主要数据包可分为如下几种:SYN请求连接数据包,ACK应答数据包,RST数据包,普通数据包,FIN断开连接数据包等,在TCP连接器件,ACK应答数据包和普通数据包作为一个数据包传输,即数据包中包含普通数据且TCP首部中ACK字段被设置为1。

/**  A TCP packet has arrived.               TCP协议数据包接受的总入口函数   数据包类型大致有 SYN请求连接数据包 ACK应答数据包 RST数据包 普通数据包 FIN断开连接数据包等*/int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,unsigned long daddr, unsigned short len,unsigned long saddr, int redo, struct inet_protocol * protocol)
{struct tcphdr *th;struct sock *sk;int syn_ok=0;if (!skb) {printk("IMPOSSIBLE 1\n");return(0);}if (!dev)                                         // 数据包没有经过网口返回{printk("IMPOSSIBLE 2\n");return(0);}tcp_statistics.TcpInSegs++;if(skb->pkt_type!= PACKET_HOST)               // 如果不是发送给本地的数据包在网络层就已经处理{kfree_skb(skb,FREE_READ);return(0);}th = skb->h.th;/**    Find the socket.*/sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);                  // 根据传入条件查找对应的sock/**   If this socket has got a reset it's to all intents and purposes *  really dead. Count closed sockets as dead.**    Note: BSD appears to have a bug here. A 'closed' TCP in BSD*  simply drops data. This seems incorrect as a 'closed' TCP doesn't*   exist so should cause resets as if the port was unreachable.*/if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE))                            // 如果找到的sk不为空 并且没有被复位或者关闭 则不可接受数据包sk=NULL;if (!redo)                                                                   // 是否是新的数据包{if (tcp_check(th, len, saddr, daddr ))                                      // 检验是否正确{skb->sk = NULL;kfree_skb(skb,FREE_READ);/**   We don't release the socket because it was*    never marked in use.*/return(0);}th->seq = ntohl(th->seq);                                                   // 获取seq/* See if we know about the socket. */if (sk == NULL)                                                             // 如果为空则返回{/**  No such TCB. If th->rst is 0 send a reset (checked in tcp_reset)*/tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);        // 重置TCPskb->sk = NULL;/**  Discard frame*/kfree_skb(skb, FREE_READ);return(0);}skb->len = len;                                                                 // 初始化值skb->acked = 0;skb->used = 0;skb->free = 0;skb->saddr = daddr;skb->daddr = saddr;/* We may need to add it to the backlog here. */cli();if (sk->inuse)                                                                 // 如果在使用{skb_queue_tail(&sk->back_log, skb);                                         // 放入到back_log中并返回sti();return(0);}sk->inuse = 1;                                                                   // 设置为正在使用sti();                                                            }else{if (sk==NULL) {tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);      // 如果为空则重置skb->sk = NULL;kfree_skb(skb, FREE_READ);return(0);}}if (!sk->prot)                                                                    // 如果没有协议操作函数则返回{printk("IMPOSSIBLE 3\n");return(0);}/**  Charge the memory to the socket. */if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)                                   // 对接受缓冲区空余空间进行检查 如果空间过小则丢弃该数据包{kfree_skb(skb, FREE_READ);release_sock(sk);return(0);}skb->sk=sk;sk->rmem_alloc += skb->mem_len;                                                                // 如果可以接受则更新当前接受的数据长度/**    This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We*   don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug*   compatibility. We also set up variables more thoroughly [Karn notes in the* KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].*/if(sk->state!=TCP_ESTABLISHED)     /* Skip this lot for normal flow */                         // 如果sk状态不是已连接状态{/**    Now deal with unusual cases.*/if(sk->state==TCP_LISTEN)                                                                    // 如果是监听状态{if(th->ack)   /* These use the socket TOS.. might want to be the received TOS */          // 如果是ack数据包tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);                     // 重置/**    We don't care for RST, and non SYN are absorbed (old segments)*    Broadcast/multicast SYN isn't allowed. Note - bug if you change the*   netmask on a running connection it can go broadcast. Even Sun's have*  this problem so I'm ignoring it */if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)                    // 如果待回复或者rst复位都丢弃该数据{kfree_skb(skb, FREE_READ);release_sock(sk);return 0;}/*   *   Guess we need to make a new socket up */tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq());                      // 经过检查之后该数据包为SYN数据包/** Now we have several options: In theory there is nothing else*   in the frame. KA9Q has an option to send data with the syn,*    BSD accepts data with the syn up to the [to be] advertised window*  and Solaris 2.1 gives you a protocol error. For now we just ignore* it, that fits the spec precisely and avoids incompatibilities. It*  would be nice in future to drop through and process the data.*/release_sock(sk);return 0;}/* retransmitted SYN? */if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq)                  // 如果当前状态为TCP_SYN_RECV 检查是否是重复发送的SYN数据包 如果重新发送的则丢弃{kfree_skb(skb, FREE_READ);release_sock(sk);return 0;}/** SYN sent means we have to look for a suitable ack and either reset* for bad matches or go to connected */if(sk->state==TCP_SYN_SENT)                                                       // 如果是TCP_SYN_SENT状态{/* Crossed SYN or previous junk segment */if(th->ack)                                                                   // 是否发送ACK{/* We got an ack, but it's not a good ack */if(!tcp_ack(sk,th,saddr,len))                                               // 检查ACK标志位 如果出现错误则报错返回{/* Reset the ack - its an ack from a different connection  [ th->rst is checked in tcp_reset()] */tcp_statistics.TcpAttemptFails++;tcp_reset(daddr, saddr, th,sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);kfree_skb(skb, FREE_READ);release_sock(sk);return(0);}if(th->rst)                                                               // 如果是RST数据包则重置return tcp_std_reset(sk,skb);if(!th->syn)                                                                 // 如果不是SYN数据包则丢弃{/* A valid ack from a different connectionstart. Shouldn't happen but cover it */kfree_skb(skb, FREE_READ);release_sock(sk);return 0;}/** Ok.. it's good. Set up sequence numbers and*   move to established.*/syn_ok=1;    /* Don't reset this connection for the syn */sk->acked_seq=th->seq+1;sk->fin_seq=th->seq;tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr);                 // 回复一个应答数据包tcp_set_state(sk, TCP_ESTABLISHED);                                         // 设置状态为建立连接状态tcp_options(sk,th);                                                       // 设置TCP选项值sk->dummy_th.dest=th->source;sk->copied_seq = sk->acked_seq;if(!sk->dead){sk->state_change(sk);                                                  // 唤醒等待进程sock_wake_async(sk->socket, 0);}if(sk->max_window==0){sk->max_window = 32;sk->mss = min(sk->max_window, sk->mtu);                                // 设置MSS}}else{/* See if SYN's cross. Drop if boring */if(th->syn && !th->rst)                                                   // 如果是syn{/* Crossed SYN's are fine - but talking toyourself is right out... */if(sk->saddr==saddr && sk->daddr==daddr &&sk->dummy_th.source==th->source &&sk->dummy_th.dest==th->dest){tcp_statistics.TcpAttemptFails++;return tcp_std_reset(sk,skb);                                         // 检查状态重置}tcp_set_state(sk,TCP_SYN_RECV);                                           // 设置状态为TCP_SYN_RECV/** FIXME:* Must send SYN|ACK here*/}       /* Discard junk segment */                                                  // 释放数据kfree_skb(skb, FREE_READ);release_sock(sk);return 0;}/** SYN_RECV with data maybe.. drop through*/goto rfc_step6;}/**    BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is* a more complex suggestion for fixing these reuse issues in RFC1644* but not yet ready for general use. Also see RFC1379.*/#define BSD_TIME_WAIT
#ifdef BSD_TIME_WAITif (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && after(th->seq, sk->acked_seq) && !th->rst)                                     // 处于2MSL状态的套接字 是否接受一个连接请求进行判读{long seq=sk->write_seq;if(sk->debug)printk("Doing a BSD time wait\n");tcp_statistics.TcpEstabResets++;       sk->rmem_alloc -= skb->mem_len;skb->sk = NULL;sk->err=ECONNRESET;tcp_set_state(sk, TCP_CLOSE);                                                // 将套接字置于关闭状态sk->shutdown = SHUTDOWN_MASK;release_sock(sk);sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr);               // 获取skif (sk && sk->state==TCP_LISTEN)                                            // 如果sk为监听状态{sk->inuse=1;skb->sk = sk;sk->rmem_alloc += skb->mem_len;tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);          // 处理请求release_sock(sk);return 0;}kfree_skb(skb, FREE_READ);return 0;}
#endif  }/**    We are now in normal data flow (see the step list in the RFC)*  Note most of these are inline now. I'll inline the lot when*   I have time to test it hard and look at what gcc outputs */if(!tcp_sequence(sk,th,len,opt,saddr,dev))                               // 对数据包中数据序列号进行合法性检查 如果不合法则释放{kfree_skb(skb, FREE_READ);release_sock(sk);return 0;}if(th->rst)                                                           // 是否是RST数据包return tcp_std_reset(sk,skb);                                       // 处理RST标志位/**  !syn_ok is effectively the state test in RFC793.*/if(th->syn && !syn_ok)                                                     // 如果不在相应状态则发送SYN请求连接数据包{tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);return tcp_std_reset(sk,skb);  }/**    Process the ACK*/if(th->ack && !tcp_ack(sk,th,saddr,len))                                // 是否需要回复ACK 如果需要则回复ACK信息{/**   Our three way handshake failed.*/if(sk->state==TCP_SYN_RECV)                                       // 如果状态为TCP_SYN_RECV 则重置{tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);}kfree_skb(skb, FREE_READ);release_sock(sk);return 0;}rfc_step6:     /* I'll clean this up later *//**  Process urgent data*/if(tcp_urg(sk, th, saddr, len))                        // 处理紧急数据{kfree_skb(skb, FREE_READ);release_sock(sk);return 0;}/**  Process the encapsulated data*/if(tcp_data(skb,sk, saddr, len))                     // 获取数据{kfree_skb(skb, FREE_READ);release_sock(sk);return 0;}/**    And done*/  release_sock(sk);return 0;
}

该函数主要包括以下几个模块,数据包合法性检测模块tcp_sequence,请求连接处理模块tcp_conn_request,RST数据包处理模块tcp_reset,应答处理模块tcp_ack,数据处理模块tcp_urg、tcp_data,以及断开连接处理模块tcp_fin。主要是根据不同的任务进行不同的数据处理。

总结

本文只是简单的对照书籍了解了TCP协议部分的内容,由内容可知TCP协议本身较为复杂,因为该协议要求的可靠性数据传输保证以及流式传输方式使得实现上必须进行数据重传以及重新排序等操作,TCP协议实现要考虑的方面很多,其主要核心思想就是对数据进行编号和应答机制。本文还有大量相关的函数没有列出,大家可对照书本读阅。由于本人才疏学浅,如有错误请批评指正。

Linux内核网络栈1.2.13-tcp.c概述相关推荐

  1. Linux内核网络栈1.2.13-socket.c函数概述

    参考资料 <<linux内核网络栈源代码情景分析>> socket常用函数概述 根据socket提供的常用的库函数,socket,read,write等函数, 执行的过程 in ...

  2. Linux内核网络栈1.2.13-route.c概述

    参考资料 <<linux内核网络栈源代码情景分析>> route路由表概述 在IP协议的实现中,只要发送数据包都要查询路由表,选择合适的路由选项,确定下一站的地址,并构造MAC ...

  3. Linux内核网络栈1.2.13-有关tcp/ip协议的基础入门

    参考资料 <<linux内核网络栈源代码情景分析>> Linux内核网络栈的基础内容 主要分析tcp/ip相关的基本构成,概述了socket的系统调用进入内核的一个流程,并了解 ...

  4. Linux内核--网络栈实现分析(二)--数据包的传递过程--转

    转载地址http://blog.csdn.net/yming0221/article/details/7492423 作者:闫明 本文分析基于Linux Kernel 1.2.13 注:标题中的&qu ...

  5. Linux内核--网络栈实现分析(一)--网络栈初始化--转

    转载地址 http://blog.csdn.net/yming0221/article/details/7488828 作者:闫明 本文分析基于内核Linux Kernel 1.2.13 以后的系列博 ...

  6. Linux内核网络栈1.2.13-icmp.c概述

    参考资料 <<linux内核网络栈源代码情景分析>> icmp协议 在实现的过程中, ICMP协议工作再IP协议之上,但又不与TCP协议工作再一级,而是在下一级,在一般ICMP ...

  7. Linux内核网络栈1.2.13-af_inet.c概述

    参考资料 <<linux内核网络栈源代码情景分析>> socket常用函数继续调用分析 根据socket提供的常用库函数,socket.read和write等函数,继续往下一层 ...

  8. linux内核网络初始化,Linux内核--网络栈实现分析

    本文分析基于内核Linux Kernel 1.2.13 以后的系列博文将深入分析Linux内核的网络栈实现原理,这里看到曹桂平博士的分析后,也决定选择Linux内核1.2.13版本进行分析. 原因如下 ...

  9. Linux内核--网络栈实现分析(三)--驱动程序层+链路层(上)

    本文分析基于Linux Kernel 1.2.13 原创作品,转载请标明http://blog.csdn.net/yming0221/article/details/7497260 更多请看专栏,地址 ...

最新文章

  1. 胡浩:人人能学的AI《从零开始机器学习》苏州.NET俱乐部课程分享
  2. Centos 7源码编译搭建Nginx
  3. [读书笔记] - 《深度探索C++对象模型》第5章 构造、解构、拷贝语意学
  4. SQL OUTER JOIN概述和示例
  5. 关于IE浏览器以及Firefox下冒泡事件的响应层级
  6. Codeforces 374A - Inna and Pink Pony
  7. python_PDF合成软件_ZHOU125disorder_
  8. 在IDEA上配置SonarLint以及代码质量分析报告模板
  9. Modelica技术教程 | 状态机教程
  10. 【FPGA的小娱乐】tft显示屏生成信号辅助测试阵列
  11. prometheus如何评估告警策略以及如何推送告警消息到alertmanager?
  12. 2021最新Java学科全阶段视频教程(从入门到精通)
  13. perspective 3D 效果和动画配合
  14. 区块链应用 | 一篇文章让你搞懂区块链,非标通证应用将首先落地
  15. 有5个人坐在一起,问第5个人多少岁?他说比第4个人大2岁
  16. CultureInfo 类中需要的【区域性名称】查询
  17. 实战 - Java程序最后的清道夫Runtime.addShutdowHook
  18. HTML表单中get 和post 的区别
  19. Linux 常用命令实践
  20. 2022 年工业工程、人工智能、计算机和应用科学前沿国际会议RTIACA2022征稿通知

热门文章

  1. 10 行代码玩转 NumPy!
  2. 958毕业,苦学Java,竟被二本毕业生吊打!网友:确实厉害!
  3. 遮挡也能识别?地平线提出时序信息提升行人检测准确度|​CVPR 2020
  4. AI芯片行业发展的来龙去脉
  5. 如何在3天内拿下DigSci大赛的亚军?| DigSci科学数据挖掘大赛
  6. 谷歌新研究:基于数据共享的神经网络快速训练方法
  7. 2018CTDC风暴来袭乌镇 互联网大佬再续前缘
  8. 【AI参赛经验】汉字书法识别比赛经验心得——by:microfat_htu
  9. Google母公司5600万美元首投AI芯片初创公司,看上了SambaNova的什么?
  10. 别再用 BeanUtils 了,这款 PO VO DTO 转换神器不香么?