1 tcp_push()

2 __tcp_push_pending_frames()

2.1 sk_write_queue队列未发送数据推送 tcp_write_xmit()

2.1.1 拥塞窗口检测 tcp_cwnd_test()

2.1.2 发送窗口检测 tcp_snd_wnd_test()

2.1.3 发送数据窗口限制 tcp_mss_split_point()

2.1.4 skb分段处理 tso_fragment()

2.1.5 sk_buff构造tcp头推送 tcp_transmit_skb()

2.1.6 更新统计信息 tcp_event_new_data_sent()

3 发送一包数据 tcp_push_one()

4 ACK报文回复 tcp_data_snd_check()

TCP发送相关系统调用的内核核心处理函数tcp_sendmsg()，可以看出该函数做的核心工作就是将待发送的数据组织成一个个的skb，并且将这些skb按照先后顺序放入到发送队列sk_write_queue中。并且该函数也会尝试调用tcp_push()（以及其它两个接口）进行一次新数据发送。

此外，在收到确认后，TCP会调用 tcp_data_snd_check() 检查是否可以发送数据，这里也会有机会发送新数据。

注意：这里的接口tcp_push、tcp_one_push、__tcp_push_pending_frames 均是指新数据发送过程（注意是新数据，而不是重传数据）。即从sk_write_queue队列的的sk_send_head指针处开始发送。关于 tcp_push 的调用堆栈如下：

tcp_push--__tcp_push_pending_frames--tcp_write_xmit //推送sk_write_queue队列中尚未发送的所有数据--tcp_cwnd_test //拥塞窗口检测--tcp_snd_wnd_test //发送窗口检测--tcp_mss_split_point //发送数据窗口限制--tso_fragment //skb超出门限进行分段--tcp_transmit_skb //skb推送，构造tcp报文调用ip层接口推送--tcp_event_new_data_sent //更新统计

1 tcp_push()

从下面的实现中可以看出，tcp_push()在判断了是否需要设置PUSH标记位之后，会调用__tcp_push_pending_frames()。

static inline void tcp_push(struct sock *sk, int flags, int mss_now,int nonagle)
{struct tcp_sock *tp = tcp_sk(sk);if (tcp_send_head(sk)) {//判断是否需要设置PUSH标记struct sk_buff *skb = tcp_write_queue_tail(sk);if (!(flags & MSG_MORE) || forced_push(tp))tcp_mark_push(tp, skb);//MSG_OOB相关，忽略tcp_mark_urg(tp, flags, skb);//调用__tcp_push_pending_frames()尝试发送__tcp_push_pending_frames(sk, mss_now,(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);}
}

2 __tcp_push_pending_frames()

该函数调用tcp_write_xmit()完成发送。

/* Push out any pending frames which were held back due to* TCP_CORK or attempt at coalescing tiny packets.* The socket must be locked by the caller.*/
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,int nonagle)
{struct sk_buff *skb = tcp_send_head(sk);//如果有新数据可供发送，调用tcp_write_xmit()发送if (skb) {if (tcp_write_xmit(sk, cur_mss, nonagle))//和PMTU相关tcp_check_probe_timer(sk);}
}

2.1 sk_write_queue队列未发送数据推送 tcp_write_xmit()

该函数是TCP发送新数据的核心函数，包括发送窗口判断、拥塞控制判断等核心操作都是在该函数中完成。

/* This routine writes packets to the network.  It advances the* send_head.  This happens as incoming acks open up the remote* window for us.** Returns 1, if no segments are in flight and we have queued segments, but* cannot send anything now because of SWS or another problem.*/
static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
{struct tcp_sock *tp = tcp_sk(sk);struct sk_buff *skb;unsigned int tso_segs, sent_pkts;int cwnd_quota;int result;/* If we are closed, the bytes will have to remain here.* In time closedown will finish, we empty the write queue and all* will be happy.*///检查TCB的状态if (unlikely(sk->sk_state == TCP_CLOSE))return 0;//sent_pkts将记录本次调用发送的数据段数sent_pkts = 0;//PMTU探测相关，如果发送了探测报文，则sent_pkts加1if ((result = tcp_mtu_probe(sk)) == 0) {return 0;} else if (result > 0) {sent_pkts = 1;}//循环发送尚未发送过的数据包while ((skb = tcp_send_head(sk))) {unsigned int limit;//设置skb中的GSO分段信息。返回值tso_segs表示该skb中的数据需要分成几个段发送tso_segs = tcp_init_tso_segs(sk, skb, mss_now);BUG_ON(!tso_segs);//获取拥塞窗口允许发送的数据段数。如果为0，表示拥塞窗口不允许发送数据，结束发送过程cwnd_quota = tcp_cwnd_test(tp, skb);if (!cwnd_quota)break;//检测发送窗口是否至少允许发送skb中的一个的段。如果不允许，结束发送过程if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))break;if (tso_segs == 1) {//tso_segs为1，说明skb只有一个段，而且长度可能小于MSS，即是一个小数据包，//所以需要检测nagle算法是否允许发送该skbif (unlikely(!tcp_nagle_test(tp, skb, mss_now,(tcp_skb_is_last(sk, skb) ?nonagle : TCP_NAGLE_PUSH))))break;} else {//tso_segs>1，需要TSO分段，判断是否需要推迟发送，这种推迟主要是为了提高GSO性能if (tcp_tso_should_defer(sk, skb))break;}//通过上面的拥塞窗口和发送窗口的检测后，我们知道，目前至少是可以发送一个//TCP段的。当然也有可能还可以发送更多，所以下面需要根据条件调整limit//如果skb有多个段，需要检查到底可以发送多少数据limit = mss_now;if (tso_segs > 1)//tcp_mss_split_point()返回的是发送窗口和拥塞窗口允许发送的最大字节数，//可能会超过skb本身的数据量，见下文limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota);//skb的数据量超过了限定值，需要分段。这种情况只可能发生在TSO情形，因为非TSO场景，skb//的长度是不可能超过MSS的。此外，这种分段完全是因为拥塞控制和流量控制算法限制了发包大小，//所以才需要分割，和TSO本身没有任何关系if (skb->len > limit &&unlikely(tso_fragment(sk, skb, limit, mss_now)))break;//更新数据包的发送时间戳TCP_SKB_CB(skb)->when = tcp_time_stamp;//发送数据，如果返回非0，表示本次发送失败（如qdisc队列已满等），那么结束本次发送过程//第三个参数为1，表示让tcp_transmit_skb()发送时克隆一份skb首部if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))break;//发送了新数据，更新发送队列以及相关统计tcp_event_new_data_sent(sk, skb);//Nagle算法相关，如果当前发送的数据量小于MSS，认为是小包，所以更新snd_sml的值tcp_minshall_update(tp, mss_now, skb);//累加发包计数sent_pkts++;}//end of while((skb = tcp_send_head(sk)))//如果本次发送了数据包，则调整拥塞控制相关变量if (likely(sent_pkts)) {tcp_cwnd_validate(sk);return 0;}//这两种特殊情况中的任意一种也认为是成功返回（返回0表示函数执行成功）：//1. 当前已有未确认的数据包在发送；//2. sk->sk_send_head为NULL，即当前已没有新数据需要发送return !tp->packets_out && tcp_send_head(sk);
}

2.1.1 拥塞窗口检测 tcp_cwnd_test()

该函数检测拥塞窗口是否允许发送数据段，如果允许，返回在拥塞窗口限制范围内，可用于发送的段数（注意：不是字节数）。

/* Can at least one segment of SKB be sent right now, according to the* congestion window rules?  If so, return how many segments are allowed.*/
static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
{u32 in_flight, cwnd;/* Don't be strict about the congestion window for the final FIN.  *///如果是FIN段，并且只有一个段（FIN有可能会携带很多数据），那么总是可以发送，不会被拥塞窗口限制if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tcp_skb_pcount(skb) == 1)return 1;//估算当前还在网络中传输的TCP段的数目in_flight = tcp_packets_in_flight(tp);//snd_cwnd就是当前拥塞窗口的大小，以TCP段为单位cwnd = tp->snd_cwnd;//比较拥塞窗口大小和飞行报文数目，余量就是拥塞控制还允许发送的段数if (in_flight < cwnd)return (cwnd - in_flight);//拥塞窗口已经好耗尽，返回0表示不允许发送数据return 0;
}//该函数估算的是那些已经发送出去（初传+重传）并且已经离开
//网络的段的数目，这些段主要是SACK确认的+已经判定为丢失的段
static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{//sacked_out：启用SACK时，表示已经被SACK选项确认的段的数量；//            不启用SACK时，记录了收到的重复ACK的次数，因为重复ACK不会自动发送，一定是对端收到了数据包；//lost_out：记录发送后在传输过程中丢失的段的数目，因为TCP没有一种机制可以准确的知道//        发出去的段是否真的丢了，所以这只是一种算法上的估计值//无论如何，这两种段属于已经发送，但是可以确定它们在网络中已经不存在了return tp->sacked_out + tp->lost_out;
}/* This determines how many packets are "in the network" to the best* of our knowledge.  In many cases it is conservative, but where* detailed information is available from the receiver (via SACK* blocks etc.) we can make more aggressive calculations.** Use this for decisions involving congestion control, use just* tp->packets_out to determine if the send queue is empty or not.** Read this equation as:**   "Packets sent once on transmission queue" MINUS*  "Packets left network, but not honestly ACKed yet" PLUS*  "Packets fast retransmitted"*/
static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{//packets_out记录的是已经从发送队列发出，但是尚未被确认的段的数目（不包括重传）//retrans_out表示的是因为重传才发送出去，但是还没有被确认的段的数目//tcp_left_out()：发出去了但是已经离开了网络的数据包数目return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
}

可以看出，拥塞窗口的检测实际上非常的简单，就是看当前网络中还在传输的报文（即飞行报文）数量是否超过了拥塞窗口的限制。拥塞控制的核心在于如何在各种情况下合理的设定拥塞窗口tp->snd_cwnd的值。

2.1.2 发送窗口检测 tcp_snd_wnd_test()

该函数判断当前发送窗口是否至少允许发送一个段，如果允许，返回1，否则返回0。如果skb的大小超过了一个MSS，那么只要允许发送一个MSS，就返回1；如果skb的大小小于一个MSS，那么只要允许发送所需的数据量就会返回1。

/* Does at least the first segment of SKB fit into the send window? */
static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
{u32 end_seq = TCP_SKB_CB(skb)->end_seq;//如果skb中数据超过了一个段大小，则调整end_seq为一个段大小的序号if (skb->len > cur_mss)end_seq = TCP_SKB_CB(skb)->seq + cur_mss;//检查一个段的末尾序号是否超过了发送窗口的右边界return !after(end_seq, tcp_wnd_end(tp));
}//返回发送窗口的右边界
static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
{//snd_una：已经发送但是还没有被确认的最小序号//snd_wnd：当前发送窗口大小，即接收方剩余的接收缓冲区return tp->snd_una + tp->snd_wnd;
}

2.1.3 发送数据窗口限制 tcp_mss_split_point()

该函数综合skb中数据长度、发送窗口允许发送数据量、拥塞窗口允许发送数据量，计算本次允许当前skb发送的数据量，以字节为单位。

/* Returns the portion of skb which can be sent right away without* introducing MSS oddities to segment boundaries. In rare cases where* mss_now != mss_cache, we will request caller to create a small skb* per input skb which could be mostly avoided here (if desired).** We explicitly want to create a request for splitting write queue tail* to a small skb for Nagle purposes while avoiding unnecessary modulos,* thus all the complexity (cwnd_len is always MSS multiple which we* return whenever allowed by the other factors). Basically we need the* modulo only when the receiver window alone is the limiting factor or* when we would be allowed to send the split-due-to-Nagle skb fully.*/
@skb：待判断的skb
@mss_now：当前MSS
@cwnd：拥塞窗口允许发送的段数，cwnd*mss_now即拥塞窗口允许发送的字节数；
static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,unsigned int mss_now, unsigned int cwnd)
{struct tcp_sock *tp = tcp_sk(sk);u32 needed, window, cwnd_len;//window为发送窗口允许当前skb发送的最大字节数（可能会超过skb->len)window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;//cwnd_len为拥塞窗口允许发送的字节数cwnd_len = mss_now * cwnd;//这段逻辑要实现的效果见下面的注释if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))return cwnd_len;//needed为经过发送窗口矫正后的实际要发送的数据量needed = min(skb->len, window);if (skb == tcp_write_queue_tail(sk) && cwnd_len <= needed)return cwnd_len;//最终返回值是MSS的整数倍，当然单位依然是字节return needed - needed % mss_now;
}

上面的实现不是很好理解，实际上该函数的逻辑如下：

最后一个skb、拥塞窗口受限-----返回拥塞窗口允许发送的数据量；
最后一个skb、拥塞窗口不受限-----返回min(发送窗口允许的数据量，实际要发送的数据量skb->len)；
不是最后一个skb、拥塞窗口受限-----返回拥塞窗口允许发送的数据量，这种情况返回的允许值可能会大于skb中要发送的数据量。因为可能是这样的关系skb->len < cwnd_len <= window.
不是最后一个skb、拥塞窗口不受限-----返回min(发送窗口允许的数据量，实际要发送的数据量skb->len)。

2.1.4 skb分段处理 tso_fragment()

在tcp_write_xmit()中，如果skb中数据量过大，超过了发送窗口和拥塞窗口的限定，只允许发送skb的一部分，那么就需要将skb拆分成两段，前半段长度为len，本次可以发送，后半段保存在新分配的skb中，在发送队列sk_write_queue中将后半段插入到前半段的后面，这样可以保证数据的顺序发送。

注：由于这种分割只是修改struct share_info的frags[]中的指针关系，不涉及内存拷贝，所以速度是很快的。

/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet* which is put after SKB on the list.  It is very much like* tcp_fragment() except that it may make several kinds of assumptions* in order to speed up the splitting operation.  In particular, we* know that all the data is in scatter-gather pages, and that the* packet has never been sent out before (and thus is not cloned).*/
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,unsigned int mss_now)
{struct sk_buff *buff;int nlen = skb->len - len;u16 flags;/* All of a TSO frame must be composed of paged data.  */if (skb->len != skb->data_len)return tcp_fragment(sk, skb, len, mss_now);buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);if (unlikely(buff == NULL))return -ENOMEM;sk->sk_wmem_queued += buff->truesize;sk_mem_charge(sk, buff->truesize);buff->truesize += nlen;skb->truesize -= nlen;/* Correct the sequence numbers. */TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;/* PSH and FIN should only be set in the second packet. */flags = TCP_SKB_CB(skb)->flags;TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);TCP_SKB_CB(buff)->flags = flags;/* This packet was never sent out yet, so no SACK bits. */TCP_SKB_CB(buff)->sacked = 0;buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;skb_split(skb, buff, len);/* Fix up tso_factor for both original and new SKB.  */tcp_set_skb_tso_segs(sk, skb, mss_now);tcp_set_skb_tso_segs(sk, buff, mss_now);/* Link BUFF into the send queue. */skb_header_release(buff);tcp_insert_write_queue_after(skb, buff, sk);return 0;
}/*** skb_split - Split fragmented skb to two parts at length len.* @skb: the buffer to split* @skb1: the buffer to receive the second part* @len: new length for skb*/
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{int pos = skb_headlen(skb);if (len < pos)  /* Split line is inside header. */skb_split_inside_header(skb, skb1, len, pos);else     /* Second chunk has no header, nothing to copy. */skb_split_no_header(skb, skb1, len, pos);
}static inline void skb_split_no_header(struct sk_buff *skb,struct sk_buff* skb1,const u32 len, int pos)
{int i, k = 0;const int nfrags = skb_shinfo(skb)->nr_frags;skb_shinfo(skb)->nr_frags = 0;skb1->len        = skb1->data_len = skb->len - len;skb->len       = len;skb->data_len         = len - pos;for (i = 0; i < nfrags; i++) {int size = skb_shinfo(skb)->frags[i].size;if (pos + size > len) {skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];if (pos < len) {/* Split frag.* We have two variants in this case:* 1. Move all the frag to the second*    part, if it is possible. F.e.*    this approach is mandatory for TUX,*    where splitting is expensive.* 2. Split is accurately. We make this.*/get_page(skb_shinfo(skb)->frags[i].page);skb_shinfo(skb1)->frags[0].page_offset += len - pos;skb_shinfo(skb1)->frags[0].size -= len - pos;skb_shinfo(skb)->frags[i].size = len - pos;skb_shinfo(skb)->nr_frags++;}k++;} elseskb_shinfo(skb)->nr_frags++;pos += size;}skb_shinfo(skb1)->nr_frags = k;
}

2.1.5 sk_buff构造tcp头推送 tcp_transmit_skb()

该函数为传入的skb构造TCP首部，然后调用IP层的输出接口完成数据发送。

/* This routine actually transmits TCP packets queued in by* tcp_do_sendmsg().  This is used by both the initial* transmission and possible later retransmissions.* All SKB's seen here are completely headerless.  It is our* job to build the TCP header, and pass the packet down to* IP so it can do the same plus pass the packet off to the* device.** We are working here with either a clone of the original* SKB, or a fresh unique copy made by the retransmit engine.*/
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,gfp_t gfp_mask)
{const struct inet_connection_sock *icsk = inet_csk(sk);struct inet_sock *inet;struct tcp_sock *tp;struct tcp_skb_cb *tcb;int tcp_header_size;
#ifdef CONFIG_TCP_MD5SIGstruct tcp_md5sig_key *md5;__u8 *md5_hash_location;
#endifstruct tcphdr *th;int sysctl_flags;int err;BUG_ON(!skb || !tcp_skb_pcount(skb));/* If congestion control is doing timestamping, we must* take such a timestamp before we potentially clone/copy.*///拥塞控制算法相关if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)__net_timestamp(skb);//如果调用者指明需要克隆skb然后再发送，那么执行skb的克隆操作if (likely(clone_it)) {if (unlikely(skb_cloned(skb)))skb = pskb_copy(skb, gfp_mask);elseskb = skb_clone(skb, gfp_mask);if (unlikely(!skb))return -ENOBUFS;}inet = inet_sk(sk);tp = tcp_sk(sk);tcb = TCP_SKB_CB(skb);tcp_header_size = tp->tcp_header_len;#define SYSCTL_FLAG_TSTAMPS   0x1
#define SYSCTL_FLAG_WSCALE  0x2
#define SYSCTL_FLAG_SACK    0x4//根据报文类型，确定TCP首部长度，因为有些选项只能在SYN段中携带，所以这里需要区分计算sysctl_flags = 0;if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;if (sysctl_tcp_timestamps) {tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;sysctl_flags |= SYSCTL_FLAG_TSTAMPS;}if (sysctl_tcp_window_scaling) {tcp_header_size += TCPOLEN_WSCALE_ALIGNED;sysctl_flags |= SYSCTL_FLAG_WSCALE;}if (sysctl_tcp_sack) {sysctl_flags |= SYSCTL_FLAG_SACK;if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;}} else if (unlikely(tp->rx_opt.eff_sacks)) {/* A SACK is 2 pad bytes, a 2 byte header, plus* 2 32-bit sequence numbers for each SACK block.*/tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +(tp->rx_opt.eff_sacks *TCPOLEN_SACK_PERBLOCK));}//拥塞控制相关。如果之前没有正在传输的报文，那么这是第一次传输，通知拥塞控制//算法CA_EVENT_TX_START事件，表示新启动了发送if (tcp_packets_in_flight(tp) == 0)tcp_ca_event(sk, CA_EVENT_TX_START);#ifdef CONFIG_TCP_MD5SIG/** Are we doing MD5 on this segment? If so - make* room for it.*/md5 = tp->af_specific->md5_lookup(sk, sk);if (md5)tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
#endif//填充TCP首部各个字段skb_push(skb, tcp_header_size);skb_reset_transport_header(skb);skb_set_owner_w(skb, sk);/* Build TCP header and checksum it. */th = tcp_hdr(skb);th->source      = inet->sport;th->dest       = inet->dport;th->seq            = htonl(tcb->seq);th->ack_seq        = htonl(tp->rcv_nxt);*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |tcb->flags);if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {/* RFC1323: The window in SYN & SYN/ACK segments* is never scaled.*/th->window    = htons(min(tp->rcv_wnd, 65535U));} else {th->window = htons(tcp_select_window(sk));}th->check       = 0;th->urg_ptr     = 0;if (unlikely(tp->urg_mode &&between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {th->urg_ptr       = htons(tp->snd_up - tcb->seq);th->urg            = 1;}//构造TCP首部的选项部分if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {tcp_syn_build_options((__be32 *)(th + 1),tcp_advertise_mss(sk),(sysctl_flags & SYSCTL_FLAG_TSTAMPS),(sysctl_flags & SYSCTL_FLAG_SACK),(sysctl_flags & SYSCTL_FLAG_WSCALE),tp->rx_opt.rcv_wscale,tcb->when,tp->rx_opt.ts_recent,#ifdef CONFIG_TCP_MD5SIGmd5 ? &md5_hash_location :
#endifNULL);} else {tcp_build_and_update_options((__be32 *)(th + 1),tp, tcb->when,
#ifdef CONFIG_TCP_MD5SIGmd5 ? &md5_hash_location :
#endifNULL);TCP_ECN_send(sk, skb, tcp_header_size);}#ifdef CONFIG_TCP_MD5SIG/* Calculate the MD5 hash, as we have all we need now */if (md5) {tp->af_specific->calc_md5_hash(md5_hash_location,md5,sk, NULL, NULL,tcp_hdr(skb),sk->sk_protocol,skb->len);}
#endif//校验和相关处理，在TCPv4中为tcp_v4_send_check()icsk->icsk_af_ops->send_check(sk, skb->len, skb);//发送的段中携带了ACK，延时确认机制需要做些更新操作if (likely(tcb->flags & TCPCB_FLAG_ACK))tcp_event_ack_sent(sk, tcp_skb_pcount(skb));//如果发送的段中携带了有效数据，需要进行一些拥塞控制相关的操作if (skb->len != tcp_header_size)tcp_event_data_sent(tp, skb, sk);if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)TCP_INC_STATS(TCP_MIB_OUTSEGS);//调用发送接口queue_xmit发送报文，进入到ip层，如果失败返回错误码。//在TCP中该接口实现函数为ip_queue_xmit()err = icsk->icsk_af_ops->queue_xmit(skb, 0);if (likely(err <= 0))return err;//显示拥塞相关tcp_enter_cwr(sk, 1);//根据错误码返回发送结果return net_xmit_eval(err);#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}

注：上面tcp_transmit_skb()中有些内容涉及到TCP的其它机制，后面有时间再来分析。

2.1.6 更新统计信息 tcp_event_new_data_sent()

发送队列中有新数据被发送出去时，调用该函数更新数据段统计信息。

static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{struct tcp_sock *tp = tcp_sk(sk);unsigned int prior_packets = tp->packets_out;//将发送队列指针sk_send_head前移tcp_advance_send_head(sk, skb);//更新下一个待发送的段的TCP序号tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;/* Don't override Nagle indefinately with F-RTO *///F-RTO算法if (tp->frto_counter == 2)tp->frto_counter = 3;//累加已经发送，但是尚未被确认的TCP段个数统计tp->packets_out += tcp_skb_pcount(skb);//如果之前没有发送过数据，则启动超时重传定时器if (!prior_packets)inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}

3 发送一包数据 tcp_push_one()

看了上面tcp_write_xmit()的处理后，再来看看 tcp_push_one() 的实现，会发现二者基本一致，区别正如函数名，本函数只尝试发送一包数据，而 tcp_push() 会尝试遍历整个发送队列，直到无法继续发送为止。

/* Send _single_ skb sitting at the send head. This function requires* true push pending frames to setup probe timer etc.*/
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{struct tcp_sock *tp = tcp_sk(sk);struct sk_buff *skb = tcp_send_head(sk);unsigned int tso_segs, cwnd_quota;BUG_ON(!skb || skb->len < mss_now);tso_segs = tcp_init_tso_segs(sk, skb, mss_now);cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);if (likely(cwnd_quota)) {unsigned int limit;BUG_ON(!tso_segs);limit = mss_now;if (tso_segs > 1 && !tcp_urg_mode(tp))limit = tcp_mss_split_point(sk, skb, mss_now,cwnd_quota);if (skb->len > limit &&unlikely(tso_fragment(sk, skb, limit, mss_now)))return;/* Send it out now. */TCP_SKB_CB(skb)->when = tcp_time_stamp;if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {tcp_event_new_data_sent(sk, skb);tcp_cwnd_validate(sk);return;}}
}

4 ACK报文回复 tcp_data_snd_check()

接收过程中，在收到ACK后，更新了发送窗口、拥塞窗口之后，也会调用 tcp_data_snd_check() 检查是否可以发送新数据。

static inline void tcp_data_snd_check(struct sock *sk)
{//是上面__tcp_push_pending_frames()的包装函数tcp_push_pending_frames(sk);//内存管理相关tcp_check_space(sk);
}static inline void tcp_push_pending_frames(struct sock *sk)
{struct tcp_sock *tp = tcp_sk(sk);__tcp_push_pending_frames(sk, tcp_current_mss(sk, 1), tp->nonagle);
}

linux内核协议栈 TCP层数据发送之发送新数相关推荐

linux内核协议栈 TCP层数据发送之TSO/GSO
目录 1 基本概念 2 TCP延迟分段判定 2.1 客户端初始化 2.2 服务器端初始化 2.3 sk_setup_caps() 3 整体结构 4. TCP发送路径TSO处理 4.1 tcp_send ...
linux内核协议栈 TCP数据发送之发送窗口
目录 1 发送窗口概述 2 snd_una 和 snd_wnd 的更新 2.1 发送窗口初始化 2.1.1 客户端初始化 2.1.2 服务器端初始化 2.2 本地接收窗口 rcv_wnd 通告 2.2 ...
linux内核协议栈 TCP选项之SACK选项概述
目录 1 SACK概述 1.1 SACK允许选项 1.2 SACK选项 2 SACK允许选项的发送和接收 2.1 客户端 SYN 段发送 tcp_transmit_skb 2.1.1 syn包构造可选 ...
linux内核协议栈 TCP连接探测中的Keepalive和心跳包使用
目录 1 TCP保活的必要性 2 导致TCP断连的因素 3 保活的两种方式 3.1 应用层面的心跳机制 3.2 TCP协议自带的保活功能 4 两种方式的优劣性 5 到底选用那种心跳方式? 6 配置 K ...
linux内核协议栈 TCP选项之MSS
目录 1 MSS概述 2 客户端三次握手 2.1 发送SYN段MSS选项值 2.1.1 tcp_advertise_mss() 2.1.2 tp->advmss的初始化 2.2 接收SYN+AC ...
Linux 内核协议栈之TCP连接关闭
Close行为: 当应用程序在调用close()函数关闭TCP连接时,Linux内核的默认行为是将套接口发送队列里的原有数据(比如之前残留的数据)以及新加入的数据(比如函数close()产生的FIN ...
什么叫linux网络协议栈,我们为什么使用Linux内核的TCP协议栈
最近的一篇文章提出了"我们为什么使用Linux内核的TCP协议栈"的问题,并在Hacker News引发了非常有意思的讨论. 在CloudFlare的时候我也曾思考这个问题.我的经 ...
Linux 内核协议栈学习资料
终极资料 1.<Understanding Linux Network Internals> 2.<TCP/IP Architecture, Design and Implement ...
linux内核中TCP接收的实现
linux内核中TCP接收的实现入口函数是tcp_v4_rcv 1. 数据包检查处理一开始做一些数据包详细检查处理,一旦出错,可能导致内核挂掉 int tcp_v4_rcv(struct sk_bu ...

linux内核协议栈 TCP层数据发送之发送新数

1 tcp_push()

2 __tcp_push_pending_frames()

2.1 sk_write_queue队列未发送数据推送 tcp_write_xmit()

2.1.1 拥塞窗口检测 tcp_cwnd_test()

2.1.2 发送窗口检测 tcp_snd_wnd_test()

2.1.3 发送数据窗口限制 tcp_mss_split_point()

2.1.4 skb分段处理 tso_fragment()

2.1.5 sk_buff构造tcp头推送 tcp_transmit_skb()

2.1.6 更新统计信息 tcp_event_new_data_sent()

3 发送一包数据 tcp_push_one()

4 ACK报文回复 tcp_data_snd_check()

linux内核协议栈 TCP层数据发送之发送新数相关推荐

最新文章

热门文章