内核网络输出帧的处理

首先来看如何打开和关闭一个输出队列。

帧的输出状态是通过device->state设为__LINK_STATE_XOFF来表示的。而打开和关闭队列也就是通过这个状态位来处理。

static inline void netif_start_queue(struct net_device *dev)
{  netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}  static inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{  clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
}  static inline void netif_stop_queue(struct net_device *dev)
{  netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}  static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{  set_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
}

通过上面的函数我们可以看到开启和关闭队列都是通过设置state的状态位进行实现的。

为什么要在设备运行的时候开启或关闭队列呢，举个例子，当设备使用完了它的内存，也就是无法再处理多余的帧，此时我们就需要停止掉队列，而当设备的可以接受新的帧时，我们又需要唤醒队列。

可以看下3c59x.c中的代码片段：

if (ioread16(ioaddr + TxFree) > 1536) {  netif_start_queue (dev);    /* AKPM: redundant? */
} else {  /* Interrupt us when the FIFO has room for max-sized packet. */  //没有足够的空间容纳mtu，因此停掉队列。  netif_stop_queue(dev);  iowrite16(SetTxThreshold + (1536>>2), ioaddr + EL3_CMD);
}

当有可用的空间时，设备会执行一个中断来通知内核驱动，有足够的内存，此时我们就需要通过netif_wake_queue来唤醒队列。

继续来看代码片段，在3c598x.c的中断处理函数中:

if (status & TxAvailable) {  if (vortex_debug > 5)  printk(KERN_DEBUG " TX room bit was handled.\n");  /* There's room in the FIFO for a full-sized packet. */  iowrite16(AckIntr | TxAvailable, ioaddr + EL3_CMD);  ///唤醒队列。  netif_wake_queue (dev);
}

为什么要用netif_wake_queue,而不是netif_start_queue呢，我们先来看它的实现：

static inline void netif_wake_queue(struct net_device *dev)
{  netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}  static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_NETPOLL_TRAP  if (netpoll_trap()) {  clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state);  return;  }
#endif  if (test_and_clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state))  //相比于netif_start_queue多了着一个函数调用。  __netif_schedule(dev_queue->qdisc);
}

我们这里要先明确一下，这里的关闭和打开队列，是相对于上层来说的，当我们关闭掉队列之后，我们不仅需要打开队列，而且还要将设备重新加入到output_queue,并触发软中断。执行新的包的传输。因此这里会调用
__netif_schedule.

static inline void __netif_reschedule(struct Qdisc *q)
{  struct softnet_data *sd;  unsigned long flags;  local_irq_save(flags);  sd = &__get_cpu_var(softnet_data);  q->next_sched = sd->output_queue;  sd->output_queue = q;  raise_softirq_irqoff(NET_TX_SOFTIRQ);  local_irq_restore(flags);
}

帧的输出和输入最大的一个差别就是帧的输出包含一个流量控制模块，相当于一个过滤器，它提供了一些虚的操作方法，不同的排队策略实现自己的虚方法。而每个帧在送出去之前都会调用qdisc_run方法，这个方法最终会调用qdisc_restart方法。

先来看下排队策略的主要的数据结构：

struct Qdisc_ops{  struct Qdisc_ops    *next;  //这里将数据分为不同的类别，每个类别都有自己的处理方法  const struct Qdisc_class_ops    *cl_ops;  char            id[IFNAMSIZ];  int         priv_size;  //入队列虚函数  int             (*enqueue)(struct sk_buff *, struct Qdisc *);  //出队列虚函数  struct sk_buff *    (*dequeue)(struct Qdisc *);  //把数据包放回队列，比如这次传输失败，则需要这样做  int             (*requeue)(struct sk_buff *, struct Qdisc *);  //丢包虚函数  unsigned int        (*drop)(struct Qdisc *);  int         (*init)(struct Qdisc *, struct nlattr *arg);  void            (*reset)(struct Qdisc *);  void            (*destroy)(struct Qdisc *);  int         (*change)(struct Qdisc *, struct nlattr *arg);  int         (*dump)(struct Qdisc *, struct sk_buff *);  int         (*dump_stats)(struct Qdisc *, struct gnet_dump *);  struct module       *owner;
};

来看qdisc_restart函数,这个函数会通过拍对策略的出队列函数来取得数据包，然后会调用每个驱动都会实现的帧传输的虚函数，最终判断返回值做相应的处理：

static inline int qdisc_restart(struct Qdisc *q)
{  struct netdev_queue *txq;  int ret = NETDEV_TX_BUSY;  struct net_device *dev;  spinlock_t *root_lock;  struct sk_buff *skb;
//调用相应的设备的排队策略的出队列方法，取出数据包  /* Dequeue packet */  if (unlikely((skb = dequeue_skb(q)) == NULL))  return 0;  //取得数据包的锁。  root_lock = qdisc_lock(q);
//释放锁  /* And release qdisc */  spin_unlock(root_lock);  //得到设备  dev = qdisc_dev(q);  txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));  //请求设备队列的xmit_lock锁  HARD_TX_LOCK(dev, txq, smp_processor_id());  if (!netif_tx_queue_stopped(txq) &&  !netif_tx_queue_frozen(txq))
//执行每一个驱动所需实现的帧的传递的虚函数。  ret = dev_hard_start_xmit(skb, dev, txq);  HARD_TX_UNLOCK(dev, txq);  //给数据包加锁。  spin_lock(root_lock);
//开始判断返回值  switch (ret) {  case NETDEV_TX_OK:  /* Driver sent out skb successfully */  ret = qdisc_qlen(q);  break;  case NETDEV_TX_LOCKED:  /* Driver try lock failed */  ret = handle_dev_cpu_collision(skb, txq, q);  break;  default:  /* Driver returned NETDEV_TX_BUSY - requeue skb */  if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))  printk(KERN_WARNING "BUG %s code %d qlen %d\n",  dev->name, ret, q->q.qlen);  ret = dev_requeue_skb(skb, q);  break;  }  if (ret && (netif_tx_queue_stopped(txq) ||  netif_tx_queue_frozen(txq)))  ret = 0;  return ret;
}

dev_queue_xmit这个函数和帧的输入中的netif_rx很类似。它不管怎么样都会调用hard_start_xmit也就是每个驱动自己实现的帧输出虚函数，可是它有两种途径，一种就是流量控制模块，也就是调用qdisc_run，第二种是直接调用hard_start_xmit(主要是虚拟设备，或者回环。).

先来看他的调用图：

int dev_queue_xmit(struct sk_buff *skb)
{  .............................  /* GSO will handle the following emulations directly. */  //有关gso的文章可以看这里：http://lwn.net/Articles/189970/  if (netif_needs_gso(dev, skb))  goto gso;
..................................
......................................  /* If packet is not checksummed and device does not support * checksumming for this protocol, complete checksumming here. */  if (skb->ip_summed == CHECKSUM_PARTIAL) {  skb_set_transport_header(skb, skb->csum_start -  skb_headroom(skb));  //进行包的校验。  if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))  goto out_kfree_skb;  }  gso:
............................................  //如果有定义enqueue，其实也就是判断是否有流量控制了。  if (q->enqueue) {  spinlock_t *root_lock = qdisc_lock(q);  spin_lock(root_lock);  //测试状态。如果是__QDISC_STATE_DEACTIVATED,则丢掉此包。  if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {  kfree_skb(skb);  rc = NET_XMIT_DROP;  } else {  //否则调用qdisc_run来处理  rc = qdisc_enqueue_root(skb, q);  qdisc_run(q);  }  spin_unlock(root_lock);  //返回  goto out;  }  /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... Really, it is unlikely that netif_tx_lock protection is necessary here.  (f.e. loopback and IP tunnels are clean ignoring statistics counters.) However, it is possible, that they rely on protection made by us here. Check this and shot the lock. It is not prone from deadlocks. Either shot noqueue qdisc, it is even simpler 8) */  //下面省略的代码就是处理一些虚拟设备。  ......................................  rc = -ENETDOWN;  rcu_read_unlock_bh();  out_kfree_skb:  kfree_skb(skb);  return rc;
out:  rcu_read_unlock_bh();  return rc;
}

最后我们来看输出队列的软中断处理函数net_rx_action,它主要会在两种情况下被触发：

1 当传输被设备打开，此时设备调用netif_wake_queue.
2 当传输已经完毕，此时设备会调用dev_kfree_skb_irq.

static void net_tx_action(struct softirq_action *h)
{  //取得cpu的softnet_data变量  struct softnet_data *sd = &__get_cpu_var(softnet_data);  if (sd->completion_queue) {  struct sk_buff *clist;  //关闭中断  local_irq_disable();  //清除sd的完成队列列表  clist = sd->completion_queue;  sd->completion_queue = NULL;  local_irq_enable();  while (clist) {  struct sk_buff *skb = clist;  clist = clist->next;  WARN_ON(atomic_read(&skb->users));  //循环释放数据包  __kfree_skb(skb);  }  }  //开始释放排队策略的相关资源。  if (sd->output_queue) {  struct Qdisc *head;  local_irq_disable();  head = sd->output_queue;  sd->output_queue = NULL;  local_irq_enable();  while (head) {  struct Qdisc *q = head;  spinlock_t *root_lock;  head = head->next_sched;  ................................  }  }
}

只有一个buffer的引用计数为0时，才能释放它，这里就是skb->users为0.

void dev_kfree_skb_irq(struct sk_buff *skb)
{  if (atomic_dec_and_test(&skb->users)) {  struct softnet_data *sd;  unsigned long flags;  local_irq_save(flags);  sd = &__get_cpu_var(softnet_data);
//将所有需要dealloc的数据包加到completime_queue中。  skb->next = sd->completion_queue;  sd->completion_queue = skb;  raise_softirq_irqoff(NET_TX_SOFTIRQ);  local_irq_restore(flags);  }
}

看段代码片段，3c59x.c的中断处理函数中的：

///dma传输已经完成，也就代表此次数据包传输已经完成。
if (status & DMADone) {  if (ioread16(ioaddr + Wn7_MasterStatus) & 0x1000) {  iowrite16(0x1000, ioaddr + Wn7_MasterStatus); /* Ack the event. */  pci_unmap_single(VORTEX_PCI(vp), vp->tx_skb_dma, (vp->tx_skb->len + 3) & ~3, PCI_DMA_TODEVICE);  //触发软中断，释放资源。  dev_kfree_skb_irq(vp->tx_skb); /* Release the transferred buffer */  if (ioread16(ioaddr + TxFree) > 1536) {  /* * AKPM: FIXME: I don't think we need this.  If the queue was stopped due to * insufficient FIFO room, the TxAvailable test will succeed and call * netif_wake_queue() */  netif_wake_queue(dev);  } else { /* Interrupt when FIFO has room for max-sized packet. */  iowrite16(SetTxThreshold + (1536>>2), ioaddr + EL3_CMD);  netif_stop_queue(dev);  }  }
}

内核网络输出帧的处理相关推荐

Linux内核网络（一）——初探内核网络
本文将从宏观上介绍Linux内核网络协议栈和网络设备驱动程序,介绍了两个很重要的结构(net_device和sk_buff),更深入更详细的内容将在以后的文章中介绍. 首先,我们需要了解网络分层模型. ...
linux内核网络协议栈--监控和调优：接收数据（十五）
译者序本文翻译自 2016 年的一篇英文博客 Monitoring and Tuning the Linux Networking Stack: Receiving Data.如果能看懂英文,建议阅 ...
Linux内核网络栈1.2.13-tcp.c概述
参考资料 <<linux内核网络栈源代码情景分析>> af_inet.c文件中调用函数在协议层的实现本文主要根据在af_inet.c文件中根据初始化不同的协议,来调用不同的协 ...
深入Linux内核网络堆栈
前一段时间看到这篇帖子,确实很经典,于是翻出了英文原版再读,顺便再翻译出来供大家学习,这篇文章的中文版也早都有了,不过出于完全理解的目的,我还是将它翻译了出来,加进了自己的代码,虽然在上一周的翻译过程 ...
linux 内核网络协议栈
Linux网络协议栈之数据包处理过程 1前言本来是想翻译<The journey of a packet through the linux 2.4 network stack>这篇文 ...
Linux内核网络性能优化
Linux内核网络性能优化 1. 前言 2. Linux网络协议栈 3. DPDK 4. XDP 4.1 XDP主要的特性 4.2 XDP与DPDK的对比 4.3 应用场景 5. CPU负载均衡 5. ...
Linux内核网络数据发送（五）——排队规则
Linux内核网络数据发送(五)--排队规则 1. 前言 2. `qdisc_run_begin()` and `qdisc_run_end()`:仅设置 qdisc 状态位 3. `__qdisc_ ...
使用eBPFBCC提取内核网络流量信息
前言本文将分享从0开始编写自己的bcc程序.那么开始编写bcc之前,自己一定要明确,我们要用bcc提取什么数据.本文的实例是统计内核网络中的流量,我要提取的数据关键字段为进程的PID,进程的名字,进 ...
eBPF学习——抓取内核网络中的socket信息
bcc 是基于 LLVM 的工具集,用 Python 封装了底层机器相关的细节,bcc工具使得 eBPF 的使用更加方便,使用时内核探测代码用 C 写, 数据处理用 Python .本文将使用 bcc ...

内核网络输出帧的处理

内核网络输出帧的处理相关推荐

最新文章

热门文章