linux内核包转发过程（三）NIC帧接收分析

每一个cpu都有队列来处理接收到的帧。都有其数据结构来处理入口和出口流量，因此。不同cpu之间没有必要使用上锁机制。。

此队列数据结构为softnet_data(定义在include/linux/netdevice.h中):

/** Incoming packets are placed on per-cpu queues so that* no locking is needed.*/
struct softnet_data
{
struct Qdisc *output_queue;
struct sk_buff_headinput_pkt_queue;//有数据要传输的设备列表
struct list_headpoll_list; //双向链表，当中的设备有输入帧等着被处理。
struct sk_buff*completion_queue;//缓冲区列表。当中缓冲区已成功传输，能够释放掉struct napi_structbacklog;
};

此结构字段可用于传输和接收。

换而言之，NET_RX_SOFTIRQ和NET_TX_SOFTIRQ软IRQ都引用此结构。入口帧会排入input_pkt_queue(NAPI有所不同)。

softnet_data是在net_dev_init函数中初始化的：

/**       This is called single threaded during boot, so no need*       to take the rtnl semaphore.*/
static int __init net_dev_init(void)
{
int i, rc = -ENOMEM;....../*
* Initialise the packet receive queues.
*/for_each_possible_cpu(i) {
struct softnet_data *queue;queue = &per_cpu(softnet_data, i);
skb_queue_head_init(&queue->input_pkt_queue);
queue->completion_queue = NULL;
INIT_LIST_HEAD(&queue->poll_list);queue->backlog.poll = process_backlog;
queue->backlog.weight = weight_p;
queue->backlog.gro_list = NULL;
queue->backlog.gro_count = 0;
}......open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);......
}

非NAPI设备驱动会为其所接收的每个帧产生一个中断事件，在高流量负载下，会花掉大量时间处理中断事件，造成资源浪费。

而NAPI驱动混合了中断事件和轮询。在高流量负载下其性能会比旧方法要好。
NAPI主要思想是混合使用中断事件和轮询。而不是只使用中断事件驱动模型。当收到新的帧时。关中断。再一次处理全然部入口队列。

从内核观点来看。NAPI方法由于中断事件少了。降低了cpu负载。

使用非NAPI的驱动程序的xx_rx()函数一般例如以下：

void xx_rx()
{
struct sk_buff *skb;skb = dev_alloc_skb(pkt_len + 5);
if (skb != NULL) {
skb_reserve(skb, 2);/* Align IP on 16 byte boundaries *//*memcpy(skb_put(skb, 2), pkt, pkt_len);*/ //copy data to skbskb->protocol = eth_type_trans(skb, dev);
netif_rx(skb);
}
}

第一步是分配一个缓存区来保存报文。

注意缓存分配函数 (dev_alloc_skb) 须要知道数据长度。

第二步将报文数据被复制到缓存区; skb_put 函数更新缓存中的数据末尾指针并返回指向新建空间的指针。

第三步提取协议标识及获取其它信息。

最后调用netif_rx(skb)做进一步处理。该函数一般定义在net/core/dev.c中。

int netif_rx(struct sk_buff *skb)
{
struct softnet_data *queue;
unsigned long flags;/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
return NET_RX_DROP;if (!skb->tstamp.tv64)
net_timestamp(skb);/*
* The code is rearranged so that the path is the most
* short when CPU is congested, but is still operating.
*/
local_irq_save(flags);
queue = &__get_cpu_var(softnet_data);__get_cpu_var(netdev_rx_stat).total++;
if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {//是否还有空间,netdev_max_backlog一般为300
//仅仅有当新缓冲区为空时。才会触发软中断（napi_schedule()）,假设缓冲区不为空，软中断已被触发。没有必要再去触发一次。

if (queue->input_pkt_queue.qlen) { enqueue: __skb_queue_tail(&queue->input_pkt_queue, skb);//这里是关键之处。将skb增加input_pkt_queue之中。 local_irq_restore(flags); return NET_RX_SUCCESS; } napi_schedule(&queue->backlog);//触发软中断 goto enqueue; } __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); kfree_skb(skb); return NET_RX_DROP; } EXPORT_SYMBOL(netif_rx);

static inline void napi_schedule(struct napi_struct *n)
{if (napi_schedule_prep(n))__napi_schedule(n);
}

void __napi_schedule(struct napi_struct *n)
{unsigned long flags;local_irq_save(flags);list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);//将该设备增加轮询链表，等待该设备的帧被处理__raise_softirq_irqoff(NET_RX_SOFTIRQ);//终于触发软中断local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);

至此中断的上半部完毕，其它的工作交由下半部来实现。napi_schedule(&queue->backlog)函数将有等待的接收数据包的NIC链入softnet_data的poll_list队列。然后触发软中断，让下半部去完毕数据的处理工作。
而是用ＮＡＰＩ设备的接受数据时直接触发软中断，不须要通过netif_rx()函数设置好接收队列再触发软中断。

比方e100硬中断处理函数为：

static irqreturn_t e100_intr(int irq, void *dev_id)
{struct net_device *netdev = dev_id;struct nic *nic = netdev_priv(netdev);u8 stat_ack = ioread8(&nic->csr->scb.stat_ack);DPRINTK(INTR, DEBUG, "stat_ack = 0x%02X\n", stat_ack);if (stat_ack == stat_ack_not_ours ||   /* Not our interrupt */stat_ack == stat_ack_not_present)  /* Hardware is ejected */return IRQ_NONE;/* Ack interrupt(s) */iowrite8(stat_ack, &nic->csr->scb.stat_ack);/* We hit Receive No Resource (RNR); restart RU after cleaning */if (stat_ack & stat_ack_rnr)nic->ru_running = RU_SUSPENDED;if (likely(napi_schedule_prep(&nic->napi))) {e100_disable_irq(nic);__napi_schedule(&nic->napi);//此处触发软中断}return IRQ_HANDLED;
}

在前面我们已经知道在net_dev_init()函数中注冊了收报软中断函数net_rx_action(),当软中断被触发之后。该函数将被调用。

net_rx_action()函数为：

static void net_rx_action(struct softirq_action *h)
{struct list_head *list = &__get_cpu_var(softnet_data).poll_list;unsigned long time_limit = jiffies + 2;int budget = netdev_budget;void *have;local_irq_disable();while (!list_empty(list)) {struct napi_struct *n;int work, weight;/* If softirq window is exhuasted then punt.* Allow this to run for 2 jiffies since which will allow* an average latency of 1.5/HZ.*/if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))//入口队列仍然有缓冲区。软IRQ再度被调度运行。goto softnet_break;local_irq_enable();/* Even though interrupts have been re-enabled, this* access is safe because interrupts can only add new* entries to the tail of this list, and only ->poll()* calls can remove this head entry from the list.*/n = list_entry(list->next, struct napi_struct, poll_list);have = netpoll_poll_lock(n);weight = n->weight;/* This NAPI_STATE_SCHED test is for avoiding a race* with netpoll's poll_napi().  Only the entity which* obtains the lock and sees NAPI_STATE_SCHED set will* actually make the ->poll() call.  Therefore we avoid* accidently calling ->poll() when NAPI is not scheduled.*/work = 0;if (test_bit(NAPI_STATE_SCHED, &n->state)) {work = n->poll(n, weight);//运行poll函数，返回已处理的帧trace_napi_poll(n);}WARN_ON_ONCE(work > weight);budget -= work;local_irq_disable();/* Drivers must not modify the NAPI state if they* consume the entire weight.  In such cases this code* still "owns" the NAPI instance and therefore can* move the instance around on the list at-will.*/if (unlikely(work == weight)) {//队列被清空。

调用napi_complete()负责此事。 if (unlikely(napi_disable_pending(n))) { local_irq_enable(); napi_complete(n); local_irq_disable(); } else list_move_tail(&n->poll_list, list); } netpoll_poll_unlock(have); } out: local_irq_enable(); #ifdef CONFIG_NET_DMA /* * There may not be any more sk_buffs coming right now, so push * any pending DMA copies to hardware */ dma_issue_pending_all(); #endif return; softnet_break: __get_cpu_var(netdev_rx_stat).time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; }

由上可见。下半部的主要工作是遍历有数据帧等待接收的设备链表，对于每一个设备。运行它对应的poll函数。
对非NAPI设备来说，poll函数在net_dev_init()函数中初始化为process_backlog()。
process_backlog()函数定义为：

static int process_backlog(struct napi_struct *napi, int quota)
{int work = 0;struct softnet_data *queue = &__get_cpu_var(softnet_data);unsigned long start_time = jiffies;napi->weight = weight_p;do {struct sk_buff *skb;local_irq_disable();skb = __skb_dequeue(&queue->input_pkt_queue);if (!skb) {__napi_complete(napi);local_irq_enable();break;}local_irq_enable();netif_receive_skb(skb);} while (++work < quota && jiffies == start_time);return work;
}

对NAPI设备来的说，驱动程序必须提供一个poll方法,poll 方法有以下原型:
int (*poll)(struct napi_struct *dev, int *budget);
在初始化时须要加入该方法：
netif_napi_add(netdev, &nic->napi, xx_poll, XX_NAPI_WEIGHT);

NAPI驱动的 poll 方法实现一般例如以下（借用《Linux设备驱动程序》中代码，内核有点没对上，懒得去写了）:

static int xx_poll(struct net_device *dev, int *budget)
{int npackets = 0, quota = min(dev->quota, *budget);struct sk_buff *skb;struct xx_priv *priv = netdev_priv(dev);struct xx_packet *pkt;while (npackets < quota && priv->rx_queue) {pkt = xx_dequeue_buf(dev);skb = dev_alloc_skb(pkt->datalen + 2);if (! skb) {if (printk_ratelimit())printk(KERN_NOTICE "xx: packet dropped\n"); priv->stats.rx_dropped++; xx_release_buffer(pkt); continue;}memcpy(skb_put(skb, pkt->datalen), pkt->data, pkt->datalen);skb->dev = dev;skb->protocol = eth_type_trans(skb, dev);skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */netif_receive_skb(skb);/* Maintain stats */npackets++;priv->stats.rx_packets++;priv->stats.rx_bytes += pkt->datalen;xx_release_buffer(pkt);}/* If we processed all packets, we're done; tell the kernel and reenable ints */*budget -= npackets;dev->quota -= npackets;if (! priv->rx_queue) {netif_rx_complete(dev);xx_rx_ints(dev, 1);return 0;}/* We couldn't process everything. */return 1;}

NAPI驱动提供自己的poll函数和私有队列。
无论是非NAPI或NAPI，他们的poll函数最后都会调用netif_receive_skb(skb)来处理接收到的帧。

该函数会想各个已注冊的协议例程发送一个skb。之后数据进入Linux内核协议栈处理。

本文转自mfrbuaa博客园博客，原文链接：http://www.cnblogs.com/mfrbuaa/p/4642266.html，如需转载请自行联系原作者

linux内核包转发过程（三）NIC帧接收分析相关推荐

实验三：跟踪分析Linux内核的启动过程 ----- 20135108 李泽源
实验要求: 使用gdb跟踪调试内核从start_kernel到init进程启动详细分析从start_kernel到init进程启动的过程并结合实验截图撰写一篇署名博客,并在博客文章中注明" ...
linux内核启动分析三,Linux内核分析实验三：跟踪分析Linux内核的启动过程
贺邦 + 原创作品转载请注明出处 + <Linux内核分析>MOOC课程 http://mooc.study.163.com/course/USTC-1000029000 一. 实验过程 ...
图解分析 Linux 网络包发送过程
大家好,下面的文章转发一个鹅厂同学的文章,这篇文章从应用到内核,写的非常不错,希望大家分析某个技术也可以从这方面入手. ----- 大家好,我是飞哥! 半年前我以源码的方式描述了网络包的接收过程.之后 ...
拆解 Linux 网络包发送过程
半年前我以源码的方式描述了网络包的接收过程.之后不断有粉丝提醒我还没聊发送过程呢.好,安排! 在开始今天的文章之前,我先来请大家思考几个小问题. 问1:我们在查看内核发送数据消耗的 CPU 时,是应该 ...
编译linux内核生成.ko,Linux内核编译完整过程
通过网上的资料我自己的实际内核编译,我把对Linux内核编译的过程写在这里,也许对其他的Linux爱好者的编译学习有些帮助,其中很大部分是网上的资料,另外就是我在实际编译过程中的一些实际经验. 内核简 ...
linux内核的配置过程,linux内核的配置机制及其编译过程
linux内核的配置机制及其编译过程. 一.配置系统的基本结构 Linux内核的配置系统由三个部分组成,分别是: 1.Makefile:分布在 Linux 内核源代码根目录及各层目录中,定义 Linu ...
Bochs调试Linux内核6 - 启动过程调试 - 跳到bootsect引导程序执行
接此,Bochs调试Linux内核5 - 启动过程调试 - 认识Bootsect.S_bcbobo21cn的专栏-CSDN博客看一下,0x00007c11 这里是重复执行串传送:而后一条 ...
Linux内核 eBPF基础：kprobe原理源码分析：源码分析
Linux内核 eBPF基础 kprobe原理源码分析:源码分析荣涛 2021年5月11日在 <Linux内核 eBPF基础:kprobe原理源码分析:基本介绍与使用>中已经介绍了kp ...
Linux内核 eBPF基础：kprobe原理源码分析：基本介绍与使用示例
Linux内核 eBPF基础 kprobe原理源码分析:基本介绍与使用示例荣涛 2021年5月11日 kprobe调试技术是为了便于跟踪内核函数执行状态所设计的一种轻量级内核调试技术. 利用kpro ...

linux内核包转发过程（三）NIC帧接收分析

linux内核包转发过程（三）NIC帧接收分析相关推荐

最新文章

热门文章