1 自带 irqbalance 瓶颈

2 RPS/RFS 数据结构以及更新函数

2.1 CPU 负载表 rps_map

2.1.1 rps_map 解析函数（store_rps_map）

2.2 设备流表 rps_dev_flow_table

2.2.1 设备流表初始化函数 store_rps_dev_flow_table_cnt

2.3 全局的数据流表（rps_sock_flow_table）

2.3.1 全局流表更新函数 ( rps_record_sock_flow )

3 RPS 工作流程　　

4 RFS 工作流程　

4.1 负载均衡策略 get_rps_cpu()【核心】

5 RPS/RFS 配置方法

1 自带 irqbalance 瓶颈

基于简单的中断负载均衡(如系统自带的irqbalance进程)可能会弄巧成拙。因为其并不识别网络流，只识别到这是一个数据包，不能识别到数据包的元组信息。

在多处理器系统的每个处理器都有单独的硬件高速缓存，如果其中一个 CPU 修改了自己的硬件高速缓存，它就必须该数据是否包含在其它 CPU 的硬件高速缓存，如果存在，必须通知其它 CPU 更新硬件高速缓存，这叫做 CPU 的 cache 一致性。所以为了降低 CPU 硬件高速缓存的刷新频率，需要把特征相似的数据包分配给同一个 CPU 处理。

另外在TCP的IP包的分段重组问题，一旦乱序就要重传，一个 linux 主机如果只是作为一台路由器的话，那么进入系统的一个TCP包的不同分段如果被不同的 cpu 处理并向一个网卡转发了，那么同步问题会很麻烦的，如果不做同步处理，那么很可能后面的段被一个 cpu 先发出去了，那最后接收方接收到乱序的包后就会请求重发，这样则还不如是一个 cpu 串行处理。

这个需要一套方案，需要弄清楚三个问题：

哪个 CPU 在消耗这个网络数据
哪个 CPU 在处理中断
哪个 CPU 在做软中断

这个三个问题就是 RPS/RFS 需要去解决的。

2 RPS/RFS 数据结构以及更新函数

linux 底层的很多机密都藏在数据结构中，包括其面向对象的设计，所以先来看下这些数据结构。网卡的硬件接收队列 netdev_rx_queue（include/linux/netdevice.h）。

/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
#ifdef CONFIG_RPSstruct rps_map __rcu       *rps_map;struct rps_dev_flow_table __rcu    *rps_flow_table;
#endifstruct kobject            kobj;struct net_device      *dev;
} ____cacheline_aligned_in_smp;

2.1 CPU 负载表 rps_map

存放解析结果的就是网卡硬件接收队列实例的 rps_map 成员, cpus数组用来记录配置文件中配置的参与报文分发处理的cpu的数组，而len成员就是cpus数组的长度。

/** This structure holds an RPS map which can be of variable length.  The* map is an array of CPUs.*/
struct rps_map {unsigned int len;struct rcu_head rcu;u16 cpus[0];
};

2.1.1 rps_map 解析函数（store_rps_map）

// net/core/net-sysfs.c
static ssize_t store_rps_map(struct netdev_rx_queue *queue,const char *buf, size_t len)
{struct rps_map *old_map, *map;cpumask_var_t mask;int err, cpu, i;static DEFINE_MUTEX(rps_map_mutex);if (!capable(CAP_NET_ADMIN))return -EPERM;if (!alloc_cpumask_var(&mask, GFP_KERNEL))return -ENOMEM;err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);if (err) {free_cpumask_var(mask);return err;}map = kzalloc(max_t(unsigned int,RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),GFP_KERNEL);if (!map) {free_cpumask_var(mask);return -ENOMEM;}i = 0;for_each_cpu_and(cpu, mask, cpu_online_mask)map->cpus[i++] = cpu;if (i) {map->len = i;} else {kfree(map);map = NULL;}mutex_lock(&rps_map_mutex);old_map = rcu_dereference_protected(queue->rps_map,mutex_is_locked(&rps_map_mutex));rcu_assign_pointer(queue->rps_map, map);if (map)static_key_slow_inc(&rps_needed);if (old_map)static_key_slow_dec(&rps_needed);mutex_unlock(&rps_map_mutex);if (old_map)kfree_rcu(old_map, rcu);free_cpumask_var(mask);return len;
}

2.2 设备流表 rps_dev_flow_table

设备流表 rps_dev_flow_table (include/linux/netdevice.h)。mask 成员是类型为 struct rps_dev_flow 的数组的大小，也就是流表项的数目，通过配置文件 /sys/class/net/(dev)/queues/rx-(n)/rps_flow_cnt 进行指定的。当设置了配置文件，那么内核就会获取到数据，并初始化网卡硬件接收队列中的设备流表成员rps_flow_table(初始化过程函数 store_rps_dev_flow_table_cnt)。

/** The rps_dev_flow_table structure contains a table of flow mappings.*/
struct rps_dev_flow_table {unsigned int mask;struct rcu_head rcu;struct rps_dev_flow flows[0];
};

rps_dev_flow 类型的实例则主要包括存放着上次处理该流中报文的 cpu 以及所在 cpu 私有数据对象 softnet_data 的 input_pkt_queue 队列尾部索引的两个成员。

/** The rps_dev_flow structure contains the mapping of a flow to a CPU, the* tail pointer for that CPU's input queue at the time of last enqueue, and* a hardware filter index.*/
struct rps_dev_flow {u16 cpu;u16 filter;unsigned int last_qtail;
};

2.2.1 设备流表初始化函数 store_rps_dev_flow_table_cnt

static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,const char *buf, size_t len)
{unsigned long mask, count;struct rps_dev_flow_table *table, *old_table;static DEFINE_SPINLOCK(rps_dev_flow_lock);int rc;if (!capable(CAP_NET_ADMIN))return -EPERM;rc = kstrtoul(buf, 0, &count);if (rc < 0)return rc;if (count) {mask = count - 1;/* mask = roundup_pow_of_two(count) - 1;* without overflows...*/while ((mask | (mask >> 1)) != mask)mask |= (mask >> 1);/* On 64 bit arches, must check mask fits in table->mask (u32),* and on 32bit arches, must check* RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.*/
#if BITS_PER_LONG > 32if (mask > (unsigned long)(u32)mask)return -EINVAL;
#elseif (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))/ sizeof(struct rps_dev_flow)) {/* Enforce a limit to prevent overflow */return -EINVAL;}
#endiftable = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));if (!table)return -ENOMEM;table->mask = mask;for (count = 0; count <= mask; count++)table->flows[count].cpu = RPS_NO_CPU;} else {table = NULL;}spin_lock(&rps_dev_flow_lock);old_table = rcu_dereference_protected(queue->rps_flow_table,lockdep_is_held(&rps_dev_flow_lock));rcu_assign_pointer(queue->rps_flow_table, table);spin_unlock(&rps_dev_flow_lock);if (old_table)call_rcu(&old_table->rcu, rps_dev_flow_table_release);return len;
}

2.3 全局的数据流表（rps_sock_flow_table）

该表中包含了数据流期望被处理的 CPU，是当前处理流中报文的应用程序所在的 CPU。全局 socket 流表会在调 recvmsg，sendmsg (inet_accept(), inet_recvmsg(), inet_sendmsg(), inet_sendpage() and tcp_splice_read())被设置更新。最终调用函数 rps_record_sock_flow 来更新 ents 数组的。

/** The rps_sock_flow_table contains mappings of flows to the last CPU* on which they were processed by the application (set in recvmsg).* Each entry is a 32bit value. Upper part is the high-order bits* of flow hash, lower part is CPU number.* rps_cpu_mask is used to partition the space, depending on number of* possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1* For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,* meaning we use 32-6=26 bits for the hash.*/
struct rps_sock_flow_table {u32 mask;u32    ents[0] ____cacheline_aligned_in_smp;
};

mask 成员存放的就是 ents 这个数组的大小，通过配置文件 /proc/sys/net/core/rps_sock_flow_entries 的方式指定的。

2.3.1 全局流表更新函数 ( rps_record_sock_flow )

每次用户程序读取数据包都会更新 rps_sock_flow_table 表，保证其中的 CPU号是最新的。

// include/linux/netdevice.h
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,u32 hash)
{if (table && hash) {unsigned int index = hash & table->mask;u32 val = hash & ~rps_cpu_mask;/* We only give a hint, preemption can change CPU under us */val |= raw_smp_processor_id();if (table->ents[index] != val)table->ents[index] = val;}
}

3 RPS 工作流程　　

rps 和 rfs 的工作都是在软中断上下文中执行，因为该阶段处理工作是和进程无关的，又和底层硬件剥离了。能够实现网络协议栈软中断的负载均衡。

rps 实现的总流程如下：将数据包加入其它CPU的接收队列，其它CPU将会在自己的软中断中执行 process_backlog，process_backlog 将会接收队列中的所有数据包，并调用 __netif_receive_skb() 执行后续工作。

Linux是通过配置文件的方式指定哪些cpu核参与到报文的分发处理，配置文件存放的路径是：/sys/class/net/(dev)/queues/rx-(n)/rps_cpus。设置好该配置文件之后，内核就会去获取该配置文件的内容，然后根据解析的结果生成一个用于参与报文分发处理的cpu列表，这样当收到报文之后，就可以建立起 hash-cpu 的映射关系了，解析函数为store_rps_map，结果存放在 rps_map 中。

rps 会根据数据包的hash值(报文hash值，可以由网卡计算得到，也可以是由软件计算得到，具体的计算也因报文协议不同而有所差异，如tcp报文的hash值是根据四元组信息，即源ip、源端口、目的ip和目的端口进行hash计算得到的)来选择CPU，选目标cpu的动作具体的实现函数是get_rps_cpu，其定义在net/core/dev.c文件中, 实现从cpu列表中获取核号的：

staticint get_rps_cpu(struct net_device *dev, struct sk_buff *skb, structrps_dev_flow **rflowp)

rps 是单纯用报文的hash值来分发报文，而不关注处理该流中报文的应用程序所在的cpu。

那么何时调用函数get_rps_cpu呢？

支持 NAPI 接口的驱动而言，在上半部主要就是将设备加入到 cpu 的私有数据 softnet_data 的待轮询设备列表中，下半部主要就是调用poll回调函数从网卡缓冲区中获取报文，然后向上给协议栈。

函数 get_rps_cpu 会被 netif_rcv_skb 调用，获取到用于分发处理报文的目标 cpu，如果目标 cpu 有效，则会调用 enqueue_to_backlog()（net/core/dev.c ）函数，尝试将报文加入到cpu私有数据对象 softnet_data 的 input_pkt_queue 队列中。

数据对象 softnet_data 中有 backlog 的成员，该成员类型为 struct napi_struct，在函数 enqueue_to_backlog() 中，会将 backlog 加入到 cpu 的待轮询设备列表中，并触发软中断，在软中断处理函数 net_rx_action() 中会依次遍历待轮询设备列表中的设备，并调用设备注册的 poll 回调函数来进行报文处理。

从 input_pkt_queue 队列该队列中取出报文，然后调用 __netif_receive_skb() 上报文送至协议栈进行后续处理。

如果没有rps的处理流程（现在一般均采用的是NAPI收包方式），软中断处理函数net_rx_action()会调用网卡驱动注册的poll回调函数从网卡中获取到报文数据后就将报文数据上送至协议栈。

对于有rps的处理流程，软中断处理函数 net_rx_action() 会调用网卡驱动注册的 poll 回调函数从网卡中获取到报文数据后，暂时不直接送至协议栈，而是选择一个目标cpu，将报文放到该cpu私有数据对象 softnet_data 的 input_pkt_queue 队列中，待对列 input_pkt_queue 满了之后，就将该cpu对应的backlog设备对象加入到该cpu的待轮询设备列表中，并触发软中断，软中断处理函数轮询到 backlog 设备对象后，调用poll回调函数 process_backlog() 从 input_pkt_queue 队列中取出报文，再上送至协议栈。

如下图左边是没有使用 rps，右边是使用 rps。

4 RFS 工作流程　

RFS 在 RPS 上的改进，通过RPS可以把同一流的数据包分发给同一个 CPU 核来处理，但是有可能发给该数据流分发的 CPU 核和执行处理该数据流的应用程序的CPU核不是同一个。

当用户态处理报文的 cpu 和内核处理报文软中断的 cpu 不同的时候，就会导致 cpu 的缓存不命中。而 rfs 就是用来处理这种情况的，rfs 的目标是通过指派处理报文的应用程序所在的 cpu 来在内核态处理报文，以此来增加 cpu 的缓存命中率。rfs 和 rps 的主要差别就是在选取分发处理报文的目标cpu上。

rfs 实现指派处理报文的应用程序所在的cpu来在内核态处理报文主要是依靠两个流表来实现的：

一个是设备流表 rps_dev_flow_table，记录的是上次在内核态处理该流中报文的cpu；
一个是全局的socket流表 rps_sock_flow_table，记录报文期望被处理的目标cpu。

4.1 负载均衡策略 get_rps_cpu()【核心】

/** get_rps_cpu is called from netif_receive_skb and returns the target* CPU from the RPS map of the receiving queue for a given skb.* rcu_read_lock must be held on entry.*/
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,struct rps_dev_flow **rflowp)
{const struct rps_sock_flow_table *sock_flow_table;struct netdev_rx_queue *rxqueue = dev->_rx;struct rps_dev_flow_table *flow_table;struct rps_map *map;int cpu = -1;u32 tcpu;u32 hash;if (skb_rx_queue_recorded(skb)) {u16 index = skb_get_rx_queue(skb);//获取网卡的rx队列if (unlikely(index >= dev->real_num_rx_queues)) {WARN_ONCE(dev->real_num_rx_queues > 1,"%s received packet on queue %u, but number ""of RX queues is %u\n",dev->name, index, dev->real_num_rx_queues);goto done;}rxqueue += index;}/* Avoid computing hash if RFS/RPS is not active for this rxqueue */flow_table = rcu_dereference(rxqueue->rps_flow_table);// RPS 逻辑map = rcu_dereference(rxqueue->rps_map);if (!flow_table && !map)goto done;skb_reset_network_header(skb);hash = skb_get_hash(skb);if (!hash)goto done;sock_flow_table = rcu_dereference(rps_sock_flow_table);// RFS 逻辑if (flow_table && sock_flow_table) {struct rps_dev_flow *rflow;u32 next_cpu;u32 ident;/* First check into global flow table if there is a match */ident = sock_flow_table->ents[hash & sock_flow_table->mask];if ((ident ^ hash) & ~rps_cpu_mask)goto try_rps;next_cpu = ident & rps_cpu_mask;/* OK, now we know there is a match,* we can look at the local (per receive queue) flow table*/rflow = &flow_table->flows[hash & flow_table->mask];tcpu = rflow->cpu;/** If the desired CPU (where last recvmsg was done) is* different from current CPU (one in the rx-queue flow* table entry), switch if one of the following holds:*   - Current CPU is unset (>= nr_cpu_ids).*   - Current CPU is offline.*   - The current CPU's queue tail has advanced beyond the*     last packet that was enqueued using this table entry.*     This guarantees that all previous packets for the flow*     have been dequeued, thus preserving in order delivery.*/if (unlikely(tcpu != next_cpu) &&(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||((int)(per_cpu(softnet_data, tcpu).input_queue_head -rflow->last_qtail)) >= 0)) {tcpu = next_cpu;rflow = set_rps_cpu(dev, skb, rflow, next_cpu);}if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {*rflowp = rflow;cpu = tcpu;goto done;}}try_rps:if (map) { //RPS 逻辑tcpu = map->cpus[reciprocal_scale(hash, map->len)];if (cpu_online(tcpu)) {cpu = tcpu;goto done;}}done:return cpu;
}

rfs 的负载均衡策略通过判断报文的 hash 值(流 hash 值)所对应的两个流表（设备流表和全局socket流表）中的记录的cpu是否相同来实施的。

1、如果当前CPU表（设备流表）对应表项未设置或者当前CPU表对应表项映射的 CPU 核处于离线状态，那么使用期望CPU表（全局流表）对应表项映射的CPU核。

2、如果当前CPU表对应表项映射的CPU核和期望CPU表对应表项映射的CPU核为同一个，就使用这一个核。

3、如果当前CPU表对应表项映射的CPU核和期望CPU表对应表项映射的CPU核不为同一个：

a) 如果同一流的前一段数据包未处理完，必须使用当前CPU表对应表项映射的CPU核，避免乱序。
b) 如果同一流的前一段数据包已经处理完，那么则可以使用期望CPU表对应表项映射的CPU核。

4、如果设备流表和全局流表均为设置，则使用 rps 策略中的CPU流表。

5、如果 rps表，设备流表、全局流表均为设置，返回无效 cpu_index(-1)。

5 RPS/RFS 配置方法

在大于等于2.6.35版本的Linux kernel上可以直接使用，默认都是关闭的。

RPS设置：

RPS指定哪些接收队列需要通过rps平均到配置的cpu列表上。

/sys/class/net/(dev)/queues/rx-(n)/rps_cpus

RFS设置：

每个队列的数据流表总数可以通过下面的参数来设置：

该值设置成rps_sock_flow_entries/N，其中Ｎ表示设备的接收队列数量。

/sys/class/net/(dev)/queues/rx-(n)/rps_flow_cnt

全局数据流表(rps_sock_flow_table)的总数，红帽是建议设置成 32768，一般设置成最大并发链接数量

/proc/sys/net/core/rps_sock_flow_entries

linux 内核网络中 RPS/RFS 原理Ⅰ相关推荐

Linux内核网络中的软中断ksoftirqd
1. 前言之前分享过Linux内核网络数据包的接收过程,当执行到网卡通过硬件中断(IRQ)通知CPU,告诉它有数据来了,CPU会根据中断表,调用已经注册的中断函数,这个中断函数会调到驱动程序(NIC ...
Linux内核网络中数据报在协议层的处理
1. 前言本文主要分析数据报从 IP 协议层进入协议栈,通过udp协议层,到达 socket,最终被用户程序读取的过程.在此过程中,介绍了 IP 协议层和 UDP 协议层中监测数据和网络调优的方法. ...
精通Linux内核网络 -(以)罗森
讨论了Linux内核网络栈的实现原理,并对网络子系统及其体系结构进行了深入细致的分析.主要内容包括:核心网络的基础知识,Netlink socket.ARP.邻居发现和ICMP等重要协议的实现,IPv ...
Linux内核网络丢包查看工具dropwatch的安装和使用
本文将安装并使用dropwatch工具,来收集并查看Linux内核网络中丢包的数量和位置. 安装 sudo apt-get install -y libnl-3-dev libnl-genl-3-de ...
LINUX内核网络丢包监控
2020年8月11日 | 由梁金荣 | 800字 | 阅读大约需要2分钟 | 归档于内核网络 | 原文:http://kerneltravel.net/blog/2020/network_ljr6 ...
内核网络中的GRO、RFS、RPS技术介绍和调优
内核网络中的GRO.RFS.RPS技术介绍和调优 1. 前言 2. GRO(Generic Receive Offloading) 2.1 使用 ethtool 修改 GRO 配置 2.2 napi_ ...
Linux内核网络性能优化
Linux内核网络性能优化 1. 前言 2. Linux网络协议栈 3. DPDK 4. XDP 4.1 XDP主要的特性 4.2 XDP与DPDK的对比 4.3 应用场景 5. CPU负载均衡 5. ...
Linux网络协议：当eBPF遇上Linux内核网络 | Linux内核之旅
<直播预告 | 当eBPF遇见Linux内核网络> 哔哩哔哩:https://www.bilibili.com/video/BV1ch411U75f?from=search&sei ...
Linux内核网络数据包发送（四）——Linux netdevice 子系统
Linux内核网络数据包发送(四)--Linux netdevice 子系统 1. 前言 2. `dev_queue_xmit` and `__dev_queue_xmit` 2.1 `netdev_ ...
Linux内核--网络栈实现分析（二）--数据包的传递过程--转
转载地址http://blog.csdn.net/yming0221/article/details/7492423 作者:闫明本文分析基于Linux Kernel 1.2.13 注:标题中的&qu ...

linux 内核网络中 RPS/RFS 原理Ⅰ

1 自带 irqbalance 瓶颈

2 RPS/RFS 数据结构以及更新函数

2.1 CPU 负载表 rps_map

2.1.1 rps_map 解析函数（store_rps_map）

2.2 设备流表 rps_dev_flow_table

2.2.1 设备流表初始化函数 store_rps_dev_flow_table_cnt

2.3 全局的数据流表（rps_sock_flow_table）

2.3.1 全局流表更新函数 ( rps_record_sock_flow )

3 RPS 工作流程

4 RFS 工作流程

4.1 负载均衡策略 get_rps_cpu()【核心】

5 RPS/RFS 配置方法

linux 内核网络中 RPS/RFS 原理Ⅰ相关推荐

最新文章

热门文章