dpvs中fdir与sa_pool介绍
问题引入
- dpvs是dpdk程序,特点就是每个核尽可能不与其他核交互,这就要求共享数据都有一份拷贝,或是数据私有。fnat模式中流表(session)保存连接信息,每个核独有。但这里有个问题,full-nat模式下,返程数据outbound packet也必须分配到与inbound packet收取的同一个lcore,否则在流表中找不到conn
- dpvs解决方案
- 引入fdir机制
- lcore上lip都是同步一致的,根据本地端口lport,来分配正确的核,此处用于fdir计算的lport根据启用的lcore数量做掩码(即dst_port_mask)
配置文件fdir
- 先看一下配置文件
<init> device dpdk0 {rx {queue_number 8descriptor_number 1024rss all}tx {queue_number 8descriptor_number 1024}fdir {mode perfectpballoc 64kstatus matched}! promisc_modekni_name dpdk0.kni
}
- 可以看到,rx队列配置了rss,并且dpdk0网卡配置了fdir
默认fdir配置
- 在对网卡初始化时,会用到default_port_conf,这里有关于网卡fdir默认配置
static struct rte_eth_conf default_port_conf =
{.rxmode ={.mq_mode = ETH_MQ_RX_RSS,.max_rx_pkt_len = ETHER_MAX_LEN,.split_hdr_size = 0,.offloads = DEV_RX_OFFLOAD_IPV4_CKSUM,},.rx_adv_conf ={.rss_conf ={.rss_key = NULL,.rss_hf = /*ETH_RSS_IP*/ ETH_RSS_TCP,},},.txmode ={.mq_mode = ETH_MQ_TX_NONE,},.fdir_conf ={.mode = RTE_FDIR_MODE_PERFECT,.pballoc = RTE_FDIR_PBALLOC_64K,.status = RTE_FDIR_REPORT_STATUS /*_ALWAYS*/,.mask ={.vlan_tci_mask = 0x0,.ipv4_mask ={.src_ip = 0x00000000,.dst_ip = 0xFFFFFFFF,},.ipv6_mask ={.src_ip ={0, 0, 0, 0},.dst_ip ={ 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },},.src_port_mask = 0x0000,/* to be changed according to slave lcore number in use */.dst_port_mask = 0x00F8,.mac_addr_byte_mask = 0x00,.tunnel_type_mask = 0,.tunnel_id_mask = 0,},.drop_queue = 127,.flex_conf ={.nb_payloads = 0,.nb_flexmasks = 0,},},
};
- rx_adv_conf关于RSS配置,默认是ETH_RSS_TCP
- 最重要的是fdir_conf,可以看到mode,pballoc,status等。这里关注mask即可,fdir支持不同层的导流,ipv4_mask.src_ip掩码是0,ipv4_mask.dst_ip位全置1,所以fdir只看目的地ip不看源ip,src_port_mask是0,dst_port_mask非0,也就是说dpvs fdir只根据dst_ip,dst_port_mask计算,也就是对应<lip,lport>,由于lip只有一个,所以等同于只看lport,那么如何如何设置dst_port_mask掩码呢?
- dst_port_mask设置参见下面netif_start_port流程
sa_pool与fdir初始化
每个lcore都有自己的sa_pool,用于管理本地分配的<lip,lport>,假如当前启用了64个lcore,一共有65535-1024可用端口,那么每个lcore在同一个lip上最多使用(65535-1024)/64个地址
相关结构体
enum {SA_F_USED = 0x01, };/*** if really need to to save memory, we can;* 1. use hlist_head* 2. use uint8_t flag* 3. remove sa_entry.addr, and get IP from sa_pool->ifa* 4. to __packed__ sa_entry.* 5. alloc sa_entries[] for 65536/cpu_num only.* 6. create sa_entry_pool only if pool_hash hit.* since when dest (like RS) num may small.*//* socket address (sa) is <ip, port> pair. */ struct sa_entry {struct list_head list; /* node of sa_pool. *///标志,目前只有SA_F_USED,主要标记该端口是否被使用了(空闲/busy状态)uint32_t flags; /* SA_F_XXX */union inet_addr addr;__be16 port; };struct sa_entry_pool {//sa_entry hash表struct sa_entry sa_entries[MAX_PORT];//used sa_entry链表头struct list_head used_enties;//free sa_entry链表头struct list_head free_enties;/* another way is use total_used/free_cnt in sa_pool,* so that we need not travels the hash to get stats.* we use cnt here, since we may need per-pool stats. *///统计计数uint16_t used_cnt;uint16_t free_cnt;uint32_t miss_cnt; };/* no lock needed because inet_ifaddr.sa_pool* is per-lcore. */ struct sa_pool {//sa_pool所属的ip层地址块struct inet_ifaddr * ifa; /* back-pointer *///low,high端口uint16_t low; /* min port */uint16_t high; /* max port *///引用计数rte_atomic32_t refcnt;/* hashed pools by dest's <ip/port>. if no dest provided,* just use first pool. it's not need create/destroy pool* for each dest, that'll be too complicated. */struct sa_entry_pool *pool_hash;//hash表中桶个数uint8_t pool_hash_sz;uint32_t flags; /* SA_POOL_F_XXX *//* fdir filter ID */uint32_t filter_id[MAX_FDIR_PROTO]; };struct sa_fdir {/* the ports one lcore can use means* "(fdir.mask & port) == port_base" *///掩码uint16_t mask; /* filter's port mask */ //所在lcore id,每个lcore上分配一个lcoreid_t lcore; //在选取本地lcore可用port时时,lport 经过掩码后,得到的值如果等于 port_base 就会分配到这个核心__be16 port_base;uint16_t soft_id; /* current unsed soft-id,* increase after use. */ };
程序初始化时调用sa_pool_init初始化全局fdir表
static struct sa_fdir sa_fdirs[DPVS_MAX_LCORE];
int sa_pool_init(void)
{int shift;lcoreid_t cid;uint16_t port_base;/* enabled lcore should not change after init *///获取已启用的lcore个数和掩码netif_get_slave_lcores(&sa_nlcore, &sa_lcore_mask);/* how many mask bits needed ? *///计算log2(sas_nlcore),向上取整for (shift = 0; (0x1 << shift) < sa_nlcore; shift++){;}if (shift >= 16){return(EDPVS_INVAL); /* bad config */}port_base = 0;for (cid = 0; cid < DPVS_MAX_LCORE; cid++){//如果cid>=64或者该lcore没有被启用,跳过if (cid >= 64 || !(sa_lcore_mask & (1L << cid))){continue;}assert(rte_lcore_is_enabled(cid) && cid != rte_get_master_lcore());//sa_fdirs是per-lcore数据结构,设置mask掩码,主要是通过&操作代替取余操作sa_fdirs[cid].mask = ~((~0x0) << shift);sa_fdirs[cid].lcore = cid;//fdir计算时,lport 经过掩码后,得到的值如果等于 port_base 就会分配到这个核心sa_fdirs[cid].port_base = htons(port_base);sa_fdirs[cid].soft_id = 0;port_base++;}return(EDPVS_OK);
}
在使用ipvsadm添加lip时,ifa_add_set调用sa_pool_create初始化sa_pool
int sa_pool_create(struct inet_ifaddr *ifa, uint16_t low, uint16_t high) {int err;struct sa_pool *ap;struct sa_fdir *fdir;uint32_t filtids[MAX_FDIR_PROTO];lcoreid_t cid = rte_lcore_id();//判断当前lcore是否在sa_lcore_mask中if (cid > 64 || !((sa_lcore_mask & (1UL << cid)))){if (cid == rte_get_master_lcore()){return(EDPVS_OK); /* no sapool on master */}return(EDPVS_INVAL);}//low,high端口默认值分别是1025,65535low = low ? : DEF_MIN_PORT;high = high ? : DEF_MAX_PORT;//参数校验if (!ifa || low > high || low == 0 || high >= MAX_PORT){RTE_LOG(ERR, SAPOOL, "%s: bad arguments\\n", __func__);return(EDPVS_INVAL);}//通过lcore id获取对应的sa_fdir对象fdir = &sa_fdirs[cid];//分配sa_pool对象ap = rte_zmalloc(NULL, sizeof(struct sa_pool), 0);if (unlikely(!ap)){return(EDPVS_NOMEM);}//设置sa_pool相关参数ap->ifa = ifa;ap->low = low;ap->high = high;ap->flags = 0;rte_atomic32_set(&ap->refcnt, 1);//分配sa_pool中socket地址的哈希表,默认hash桶的数量为16err = sa_pool_alloc_hash(ap, sa_pool_hash_size, fdir);if (err != EDPVS_OK){goto free_ap;}filtids[0] = fdir->soft_id++;filtids[1] = fdir->soft_id++;//增加fdir filter,用于fdir匹配err = sa_add_filter(ifa->af, ifa->idev->dev, cid, &ifa->addr,fdir->port_base, filtids); /* thread-safe ? */if (err != EDPVS_OK){goto free_hash;}ap->filter_id[0] = filtids[0];ap->filter_id[1] = filtids[1];ifa->sa_pool = ap;/* inc ifa->refcnt to hold it */rte_atomic32_inc(&ifa->refcnt);#ifdef CONFIG_DPVS_SAPOOL_DEBUG{char addr[64];RTE_LOG(INFO, SAPOOL, "[%02d] %s: sa pool created -- %s\\n", rte_lcore_id(),__func__, inet_ntop(ifa->af, &ifa->addr, addr, sizeof(addr)) ? : NULL);} #endifreturn(EDPVS_OK);free_hash:sa_pool_free_hash(ap); free_ap:rte_free(ap);return(err); }
sa_pool_alloc_hash
static int sa_pool_alloc_hash(struct sa_pool *ap, uint8_t hash_sz,const struct sa_fdir *fdir) {int hash;struct sa_entry_pool *pool;uint32_t port; /* should be u32 or 65535==0 *///分配sa_pool->pool_hash桶分配ap->pool_hash = rte_malloc(NULL, sizeof(struct sa_entry_pool) * hash_sz,RTE_CACHE_LINE_SIZE);if (!ap->pool_hash){return(EDPVS_NOMEM);}//设置哈希表桶大小为hash_szap->pool_hash_sz = hash_sz;/* the big loop takes about 17ms */for (hash = 0; hash < hash_sz; hash++){pool = &ap->pool_hash[hash];//初始化used_entry和free_entry链表头INIT_LIST_HEAD(&pool->used_enties);INIT_LIST_HEAD(&pool->free_enties);pool->used_cnt = 0;pool->free_cnt = 0;//根据fdir->mask &&((uint16_t)port & fdir->mask) == ntohs(fdir->port_base) 为当前lcore分配地址for (port = ap->low; port <= ap->high; port++){struct sa_entry *sa;if (fdir->mask &&((uint16_t)port & fdir->mask) != ntohs(fdir->port_base)){continue;}sa = &pool->sa_entries[(uint16_t)port];sa->addr = ap->ifa->addr;sa->port = htons((uint16_t)port);list_add_tail(&sa->list, &pool->free_enties);pool->free_cnt++;}}return(EDPVS_OK); }
sa_add_filter
static inline int sa_add_filter(int af, struct netif_port *dev, lcoreid_t cid,const union inet_addr *dip, __be16 dport,uint32_t filter_id[MAX_FDIR_PROTO]) {return(__add_del_filter(af, dev, cid, dip, dport, filter_id, true)); }
__add_del_filter
- 主要对(目的ip,目的port),更确切讲是dst_port&fdir_conf.mask.dst_port_mask后的端口进行flow director设置
- fdir_conf.mask.dst_port_mask的设置详见netif_start中调用netif_port_fdir_dstport_mask_set
static int __add_del_filter(int af, struct netif_port *dev, lcoreid_t cid,const union inet_addr *dip, __be16 dport,uint32_t filter_id[MAX_FDIR_PROTO], bool add) {queueid_t queue;int err;enum rte_filter_op op, rop;//flow Director设置,action为接收数据struct rte_eth_fdir_filter filt[MAX_FDIR_PROTO] ={{.action.behavior = RTE_ETH_FDIR_ACCEPT,.action.report_status = RTE_ETH_FDIR_REPORT_ID,.soft_id = filter_id[0],},{.action.behavior = RTE_ETH_FDIR_ACCEPT,.action.report_status = RTE_ETH_FDIR_REPORT_ID,.soft_id = filter_id[1],},};if (af == AF_INET){//IPv4 flow dirctor设置,只对(目的ip,目的端口)进行过滤,,具体为做过mask.dst_port_mask掩码后的dport//当dport为0,lcore_count=4,则0,4,8,12......的目的端口会进入该lcorefilt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP;filt[0].input.flow.tcp4_flow.ip.dst_ip = dip->in.s_addr;filt[0].input.flow.tcp4_flow.dst_port = dport;filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;filt[1].input.flow.udp4_flow.ip.dst_ip = dip->in.s_addr;filt[1].input.flow.udp4_flow.dst_port = dport;}else if (af == AF_INET6){filt[0].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_TCP;memcpy(filt[0].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr));filt[0].input.flow.tcp6_flow.dst_port = dport;filt[1].input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV6_UDP;memcpy(filt[1].input.flow.ipv6_flow.dst_ip, &dip->in6, sizeof(struct in6_addr));filt[1].input.flow.udp6_flow.dst_port = dport;}else{return(EDPVS_NOTSUPP);}//判断设备是否flow director功能,主要对dpdk设备和bond设备,内部调用rte_eth_dev_filter_supportedif (dev->netif_ops && dev->netif_ops->op_filter_supported){if (dev->netif_ops->op_filter_supported(dev, RTE_ETH_FILTER_FDIR) < 0){if (dev->nrxq <= 1){return(EDPVS_OK);}RTE_LOG(ERR, SAPOOL, "%s: FDIR is not supported by device %s. Only"" single rxq can be configured.\\n", __func__, dev->name);return(EDPVS_NOTSUPP);}}else{RTE_LOG(ERR, SAPOOL, "%s: FDIR support of device %s is not known.\\n",__func__, dev->name);return(EDPVS_INVAL);}//获取lcore上处理port的哪些接收队列err = netif_get_queue(dev, cid, &queue);if (err != EDPVS_OK){return(err);}//设置flow Director的接收队列为queue,绑定网卡硬件队列filt[0].action.rx_queue = filt[1].action.rx_queue = queue;op = add ? RTE_ETH_FILTER_ADD : RTE_ETH_FILTER_DELETE;//将过滤条件更新到网卡netif_mask_fdir_filter(af, dev, &filt[0]);netif_mask_fdir_filter(af, dev, &filt[1]);//内部调用rte_eth_dev_filter_ctrl添加flow filter规则err = netif_fdir_filter_set(dev, op, &filt[0]);if (err != EDPVS_OK){return(err);}err = netif_fdir_filter_set(dev, op, &filt[1]);if (err != EDPVS_OK){rop = add ? RTE_ETH_FILTER_DELETE : RTE_ETH_FILTER_ADD;netif_fdir_filter_set(dev, rop, &filt[0]);return(err);}#ifdef CONFIG_DPVS_SAPOOL_DEBUG{char ipaddr[64];RTE_LOG(DEBUG, SAPOOL, "FDIR: %s %s %s TCP/UDP ""ip %s port %d (0x%04x) mask 0x%04X queue %d lcore %2d filterID %d/%d\\n",add ? "add" : "del", dev->name,af == AF_INET ? "IPv4" : "IPv6",inet_ntop(af, dip, ipaddr, sizeof(ipaddr)) ? : "::",ntohs(dport), ntohs(dport), sa_fdirs[cid].mask, queue, cid,filter_id[0], filter_id[1]);} #endifreturn(err); }
netif_mask_fdir_filter
- 对目前设置做与操作,屏蔽之前设置的flow director信息
void netif_mask_fdir_filter(int af, const struct netif_port *port,struct rte_eth_fdir_filter *filt) {struct rte_eth_fdir_info fdir_info;const struct rte_eth_fdir_masks *fmask;//首先获取fdir_filter中的input flow定义union rte_eth_fdir_flow * flow = &filt->input.flow;/**union rte_eth_fdir_flow {struct rte_eth_l2_flow l2_flow;struct rte_eth_udpv4_flow udp4_flow;struct rte_eth_tcpv4_flow tcp4_flow;struct rte_eth_sctpv4_flow sctp4_flow;struct rte_eth_ipv4_flow ip4_flow;struct rte_eth_udpv6_flow udp6_flow;struct rte_eth_tcpv6_flow tcp6_flow;struct rte_eth_sctpv6_flow sctp6_flow;struct rte_eth_ipv6_flow ipv6_flow;struct rte_eth_mac_vlan_flow mac_vlan_flow;struct rte_eth_tunnel_flow tunnel_flow;};*//* There exists a defect here. If the netif_port 'port' is not PORT_TYPE_GENERAL,* mask fdir_filter of the port would fail. The correct way to accomplish the* function is to register this method for all device types. Considering the flow* is not changed after masking, we just skip netif_ports other than physical ones. */if (port->type != PORT_TYPE_GENERAL){return;}//retrieve fdir informationif (rte_eth_dev_filter_ctrl(port->id, RTE_ETH_FILTER_FDIR,RTE_ETH_FILTER_INFO, &fdir_info) < 0){RTE_LOG(DEBUG, NETIF, "%s: Fail to fetch fdir info of %s !\\n",__func__, port->name);return;}fmask = &fdir_info.mask;/* ipv4 flow */if (af == AF_INET){flow->ip4_flow.src_ip &= fmask->ipv4_mask.src_ip;flow->ip4_flow.dst_ip &= fmask->ipv4_mask.dst_ip;flow->ip4_flow.tos &= fmask->ipv4_mask.tos;flow->ip4_flow.ttl &= fmask->ipv4_mask.ttl;flow->ip4_flow.proto &= fmask->ipv4_mask.proto;flow->tcp4_flow.src_port &= fmask->src_port_mask;flow->tcp4_flow.dst_port &= fmask->dst_port_mask;return;}/* ipv6 flow */if (af == AF_INET6){flow->ipv6_flow.src_ip[0] &= fmask->ipv6_mask.src_ip[0];flow->ipv6_flow.src_ip[1] &= fmask->ipv6_mask.src_ip[1];flow->ipv6_flow.src_ip[2] &= fmask->ipv6_mask.src_ip[2];flow->ipv6_flow.src_ip[3] &= fmask->ipv6_mask.src_ip[3];flow->ipv6_flow.dst_ip[0] &= fmask->ipv6_mask.dst_ip[0];flow->ipv6_flow.dst_ip[1] &= fmask->ipv6_mask.dst_ip[1];flow->ipv6_flow.dst_ip[2] &= fmask->ipv6_mask.dst_ip[2];flow->ipv6_flow.dst_ip[3] &= fmask->ipv6_mask.dst_ip[3];flow->ipv6_flow.tc &= fmask->ipv6_mask.tc;flow->ipv6_flow.proto &= fmask->ipv6_mask.proto;flow->ipv6_flow.hop_limits &= fmask->ipv6_mask.hop_limits;flow->tcp6_flow.src_port &= fmask->src_port_mask;flow->tcp6_flow.dst_port &= fmask->dst_port_mask;return;} }
netif_port_fdir_dstport_mask_set
- 在netif_port_start启动网卡时调用
/** fdir mask must be set according to configured slave lcore number* */ inline static int netif_port_fdir_dstport_mask_set(struct netif_port *port) {uint8_t slave_nb;int shift;netif_get_slave_lcores(&slave_nb, NULL);for (shift = 0; (0x1 << shift) < slave_nb; shift++){;}if (shift >= 16){RTE_LOG(ERR, NETIF, "%s: %s's fdir dst_port_mask init failed\\n",__func__, port->name);return(EDPVS_NOTSUPP);} #if RTE_VERSION >= 0x10040010port->dev_conf.fdir_conf.mask.dst_port_mask = htons(~((~0x0) << shift)); #elseport->dev_conf.fdir_conf.mask.dst_port_mask = ~((~0x0) << shift); #endifRTE_LOG(INFO, NETIF, "%s:dst_port_mask=%0x\\n", port->name,port->dev_conf.fdir_conf.mask.dst_port_mask);return(EDPVS_OK); }
conn流表中使用fdir
上文fdir设置主要是用于fnat中outbound方向数据流量回到dpvs时,与inbound流量进入dpvs所在lcore保持一致,避免流表查找的锁和缓存失效等。
主要看下dpvs中如何从sa_pool中分配本地地址和本地端口,对于所有新建连接,都会调用dp_vs_conn_new注册流表
dp_vs_conn_new
- 只有 full-nat 才支持 local address/port
struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf,const struct dp_vs_iphdr *iph,struct dp_vs_conn_param *param,struct dp_vs_dest *dest,uint32_t flags) {//full-nat 特殊处理if (dest->fwdmode == DPVS_FWD_MODE_FNAT){//绑定LB本地socket,fullnat做了双向natif ((err = dp_vs_laddr_bind(new, dest->svc)) != EDPVS_OK){goto unbind_dest;}} }
dp_vs_laddr_bind
int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) {struct dp_vs_laddr *laddr = NULL;int i;uint16_t sport = 0;struct sockaddr_storage dsin, ssin;bool found = false;//一些常规校验if (!conn || !conn->dest || !svc){return(EDPVS_INVAL);}//如果传输层协议不是TCP或者UDP也直接返回错误,因为在设置fdir时只关注了tcp和udp协议if (svc->proto != IPPROTO_TCP && svc->proto != IPPROTO_UDP){return(EDPVS_NOTSUPP);}if (dp_vs_conn_is_template(conn)){return(EDPVS_OK);}/** some time allocate lport fails for one laddr,* but there's also some resource on another laddr.*/for (i = 0; i < dp_vs_laddr_max_trails && i < svc->num_laddrs; i++){/* select a local IP from service *///首先选择一个laddrladdr = __get_laddr(svc);if (!laddr){RTE_LOG(ERR, IPVS, "%s: no laddr available.\\n", __func__);return(EDPVS_RESOURCE);}//首先将dsin与ssin清零memset(&dsin, 0, sizeof(struct sockaddr_storage));memset(&ssin, 0, sizeof(struct sockaddr_storage));if (laddr->af == AF_INET){//fnat要将目的信息修改为conn选择的RS的ip:port,同时将源ip和port修改为local ip相关,因为要保证outbound方向数据//同样返回值dpvsstruct sockaddr_in *daddr, *saddr;daddr = (struct sockaddr_in *)&dsin;daddr->sin_family = laddr->af;daddr->sin_addr = conn->daddr.in;daddr->sin_port = conn->dport;saddr = (struct sockaddr_in *)&ssin;saddr->sin_family = laddr->af;saddr->sin_addr = laddr->addr.in;}else{struct sockaddr_in6 *daddr, *saddr;daddr = (struct sockaddr_in6 *)&dsin;daddr->sin6_family = laddr->af;daddr->sin6_addr = conn->daddr.in6;daddr->sin6_port = conn->dport;saddr = (struct sockaddr_in6 *)&ssin;saddr->sin6_family = laddr->af;saddr->sin6_addr = laddr->addr.in6;}//sa_fetch获取端口,来填充完整的dsin,ssin地址,选取port主要用于通过fdir后outbound方向数据返回当前lcore处理if (sa_fetch(laddr->af, laddr->iface, &dsin, &ssin) != EDPVS_OK){char buf[64];if (inet_ntop(laddr->af, &laddr->addr, buf, sizeof(buf)) == NULL){snprintf(buf, sizeof(buf), "::");}#ifdef CONFIG_DPVS_IPVS_DEBUGRTE_LOG(DEBUG, IPVS, "%s: [%02d] no lport available on %s, ""try next laddr.\\n", __func__, rte_lcore_id(), buf); #endifput_laddr(laddr);continue;}//sport为选取出来的portsport = (laddr->af == AF_INET ? (((struct sockaddr_in *)&ssin)->sin_port): (((struct sockaddr_in6 *)&ssin)->sin6_port));found = true;break;}//如果选取laddr和lport失败,返回错误if (!found){ #ifdef CONFIG_DPVS_IPVS_DEBUGRTE_LOG(ERR, IPVS, "%s: [%02d] no lip/lport available !!\\n",__func__, rte_lcore_id()); #endifreturn(EDPVS_RESOURCE);}rte_atomic32_inc(&laddr->conn_counts);/* overwrite related fields in out-tuplehash and conn *///设置conn的local address和local portconn->laddr = laddr->addr;conn->lport = sport;//设置outbound方向tuplehash_entry的连接地址信息tuplehash_out(conn).daddr = laddr->addr;tuplehash_out(conn).dport = sport;//关联选取的laddrconn->local = laddr;return(EDPVS_OK); }
__get_laddr
static inline struct dp_vs_laddr *__get_laddr(struct dp_vs_service *svc) {int step;struct dp_vs_laddr *laddr = NULL;/* if list not inited ? list_empty() returns true ! */assert(svc->laddr_list.next);//如果没有laddr,直接返回if (list_empty(&svc->laddr_list)){return(NULL);}//随记产生一个步数step = __laddr_step(svc);while (step-- > 0){if (unlikely(!svc->laddr_curr)){svc->laddr_curr = svc->laddr_list.next;}else{svc->laddr_curr = svc->laddr_curr->next;}//循环链表,过滤链表头if (svc->laddr_curr == &svc->laddr_list){svc->laddr_curr = svc->laddr_list.next;}}//获取struct dp_vs_laddrladdr = list_entry(svc->laddr_curr, struct dp_vs_laddr, list);rte_atomic32_inc(&laddr->refcnt);return(laddr); }
__laddr_step
static inline int __laddr_step(struct dp_vs_service *svc) {/* Why can't we always use the next laddr(rr scheduler) to setup new session?* Because realserver rr/wrr scheduler may get synchronous with the laddr rr* scheduler. If so, the local IP may stay invariant for a specified realserver,* which is a hurt for realserver concurrency performance. To avoid the problem,* we just choose 5% sessions to use the one after the next laddr randomly.* */if (strncmp(svc->scheduler->name, "rr", 2) == 0 ||strncmp(svc->scheduler->name, "wrr", 3) == 0){return((random() % 100) < 5 ? 2 : 1);}return(1); }
sa_fetch
- 包裹函数
int sa_fetch(int af, struct netif_port *dev,const struct sockaddr_storage *daddr,struct sockaddr_storage *saddr) {if (unlikely(daddr && daddr->ss_family != af)){return(EDPVS_INVAL);}if (unlikely(saddr && saddr->ss_family != af)){return(EDPVS_INVAL);}if (AF_INET == af){return(sa4_fetch(dev, (const struct sockaddr_in *)daddr,(struct sockaddr_in *)saddr));}else if (AF_INET6 == af){return(sa6_fetch(dev, (const struct sockaddr_in6 *)daddr,(struct sockaddr_in6 *)saddr));}else{return(EDPVS_NOTSUPP);} }
sa4_fetch
- 关注下ipv4 lport的选取
- 查找未使用的<saddr,sport>
/** fetch unused <saddr, sport> pair by given hint.* given @ap equivalent to @dev+@saddr, and dport is useless.* with routing's help, the mapping looks like,** +------+------------+-------+-------------------* | | ap | | Is possible to* |daddr | dev & saddr| sport | fetch addr pair?* +------+------------+-------+-------------------* Y Y ? Y Possible* Y Y Y ? Possible* Y Y ? ? Possible* Y N ? Y Possible* Y N Y ? Possible* Y N ? ? Possible* N Y ? Y Possible* N Y Y ? Possible* N Y ? ? Possible* N N ? Y Not Possible* N N Y ? Possible* N N ? ? Not Possible** daddr is a hint to found dev/saddr (by route/netif module).* dev is also a hint, the saddr(ifa) is the key.* af is needed when both saddr and daddr are NULL.*/ static int sa4_fetch(struct netif_port *dev,const struct sockaddr_in *daddr,struct sockaddr_in *saddr) {struct inet_ifaddr *ifa;struct flow4 fl;struct route_entry *rt;int err;assert(saddr);//首先对参数进行校验if (saddr && saddr->sin_addr.s_addr != INADDR_ANY && saddr->sin_port != 0){return(EDPVS_OK); /* everything is known, why call this function ? */}/* if source IP is assiged, we can find ifa->sa_pool* without @daddr and @dev. *///如果指定了sin_addrif (saddr->sin_addr.s_addr){//在net_device上获取对应local ip对应的IP地址信息块inet_ifaddr,主要用于获取sa_poolifa = inet_addr_ifa_get(AF_INET, dev, (union inet_addr *)&saddr->sin_addr);if (!ifa){return(EDPVS_NOTEXIST);}//如果没有配置sa_pool,则返回出错if (!ifa->sa_pool){RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without sapool.", __func__);inet_addr_ifa_put(ifa);return(EDPVS_INVAL);}//获取lporterr = sa_pool_fetch(sa_pool_hash(ifa->sa_pool, (struct sockaddr_storage *)daddr),(struct sockaddr_storage *)saddr);if (err == EDPVS_OK){rte_atomic32_inc(&ifa->sa_pool->refcnt);}inet_addr_ifa_put(ifa);return(err);}/* try to find source ifa by @dev and @daddr *///如果未指定saddr,则首先根据目的地址查找出口路由memset(&fl, 0, sizeof(struct flow4));fl.fl4_oif = dev;fl.fl4_daddr.s_addr = daddr ? daddr->sin_addr.s_addr : htonl(INADDR_ANY);fl.fl4_saddr.s_addr = saddr ? saddr->sin_addr.s_addr : htonl(INADDR_ANY);rt = route4_output(&fl);if (!rt){return(EDPVS_NOROUTE);}/* select source address. *///选择一个源ip地址if (!rt->src.s_addr){inet_addr_select(AF_INET, rt->port, (union inet_addr *)&rt->dest,RT_SCOPE_UNIVERSE, (union inet_addr *)&rt->src);}//根据选取的源ip获取对应的ip信息控制块ifa = inet_addr_ifa_get(AF_INET, rt->port, (union inet_addr *)&rt->src);if (!ifa){route4_put(rt);return(EDPVS_NOTEXIST);}route4_put(rt);if (!ifa->sa_pool){RTE_LOG(WARNING, SAPOOL, "%s: fetch addr on IP without pool.",__func__);inet_addr_ifa_put(ifa);return(EDPVS_INVAL);}/* do fetch socket address */err = sa_pool_fetch(sa_pool_hash(ifa->sa_pool,(struct sockaddr_storage *)daddr),(struct sockaddr_storage *)saddr);if (err == EDPVS_OK){rte_atomic32_inc(&ifa->sa_pool->refcnt);}inet_addr_ifa_put(ifa);return(err); }
sa_pool_hash
- 主要用于在sa_pool的不同sa_entry_pool哈希桶中查找
/* hash dest's <ip/port>. if no dest provided, just use first pool. */ static inline struct sa_entry_pool * sa_pool_hash(const struct sa_pool *ap, const struct sockaddr_storage *ss) {uint32_t hashkey;assert(ap && ap->pool_hash && ap->pool_hash_sz >= 1);if (!ss){return(&ap->pool_hash[0]);}if (ss->ss_family == AF_INET){uint16_t vect[2];const struct sockaddr_in *sin = (const struct sockaddr_in *)ss;vect[0] = ntohl(sin->sin_addr.s_addr) & 0xffff;vect[1] = ntohs(sin->sin_port);hashkey = (vect[0] + vect[1]) % ap->pool_hash_sz;return(&ap->pool_hash[hashkey]);}else if (ss->ss_family == AF_INET6){uint32_t vect[5] = { 0 };const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)ss;vect[0] = sin6->sin6_port;memcpy(&vect[1], &sin6->sin6_addr, 16);hashkey = rte_jhash_32b(vect, 5, sin6->sin6_family) % ap->pool_hash_sz;return(&ap->pool_hash[hashkey]);}else{return(NULL);} }
sa_pool_fetch
- 从free_enties中获取第一个free sa_entry节点,设置local ip和port
- 移动到used列表
- 更新统计计数
static inline int sa_pool_fetch(struct sa_entry_pool *pool,struct sockaddr_storage *ss) {assert(pool && ss);struct sa_entry * ent;struct sockaddr_in * sin = (struct sockaddr_in *)ss;struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;//从free_entries链表中获取第一个可用sa_entryent = list_first_entry_or_null(&pool->free_enties, struct sa_entry, list);if (!ent){ #ifdef CONFIG_DPVS_SAPOOL_DEBUGRTE_LOG(DEBUG, SAPOOL, "%s: no entry (used/free %d/%d)\\n", __func__,pool->used_cnt, pool->free_cnt); #endifpool->miss_cnt++;return(EDPVS_RESOURCE);}//设置local ip和portif (ss->ss_family == AF_INET){sin->sin_family = AF_INET;sin->sin_addr.s_addr = ent->addr.in.s_addr;sin->sin_port = ent->port;}else if (ss->ss_family == AF_INET6){sin6->sin6_family = AF_INET6;sin6->sin6_addr = ent->addr.in6;sin6->sin6_port = ent->port;}else{return(EDPVS_NOTSUPP);}//标记entry正在使用中ent->flags |= SA_F_USED;//将其移动到used列表中list_move_tail(&ent->list, &pool->used_enties);//同时更新使用统计信息pool->used_cnt++;pool->free_cnt--;#ifdef CONFIG_DPVS_SAPOOL_DEBUG{char addr[64];RTE_LOG(DEBUG, SAPOOL, "%s: %s:%d fetched!\\n", __func__,inet_ntop(ss->ss_family, &ent->addr, addr, sizeof(addr)) ? : NULL,ntohs(ent->port));} #endifreturn(EDPVS_OK); }
dpvs中fdir与sa_pool介绍相关推荐
- DPDK在DPVS中的应用及原理分析
上一篇文章中我们已经介绍了DPVS的特点和部署方式,本文主要是用于介绍DPVS是如何实现前面所说的特点,或者说是如何提高性能的. 下图是爱奇艺的DPVS开发团队给出的DPVS在提高性能方面的操作,我们 ...
- Blender中的Python脚本介绍学习教程
Blender中的Python脚本介绍学习教程 MP4 |视频:h264,1280×720 |音频:AAC,48000 Hz 语言:英语+中英文字幕(根据原英文字幕机译更准确)|大小解压后:1.63 ...
- django中使用celery简单介绍
链客,专为开发者而生,有问必答! 此文章来自区块链技术社区,未经允许拒绝转载. 本章节我们重点在于实现,如何存储任务的结果. 我们将任务函数改为: from celery_demo.celery im ...
- pythonexcel介绍_Python 中pandas.read_excel详细介绍
Python 中pandas.read_excel详细介绍 #coding:utf-8 import pandas as pd import numpy as np filefullpath = r& ...
- php中使用mysql的视图_MYSQL中视图的用法介绍(代码示例)
本篇文章给大家带来的内容是关于MYSQL中视图的用法介绍(代码示例),有一定的参考价值,有需要的朋友可以参考一下,希望对你有所帮助. 1.什么是视图 执行一条SQL,将结果集保存在一张虚拟表中 (相关 ...
- Java基础-JAVA中常见的数据结构介绍
Java基础-JAVA中常见的数据结构介绍 作者:尹正杰 版权声明:原创作品,谢绝转载!否则将追究法律责任. 一.什么是数据结构 答:数据结构是指数据存储的组织方式.大致上分为线性表.栈(Stack) ...
- html的meta总结,html标签中meta属性使用介绍和 动态替换字符串
http://www.haorooms.com/post/html_meta_ds http://www.haorooms.com/archives里面的东西比较多,需要细看一下 http://www ...
- vuex结合php,vuex中store的使用介绍(附实例)
本篇文章给大家带来的内容是关于vuex中store的使用介绍(附实例),有一定的参考价值,有需要的朋友可以参考一下,希望对你有所帮助. 一.状态管理(vuex)简介 vuex是专为vue.js应用程序 ...
- python在日常工作处理中的应用-python在工作中的应用场景介绍
python在工作中的应用场景介绍 发布时间:2020-04-21 14:44:30 来源:亿速云 阅读:277 作者:小新 今天小编给大家分享的是python在工作中的应用场景介绍,相信很多人都不太 ...
最新文章
- (NO.00003)iOS游戏简单的机器人投射游戏成形记(二)
- 配置kali linux
- Android性能优化之一:ViewStub
- NPM酷库:dateformat 时间字符串格式化
- 高安全性同态加密算法_坏的同态性教程
- SVD++:推荐系统的基于矩阵分解的协同过滤算法的提高
- [设计模式]抽象工厂模式
- [Leedcode][JAVA][第102题][二叉树的层序遍历][递归][迭代][BFS]
- 【微信小程序】带你做一个公众号留言系统(附源码)
- 重温《数据库系统概论》【第一篇 基础篇】【第4章 数据库安全性】
- java面试要点---Hibernate面试系统知识点复习,hibernate原理,缓冲---随时更新
- Android NFC详解
- (第3章)Docker核心原理解读
- Java、JSP网上订餐系统
- 什么是搜索引擎优化(SEO)
- html涟漪动画效果,CSS 在按钮上做个涟漪效果(Ripple Animation)
- windows11安装wsa安卓子系统
- 网站建设项目合同撰写
- MySQL笔记(基础)
- 给video视频自定义添加中间播放按钮
热门文章
- 硅谷码农35岁危机:Java之父也找不到工作!程序员整容成风!
- Photoshop:制作一张光盘
- WP采集汇集WP采集插件-WP关键词采集文章
- c语言中fun和main,功能:编写函数fun求1!+2!+3!+ …… +n!的和,在main函 数中由键盘输入n值,并输出运算结果。请编写fun 函数...
- python 实现九型人格测试小程序
- Amesim(六):如何使用回放功能
- 给layim和websocket 菜鸟一个温馨的提示
- 第四章 存货 详细笔记
- 微软应用商店点了安装没反应,没法下载和更新
- 硅谷钢铁侠:埃隆 · 马斯克告诉你这个残酷的世界规则