对于mellanox网卡,使用dpdk driver时,在多线程场景下,如果这几个线程分别获取link状态,或者统计计数,或者设置mtu等,就会出现线程被堵塞的问题,下面使用dpdk的example l2fwd复现此问题,并分析原因。

复现

# cd dpdk-stable-18.11.2
# export RTE_TARGET=build
# export RTE_SDK=`pwd`
# make config T=x86_64-native-linuxapp-gcc
# make -j32
# cd examples/l2fwd

稍微修改下l2fwd的main.c文件,如下,第一个线程获取link状态,第二个线程设置mtu

static void
l2fwd_main_loop(void)
{unsigned lcore_id;prev_tsc = 0;timer_tsc = 0;lcore_id = rte_lcore_id();qconf = &lcore_queue_conf[lcore_id];struct rte_eth_link eth_link;while (!force_quit) {if (lcore_id == 0) {rte_eth_link_get(qconf->rx_port_list[0], &eth_link);RTE_LOG(INFO, L2FWD, "link is %d on core %d\n", eth_link.link_status, lcore_id);}else if (lcore_id == 1) {rte_eth_dev_set_mtu(qconf->rx_port_list[0], 1500);RTE_LOG(INFO, L2FWD, "set mtu on core %d\n", lcore_id);}usleep(300);}
}

编译并运行l2fwd,通过 -c 指定两个cpu,可看到线程已经hang住了

#make
# ./build/l2fwd -c3 -n4 -w 82:00.1 -- -p1
EAL: Detected 40 lcore(s)
EAL: Detected 2 NUMA nodes
EAL: Multi-process socket /var/run/dpdk/rte/mp_socket
EAL: Probing VFIO support...
EAL: VFIO support initialized
EAL: PCI device 0000:82:00.1 on NUMA socket 1
EAL:   probe driver: 15b3:1015 net_mlx5
MAC updating enabled
Notice: odd number of ports in portmask.
Lcore 0: RX port 0
Initializing port 0... done:
Port 0, MAC address: 50:6B:4B:C0:9B:C5Checking link statusdone
Port0 Link Up. Speed 25000 Mbps - full-duplex^C

分析原因

使用gdb来查看线程状态和调用栈

# ps -ef | grep l2fwd
root       8344   7232  0 05:45 pts/3    00:00:00 ./build/l2fwd -c3 -n4 -w 82:00.1 -- -p1
root       8353   7790  0 05:47 pts/0    00:00:00 grep --color=auto l2fwd# gdb -p 8344
...
//一共四个线程,线程1和线程4为获取link状态和设置mtu的线程,都已经堵塞在了recvmsg调用上。
(gdb) info threadId   Target Id         Frame1    Thread 0x7f68e4981c00 (LWP 8344) "l2fwd" 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:842    Thread 0x7f68e2d71700 (LWP 8345) "eal-intr-thread" 0x00007f68e32fba13 in epoll_wait () at ../sysdeps/unix/syscall-template.S:843    Thread 0x7f68e2570700 (LWP 8346) "rte_mp_handle" 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
* 4    Thread 0x7f68e1d6f700 (LWP 8347) "lcore-slave-1" 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
//线程1堵塞在获取link状态上
(gdb) thread 1
[Switching to thread 1 (Thread 0x7f68e4981c00 (LWP 8344))]
#0  0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
84      ../sysdeps/unix/syscall-template.S: No such file or directory.
(gdb) bt
#0  0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
#1  0x00000000007f2ad6 in mlx5_nl_recv (nlsk_fd=18, sn=2089018456, cb=0x7f2c70 <mlx5_nl_ifindex_cb>, arg=0x7fff4edcd440)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:266
#2  0x00000000007f41de in mlx5_nl_ifindex (nl=18, name=name@entry=0x43003e75f8 "mlx5_1")at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:782
#3  0x00000000007d6015 in mlx5_get_ifname (dev=0xf7cf40 <rte_eth_devices>, ifname=0x7fff4edcd780)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:225
#4  0x00000000007d6869 in mlx5_ifreq (ifr=0x7fff4edcd780, req=35091, dev=0xf7cf40 <rte_eth_devices>)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:285
#5  mlx5_link_update_unlocked_gs (dev=dev@entry=0xf7cf40 <rte_eth_devices>, link=link@entry=0x7fff4edcd830)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:695
#6  0x00000000007d8833 in mlx5_link_update (dev=0xf7cf40 <rte_eth_devices>, wait_to_complete=1)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:804
#7  0x000000000051b1cf in rte_eth_link_get (port_id=<optimized out>, eth_link=0x7fff4edcd8a0)at /root/dpdk-stable-18.11.2/lib/librte_ethdev/rte_ethdev.c:1913
#8  0x000000000047be2e in l2fwd_main_loop () at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:210
#9  0x000000000047c1dc in l2fwd_launch_one_lcore (dummy=0x0) at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:296
#10 0x0000000000562b7b in rte_eal_mp_remote_launch (f=0x47c1cb <l2fwd_launch_one_lcore>, arg=0x0, call_master=CALL_MASTER)at /root/dpdk-stable-18.11.2/lib/librte_eal/common/eal_common_launch.c:62
#11 0x000000000047d234 in main (argc=2, argv=0x7fff4edce890) at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:739
(gdb) info local
No locals.
(gdb) f 1
#1  0x00000000007f2ad6 in mlx5_nl_recv (nlsk_fd=18, sn=2089018456, cb=0x7f2c70 <mlx5_nl_ifindex_cb>, arg=0x7fff4edcd440)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:266
266                             recv_bytes = recvmsg(nlsk_fd, &msg, 0);
(gdb) info local
nh = <optimized out>
recv_bytes = <optimized out>
sa = {nl_family = 16, nl_pad = 0, nl_pid = 0, nl_groups = 0}
buf = "|\000\000\000\001\024\002\000X\344\203|\230 \000\000\b\000\001\000\003\000\000\000\v\000\002\000mlx5_3\000\000\b\000\003\000\001\000\000\000\f\000\004\000\066\034r\255\027\000\000 \017\000\005\000\061\064.29.2002\000\000\f\000\006\000\000\000\000\000\000\000\000\000\f\000\a\000ě\300\000\003KkP\005\000\016\000\001\000\000\000\005\000T\000\001\000\000\000\t\000C\000roce", '\000' <repeats 16 times>, "\060-\177", '\000' <repeats 13 times>, "\001\000\000\000\v\000\000\000T\000\000\000\001\024\000\000\000U\334N\377\177\000\000\022\000\000\000\000\000\000\000\016P\233Q", '\000' <repeats 12 times>, "T"...
iov = {iov_base = 0x7fff4edc53e0, iov_len = 32768}
msg = {msg_name = 0x7fff4edc5380, msg_namelen = 12, msg_iov = 0x7fff4edc5390, msg_iovlen = 1, msg_control = 0x0, msg_controllen = 0,msg_flags = 0}
multipart = <optimized out>
ret = <optimized out>
//接收数据的seq为2089018456,send时的seq也为2089018456,说
//明接收到了期望数据。nlmsg_flags 为2(NLM_F_MULTI),
//nlmsg_type 不为0x3(NLMSG_DONE)说明还有后续的数据,所以
//继续调用recvmsg接收
(gdb) p *(struct nlmsghdr *)buf
$7 = {nlmsg_len = 124, nlmsg_type = 5121, nlmsg_flags = 2, nlmsg_seq = 2089018456, nlmsg_pid = 8344}(gdb) f 2
#2  0x00000000007f41de in mlx5_nl_ifindex (nl=18, name=name@entry=0x43003e75f8 "mlx5_1")at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:782
782             ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
//ibindex = 1 是第一次接收的数据解析出来的内容
(gdb) info local
seq = 2089018456
data = {name = 0x43003e75f8 "mlx5_1", ibindex = 1, ifindex = 0}
req = {nh = {nlmsg_len = 16, nlmsg_type = 5121, nlmsg_flags = 773, nlmsg_seq = 2089018456, nlmsg_pid = 0},buf = "\020\000\000\000\001\024\005\003X\344\203|", '\000' <repeats 19 times>}
na = <optimized out>
ret = <optimized out>//线程4堵塞在set mtu上
(gdb) thread 4
[Switching to thread 4 (Thread 0x7f68e1d6f700 (LWP 8347))]
#0  0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
84      ../sysdeps/unix/syscall-template.S: No such file or directory.
(gdb) bt
#0  0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
#1  0x00000000007f2ad6 in mlx5_nl_recv (nlsk_fd=18, sn=628175011, cb=0x7f2c70 <mlx5_nl_ifindex_cb>, arg=0x7f68e1d6d020)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:266
#2  0x00000000007f41de in mlx5_nl_ifindex (nl=18, name=name@entry=0x43003e75f8 "mlx5_1")at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:782
#3  0x00000000007d6015 in mlx5_get_ifname (dev=0xf7cf40 <rte_eth_devices>, ifname=0x7f68e1d6d360)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:225
#4  0x00000000007d6869 in mlx5_ifreq (ifr=0x7f68e1d6d360, req=35091, dev=0xf7cf40 <rte_eth_devices>)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:285
#5  mlx5_link_update_unlocked_gs (dev=dev@entry=0xf7cf40 <rte_eth_devices>, link=link@entry=0x7f68e1d6d410)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:695
#6  0x00000000007d8833 in mlx5_link_update (dev=0xf7cf40 <rte_eth_devices>, wait_to_complete=1)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:804
#7  0x000000000051b1cf in rte_eth_link_get (port_id=<optimized out>, eth_link=0x7f68e1d6d480)at /root/dpdk-stable-18.11.2/lib/librte_ethdev/rte_ethdev.c:1913
#8  0x000000000047be2e in l2fwd_main_loop () at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:210
#9  0x000000000047c1dc in l2fwd_launch_one_lcore (dummy=0x0) at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:296
#10 0x0000000000557ae1 in eal_thread_loop (arg=<optimized out>)at /root/dpdk-stable-18.11.2/lib/librte_eal/linuxapp/eal/eal_thread.c:153
#11 0x00007f68e35c56ba in start_thread (arg=0x7f68e1d6f700) at pthread_create.c:333
#12 0x00007f68e32fb41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
(gdb) f 1
#1  0x00000000007f2ad6 in mlx5_nl_recv (nlsk_fd=18, sn=628175011, cb=0x7f2c70 <mlx5_nl_ifindex_cb>, arg=0x7f68e1d6d020)at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:266
266                             recv_bytes = recvmsg(nlsk_fd, &msg, 0);
(gdb) info local
nh = <optimized out>
recv_bytes = <optimized out>
sa = {nl_family = 16, nl_pad = 0, nl_pid = 0, nl_groups = 0}
buf = "\024\000\000\000\003\000\002\000X\344\203|\230 ", '\000' <repeats 28106 times>...
iov = {iov_base = 0x7f68e1d64fc0, iov_len = 32768}
msg = {msg_name = 0x7f68e1d64f60, msg_namelen = 12, msg_iov = 0x7f68e1d64f70, msg_iovlen = 1, msg_control = 0x0, msg_controllen = 0,msg_flags = 0}
multipart = <optimized out>
ret = <optimized out>
//接收数据的seq为2089018456,但是send时的seq为628175011,
//说明接收到了错误数据。seq为2089018456的数据应该是线程1接
//收的数据。
(gdb) p *(struct nlmsghdr *)buf
$5 = {nlmsg_len = 20, nlmsg_type = 3, nlmsg_flags = 2, nlmsg_seq = 2089018456, nlmsg_pid = 8344}
(gdb) f 2
#2  0x00000000007f41de in mlx5_nl_ifindex (nl=18, name=name@entry=0x43003e75f8 "mlx5_1")at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:782
782             ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
(gdb) info local
seq = 628175011
data = {name = 0x43003e75f8 "mlx5_1", ibindex = 0, ifindex = 0}
req = {nh = {nlmsg_len = 16, nlmsg_type = 5121, nlmsg_flags = 773, nlmsg_seq = 628175011, nlmsg_pid = 0},buf = "\020\000\000\000\001\024\005\003\243\060q%", '\000' <repeats 19 times>}
na = <optimized out>
ret = <optimized out>

通过分析调用栈可知,都会调用 mlx5_ifreq->mlx5_get_ifname->mlx5_nl_ifindex->mlx5_nl_recv->recvmsg最终堵塞在recvmsg上。

下面分析下函数mlx5_nl_ifindex,为什么多个线程同时调用会出问题

//驱动初始化时,创建netlink类型的socket,将fd保存到
//nl_socket_rdma,多线程共用这一个fd
mlx5_pci_probe->mlx5_dev_spawnpriv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);//通过 nl_socket_rdma 到kernel获取信息
int
mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
{struct mlx5_priv *priv = dev->data->dev_private;unsigned int ifindex =priv->nl_socket_rdma >= 0 ?mlx5_nl_ifindex(priv->nl_socket_rdma, priv->ibdev_name) : 0;...
}unsigned int
mlx5_nl_ifindex(int nl, const char *name)
{static const uint32_t pindex = 1;//随机分配一个序列号,用来标识一对sendmsg和recvmsguint32_t seq = random();struct mlx5_nl_ifindex_data data = {.name = name,.ibindex = 0, /* Determined during first pass. */.ifindex = 0, /* Determined during second pass. */};union {struct nlmsghdr nh;uint8_t buf[NLMSG_HDRLEN +NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];} req = {.nh = {.nlmsg_len = NLMSG_LENGTH(0),.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,RDMA_NLDEV_CMD_GET),.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,},};struct nlattr *na;int ret;//先发送RDMA_NLDEV_CMD_GET消息,请求获取ibindex ret = mlx5_nl_send(nl, &req.nh, seq);if (ret < 0)return 0;//请求后的数据需要recvmsg来接收ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);if (ret < 0)return 0;...
}static int
mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),void *arg)do {recv_bytes = recvmsg(nlsk_fd, &msg, 0);if (recv_bytes == -1) {rte_errno = errno;return -rte_errno;}nh = (struct nlmsghdr *)buf;//接收的消息携带的seq必须和send时的seq相同,否则继续recvmsg} while (nh->nlmsg_seq != sn);
}

由上面代码可知,每次获取ifindex时,都会先sendmsg,再recvmsg数据,并且seq必须相同。
由上面gdb分析结果可知,线程1发送消息给kernel获取数据,kernel返回的数据分为两次才能接收完,第一次的数据被线程1接收到,但是第二次的数据被线程4接收到,而线程1还在等待接收第二次数据,所以一直堵塞在recvmsg上。

综上,只要多线程情况下,同时调用mlx5_ifreq的api都有可能导致线程hang住。

解决办法

a. 修改业务代码,加锁。或者修改dpdk driver,加锁
b. dpdk在19年的版本已经修复了此问题,patch认为dpdk应用启动后,ifindex应该一直是不变的,所以在驱动初始化时获取一次ifindex,保存下来即可,后续可以直接使用,不用再次从kernel中获取。

也可参考:dpdk mellanox网卡 多线程hang住的问题 - 简书 (jianshu.com)

dpdk mellanox网卡 多线程hang住的问题相关推荐

  1. oracle数据库延迟执行,如何诊断oracle数据库运行缓慢或hang住的问题

    为了诊断oracle运行缓慢的问题首先要决定收集哪些诊断信息,可以采取下面的诊断方法: 1.数据库运行缓慢这个问题是常见还是在特定时间出现 如果数据库运行缓慢是一个常见的问题那么可以在问题出现的时候收 ...

  2. MySQL 执行DDL语句 hang住了怎么办?

    MySQL 执行DDL语句 hang住了怎么办? 不要慌,先点支烟,听我娓娓道来! 前两天,早上7点多的时候,抓起手机忽然看到了圈内的一则DDL语句 hang住的案例,阅读到文末,发现文中留有一些疑问 ...

  3. HANA 数据库备份hang住的解决办法

    今天遇到 HANA 数据库备份hang住的情况.经过查 SAP NOTE 解决,记录一下过程.两个NOTE如下: 2452735 - HANA Backup failing with "[4 ...

  4. oracle删除表不等待,oracle故障处理之删除大表空间hang住

    背景 数据库分区表数据越来越大,需要对过期话的数据进行迁移,以及大的分区表需要进行数据的清理和删除,达到释放磁盘空间的目的. 问题说明 环境:linux 6.X 数据库:oracle 11.2.0.4 ...

  5. c++排查线程hang住_Kafka学习笔记之kafka高版本Client连接0.9Server引发的血案排查 - 时光飞逝,逝者如斯...

    0x00 概述 kafka server虽然原则上是兼容详细的client,但只是高版本的Server端兼容低版本的Client端: 在有高版本Client端连接时,会导致低版本Server集群会ha ...

  6. OGG目标端复制Sequence时Hang住的问题

    昨天遇到一个问题一个OGG的复制进程在复制序列(Sequence)时Hang住不动,进程状态一直是Running状态但是不往前进行复制,导致进程延迟6个多小时 GGSCI (ctm-3) 2> ...

  7. mysql 备库 hang住_mysql主键的缺少导致备库hang住

    最近线上频繁的出现slave延时的情况,经排查发现为用户在删除数据的时候,由于表主键的主键的缺少,同时删除条件没有索引,或或者删除的条件过滤性极差,导致slave出现hang住,严重的影响了生产环境的 ...

  8. mysql客户端hang_MySQL所有操作hang住了,怎么破?

    <MySQL所有操作hang住了,怎么破?>要点: 本文介绍了MySQL所有操作hang住了,怎么破?,希望对您有用.如果有疑问,可以联系我们. 作者介绍 王松磊,现任职于UCloud,从 ...

  9. plsql一直正在编译_使用plsql/devlop编译过程hang住案列小结

    今天有位同事在使用plsql/devleop编译存储过程时老是导致整个操作界面hang住长时间无法响应,以下是我的处理过程,简要记录之. 1.查询v$session视图确定hang住的会话相关信息,比 ...

最新文章

  1. react控制 input 框回车之后内容清空
  2. 2019.04.09 电商23 用户未登录操作数据库
  3. windows下安装consul
  4. 4.线性和卷积——边界问题、解决边界方法和Matlab实战_3
  5. OpenShift 4之设置用户/组对项目的访问权限
  6. OI树上问题 简单学习笔记
  7. 创建连接数据库(DBLink)
  8. MicrosoftStore无法下载软件
  9. 教你安装ps,pr,ae,ai等Adobe软件,办公必备
  10. 物理专业英语词汇(H-N)
  11. 亲测免费下载知网论文方法
  12. 【算法随记二】线卷积积分及其在图像增强和特效方面的应用(一)
  13. Cadence PSpice 补充1:脉冲信号源的详细介绍与使用方法图文演示
  14. 米兔机器人终于拼完了_米兔机器人上手组装如果你不会拼装或拆卸建议收藏
  15. Tiny210--3--基于Tiny210的CMOS摄像头图像采集
  16. WPF使用Live Chart之动态更新数据
  17. (一)OSG初学者入门基础教程
  18. 蓝桥杯大学JAVA题型_蓝桥杯 2020年省赛真题 10月第二场 (Java 大学B组)
  19. 浪潮服务器sa5212m4虚拟化,SA5212M4 BMC设置
  20. 基于Kafka-Zookeeper-Nginx-FIlebeat-MySQL的日志清洗分析平台搭建

热门文章

  1. 设计思维从1到100
  2. mysql 5.7 ga_mysql 5.7.9(GA) 安装
  3. 心流状态---人们做事时内心的一种状态
  4. 流量从“海量”到“僵化”,精细化运营是企业最后一根救命稻草
  5. ASP.NET MVC 分部页 PartialViewResult
  6. Python之input()函数用法,如何接收单行或多行的输入多个参数
  7. 32位操作系统升级为64位步奏
  8. JMeter 进行压力测试并发测试步骤,及文件上传并发测试演示
  9. 呼叫中心静态座席的配置
  10. 八字易经算法之用JAVA实现排八字神煞