当前位置:   article > 正文

dpdk mellanox网卡 多线程hang住的问题_rte_eth_link_get

rte_eth_link_get

对于mellanox网卡,使用dpdk driver时,在多线程场景下,如果这几个线程分别获取link状态,或者统计计数,或者设置mtu等,就会出现线程被堵塞的问题,下面使用dpdk的example l2fwd复现此问题,并分析原因。

复现

  1. # cd dpdk-stable-18.11.2
  2. # export RTE_TARGET=build
  3. # export RTE_SDK=`pwd`
  4. # make config T=x86_64-native-linuxapp-gcc
  5. # make -j32
  6. # cd examples/l2fwd

稍微修改下l2fwd的main.c文件,如下,第一个线程获取link状态,第二个线程设置mtu

  1. static void
  2. l2fwd_main_loop(void)
  3. {
  4. unsigned lcore_id;
  5. prev_tsc = 0;
  6. timer_tsc = 0;
  7. lcore_id = rte_lcore_id();
  8. qconf = &lcore_queue_conf[lcore_id];
  9. struct rte_eth_link eth_link;
  10. while (!force_quit) {
  11. if (lcore_id == 0) {
  12. rte_eth_link_get(qconf->rx_port_list[0], &eth_link);
  13. RTE_LOG(INFO, L2FWD, "link is %d on core %d\n", eth_link.link_status, lcore_id);
  14. }
  15. else if (lcore_id == 1) {
  16. rte_eth_dev_set_mtu(qconf->rx_port_list[0], 1500);
  17. RTE_LOG(INFO, L2FWD, "set mtu on core %d\n", lcore_id);
  18. }
  19. usleep(300);
  20. }
  21. }

编译并运行l2fwd,通过 -c 指定两个cpu,可看到线程已经hang住了

  1. #make
  2. # ./build/l2fwd -c3 -n4 -w 82:00.1 -- -p1
  3. EAL: Detected 40 lcore(s)
  4. EAL: Detected 2 NUMA nodes
  5. EAL: Multi-process socket /var/run/dpdk/rte/mp_socket
  6. EAL: Probing VFIO support...
  7. EAL: VFIO support initialized
  8. EAL: PCI device 0000:82:00.1 on NUMA socket 1
  9. EAL: probe driver: 15b3:1015 net_mlx5
  10. MAC updating enabled
  11. Notice: odd number of ports in portmask.
  12. Lcore 0: RX port 0
  13. Initializing port 0... done:
  14. Port 0, MAC address: 50:6B:4B:C0:9B:C5
  15. Checking link statusdone
  16. Port0 Link Up. Speed 25000 Mbps - full-duplex
  17. ^C

分析原因

使用gdb来查看线程状态和调用栈

  1. # ps -ef | grep l2fwd
  2. root 8344 7232 0 05:45 pts/3 00:00:00 ./build/l2fwd -c3 -n4 -w 82:00.1 -- -p1
  3. root 8353 7790 0 05:47 pts/0 00:00:00 grep --color=auto l2fwd
  4. # gdb -p 8344
  5. ...
  6. //一共四个线程,线程1和线程4为获取link状态和设置mtu的线程,都已经堵塞在了recvmsg调用上。
  7. (gdb) info thread
  8. Id Target Id Frame
  9. 1 Thread 0x7f68e4981c00 (LWP 8344) "l2fwd" 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
  10. 2 Thread 0x7f68e2d71700 (LWP 8345) "eal-intr-thread" 0x00007f68e32fba13 in epoll_wait () at ../sysdeps/unix/syscall-template.S:84
  11. 3 Thread 0x7f68e2570700 (LWP 8346) "rte_mp_handle" 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
  12. * 4 Thread 0x7f68e1d6f700 (LWP 8347) "lcore-slave-1" 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
  13. //线程1堵塞在获取link状态上
  14. (gdb) thread 1
  15. [Switching to thread 1 (Thread 0x7f68e4981c00 (LWP 8344))]
  16. #0 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
  17. 84 ../sysdeps/unix/syscall-template.S: No such file or directory.
  18. (gdb) bt
  19. #0 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
  20. #1 0x00000000007f2ad6 in mlx5_nl_recv (nlsk_fd=18, sn=2089018456, cb=0x7f2c70 <mlx5_nl_ifindex_cb>, arg=0x7fff4edcd440)
  21. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:266
  22. #2 0x00000000007f41de in mlx5_nl_ifindex (nl=18, name=name@entry=0x43003e75f8 "mlx5_1")
  23. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:782
  24. #3 0x00000000007d6015 in mlx5_get_ifname (dev=0xf7cf40 <rte_eth_devices>, ifname=0x7fff4edcd780)
  25. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:225
  26. #4 0x00000000007d6869 in mlx5_ifreq (ifr=0x7fff4edcd780, req=35091, dev=0xf7cf40 <rte_eth_devices>)
  27. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:285
  28. #5 mlx5_link_update_unlocked_gs (dev=dev@entry=0xf7cf40 <rte_eth_devices>, link=link@entry=0x7fff4edcd830)
  29. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:695
  30. #6 0x00000000007d8833 in mlx5_link_update (dev=0xf7cf40 <rte_eth_devices>, wait_to_complete=1)
  31. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:804
  32. #7 0x000000000051b1cf in rte_eth_link_get (port_id=<optimized out>, eth_link=0x7fff4edcd8a0)
  33. at /root/dpdk-stable-18.11.2/lib/librte_ethdev/rte_ethdev.c:1913
  34. #8 0x000000000047be2e in l2fwd_main_loop ()
  35. at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:210
  36. #9 0x000000000047c1dc in l2fwd_launch_one_lcore (dummy=0x0) at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:296
  37. #10 0x0000000000562b7b in rte_eal_mp_remote_launch (f=0x47c1cb <l2fwd_launch_one_lcore>, arg=0x0, call_master=CALL_MASTER)
  38. at /root/dpdk-stable-18.11.2/lib/librte_eal/common/eal_common_launch.c:62
  39. #11 0x000000000047d234 in main (argc=2, argv=0x7fff4edce890) at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:739
  40. (gdb) info local
  41. No locals.
  42. (gdb) f 1
  43. #1 0x00000000007f2ad6 in mlx5_nl_recv (nlsk_fd=18, sn=2089018456, cb=0x7f2c70 <mlx5_nl_ifindex_cb>, arg=0x7fff4edcd440)
  44. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:266
  45. 266 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
  46. (gdb) info local
  47. nh = <optimized out>
  48. recv_bytes = <optimized out>
  49. sa = {nl_family = 16, nl_pad = 0, nl_pid = 0, nl_groups = 0}
  50. buf = "|\000\000\000\001\024\002\000X\344\203|\230 \000\000\b\000\001\000\003\000\000\000\v\000\002\000mlx5_3\000\000\b\000\003\000\001\000\000\000\f\000\004\000\066\034r\255\027\000\000 \017\000\005\000\061\064.29.2002\000\000\f\000\006\000\000\000\000\000\000\000\000\000\f\000\a\000ě\300\000\003KkP\005\000\016\000\001\000\000\000\005\000T\000\001\000\000\000\t\000C\000roce", '\000' <repeats 16 times>, "\060-\177", '\000' <repeats 13 times>, "\001\000\000\000\v\000\000\000T\000\000\000\001\024\000\000\000U\334N\377\177\000\000\022\000\000\000\000\000\000\000\016P\233Q", '\000' <repeats 12 times>, "T"...
  51. iov = {iov_base = 0x7fff4edc53e0, iov_len = 32768}
  52. msg = {msg_name = 0x7fff4edc5380, msg_namelen = 12, msg_iov = 0x7fff4edc5390, msg_iovlen = 1, msg_control = 0x0, msg_controllen = 0,
  53. msg_flags = 0}
  54. multipart = <optimized out>
  55. ret = <optimized out>
  56. //接收数据的seq为2089018456send时的seq也为2089018456,说
  57. //明接收到了期望数据。nlmsg_flags 为2(NLM_F_MULTI),
  58. //nlmsg_type 不为0x3(NLMSG_DONE)说明还有后续的数据,所以
  59. //继续调用recvmsg接收
  60. (gdb) p *(struct nlmsghdr *)buf
  61. $7 = {nlmsg_len = 124, nlmsg_type = 5121, nlmsg_flags = 2, nlmsg_seq = 2089018456, nlmsg_pid = 8344}
  62. (gdb) f 2
  63. #2 0x00000000007f41de in mlx5_nl_ifindex (nl=18, name=name@entry=0x43003e75f8 "mlx5_1")
  64. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:782
  65. 782 ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
  66. //ibindex = 1 是第一次接收的数据解析出来的内容
  67. (gdb) info local
  68. seq = 2089018456
  69. data = {name = 0x43003e75f8 "mlx5_1", ibindex = 1, ifindex = 0}
  70. req = {nh = {nlmsg_len = 16, nlmsg_type = 5121, nlmsg_flags = 773, nlmsg_seq = 2089018456, nlmsg_pid = 0},
  71. buf = "\020\000\000\000\001\024\005\003X\344\203|", '\000' <repeats 19 times>}
  72. na = <optimized out>
  73. ret = <optimized out>
  74. //线程4堵塞在set mtu上
  75. (gdb) thread 4
  76. [Switching to thread 4 (Thread 0x7f68e1d6f700 (LWP 8347))]
  77. #0 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
  78. 84 ../sysdeps/unix/syscall-template.S: No such file or directory.
  79. (gdb) bt
  80. #0 0x00007f68e35ce94d in recvmsg () at ../sysdeps/unix/syscall-template.S:84
  81. #1 0x00000000007f2ad6 in mlx5_nl_recv (nlsk_fd=18, sn=628175011, cb=0x7f2c70 <mlx5_nl_ifindex_cb>, arg=0x7f68e1d6d020)
  82. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:266
  83. #2 0x00000000007f41de in mlx5_nl_ifindex (nl=18, name=name@entry=0x43003e75f8 "mlx5_1")
  84. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:782
  85. #3 0x00000000007d6015 in mlx5_get_ifname (dev=0xf7cf40 <rte_eth_devices>, ifname=0x7f68e1d6d360)
  86. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:225
  87. #4 0x00000000007d6869 in mlx5_ifreq (ifr=0x7f68e1d6d360, req=35091, dev=0xf7cf40 <rte_eth_devices>)
  88. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:285
  89. #5 mlx5_link_update_unlocked_gs (dev=dev@entry=0xf7cf40 <rte_eth_devices>, link=link@entry=0x7f68e1d6d410)
  90. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:695
  91. #6 0x00000000007d8833 in mlx5_link_update (dev=0xf7cf40 <rte_eth_devices>, wait_to_complete=1)
  92. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_ethdev.c:804
  93. #7 0x000000000051b1cf in rte_eth_link_get (port_id=<optimized out>, eth_link=0x7f68e1d6d480)
  94. at /root/dpdk-stable-18.11.2/lib/librte_ethdev/rte_ethdev.c:1913
  95. #8 0x000000000047be2e in l2fwd_main_loop () at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:210
  96. #9 0x000000000047c1dc in l2fwd_launch_one_lcore (dummy=0x0) at /root/dpdk-stable-18.11.2/examples/l2fwd/main.c:296
  97. #10 0x0000000000557ae1 in eal_thread_loop (arg=<optimized out>)
  98. at /root/dpdk-stable-18.11.2/lib/librte_eal/linuxapp/eal/eal_thread.c:153
  99. #11 0x00007f68e35c56ba in start_thread (arg=0x7f68e1d6f700) at pthread_create.c:333
  100. #12 0x00007f68e32fb41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
  101. (gdb) f 1
  102. #1 0x00000000007f2ad6 in mlx5_nl_recv (nlsk_fd=18, sn=628175011, cb=0x7f2c70 <mlx5_nl_ifindex_cb>, arg=0x7f68e1d6d020)
  103. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:266
  104. 266 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
  105. (gdb) info local
  106. nh = <optimized out>
  107. recv_bytes = <optimized out>
  108. sa = {nl_family = 16, nl_pad = 0, nl_pid = 0, nl_groups = 0}
  109. buf = "\024\000\000\000\003\000\002\000X\344\203|\230 ", '\000' <repeats 28106 times>...
  110. iov = {iov_base = 0x7f68e1d64fc0, iov_len = 32768}
  111. msg = {msg_name = 0x7f68e1d64f60, msg_namelen = 12, msg_iov = 0x7f68e1d64f70, msg_iovlen = 1, msg_control = 0x0, msg_controllen = 0,
  112. msg_flags = 0}
  113. multipart = <optimized out>
  114. ret = <optimized out>
  115. //接收数据的seq为2089018456,但是send时的seq为628175011
  116. //说明接收到了错误数据。seq为2089018456的数据应该是线程1
  117. //收的数据。
  118. (gdb) p *(struct nlmsghdr *)buf
  119. $5 = {nlmsg_len = 20, nlmsg_type = 3, nlmsg_flags = 2, nlmsg_seq = 2089018456, nlmsg_pid = 8344}
  120. (gdb) f 2
  121. #2 0x00000000007f41de in mlx5_nl_ifindex (nl=18, name=name@entry=0x43003e75f8 "mlx5_1")
  122. at /root/dpdk-stable-18.11.2/drivers/net/mlx5/mlx5_nl.c:782
  123. 782 ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
  124. (gdb) info local
  125. seq = 628175011
  126. data = {name = 0x43003e75f8 "mlx5_1", ibindex = 0, ifindex = 0}
  127. req = {nh = {nlmsg_len = 16, nlmsg_type = 5121, nlmsg_flags = 773, nlmsg_seq = 628175011, nlmsg_pid = 0},
  128. buf = "\020\000\000\000\001\024\005\003\243\060q%", '\000' <repeats 19 times>}
  129. na = <optimized out>
  130. ret = <optimized out>

通过分析调用栈可知,都会调用 mlx5_ifreq->mlx5_get_ifname->mlx5_nl_ifindex->mlx5_nl_recv->recvmsg最终堵塞在recvmsg上。

下面分析下函数mlx5_nl_ifindex,为什么多个线程同时调用会出问题

  1. //驱动初始化时,创建netlink类型的socket,将fd保存到
  2. //nl_socket_rdma,多线程共用这一个fd
  3. mlx5_pci_probe->mlx5_dev_spawn
  4. priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
  5. //通过 nl_socket_rdma 到kernel获取信息
  6. int
  7. mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[IF_NAMESIZE])
  8. {
  9. struct mlx5_priv *priv = dev->data->dev_private;
  10. unsigned int ifindex =
  11. priv->nl_socket_rdma >= 0 ?
  12. mlx5_nl_ifindex(priv->nl_socket_rdma, priv->ibdev_name) : 0;
  13. ...
  14. }
  15. unsigned int
  16. mlx5_nl_ifindex(int nl, const char *name)
  17. {
  18. static const uint32_t pindex = 1;
  19. //随机分配一个序列号,用来标识一对sendmsg和recvmsg
  20. uint32_t seq = random();
  21. struct mlx5_nl_ifindex_data data = {
  22. .name = name,
  23. .ibindex = 0, /* Determined during first pass. */
  24. .ifindex = 0, /* Determined during second pass. */
  25. };
  26. union {
  27. struct nlmsghdr nh;
  28. uint8_t buf[NLMSG_HDRLEN +
  29. NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
  30. NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
  31. } req = {
  32. .nh = {
  33. .nlmsg_len = NLMSG_LENGTH(0),
  34. .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
  35. RDMA_NLDEV_CMD_GET),
  36. .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
  37. },
  38. };
  39. struct nlattr *na;
  40. int ret;
  41. //先发送RDMA_NLDEV_CMD_GET消息,请求获取ibindex
  42. ret = mlx5_nl_send(nl, &req.nh, seq);
  43. if (ret < 0)
  44. return 0;
  45. //请求后的数据需要recvmsg来接收
  46. ret = mlx5_nl_recv(nl, seq, mlx5_nl_ifindex_cb, &data);
  47. if (ret < 0)
  48. return 0;
  49. ...
  50. }
  51. static int
  52. mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
  53. void *arg)
  54. do {
  55. recv_bytes = recvmsg(nlsk_fd, &msg, 0);
  56. if (recv_bytes == -1) {
  57. rte_errno = errno;
  58. return -rte_errno;
  59. }
  60. nh = (struct nlmsghdr *)buf;
  61. //接收的消息携带的seq必须和send时的seq相同,否则继续recvmsg
  62. } while (nh->nlmsg_seq != sn);
  63. }

由上面代码可知,每次获取ifindex时,都会先sendmsg,再recvmsg数据,并且seq必须相同。
由上面gdb分析结果可知,线程1发送消息给kernel获取数据,kernel返回的数据分为两次才能接收完,第一次的数据被线程1接收到,但是第二次的数据被线程4接收到,而线程1还在等待接收第二次数据,所以一直堵塞在recvmsg上。

综上,只要多线程情况下,同时调用mlx5_ifreq的api都有可能导致线程hang住。

解决办法

a. 修改业务代码,加锁。或者修改dpdk driver,加锁
b. dpdk在19年的版本已经修复了此问题,patch认为dpdk应用启动后,ifindex应该一直是不变的,所以在驱动初始化时获取一次ifindex,保存下来即可,后续可以直接使用,不用再次从kernel中获取。

也可参考:dpdk mellanox网卡 多线程hang住的问题 - 简书 (jianshu.com) 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/542024
推荐阅读
相关标签
  

闽ICP备14008679号