root@hpc118:~/project/rdma/perftest# ./ib_send_bw
WARNING: BW peak won't be measured in this run.
************************************
* Waiting for client to connect... *
************************************
---------------------------------------------------------------------------------------
Send BW Test
Dual-port : OFF Device : mlx5_0
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
PCIe relax order: ON Lock-free : OFF
ibv_wr* API : ON Using DDP : OFF
RX depth : 512
CQ Moderation : 1
Mtu : 1024[B]
Link type : Ethernet
GID index : 3
Max inline data : 0[B]
rdma_cm QPs : OFF
Data ex. method : Ethernet
---------------------------------------------------------------------------------------
local address: LID 0000 QPN 0x00b5 PSN 0x50af45
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:118
remote address: LID 0000 QPN 0x00b4 PSN 0x13043e
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:117
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MiB/sec] BW average[MiB/sec] MsgRate[Mpps]
Did not get Message for 120 Seconds, exiting..
Total Received=1, Total Iters Required=1000
root@hpc118:~/project/rdma/perftest#
--------------------- Our rdma-core --------------------- 使用自己编译的rdma-core便于单步调试
---------------------------------------------------------------------------------------
Send BW Test
Dual-port : OFF Device : mlx5_0
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
PCIe relax order: ON Lock-free : OFF
ibv_wr* API : ON Using DDP : OFF
TX depth : 128
CQ Moderation : 1
Mtu : 1024[B]
Link type : Ethernet
GID index : 3
Max inline data : 0[B]
rdma_cm QPs : OFF
Data ex. method : Ethernet
---------------------------------------------------------------------------------------
local address: LID 0000 QPN 0x00b4 PSN 0x13043e
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:117
remote address: LID 0000 QPN 0x00b5 PSN 0x50af45
GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:01:118
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MiB/sec] BW average[MiB/sec] MsgRate[Mpps]
Completion with error at client
Failed status 12: wr_id 0 syndrom 0x81
scnt=128, ccnt=1
[1] + Done "/bin/gdb" --interpreter=mi --tty=${DbgTerm} 0<"/tmp/Microsoft-MIEngine-In-kqns3cnj.eq5" 1>"/tmp/Microsoft-MIEngine-Out-iuyqjstb.12s"
perftest中发送带宽测试时的提交WR流程分析:
libmlx5.so.1!_common_wqe_init_op(uint8_t mlx5_op, int ib_op, struct ibv_qp_ex * ibqp) (\root\project\rdma\rdma-core\providers\mlx5\qp.c:1253)
libmlx5.so.1!_common_wqe_init(enum ibv_wr_opcode ib_op, struct ibv_qp_ex * ibqp) (\root\project\rdma\rdma-core\providers\mlx5\qp.c:1309)
libmlx5.so.1!_mlx5_send_wr_send(enum ibv_wr_opcode ib_op, struct ibv_qp_ex * ibqp) (\root\project\rdma\rdma-core\providers\mlx5\qp.c:1346)
libmlx5.so.1!mlx5_send_wr_send_other(struct ibv_qp_ex * ibqp) (\root\project\rdma\rdma-core\providers\mlx5\qp.c:1369)
ibv_wr_send(struct ibv_qp_ex * qp) (\usr\include\infiniband\verbs.h:1409)
_new_post_send(int index, struct perftest_parameters * user_param, struct pingpong_context * ctx) (\root\project\rdma\perftest\src\perftest_resources.c:316)
new_post_send_sge_rc(struct pingpong_context * ctx, int index, struct perftest_parameters * user_param) (\root\project\rdma\perftest\src\perftest_resources.c:493)
post_send_method(struct perftest_parameters * user_param, int index, struct pingpong_context * ctx) (\root\project\rdma\perftest\src\perftest_resources.c:681)
post_send_method(struct perftest_parameters * user_param, int index, struct pingpong_context * ctx) (\root\project\rdma\perftest\src\perftest_resources.c:676)
run_iter_bw(struct pingpong_context * ctx, struct perftest_parameters * user_param) (\root\project\rdma\perftest\src\perftest_resources.c:3629)
main(int argc, char ** argv) (\root\project\rdma\perftest\src\send_bw.c:506)
new_post_send_sge_rc -> _new_post_send
ibv_wr_start(ctx->qpx[index])
qp->wr_start(qp) -> mlx5_send_wr_start
ibv_wr_send(ctx->qpx[index]) -> mlx5_send_wr_send_other -> _mlx5_send_wr_send
_common_wqe_init(ibqp, ib_op)
mlx5_get_send_wqe
mqp->cur_size = (sizeof(struct mlx5_wqe_ctrl_seg) + transport_seg_sz) / 16
ibv_wr_set_sge -> mlx5_send_wr_set_sge_rc_uc
ibv_wr_complete -> mlx5_send_wr_complete
post_send_db(mqp, mqp->bf, mqp->nreq, mqp->inl_wqe, mqp->cur_size, mqp->cur_ctrl)
udma_to_device_barrier() -> 确保敲DB之前, 描述符已经写入
mmio_wc_start() -> 防止 WC 写入相对于其他 MMIO 写入重新排序。这应该在写入 WC 内存之前使用。这必须充当屏障,以防止来自不同内存类型的写入重新排序:mmio_mem = 1; mmio_flush_writes(); wc_mem = 2; 必须始终生成 TLP“1”后跟“2”。此屏障意味着 udma_to_device_barrier() 这旨在与 WC 内存一起使用,以从 CPU 生成大型 PCI-E MemWr TLP
mlx5_bf_copy
mmio_memcpy_x64(dst, src, 64) -> 拷贝多个64字节
or mmio_write64_be
mmio_flush_writes() -> 使用 mmio_flush_writes() 确保写入组合缓冲区从正在运行的 CPU 中刷新出来。这必须在自旋锁内进行。否则,可能会发生竞争。在竞争中,CPU A 写入门铃 1,该门铃在 WC 缓冲区中等待。CPU B 写入门铃 2,并且它的写入更早被刷新。由于 mmio_flush_writes 是 CPU 本地的,这将导致 HCA 看到门铃 2,然后是门铃 1。在切换 bf_offset 以面向延迟之前刷新。
https://github.com/linux-rdma/rdma-core/commit/0d6f52347438379c905614b1bc32f824cf0cfc36
enum ibv_wr_opcode {
IBV_WR_RDMA_WRITE,
IBV_WR_RDMA_WRITE_WITH_IMM,
IBV_WR_SEND,
IBV_WR_SEND_WITH_IMM,
IBV_WR_RDMA_READ,
IBV_WR_ATOMIC_CMP_AND_SWP,
IBV_WR_ATOMIC_FETCH_AND_ADD,
IBV_WR_LOCAL_INV,
IBV_WR_BIND_MW,
IBV_WR_SEND_WITH_INV,
IBV_WR_TSO,
IBV_WR_DRIVER1,
IBV_WR_FLUSH = 14,
IBV_WR_ATOMIC_WRITE = 15,
};
enum {
MLX5_OPCODE_NOP = 0x00,
MLX5_OPCODE_SEND_INVAL = 0x01,
MLX5_OPCODE_RDMA_WRITE = 0x08,
MLX5_OPCODE_RDMA_WRITE_IMM = 0x09,
MLX5_OPCODE_SEND = 0x0a,
MLX5_OPCODE_SEND_IMM = 0x0b,
MLX5_OPCODE_TSO = 0x0e,
MLX5_OPCODE_RDMA_READ = 0x10,
MLX5_OPCODE_ATOMIC_CS = 0x11,
MLX5_OPCODE_ATOMIC_FA = 0x12,
MLX5_OPCODE_ATOMIC_MASKED_CS = 0x14,
MLX5_OPCODE_ATOMIC_MASKED_FA = 0x15,
MLX5_OPCODE_FMR = 0x19,
MLX5_OPCODE_LOCAL_INVAL = 0x1b,
MLX5_OPCODE_CONFIG_CMD = 0x1f,
MLX5_OPCODE_SET_PSV = 0x20,
MLX5_OPCODE_UMR = 0x25,
MLX5_OPCODE_TAG_MATCHING = 0x28,
MLX5_OPCODE_FLOW_TBL_ACCESS = 0x2c,
MLX5_OPCODE_MMO = 0x2F,
};
static const uint32_t mlx5_ib_opcode[] = {
[IBV_WR_SEND] = MLX5_OPCODE_SEND,
[IBV_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL,
[IBV_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM,
[IBV_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE,
[IBV_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM,
[IBV_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ,
[IBV_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS,
[IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA,
[IBV_WR_BIND_MW] = MLX5_OPCODE_UMR,
[IBV_WR_LOCAL_INV] = MLX5_OPCODE_UMR,
[IBV_WR_TSO] = MLX5_OPCODE_TSO,
[IBV_WR_DRIVER1] = MLX5_OPCODE_UMR,
};
参考rdma-core: util/udma_barrier.h
防止 WC 写入相对于其他 MMIO 写入重新排序。这应该在写入 WC 内存之前使用。这必须充当屏障,以防止来自不同内存类型的写入重新排序:mmio_mem = 1; mmio_flush_writes(); wc_mem = 2; 必须始终生成 TLP“1”后跟“2”。此屏障意味着 udma_to_device_barrier() 这旨在与 WC 内存一起使用,以从 CPU 生成大型 PCI-E MemWr TLP。
/* Prevent WC writes from being re-ordered relative to other MMIO
writes. This should be used before a write to WC memory.
This must act as a barrier to prevent write re-ordering from different
memory types:
*mmio_mem = 1;
mmio_flush_writes();
*wc_mem = 2;
Must always produce a TLP '1' followed by '2'.
This barrier implies udma_to_device_barrier()
This is intended to be used in conjunction with WC memory to generate large
PCI-E MemWr TLPs from the CPU.
*/
#define mmio_wc_start() mmio_flush_writes()
// 针对不同架构, 内存屏障的实现:
#if defined(__i386__)
#define mmio_flush_writes() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
#elif defined(__x86_64__)
#define mmio_flush_writes() asm volatile("sfence" ::: "memory")
#elif defined(__PPC64__)
#define mmio_flush_writes() asm volatile("sync" ::: "memory")
#elif defined(__PPC__)
#define mmio_flush_writes() asm volatile("sync" ::: "memory")
#elif defined(__ia64__)
#define mmio_flush_writes() asm volatile("fwb" ::: "memory")
#elif defined(__sparc_v9__)
#define mmio_flush_writes() asm volatile("membar #StoreStore" ::: "memory")
#elif defined(__aarch64__)
#define mmio_flush_writes() asm volatile("dsb st" ::: "memory");
#elif defined(__sparc__)
#define mmio_flush_writes() asm volatile("" ::: "memory")
#elif defined(__loongarch__)
#define mmio_flush_writes() asm volatile("dbar 0" ::: "memory")
#elif defined(__riscv)
#define mmio_flush_writes() asm volatile("fence ow,ow" ::: "memory")
#elif defined(__s390x__)
#include "s390_mmio_insn.h"
#define mmio_flush_writes() s390_pciwb()
#elif defined(__mips__)
#define mmio_flush_writes() asm volatile("sync" ::: "memory")
#else
#error No architecture specific memory barrier defines found!
#endif
perftest: ib_send_bw
rdma-core(mlx5用户态驱动)
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。