RDMA内核态函数ib_post_send()源码分析

最近调用linux内核下RDMA的Verb API ib_post_send()出现了问题,因此从源码分析一下这个函数的调用过程。

我使用的内核版本为5.15.0-94

这是函数ib_post_send的头文件定义,这个函数的意义是向发送队列提交发送请求,他会调用qp对应设备的post_send操作

post_send是一个函数指针

post_send函数的具体实现在infiniband驱动程序里,这是驱动程序的路径

mlx5驱动作为内核模块载入内核:

模块载入过程中会进行相关变量、函数的绑定和初始化。其中post_send函数指针对应的函数为mlx5_ib_post_send_nodrain

而这个函数会调用mlx5_ib_post_send函数。

下面这个函数就是最后调用的函数。

这个函数的完整代码如下:

c 复制代码
int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
		      const struct ib_send_wr **bad_wr, bool drain)
{
	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
	struct mlx5_core_dev *mdev = dev->mdev;
	struct mlx5_ib_qp *qp = to_mqp(ibqp);
	struct mlx5_wqe_xrc_seg *xrc;
	struct mlx5_bf *bf;
	void *cur_edge;
	int size;
	unsigned long flags;
	unsigned int idx;
	int err = 0;
	int num_sge;
	void *seg;
	int nreq;
	int i;
	u8 next_fence = 0;
	u8 fence;

	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR &&
		     !drain)) {
		*bad_wr = wr;
		return -EIO;
	}

	if (qp->type == IB_QPT_GSI)
		return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr);

	bf = &qp->bf;

	spin_lock_irqsave(&qp->sq.lock, flags);

	for (nreq = 0; wr; nreq++, wr = wr->next) {
		if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
			mlx5_ib_warn(dev, "\n");
			err = -EINVAL;
			*bad_wr = wr;
			goto out;
		}

		num_sge = wr->num_sge;
		if (unlikely(num_sge > qp->sq.max_gs)) {
			mlx5_ib_warn(dev, "\n");
			err = -EINVAL;
			*bad_wr = wr;
			goto out;
		}

		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge,
				nreq);
		if (err) {
			mlx5_ib_warn(dev, "\n");
			err = -ENOMEM;
			*bad_wr = wr;
			goto out;
		}

		if (wr->opcode == IB_WR_REG_MR ||
		    wr->opcode == IB_WR_REG_MR_INTEGRITY) {
			fence = dev->umr_fence;
			next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
		} else  {
			if (wr->send_flags & IB_SEND_FENCE) {
				if (qp->next_fence)
					fence = MLX5_FENCE_MODE_SMALL_AND_FENCE;
				else
					fence = MLX5_FENCE_MODE_FENCE;
			} else {
				fence = qp->next_fence;
			}
		}

		switch (qp->type) {
		case IB_QPT_XRC_INI:
			xrc = seg;
			seg += sizeof(*xrc);
			size += sizeof(*xrc) / 16;
			fallthrough;
		case IB_QPT_RC:
			err = handle_qpt_rc(dev, qp, wr, &ctrl, &seg, &size,
					    &cur_edge, &idx, nreq, fence,
					    next_fence, &num_sge);
			if (unlikely(err)) {
				*bad_wr = wr;
				goto out;
			} else if (wr->opcode == IB_WR_REG_MR_INTEGRITY) {
				goto skip_psv;
			}
			break;

		case IB_QPT_UC:
			handle_qpt_uc(wr, &seg, &size);
			break;
		case IB_QPT_SMI:
			if (unlikely(!dev->port_caps[qp->port - 1].has_smi)) {
				mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n");
				err = -EPERM;
				*bad_wr = wr;
				goto out;
			}
			fallthrough;
		case MLX5_IB_QPT_HW_GSI:
			handle_qpt_hw_gsi(qp, wr, &seg, &size, &cur_edge);
			break;
		case IB_QPT_UD:
			handle_qpt_ud(qp, wr, &seg, &size, &cur_edge);
			break;
		case MLX5_IB_QPT_REG_UMR:
			err = handle_qpt_reg_umr(dev, qp, wr, &ctrl, &seg,
						       &size, &cur_edge, idx);
			if (unlikely(err))
				goto out;
			break;

		default:
			break;
		}

		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
			err = set_data_inl_seg(qp, wr, &seg, &size, &cur_edge);
			if (unlikely(err)) {
				mlx5_ib_warn(dev, "\n");
				*bad_wr = wr;
				goto out;
			}
		} else {
			for (i = 0; i < num_sge; i++) {
				handle_post_send_edge(&qp->sq, &seg, size,
						      &cur_edge);
				if (unlikely(!wr->sg_list[i].length))
					continue;

				set_data_ptr_seg(
					(struct mlx5_wqe_data_seg *)seg,
					wr->sg_list + i);
				size += sizeof(struct mlx5_wqe_data_seg) / 16;
				seg += sizeof(struct mlx5_wqe_data_seg);
			}
		}

		qp->next_fence = next_fence;
		finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq,
			   fence, mlx5_ib_opcode[wr->opcode]);
skip_psv:
		if (0)
			dump_wqe(qp, idx, size);
	}

out:
	if (likely(nreq)) {
		qp->sq.head += nreq;

		/* Make sure that descriptors are written before
		 * updating doorbell record and ringing the doorbell
		 */
		wmb();

		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);

		/* Make sure doorbell record is visible to the HCA before
		 * we hit doorbell.
		 */
		wmb();

		mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset);
		/* Make sure doorbells don't leak out of SQ spinlock
		 * and reach the HCA out of order.
		 */
		bf->offset ^= bf->buf_size;
	}

	spin_unlock_irqrestore(&qp->sq.lock, flags);

	return err;
}

此函数介绍如下:

1.函数参数:这个函数接收四个参数,分别是一个指向ib_qp结构的指针(代表一个InfiniBand队列对),一个指向ib_send_wr结构的指针(代表一个发送工作请求),一个指向ib_send_wr指针的指针(用于返回出错的工作请求),以及一个布尔值(表示是否为"drain"操作)。

2.函数首先进行错误检查,例如检查设备状态,检查工作请求的操作码,以及检查工作请求的SGE(Scatter/Gather Entry)数量。

3.函数的主要部分是一个循环,对每个工作请求进行处理。处理包括开始一个WQE(Work Queue Entry),设置控制段,处理不同类型的QP(例如RC、UC、UD等),设置数据段,以及完成WQE。

4.在处理完所有的工作请求后,函数会更新doorbell record,并通过写入doorbell来通知硬件开始处理WQE。

综上所述,当在内核中调用ib_post_send函数时,会触发infiniBand模块的回调函数mlx5_ib_post_send_nodrain,最后会调用到函数

c 复制代码
int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
		      const struct ib_send_wr **bad_wr, bool drain)
相关推荐
KIDGINBROOK1 个月前
RDMA驱动学习(一)- 用户态到内核态的过程
rdma
羌俊恩2 个月前
Linux 常见的冷知识集锦
linux·rdma·posix
我想学LINUX2 个月前
【RDMA项目】如何使用rdma-core进行调用开发一个实战项目
服务器·网络·github·客户端·rdma·rdma-core·内存访问
我想学LINUX3 个月前
RDMA 高性能架构基本原理与设计方案
linux·架构·嵌入式·rdma·系统开发·rdma设计思路·rdma基本元素架构
北冥有鱼被烹3 个月前
RDMA建链的3次握手和断链的4次挥手流程?
rdma·rdma_cm
妙BOOK言3 个月前
Rcmp: Reconstructing RDMA-Based Memory Disaggregation via CXL——论文阅读
论文阅读·cxl·rdma·内存分解
Pretend ^^4 个月前
3. 使用tcpdump抓取rdma数据包
网络·wireshark·tcpdump·rdma
北冥有鱼被烹5 个月前
【DPU系列之】Bluefield 2 DPU卡的功能图,ConnectX网卡、ARM OS、Host OS的关系?(通过PCIe Switch连接)
rdma·dpu·mellanox
qq_426328407 个月前
RDMA内核态函数ib_post_recv()源码分析
rdma