1. rdma 命令官网查看
rdma 命令是 iproute2 工具包的一部分,主要用于管理 RDMA(Remote Direct Memory Access)设备。相关文档可以在以下位置找到:
官方文档
- iproute2 官方文档 :https://wiki.linuxfoundation.org/networking/iproute2
- Linux 内核文档 :https://www.kernel.org/doc/html/latest/
- man 手册 :
man rdma或man 8 rdma
GitHub 仓库
iproute2 的源代码和文档:
2. rdma 命令用法
版本:
rdma -V
rdma utility, iproute2-5.15.0
ofed_info -s
MLNX_OFED_LINUX-23.10-2.1.3.1:
# 查看 rdma 命令的帮助
man rdma # 查看完整手册
rdma help
rdma resource help
# 尝试最基本的命令
rdma
rdma resource
尝试其他资源类型
rdma resource show qp
rdma resource show cq
rdma resource show pd
rdma resource show mr
# 输出格式
尝试不同的输出格式
rdma -d resource show qp
rdma -j resource show qp
rdma -p resource show qp
设备管理 (dev)
rdma dev
0: mlx5_0: node_type ca fw 32.42.1000 node_guid b8e9:2403:000a:b6c6 sys_image_guid b8e9:2403:000a:b6c6
1: mlx5_1: node_type ca fw 32.42.1000 node_guid b8e9:2403:000a:b6c7 sys_image_guid b8e9:2403:000a:b6c6
2: mlx5_2: node_type ca fw 28.39.3900 node_guid 5c25:7303:00a9:0de0 sys_image_guid 5c25:7303:00a9:0de0
rdma dev show mlx5_0
链路管理 (link)
rdma link
rdma link show mlx5_0/1
资源管理 (resource)
显示 CQ(完成队列)
rdma resource show cq
显示 PD(保护域)
rdma resource show pd
显示 MR(内存区域)
rdma resource show mr
显示 QP(队列对)
rdma resource show qp
显示 CM(连接管理器)ID
rdma resource show cm_id
显示特定设备的资源
rdma resource show cq dev mlx5_0
系统参数管理 (system)
显示系统参数
rdma system show
显示特定参数
rdma system show netns
rdma system show caps
设置系统参数
rdma system set netns exclusive
rdma system set caps exclusive
统计信息 (statistic)
rdma statistic help
rdma statistic
rdma stat
用例
显示网卡上创建的 QP
# rdma resource show qp
link mlx5_0/1 lqpn 72 type UD state RTS sq-psn 1816732 comm [mlx5_ib]
link mlx5_0/1 lqpn 91 rqpn 10637 type RC state RTS rq-psn 4820664 sq-psn 7848440 path-mig-state MIGRATED pdn 6 pid 7433 comm NIO_SERV/0
link mlx5_0/1 lqpn 92 rqpn 10638 type RC state RTS rq-psn 1700215 sq-psn 6347068 path-mig-state MIGRATED pdn 6 pid 7443 comm NIO_SERV/3
link mlx5_0/1 lqpn 94 rqpn 10640 type RC state RTS rq-psn 12590440 sq-psn 13302608 path-mig-state MIGRATED pdn 6 pid 7433 comm NIO_SERV/0
link mlx5_0/1 lqpn 97 rqpn 10643 type RC state RTS rq-psn 12216651 sq-psn 15013520 path-mig-state MIGRATED pdn 6 pid 7443 comm NIO_SERV/3
link mlx5_0/1 lqpn 136 rqpn 27665 type RC state RTS rq-psn 5297131 sq-psn 6313467 path-mig-state MIGRATED comm [rdma_cm]
link mlx5_0/1 lqpn 137 rqpn 27667 type RC state RTS rq-psn 387433 sq-psn 14565436 path-mig-state MIGRATED comm [rdma_cm]
link mlx5_0/1 lqpn 142 rqpn 27692 type RC state RTS rq-psn 4361743 sq-psn 8995677 path-mig-state MIGRATED comm [rdma_cm]
link mlx5_0/1 lqpn 147 rqpn 27679 type RC state RTS rq-psn 987401 sq-psn 10446402 path-mig-state MIGRATED comm [rdma_cm]
link mlx5_0/1 lqpn 150 rqpn 27682 type RC state RTS rq-psn 197389 sq-psn 9257886 path-mig-state MIGRATED comm [rdma_cm]
link <设备/端口> lqpn <本地QP号> [rqpn <远程QP号>] type <类型> state <状态> ... comm <进程名>
各字段说明:
link mlx5_0/1 - 设备 mlx5_0 的端口 1
lqpn - Local QP Number(本地 QP 号)
rqpn - Remote QP Number(远程 QP 号,如果有的话)
type - QP 类型:UD(不可靠数据报)或 RC(可靠连接)
state - QP 状态:RTS(准备发送)
sq-psn - 发送队列数据包序列号
rq-psn - 接收队列数据包序列号
path-mig-state - 路径迁移状态
pdn - Protection Domain Number(保护域号)
pid - 进程ID
comm - 进程名
查看 mlx5_2 的 QP
# 1. 查看 mlx5_2 的所有 QP
rdma resource show qp | grep "link mlx5_2"
# 2. 统计 mlx5_2 的 QP 数量
rdma resource show qp | grep -c "link mlx5_2"
# 3. 查看 mlx5_2 的 QP(带行号)
rdma resource show qp | grep -n "link mlx5_2"
# 4. 查看所有设备(不筛选)
rdma resource show qp | head -20
# 3. 统计每个设备的 QP
rdma resource show qp | awk '{split($2,a,"/"); dev=a[1]; count[dev]++} END{for(d in count) print d, count[d]}' | sort
快速统计命令
# 1. 统计所有 QP
rdma resource show qp | wc -l
# 2. 统计 mlx5_2 的 QP
rdma resource show qp | grep -c "link mlx5_2"
# 3. 统计每个设备的 QP
rdma resource show qp | awk '{split($2,a,"/"); dev=a[1]; count[dev]++} END{for(d in count) print d, count[d]}' | sort
# 4. 统计 QP 类型
rdma resource show qp | awk '{for(i=1;i<=NF;i++) if($i=="type"){type=$(i+1); count[type]++}} END{for(t in count) print t, count[t]}'
# 5. 统计使用 QP 的进程
rdma resource show qp | awk '{for(i=1;i<=NF;i++) if($i=="comm"){print $(i+1)}}' | sort | uniq -c | sort -nr
# 6. 实时监控 mlx5_2 QP 数量
watch -n 1 'echo -n "mlx5_2 QP count: "; rdma resource show qp | grep -c "link mlx5_2"'
# 只看前20行
rdma resource show qp | head -20
# 统计总数
rdma resource show qp | wc -l
# 计算实际 QP 数量(排除标题)
rdma resource show qp | tail -n +2 | wc -l
检查系统日志
# 查看 RDMA 相关日志
dmesg | grep -i rdma | tail -20
dmesg | grep -i mlx | tail -20
# 检查系统日志
journalctl -xe | grep -i rdma | tail -20
统计和分析脚本
#!/bin/bash
# qp_analysis.sh
echo "=== QP 统计分析 ==="
echo "时间: $(date)"
echo ""
# 1. 所有设备 QP 总数
total_qps=$(rdma resource show qp | wc -l)
echo "1. 总 QP 数量: $total_qps"
echo ""
# 2. 按设备统计
echo "2. 各设备 QP 分布:"
echo "------------------"
rdma resource show qp | awk '
{
# 提取设备名(link mlx5_0/1 -> mlx5_0)
split($2, parts, "/")
device = parts[1]
count[device]++
}
END {
for (d in count) {
printf "%-10s: %4d QPs\n", d, count[d]
}
}' | sort
echo ""
echo "3. 各设备端口分布:"
echo "------------------"
rdma resource show qp | awk '
{
# 提取设备和端口
device_port = $2 # mlx5_0/1
count[device_port]++
}
END {
for (dp in count) {
printf "%-12s: %3d QPs\n", dp, count[dp]
}
}' | sort
echo ""
echo "4. QP 类型分布:"
echo "---------------"
rdma resource show qp | awk '
{
# 查找 type 字段
for(i=1; i<=NF; i++) {
if($i == "type") {
type = $(i+1)
count[type]++
break
}
}
}
END {
for (t in count) {
printf "%-5s: %4d QPs\n", t, count[t]
}
}'
echo ""
echo "5. QP 状态分布:"
echo "---------------"
rdma resource show qp | awk '
{
# 查找 state 字段
for(i=1; i<=NF; i++) {
if($i == "state") {
state = $(i+1)
count[state]++
break
}
}
}
END {
for (s in count) {
printf "%-10s: %4d QPs\n", s, count[s]
}
}'
echo ""
echo "6. 进程使用 QP 情况:"
echo "-------------------"
rdma resource show qp | awk '
{
# 查找 comm 字段
for(i=1; i<=NF; i++) {
if($i == "comm") {
# 获取进程名(可能包含空格)
comm = ""
for(j=i+1; j<=NF; j++) {
if($j ~ /^\[/ || j == i+1) {
if(comm == "") comm = $j
else comm = comm " " $j
} else break
}
count[comm]++
break
}
}
}
END {
for (c in count) {
printf "%-30s: %3d QPs\n", c, count[c]
}
}' | sort -k2 -nr | head -10
echo ""
echo "7. mlx5_2 设备详情(如果存在):"
echo "-----------------------------"
mlx5_2_count=$(rdma resource show qp | grep -c "link mlx5_2")
if [ $mlx5_2_count -gt 0 ]; then
echo "mlx5_2 有 $mlx5_2_count 个 QP:"
rdma resource show qp | grep "link mlx5_2" | head -5
echo ""
echo "mlx5_2 QP 类型分布:"
rdma resource show qp | grep "link mlx5_2" | awk '
{
for(i=1; i<=NF; i++) {
if($i == "type") {
type = $(i+1)
count[type]++
break
}
}
}
END {
for (t in count) {
printf " %-5s: %d\n", t, count[t]
}
}'
else
echo "mlx5_2 没有找到 QP"
echo "检查设备是否存在:"
rdma dev | grep mlx5_2
fi
#!/bin/bash
# qp_check_mlxofed.sh
echo "=== MLNX OFED 23.10 QP Check ==="
echo "OFED Version: MLNX_OFED_LINUX-23.10-2.1.3.1"
echo "RDMA Version: iproute2-5.15.0"
echo "Time: $(date)"
echo ""
# 方法1:使用 rdma 命令
echo "1. Using rdma command:"
echo "----------------------"
rdma resource show qp 2>/dev/null || echo "Command failed or no QPs"
echo ""
echo "2. Filtering for mlx5_2:"
echo "-----------------------"
# 尝试多种过滤方法
echo "Method 1 - grep:"
rdma resource show qp 2>/dev/null | grep -i mlx5_2 || echo "No mlx5_2 found with grep"
echo ""
echo "Method 2 - awk (all columns):"
rdma resource show qp 2>/dev/null | awk '
NR==1 {
print "Header:", $0
print "Column count:", NF
for(i=1;i<=NF;i++) print " Col " i ": \"" $i "\""
}
/mlx5_2/ {
print "Found:", $0
}' || echo "No output"
echo ""
echo "3. Device information:"
echo "----------------------"
rdma dev
echo ""
echo "Active links:"
rdma link show 2>/dev/null | head -10
echo ""
echo "4. Using ibverbs tools:"
echo "-----------------------"
if command -v ibv_devices &> /dev/null; then
ibv_devices
else
echo "ibv_devices not found"
fi
if command -v ibstat &> /dev/null; then
echo ""
ibstat mlx5_2 2>/dev/null || echo "ibstat not available for mlx5_2"
fi