Lss-bev系列-1-部署插件inverse

总结

在导出该项目onnx时候，会产生不支持算子的报错。这里首先分析torch.inverse这个函数。

pytorch引用

原始代码中的使用主要发生在这两个部分，其中可以看到的是该算子其实是对最后的3*3维度的矩阵进行矩阵求逆，主要应用在进行坐标转换上的。

python 复制代码

points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))

python 复制代码

combine = rots.matmul(torch.inverse(intrins))

自定义插件-pytorch

进行定义一个求逆计算规则

python 复制代码

class InverseFunction(torch.autograd.Function):
   @staticmethod
   def forward(ctx, input):
       # PyTorch原生实现
       output = torch.inverse(input)
       ctx.save_for_backward(output)
       return output
   
   @staticmethod
   def symbolic(g, input):
       # 明确指定输出类型和形状与输入相同
       output = g.op("xyz.onnx.contrib::MatrixInverse", input)
       # 设置输出形状与输入相同
       output.setType(input.type())
       return output

在pytorch中使用此计算规则，替换不可直接支持的计算语句

python 复制代码

# points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))     # 撤销旋转 -> B N D H W 3 1
points = InverseFunction.apply(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))

自定义插件-c++

c++插件主要是导出trt引擎时对于自定义的算子需要进行实现，因此这里需要对矩阵求逆算子进行实现，好在这里只是33的矩阵，我们可以直接利用线性代数中3 3的公式直接进行实现就好了，不需要写成复杂的nn矩阵求逆，只给出33的特例计算就好了。

cu文件

c 复制代码

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>

// CUDA kernel for 3x3 matrix inversion using analytical method
__global__ void matrixInverse3x3Kernel(const float* input, float* output, int numMatrices)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (idx >= numMatrices) return;
    
    // 每个矩阵有9个元素
    const float* mat = input + idx * 9;
    float* inv = output + idx * 9;
    
    // 3x3矩阵布局:
    // [0 1 2]
    // [3 4 5]
    // [6 7 8]
    
    // 计算行列式
    float det = mat[0] * (mat[4] * mat[8] - mat[5] * mat[7])
              - mat[1] * (mat[3] * mat[8] - mat[5] * mat[6])
              + mat[2] * (mat[3] * mat[7] - mat[4] * mat[6]);
    
    // 检查行列式是否为零
    if (fabs(det) < 1e-10f) {
        // 矩阵奇异，使用单位矩阵或原矩阵
        for (int i = 0; i < 9; ++i) {
            inv[i] = (i % 4 == 0) ? 1.0f : 0.0f;  // 单位矩阵
        }
        return;
    }
    
    float invDet = 1.0f / det;
    
    // 计算逆矩阵（伴随矩阵的转置除以行列式）
    inv[0] = (mat[4] * mat[8] - mat[5] * mat[7]) * invDet;
    inv[1] = (mat[2] * mat[7] - mat[1] * mat[8]) * invDet;
    inv[2] = (mat[1] * mat[5] - mat[2] * mat[4]) * invDet;
    
    inv[3] = (mat[5] * mat[6] - mat[3] * mat[8]) * invDet;
    inv[4] = (mat[0] * mat[8] - mat[2] * mat[6]) * invDet;
    inv[5] = (mat[2] * mat[3] - mat[0] * mat[5]) * invDet;
    
    inv[6] = (mat[3] * mat[7] - mat[4] * mat[6]) * invDet;
    inv[7] = (mat[1] * mat[6] - mat[0] * mat[7]) * invDet;
    inv[8] = (mat[0] * mat[4] - mat[1] * mat[3]) * invDet;
}

extern "C" void matrixInverseBatchedCUDA(
    const float* input, float* output, int batchSize, int matrixSize, cudaStream_t stream)
{
    if (matrixSize != 3) {
        printf("Error: Only 3x3 matrices are supported. Got size: %d\n", matrixSize);
        return;
    }
    
    // 配置CUDA执行参数
    int threadsPerBlock = 256;
    int blocksPerGrid = (batchSize + threadsPerBlock - 1) / threadsPerBlock;
    
    // 启动kernel
    matrixInverse3x3Kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
        input, output, batchSize);
    
    // 检查CUDA错误
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA Error in matrixInverseBatchedCUDA: %s\n", cudaGetErrorString(err));
    }
}

cpp文件

c 复制代码

#include "MatrixInversePlugin.h"
#include "PluginUtils.h"
#include <cassert>
#include <cstring>

using namespace nvinfer1;
using namespace plugin;

namespace {
const char* MATRIX_INVERSE_PLUGIN_VERSION{"1"};
const char* MATRIX_INVERSE_PLUGIN_NAME{"MatrixInverse"};
}

// CUDA kernel声明
extern "C" void matrixInverseBatchedCUDA(
    const float* input, float* output, int batchSize, int matrixSize, cudaStream_t stream);

// 构造函数
MatrixInversePlugin::MatrixInversePlugin()
    : mNamespace("")
    , mDataType(DataType::kFLOAT)
    , mNumMatrices(0)
    , mMatrixSize(3)
{
}

MatrixInversePlugin::MatrixInversePlugin(const void* data, size_t length)
    : mNamespace("")
{
    const char* d = static_cast<const char*>(data);
    const char* a = d;
    
    mDataType = plugin::read<DataType>(d);
    mNumMatrices = plugin::read<int32_t>(d);
    mMatrixSize = plugin::read<int32_t>(d);
    
    assert(d == a + length);
}

MatrixInversePlugin::~MatrixInversePlugin()
{
    terminate()
    ;
}
IPluginV2DynamicExt* MatrixInversePlugin::clone() const noexcept
{
    MatrixInversePlugin* plugin = new MatrixInversePlugin();
    plugin->mDataType = this->mDataType;
    plugin->mNumMatrices = this->mNumMatrices;
    plugin->mMatrixSize = this->mMatrixSize;
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

DimsExprs MatrixInversePlugin::getOutputDimensions(
    int32_t outputIndex, const DimsExprs* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept
{
    assert(outputIndex == 0 && nbInputs == 1);
    return inputs[0];
}

bool MatrixInversePlugin::supportsFormatCombination(
    int32_t pos, const PluginTensorDesc* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
{
    assert(nbInputs == 1 && nbOutputs == 1);
    assert(pos < nbInputs + nbOutputs);
    
    // 只支持 FP32 + Linear 格式
    const PluginTensorDesc& desc = inOut[pos];
    return (desc.type == DataType::kFLOAT) && (desc.format == TensorFormat::kLINEAR);
}

void MatrixInversePlugin::configurePlugin(
    const DynamicPluginTensorDesc* in, int32_t nbInputs,
    const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept
{
    assert(nbInputs == 1 && nbOutputs == 1);
    
    mDataType = in[0].desc.type;
    const auto& dims = in[0].desc.dims;
    int32_t totalSize = 1;
    for (int32_t i = 0; i < dims.nbDims; ++i) {
        totalSize *= dims.d[i];
    }
    mMatrixSize = 3;
    mNumMatrices = totalSize / (mMatrixSize * mMatrixSize);
}

size_t MatrixInversePlugin::getWorkspaceSize(
    const PluginTensorDesc* inputs, int32_t nbInputs,
    const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept
{
    // 为批量矩阵求逆分配工作空间
    return mNumMatrices * mMatrixSize * mMatrixSize * sizeof(float) * 2;
}

int32_t MatrixInversePlugin::enqueue(
    const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
    const void* const* inputs, void* const* outputs,
    void* workspace, cudaStream_t stream) noexcept
{
    const float* input = static_cast<const float*>(inputs[0]);
    float* output = static_cast<float*>(outputs[0]);
    
    // 调用CUDA kernel进行批量矩阵求逆
    matrixInverseBatchedCUDA(input, output, mNumMatrices, mMatrixSize, stream);
    
    return 0;
}

const char* MatrixInversePlugin::getPluginType() const noexcept
{
    return MATRIX_INVERSE_PLUGIN_NAME;
}

const char* MatrixInversePlugin::getPluginVersion() const noexcept
{
    return MATRIX_INVERSE_PLUGIN_VERSION;
}

int32_t MatrixInversePlugin::getNbOutputs() const noexcept
{
    return 1;
}

int32_t MatrixInversePlugin::initialize() noexcept
{
    return 0;
}

void MatrixInversePlugin::terminate() noexcept
{
}

size_t MatrixInversePlugin::getSerializationSize() const noexcept
{
    return sizeof(DataType) + sizeof(int32_t) * 2;
}

void MatrixInversePlugin::serialize(void* buffer) const noexcept
{
    char* d = static_cast<char*>(buffer);
    const char* a = d;
    
    plugin::write(d, mDataType);
    plugin::write(d, mNumMatrices);
    plugin::write(d, mMatrixSize);
    
    assert(d == a + getSerializationSize());
}

void MatrixInversePlugin::destroy() noexcept
{
    delete this;
}

void MatrixInversePlugin::setPluginNamespace(const char* pluginNamespace) noexcept
{
    mNamespace = pluginNamespace;
}

const char* MatrixInversePlugin::getPluginNamespace() const noexcept
{
    return mNamespace.c_str();
}

DataType MatrixInversePlugin::getOutputDataType(
    int32_t index, const nvinfer1::DataType* inputTypes, int32_t nbInputs) const noexcept
{
    assert(index == 0);
    return inputTypes[0];
}

// MatrixInversePluginCreator实现
PluginFieldCollection MatrixInversePluginCreator::mFC{};
std::vector<PluginField> MatrixInversePluginCreator::mPluginAttributes;

MatrixInversePluginCreator::MatrixInversePluginCreator()
{
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* MatrixInversePluginCreator::getPluginName() const noexcept
{
    return MATRIX_INVERSE_PLUGIN_NAME;
}

const char* MatrixInversePluginCreator::getPluginVersion() const noexcept
{
    return MATRIX_INVERSE_PLUGIN_VERSION;
}

const PluginFieldCollection* MatrixInversePluginCreator::getFieldNames() noexcept
{
    return &mFC;
}

IPluginV2* MatrixInversePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
{
    MatrixInversePlugin* plugin = new MatrixInversePlugin();
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

IPluginV2* MatrixInversePluginCreator::deserializePlugin(
    const char* name, const void* serialData, size_t serialLength) noexcept
{
    MatrixInversePlugin* plugin = new MatrixInversePlugin(serialData, serialLength);
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

void MatrixInversePluginCreator::setPluginNamespace(const char* pluginNamespace) noexcept
{
    mNamespace = pluginNamespace;
}

const char* MatrixInversePluginCreator::getPluginNamespace() const noexcept
{
    return mNamespace.c_str();
}

h文件

c 复制代码

#ifndef MATRIX_INVERSE_PLUGIN_H
#define MATRIX_INVERSE_PLUGIN_H

#include <NvInfer.h>
#include <vector>
#include <string>
#include <cuda_runtime.h>

namespace nvinfer1 {
namespace plugin {

class MatrixInversePlugin : public IPluginV2DynamicExt {
public:
    MatrixInversePlugin();
    MatrixInversePlugin(const void* data, size_t length);
    ~MatrixInversePlugin() override;

    IPluginV2DynamicExt* clone() const noexcept override;
    
    DimsExprs getOutputDimensions(int32_t outputIndex, const DimsExprs* inputs,
                                  int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override;
    
    bool supportsFormatCombination(int32_t pos, const PluginTensorDesc* inOut,
                                   int32_t nbInputs, int32_t nbOutputs) noexcept override;
    
    void configurePlugin(const DynamicPluginTensorDesc* in, int32_t nbInputs,
                        const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept override;
    
    size_t getWorkspaceSize(const PluginTensorDesc* inputs, int32_t nbInputs,
                           const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept override;
    
    int32_t enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
                   const void* const* inputs, void* const* outputs,
                   void* workspace, cudaStream_t stream) noexcept override;
    const char* getPluginType() const noexcept override;
    const char* getPluginVersion() const noexcept override;
    int32_t getNbOutputs() const noexcept override;
    int32_t initialize() noexcept override;
    void terminate() noexcept override;
    size_t getSerializationSize() const noexcept override;
    void serialize(void* buffer) const noexcept override;
    void destroy() noexcept override;
    void setPluginNamespace(const char* pluginNamespace) noexcept override;
    const char* getPluginNamespace() const noexcept override;
    DataType getOutputDataType(int32_t index, const nvinfer1::DataType* inputTypes, int32_t nbInputs) const noexcept override;

private:
    std::string mNamespace;
    DataType mDataType;
    int32_t mNumMatrices;
    int32_t mMatrixSize;
};

class MatrixInversePluginCreator : public IPluginCreator {
public:
    MatrixInversePluginCreator();
    ~MatrixInversePluginCreator() override = default;

    const char* getPluginName() const noexcept override;
    const char* getPluginVersion() const noexcept override;
    const PluginFieldCollection* getFieldNames() noexcept override;
    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
    void setPluginNamespace(const char* pluginNamespace) noexcept override;
    const char* getPluginNamespace() const noexcept override;

private:
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
    std::string mNamespace;
};

} // namespace plugin
} // namespace nvinfer1

#endif // MATRIX_INVERSE_PLUGIN_H

测试

写好插件后为了判定与python的实现内容是一致的最好的验证方式是写一个验证脚本，我们可以把输入结果pytorch的递推结果中拿出来，然后构建一个只有插件这一层的trt引擎，将同样的输入给到pytorch代码和trt代码，通过对比结果来看插件的工作是否是有效的。

数据保存与加载

保存的例子

python 复制代码

# np.save(f"{debug_dir}/post_rots.npy", post_rots.cpu().numpy())
# np.save(f"{debug_dir}/intrins.npy", intrins.cpu().numpy())

加载并对比插件

python 复制代码

# TensorRT MatrixInverse 插件验证
import torch
import numpy as np
import tensorrt as trt
import os
import sys
import ctypes
from typing import Tuple, Optional

# 插件路径
PLUGIN_PATH = os.path.join(os.path.dirname(__file__), "..", "cpp", "build", "lib", "liblss_trt_plugins.so")
PLUGIN_PATH = os.path.abspath(PLUGIN_PATH)


def load_trt_plugins():
    """加载 TensorRT 插件库"""
    print(f"Loading plugin: {PLUGIN_PATH}")
    if os.path.exists(PLUGIN_PATH):
        # 使用 ctypes 加载库，触发静态初始化函数
        ctypes.CDLL(PLUGIN_PATH)
        trt.init_libnvinfer_plugins(None, "")
        print("Plugin loaded successfully!")
    else:
        raise FileNotFoundError(f"Plugin not found: {PLUGIN_PATH}")


def create_inverse_engine():
    """创建包含 MatrixInverse 插件的 TensorRT 引擎（支持 4D 输入 [batch, N, 3, 3]）"""
    logger = trt.Logger(trt.Logger.ERROR)
    builder = trt.Builder(logger)
    
    # 网络配置
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    
    # 输入 tensor: 使用动态形状 [-1, -1, 3, 3]
    input_tensor = network.add_input("input", trt.DataType.FLOAT, [-1, -1, 3, 3])
    
    # 获取插件
    plugin_registry = trt.get_plugin_registry()
    plugin_creator = plugin_registry.get_plugin_creator("MatrixInverse", "1", "")
    
    # 创建空的 PluginFieldCollection
    fc = trt.PluginFieldCollection()
    
    # 创建插件
    plugin = plugin_creator.create_plugin("MatrixInverse", fc)
    
    # 添加插件层
    inverse_layer = network.add_plugin_v2([input_tensor], plugin)
    
    # 输出
    network.mark_output(inverse_layer.get_output(0))
    
    # 构建引擎
    config = builder.create_builder_config()
    config.max_workspace_size = 1 << 20  # 1MB
    
    # 设置优化配置文件（支持动态 batch 和 cameras）
    profile = builder.create_optimization_profile()
    profile.set_shape("input", [1, 1, 3, 3], [10, 10, 3, 3], [100, 20, 3, 3])
    config.add_optimization_profile(profile)
    
    engine = builder.build_engine(network, config)
    return engine


def run_trt_inference(engine, input_data: np.ndarray) -> np.ndarray:
    """使用 TensorRT 引擎运行推理"""
    context = engine.create_execution_context()
    
    # 设置输入形状（使用 tensor 名称）
    context.set_input_shape("input", input_data.shape)
    
    # 分配内存
    input_np = input_data.astype(np.float32)
    output_np = np.empty(input_data.shape, dtype=np.float32)
    
    # 复制输入到 GPU
    d_input = torch.cuda.FloatTensor(input_np).cuda()
    d_output = torch.cuda.FloatTensor(output_np).cuda()
    
    # 推理
    context.execute_v2([d_input.data_ptr(), d_output.data_ptr()])
    
    # 同步并返回
    torch.cuda.synchronize()
    return d_output.cpu().numpy()


def torch_inverse(post_rots: torch.Tensor) -> torch.Tensor:
    """PyTorch 矩阵求逆（基准）"""
    return torch.inverse(post_rots)


def verify_with_npy(data_path: str) -> Tuple[bool, dict]:
    """
    使用 .npy 文件验证插件正确性
    
    Args:
        data_path: .npy 文件路径
        
    Returns:
        (是否通过, 统计信息)
    """
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"数据文件不存在: {data_path}")
    
    # 加载数据
    data = np.load(data_path)
    print(f"加载数据: {data_path}")
    print(f"形状: {data.shape}, dtype: {data.dtype}")
    
    # 转换为 tensor
    test_matrices = torch.from_numpy(data).float()
    
    return verify_with_tensor(test_matrices)


def verify_with_tensor(test_matrices: torch.Tensor) -> Tuple[bool, dict]:
    """
    使用 tensor 数据验证插件正确性
    
    Args:
        test_matrices: 输入矩阵 tensor
        
    Returns:
        (是否通过, 统计信息)
    """
    # 加载插件
    load_trt_plugins()
    engine = create_inverse_engine()
    
    print(f"测试数据形状: {test_matrices.shape}")
    
    # PyTorch 结果
    pytorch_results = torch_inverse(test_matrices)
    
    # TensorRT 结果
    trt_results = run_trt_inference(engine, test_matrices.numpy())
    trt_results = torch.from_numpy(trt_results)
    
    # 计算误差
    diff = torch.abs(pytorch_results - trt_results)
    max_error = float(torch.max(diff))
    mean_error = float(torch.mean(diff))
    
    # 验证 A @ A⁻¹ = I
    product = torch.bmm(test_matrices.view(-1, 3, 3), pytorch_results.view(-1, 3, 3))
    product = product.view(test_matrices.shape)
    identity = torch.eye(3).unsqueeze(0).expand_as(product)
    inv_error = torch.abs(product - identity)
    inv_max_error = float(torch.max(inv_error))
    
    # 统计
    stats = {
        "num_samples": test_matrices.shape[0],
        "max_error": max_error,
        "mean_error": mean_error,
        "inverse_property_error": inv_max_error,
        "passed": max_error < 1e-5 and inv_max_error < 1e-5
    }
    
    return stats["passed"], stats


# python test_inverse_plugin.py /path/to/post_rots.npy


if __name__ == "__main__":
    print("=" * 50)
    print("TensorRT MatrixInverse Plugin Verification")
    print("=" * 50)
    
    # 解析参数
    passed, stats = verify_with_npy(sys.argv[1])
    
    print(f"\n Test Results:")
    print(f"   Samples: {stats.get('num_samples', stats.get('num_tests', 'N/A'))}")
    print(f"   Max Error (vs PyTorch): {stats['max_error']:.2e}")
    print(f"   Mean Error (vs PyTorch): {stats['mean_error']:.2e}")
    print(f"   A @ A⁻¹ = I Error: {stats['inverse_property_error']:.2e}")
    print(f"\n{'PASSED' if passed else 'FAILED'}")
    print("=" * 50)

结果

shell 复制代码

Loading plugin: /workspace/current_workspace/cpp/build/lib/liblss_trt_plugins.so
Plugin loaded successfully!
/workspace/current_workspace/py/test_inverse_plugin.py:56: DeprecationWarning: Use set_memory_pool_limit instead.
  config.max_workspace_size = 1 << 20  # 1MB
/workspace/current_workspace/py/test_inverse_plugin.py:63: DeprecationWarning: Use build_serialized_network instead.
  engine = builder.build_engine(network, config)
测试数据形状: torch.Size([1, 5, 3, 3])
/workspace/current_workspace/py/test_inverse_plugin.py:79: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)
  d_input = torch.cuda.FloatTensor(input_np).cuda()

 Test Results:
   Samples: 1
   Max Error (vs PyTorch): 4.77e-07
   Mean Error (vs PyTorch): 4.34e-08
   A @ A⁻¹ = I Error: 3.73e-09

PASSED
==================================================