Lss-bev系列-1-部署插件inverse

Lss-bev系列-1-部署插件inverse

总结

在导出该项目onnx时候,会产生不支持算子的报错。这里首先分析torch.inverse这个函数。

pytorch引用

原始代码中的使用主要发生在这两个部分,其中可以看到的是该算子其实是对最后的3*3维度的矩阵进行矩阵求逆,主要应用在进行坐标转换上的。

python 复制代码
points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))
python 复制代码
combine = rots.matmul(torch.inverse(intrins))

自定义插件-pytorch

进行定义一个求逆计算规则

python 复制代码
class InverseFunction(torch.autograd.Function):
   @staticmethod
   def forward(ctx, input):
       # PyTorch原生实现
       output = torch.inverse(input)
       ctx.save_for_backward(output)
       return output
   
   @staticmethod
   def symbolic(g, input):
       # 明确指定输出类型和形状与输入相同
       output = g.op("xyz.onnx.contrib::MatrixInverse", input)
       # 设置输出形状与输入相同
       output.setType(input.type())
       return output

在pytorch中使用此计算规则,替换不可直接支持的计算语句

python 复制代码
# points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))     # 撤销旋转 -> B N D H W 3 1
points = InverseFunction.apply(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))

自定义插件-c++

c++插件主要是导出trt引擎时对于自定义的算子需要进行实现,因此这里需要对矩阵求逆算子进行实现,好在这里只是33的矩阵,我们可以直接利用线性代数中3 3的公式直接进行实现就好了,不需要写成复杂的nn矩阵求逆,只给出33的特例计算就好了。

cu文件

c 复制代码
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>

// CUDA kernel for 3x3 matrix inversion using analytical method
__global__ void matrixInverse3x3Kernel(const float* input, float* output, int numMatrices)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (idx >= numMatrices) return;
    
    // 每个矩阵有9个元素
    const float* mat = input + idx * 9;
    float* inv = output + idx * 9;
    
    // 3x3矩阵布局:
    // [0 1 2]
    // [3 4 5]
    // [6 7 8]
    
    // 计算行列式
    float det = mat[0] * (mat[4] * mat[8] - mat[5] * mat[7])
              - mat[1] * (mat[3] * mat[8] - mat[5] * mat[6])
              + mat[2] * (mat[3] * mat[7] - mat[4] * mat[6]);
    
    // 检查行列式是否为零
    if (fabs(det) < 1e-10f) {
        // 矩阵奇异,使用单位矩阵或原矩阵
        for (int i = 0; i < 9; ++i) {
            inv[i] = (i % 4 == 0) ? 1.0f : 0.0f;  // 单位矩阵
        }
        return;
    }
    
    float invDet = 1.0f / det;
    
    // 计算逆矩阵(伴随矩阵的转置除以行列式)
    inv[0] = (mat[4] * mat[8] - mat[5] * mat[7]) * invDet;
    inv[1] = (mat[2] * mat[7] - mat[1] * mat[8]) * invDet;
    inv[2] = (mat[1] * mat[5] - mat[2] * mat[4]) * invDet;
    
    inv[3] = (mat[5] * mat[6] - mat[3] * mat[8]) * invDet;
    inv[4] = (mat[0] * mat[8] - mat[2] * mat[6]) * invDet;
    inv[5] = (mat[2] * mat[3] - mat[0] * mat[5]) * invDet;
    
    inv[6] = (mat[3] * mat[7] - mat[4] * mat[6]) * invDet;
    inv[7] = (mat[1] * mat[6] - mat[0] * mat[7]) * invDet;
    inv[8] = (mat[0] * mat[4] - mat[1] * mat[3]) * invDet;
}

extern "C" void matrixInverseBatchedCUDA(
    const float* input, float* output, int batchSize, int matrixSize, cudaStream_t stream)
{
    if (matrixSize != 3) {
        printf("Error: Only 3x3 matrices are supported. Got size: %d\n", matrixSize);
        return;
    }
    
    // 配置CUDA执行参数
    int threadsPerBlock = 256;
    int blocksPerGrid = (batchSize + threadsPerBlock - 1) / threadsPerBlock;
    
    // 启动kernel
    matrixInverse3x3Kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
        input, output, batchSize);
    
    // 检查CUDA错误
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA Error in matrixInverseBatchedCUDA: %s\n", cudaGetErrorString(err));
    }
}

cpp文件

c 复制代码
#include "MatrixInversePlugin.h"
#include "PluginUtils.h"
#include <cassert>
#include <cstring>

using namespace nvinfer1;
using namespace plugin;

namespace {
const char* MATRIX_INVERSE_PLUGIN_VERSION{"1"};
const char* MATRIX_INVERSE_PLUGIN_NAME{"MatrixInverse"};
}

// CUDA kernel声明
extern "C" void matrixInverseBatchedCUDA(
    const float* input, float* output, int batchSize, int matrixSize, cudaStream_t stream);

// 构造函数
MatrixInversePlugin::MatrixInversePlugin()
    : mNamespace("")
    , mDataType(DataType::kFLOAT)
    , mNumMatrices(0)
    , mMatrixSize(3)
{
}

MatrixInversePlugin::MatrixInversePlugin(const void* data, size_t length)
    : mNamespace("")
{
    const char* d = static_cast<const char*>(data);
    const char* a = d;
    
    mDataType = plugin::read<DataType>(d);
    mNumMatrices = plugin::read<int32_t>(d);
    mMatrixSize = plugin::read<int32_t>(d);
    
    assert(d == a + length);
}

MatrixInversePlugin::~MatrixInversePlugin()
{
    terminate()
    ;
}
IPluginV2DynamicExt* MatrixInversePlugin::clone() const noexcept
{
    MatrixInversePlugin* plugin = new MatrixInversePlugin();
    plugin->mDataType = this->mDataType;
    plugin->mNumMatrices = this->mNumMatrices;
    plugin->mMatrixSize = this->mMatrixSize;
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

DimsExprs MatrixInversePlugin::getOutputDimensions(
    int32_t outputIndex, const DimsExprs* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept
{
    assert(outputIndex == 0 && nbInputs == 1);
    return inputs[0];
}

bool MatrixInversePlugin::supportsFormatCombination(
    int32_t pos, const PluginTensorDesc* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
{
    assert(nbInputs == 1 && nbOutputs == 1);
    assert(pos < nbInputs + nbOutputs);
    
    // 只支持 FP32 + Linear 格式
    const PluginTensorDesc& desc = inOut[pos];
    return (desc.type == DataType::kFLOAT) && (desc.format == TensorFormat::kLINEAR);
}

void MatrixInversePlugin::configurePlugin(
    const DynamicPluginTensorDesc* in, int32_t nbInputs,
    const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept
{
    assert(nbInputs == 1 && nbOutputs == 1);
    
    mDataType = in[0].desc.type;
    const auto& dims = in[0].desc.dims;
    int32_t totalSize = 1;
    for (int32_t i = 0; i < dims.nbDims; ++i) {
        totalSize *= dims.d[i];
    }
    mMatrixSize = 3;
    mNumMatrices = totalSize / (mMatrixSize * mMatrixSize);
}

size_t MatrixInversePlugin::getWorkspaceSize(
    const PluginTensorDesc* inputs, int32_t nbInputs,
    const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept
{
    // 为批量矩阵求逆分配工作空间
    return mNumMatrices * mMatrixSize * mMatrixSize * sizeof(float) * 2;
}

int32_t MatrixInversePlugin::enqueue(
    const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
    const void* const* inputs, void* const* outputs,
    void* workspace, cudaStream_t stream) noexcept
{
    const float* input = static_cast<const float*>(inputs[0]);
    float* output = static_cast<float*>(outputs[0]);
    
    // 调用CUDA kernel进行批量矩阵求逆
    matrixInverseBatchedCUDA(input, output, mNumMatrices, mMatrixSize, stream);
    
    return 0;
}

const char* MatrixInversePlugin::getPluginType() const noexcept
{
    return MATRIX_INVERSE_PLUGIN_NAME;
}

const char* MatrixInversePlugin::getPluginVersion() const noexcept
{
    return MATRIX_INVERSE_PLUGIN_VERSION;
}

int32_t MatrixInversePlugin::getNbOutputs() const noexcept
{
    return 1;
}

int32_t MatrixInversePlugin::initialize() noexcept
{
    return 0;
}

void MatrixInversePlugin::terminate() noexcept
{
}

size_t MatrixInversePlugin::getSerializationSize() const noexcept
{
    return sizeof(DataType) + sizeof(int32_t) * 2;
}

void MatrixInversePlugin::serialize(void* buffer) const noexcept
{
    char* d = static_cast<char*>(buffer);
    const char* a = d;
    
    plugin::write(d, mDataType);
    plugin::write(d, mNumMatrices);
    plugin::write(d, mMatrixSize);
    
    assert(d == a + getSerializationSize());
}

void MatrixInversePlugin::destroy() noexcept
{
    delete this;
}

void MatrixInversePlugin::setPluginNamespace(const char* pluginNamespace) noexcept
{
    mNamespace = pluginNamespace;
}

const char* MatrixInversePlugin::getPluginNamespace() const noexcept
{
    return mNamespace.c_str();
}

DataType MatrixInversePlugin::getOutputDataType(
    int32_t index, const nvinfer1::DataType* inputTypes, int32_t nbInputs) const noexcept
{
    assert(index == 0);
    return inputTypes[0];
}

// MatrixInversePluginCreator实现
PluginFieldCollection MatrixInversePluginCreator::mFC{};
std::vector<PluginField> MatrixInversePluginCreator::mPluginAttributes;

MatrixInversePluginCreator::MatrixInversePluginCreator()
{
    mPluginAttributes.clear();
    mFC.nbFields = mPluginAttributes.size();
    mFC.fields = mPluginAttributes.data();
}

const char* MatrixInversePluginCreator::getPluginName() const noexcept
{
    return MATRIX_INVERSE_PLUGIN_NAME;
}

const char* MatrixInversePluginCreator::getPluginVersion() const noexcept
{
    return MATRIX_INVERSE_PLUGIN_VERSION;
}

const PluginFieldCollection* MatrixInversePluginCreator::getFieldNames() noexcept
{
    return &mFC;
}

IPluginV2* MatrixInversePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
{
    MatrixInversePlugin* plugin = new MatrixInversePlugin();
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

IPluginV2* MatrixInversePluginCreator::deserializePlugin(
    const char* name, const void* serialData, size_t serialLength) noexcept
{
    MatrixInversePlugin* plugin = new MatrixInversePlugin(serialData, serialLength);
    plugin->setPluginNamespace(mNamespace.c_str());
    return plugin;
}

void MatrixInversePluginCreator::setPluginNamespace(const char* pluginNamespace) noexcept
{
    mNamespace = pluginNamespace;
}

const char* MatrixInversePluginCreator::getPluginNamespace() const noexcept
{
    return mNamespace.c_str();
}

h文件

c 复制代码
#ifndef MATRIX_INVERSE_PLUGIN_H
#define MATRIX_INVERSE_PLUGIN_H

#include <NvInfer.h>
#include <vector>
#include <string>
#include <cuda_runtime.h>

namespace nvinfer1 {
namespace plugin {

class MatrixInversePlugin : public IPluginV2DynamicExt {
public:
    MatrixInversePlugin();
    MatrixInversePlugin(const void* data, size_t length);
    ~MatrixInversePlugin() override;

    IPluginV2DynamicExt* clone() const noexcept override;
    
    DimsExprs getOutputDimensions(int32_t outputIndex, const DimsExprs* inputs,
                                  int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override;
    
    bool supportsFormatCombination(int32_t pos, const PluginTensorDesc* inOut,
                                   int32_t nbInputs, int32_t nbOutputs) noexcept override;
    
    void configurePlugin(const DynamicPluginTensorDesc* in, int32_t nbInputs,
                        const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept override;
    
    size_t getWorkspaceSize(const PluginTensorDesc* inputs, int32_t nbInputs,
                           const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept override;
    
    int32_t enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
                   const void* const* inputs, void* const* outputs,
                   void* workspace, cudaStream_t stream) noexcept override;
    const char* getPluginType() const noexcept override;
    const char* getPluginVersion() const noexcept override;
    int32_t getNbOutputs() const noexcept override;
    int32_t initialize() noexcept override;
    void terminate() noexcept override;
    size_t getSerializationSize() const noexcept override;
    void serialize(void* buffer) const noexcept override;
    void destroy() noexcept override;
    void setPluginNamespace(const char* pluginNamespace) noexcept override;
    const char* getPluginNamespace() const noexcept override;
    DataType getOutputDataType(int32_t index, const nvinfer1::DataType* inputTypes, int32_t nbInputs) const noexcept override;

private:
    std::string mNamespace;
    DataType mDataType;
    int32_t mNumMatrices;
    int32_t mMatrixSize;
};

class MatrixInversePluginCreator : public IPluginCreator {
public:
    MatrixInversePluginCreator();
    ~MatrixInversePluginCreator() override = default;

    const char* getPluginName() const noexcept override;
    const char* getPluginVersion() const noexcept override;
    const PluginFieldCollection* getFieldNames() noexcept override;
    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
    void setPluginNamespace(const char* pluginNamespace) noexcept override;
    const char* getPluginNamespace() const noexcept override;

private:
    static PluginFieldCollection mFC;
    static std::vector<PluginField> mPluginAttributes;
    std::string mNamespace;
};

} // namespace plugin
} // namespace nvinfer1

#endif // MATRIX_INVERSE_PLUGIN_H

测试

写好插件后为了判定与python的实现内容是一致的最好的验证方式是写一个验证脚本,我们可以把输入结果pytorch的递推结果中拿出来,然后构建一个只有插件这一层的trt引擎,将同样的输入给到pytorch代码和trt代码,通过对比结果来看插件的工作是否是有效的。

数据保存与加载

保存的例子

python 复制代码
# np.save(f"{debug_dir}/post_rots.npy", post_rots.cpu().numpy())
# np.save(f"{debug_dir}/intrins.npy", intrins.cpu().numpy())

加载并对比插件

python 复制代码
# TensorRT MatrixInverse 插件验证
import torch
import numpy as np
import tensorrt as trt
import os
import sys
import ctypes
from typing import Tuple, Optional

# 插件路径
PLUGIN_PATH = os.path.join(os.path.dirname(__file__), "..", "cpp", "build", "lib", "liblss_trt_plugins.so")
PLUGIN_PATH = os.path.abspath(PLUGIN_PATH)


def load_trt_plugins():
    """加载 TensorRT 插件库"""
    print(f"Loading plugin: {PLUGIN_PATH}")
    if os.path.exists(PLUGIN_PATH):
        # 使用 ctypes 加载库,触发静态初始化函数
        ctypes.CDLL(PLUGIN_PATH)
        trt.init_libnvinfer_plugins(None, "")
        print("Plugin loaded successfully!")
    else:
        raise FileNotFoundError(f"Plugin not found: {PLUGIN_PATH}")


def create_inverse_engine():
    """创建包含 MatrixInverse 插件的 TensorRT 引擎(支持 4D 输入 [batch, N, 3, 3])"""
    logger = trt.Logger(trt.Logger.ERROR)
    builder = trt.Builder(logger)
    
    # 网络配置
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    
    # 输入 tensor: 使用动态形状 [-1, -1, 3, 3]
    input_tensor = network.add_input("input", trt.DataType.FLOAT, [-1, -1, 3, 3])
    
    # 获取插件
    plugin_registry = trt.get_plugin_registry()
    plugin_creator = plugin_registry.get_plugin_creator("MatrixInverse", "1", "")
    
    # 创建空的 PluginFieldCollection
    fc = trt.PluginFieldCollection()
    
    # 创建插件
    plugin = plugin_creator.create_plugin("MatrixInverse", fc)
    
    # 添加插件层
    inverse_layer = network.add_plugin_v2([input_tensor], plugin)
    
    # 输出
    network.mark_output(inverse_layer.get_output(0))
    
    # 构建引擎
    config = builder.create_builder_config()
    config.max_workspace_size = 1 << 20  # 1MB
    
    # 设置优化配置文件(支持动态 batch 和 cameras)
    profile = builder.create_optimization_profile()
    profile.set_shape("input", [1, 1, 3, 3], [10, 10, 3, 3], [100, 20, 3, 3])
    config.add_optimization_profile(profile)
    
    engine = builder.build_engine(network, config)
    return engine


def run_trt_inference(engine, input_data: np.ndarray) -> np.ndarray:
    """使用 TensorRT 引擎运行推理"""
    context = engine.create_execution_context()
    
    # 设置输入形状(使用 tensor 名称)
    context.set_input_shape("input", input_data.shape)
    
    # 分配内存
    input_np = input_data.astype(np.float32)
    output_np = np.empty(input_data.shape, dtype=np.float32)
    
    # 复制输入到 GPU
    d_input = torch.cuda.FloatTensor(input_np).cuda()
    d_output = torch.cuda.FloatTensor(output_np).cuda()
    
    # 推理
    context.execute_v2([d_input.data_ptr(), d_output.data_ptr()])
    
    # 同步并返回
    torch.cuda.synchronize()
    return d_output.cpu().numpy()


def torch_inverse(post_rots: torch.Tensor) -> torch.Tensor:
    """PyTorch 矩阵求逆(基准)"""
    return torch.inverse(post_rots)


def verify_with_npy(data_path: str) -> Tuple[bool, dict]:
    """
    使用 .npy 文件验证插件正确性
    
    Args:
        data_path: .npy 文件路径
        
    Returns:
        (是否通过, 统计信息)
    """
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"数据文件不存在: {data_path}")
    
    # 加载数据
    data = np.load(data_path)
    print(f"加载数据: {data_path}")
    print(f"形状: {data.shape}, dtype: {data.dtype}")
    
    # 转换为 tensor
    test_matrices = torch.from_numpy(data).float()
    
    return verify_with_tensor(test_matrices)


def verify_with_tensor(test_matrices: torch.Tensor) -> Tuple[bool, dict]:
    """
    使用 tensor 数据验证插件正确性
    
    Args:
        test_matrices: 输入矩阵 tensor
        
    Returns:
        (是否通过, 统计信息)
    """
    # 加载插件
    load_trt_plugins()
    engine = create_inverse_engine()
    
    print(f"测试数据形状: {test_matrices.shape}")
    
    # PyTorch 结果
    pytorch_results = torch_inverse(test_matrices)
    
    # TensorRT 结果
    trt_results = run_trt_inference(engine, test_matrices.numpy())
    trt_results = torch.from_numpy(trt_results)
    
    # 计算误差
    diff = torch.abs(pytorch_results - trt_results)
    max_error = float(torch.max(diff))
    mean_error = float(torch.mean(diff))
    
    # 验证 A @ A⁻¹ = I
    product = torch.bmm(test_matrices.view(-1, 3, 3), pytorch_results.view(-1, 3, 3))
    product = product.view(test_matrices.shape)
    identity = torch.eye(3).unsqueeze(0).expand_as(product)
    inv_error = torch.abs(product - identity)
    inv_max_error = float(torch.max(inv_error))
    
    # 统计
    stats = {
        "num_samples": test_matrices.shape[0],
        "max_error": max_error,
        "mean_error": mean_error,
        "inverse_property_error": inv_max_error,
        "passed": max_error < 1e-5 and inv_max_error < 1e-5
    }
    
    return stats["passed"], stats


# python test_inverse_plugin.py /path/to/post_rots.npy


if __name__ == "__main__":
    print("=" * 50)
    print("TensorRT MatrixInverse Plugin Verification")
    print("=" * 50)
    
    # 解析参数
    passed, stats = verify_with_npy(sys.argv[1])
    
    print(f"\n Test Results:")
    print(f"   Samples: {stats.get('num_samples', stats.get('num_tests', 'N/A'))}")
    print(f"   Max Error (vs PyTorch): {stats['max_error']:.2e}")
    print(f"   Mean Error (vs PyTorch): {stats['mean_error']:.2e}")
    print(f"   A @ A⁻¹ = I Error: {stats['inverse_property_error']:.2e}")
    print(f"\n{'PASSED' if passed else 'FAILED'}")
    print("=" * 50)

结果

shell 复制代码
Loading plugin: /workspace/current_workspace/cpp/build/lib/liblss_trt_plugins.so
Plugin loaded successfully!
/workspace/current_workspace/py/test_inverse_plugin.py:56: DeprecationWarning: Use set_memory_pool_limit instead.
  config.max_workspace_size = 1 << 20  # 1MB
/workspace/current_workspace/py/test_inverse_plugin.py:63: DeprecationWarning: Use build_serialized_network instead.
  engine = builder.build_engine(network, config)
测试数据形状: torch.Size([1, 5, 3, 3])
/workspace/current_workspace/py/test_inverse_plugin.py:79: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)
  d_input = torch.cuda.FloatTensor(input_np).cuda()

 Test Results:
   Samples: 1
   Max Error (vs PyTorch): 4.77e-07
   Mean Error (vs PyTorch): 4.34e-08
   A @ A⁻¹ = I Error: 3.73e-09

PASSED
==================================================
相关推荐
小七-七牛开发者3 小时前
周一上线 | SpaceX 收购 Cursor、支付宝进入 AI 时代、DeepSeek 完成 500 亿元融资
ai·agent·token·glm·智谱·claudecode·ai coding·周一上线
doiito21 小时前
【Agent Harness】为什么我把 JSON‑LD “编译成 DAG” 后,整个 Agent 平台立刻聪明了
ai·rust·架构设计·系统设计·ai agent
xiezhr1 天前
折腾半小时,终于让AI 能直接帮我写飞书文档了
ai·飞书·ai agent·飞书cli·飞书文档
岳小哥AI1 天前
Claude Fable和Claude Mythos 5同时发布:注意力机制下愈加强大的AI大模型
ai·ai基础
Artech1 天前
[MAF预定义的AIContextProvider-04]Mem0Provider——长期记忆基于的云端解决方案
ai·agent·maf·aicontextprovider·chathistorymemoryprovider·mem0provider
哥不是小萝莉2 天前
一文读懂 OpenAI Codex 源码的原理、架构与未来
ai
AlfredZhao2 天前
AI 编程工作总结:从体验问题到模块能力建设
ai·codex
cup113 天前
[技术复盘] Windows Python 打包实战:Nuitka 环境踩坑总结与 CI 自动化构建全指南
python·ai·环境变量·ci·nuitka·skill
IT王师傅3 天前
从 豆包 到 Codex CLI:一名普通开发者的 AI 工具进化路线
ai·codex cli·openclaw
岳小哥AI3 天前
Siri要接入AI了,苹果手机上一句话让GPT写文案、DeepSeek写代码的时刻来了
ai·ai基础