Lss-bev系列-1-部署插件inverse
总结
在导出该项目onnx时候,会产生不支持算子的报错。这里首先分析torch.inverse这个函数。
pytorch引用
原始代码中的使用主要发生在这两个部分,其中可以看到的是该算子其实是对最后的3*3维度的矩阵进行矩阵求逆,主要应用在进行坐标转换上的。
python
points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))
python
combine = rots.matmul(torch.inverse(intrins))
自定义插件-pytorch
进行定义一个求逆计算规则
python
class InverseFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input):
# PyTorch原生实现
output = torch.inverse(input)
ctx.save_for_backward(output)
return output
@staticmethod
def symbolic(g, input):
# 明确指定输出类型和形状与输入相同
output = g.op("xyz.onnx.contrib::MatrixInverse", input)
# 设置输出形状与输入相同
output.setType(input.type())
return output
在pytorch中使用此计算规则,替换不可直接支持的计算语句
python
# points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1)) # 撤销旋转 -> B N D H W 3 1
points = InverseFunction.apply(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))
自定义插件-c++
c++插件主要是导出trt引擎时对于自定义的算子需要进行实现,因此这里需要对矩阵求逆算子进行实现,好在这里只是33的矩阵,我们可以直接利用线性代数中3 3的公式直接进行实现就好了,不需要写成复杂的nn矩阵求逆,只给出33的特例计算就好了。
cu文件
c
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
// CUDA kernel for 3x3 matrix inversion using analytical method
__global__ void matrixInverse3x3Kernel(const float* input, float* output, int numMatrices)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= numMatrices) return;
// 每个矩阵有9个元素
const float* mat = input + idx * 9;
float* inv = output + idx * 9;
// 3x3矩阵布局:
// [0 1 2]
// [3 4 5]
// [6 7 8]
// 计算行列式
float det = mat[0] * (mat[4] * mat[8] - mat[5] * mat[7])
- mat[1] * (mat[3] * mat[8] - mat[5] * mat[6])
+ mat[2] * (mat[3] * mat[7] - mat[4] * mat[6]);
// 检查行列式是否为零
if (fabs(det) < 1e-10f) {
// 矩阵奇异,使用单位矩阵或原矩阵
for (int i = 0; i < 9; ++i) {
inv[i] = (i % 4 == 0) ? 1.0f : 0.0f; // 单位矩阵
}
return;
}
float invDet = 1.0f / det;
// 计算逆矩阵(伴随矩阵的转置除以行列式)
inv[0] = (mat[4] * mat[8] - mat[5] * mat[7]) * invDet;
inv[1] = (mat[2] * mat[7] - mat[1] * mat[8]) * invDet;
inv[2] = (mat[1] * mat[5] - mat[2] * mat[4]) * invDet;
inv[3] = (mat[5] * mat[6] - mat[3] * mat[8]) * invDet;
inv[4] = (mat[0] * mat[8] - mat[2] * mat[6]) * invDet;
inv[5] = (mat[2] * mat[3] - mat[0] * mat[5]) * invDet;
inv[6] = (mat[3] * mat[7] - mat[4] * mat[6]) * invDet;
inv[7] = (mat[1] * mat[6] - mat[0] * mat[7]) * invDet;
inv[8] = (mat[0] * mat[4] - mat[1] * mat[3]) * invDet;
}
extern "C" void matrixInverseBatchedCUDA(
const float* input, float* output, int batchSize, int matrixSize, cudaStream_t stream)
{
if (matrixSize != 3) {
printf("Error: Only 3x3 matrices are supported. Got size: %d\n", matrixSize);
return;
}
// 配置CUDA执行参数
int threadsPerBlock = 256;
int blocksPerGrid = (batchSize + threadsPerBlock - 1) / threadsPerBlock;
// 启动kernel
matrixInverse3x3Kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
input, output, batchSize);
// 检查CUDA错误
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA Error in matrixInverseBatchedCUDA: %s\n", cudaGetErrorString(err));
}
}
cpp文件
c
#include "MatrixInversePlugin.h"
#include "PluginUtils.h"
#include <cassert>
#include <cstring>
using namespace nvinfer1;
using namespace plugin;
namespace {
const char* MATRIX_INVERSE_PLUGIN_VERSION{"1"};
const char* MATRIX_INVERSE_PLUGIN_NAME{"MatrixInverse"};
}
// CUDA kernel声明
extern "C" void matrixInverseBatchedCUDA(
const float* input, float* output, int batchSize, int matrixSize, cudaStream_t stream);
// 构造函数
MatrixInversePlugin::MatrixInversePlugin()
: mNamespace("")
, mDataType(DataType::kFLOAT)
, mNumMatrices(0)
, mMatrixSize(3)
{
}
MatrixInversePlugin::MatrixInversePlugin(const void* data, size_t length)
: mNamespace("")
{
const char* d = static_cast<const char*>(data);
const char* a = d;
mDataType = plugin::read<DataType>(d);
mNumMatrices = plugin::read<int32_t>(d);
mMatrixSize = plugin::read<int32_t>(d);
assert(d == a + length);
}
MatrixInversePlugin::~MatrixInversePlugin()
{
terminate()
;
}
IPluginV2DynamicExt* MatrixInversePlugin::clone() const noexcept
{
MatrixInversePlugin* plugin = new MatrixInversePlugin();
plugin->mDataType = this->mDataType;
plugin->mNumMatrices = this->mNumMatrices;
plugin->mMatrixSize = this->mMatrixSize;
plugin->setPluginNamespace(mNamespace.c_str());
return plugin;
}
DimsExprs MatrixInversePlugin::getOutputDimensions(
int32_t outputIndex, const DimsExprs* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept
{
assert(outputIndex == 0 && nbInputs == 1);
return inputs[0];
}
bool MatrixInversePlugin::supportsFormatCombination(
int32_t pos, const PluginTensorDesc* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
{
assert(nbInputs == 1 && nbOutputs == 1);
assert(pos < nbInputs + nbOutputs);
// 只支持 FP32 + Linear 格式
const PluginTensorDesc& desc = inOut[pos];
return (desc.type == DataType::kFLOAT) && (desc.format == TensorFormat::kLINEAR);
}
void MatrixInversePlugin::configurePlugin(
const DynamicPluginTensorDesc* in, int32_t nbInputs,
const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept
{
assert(nbInputs == 1 && nbOutputs == 1);
mDataType = in[0].desc.type;
const auto& dims = in[0].desc.dims;
int32_t totalSize = 1;
for (int32_t i = 0; i < dims.nbDims; ++i) {
totalSize *= dims.d[i];
}
mMatrixSize = 3;
mNumMatrices = totalSize / (mMatrixSize * mMatrixSize);
}
size_t MatrixInversePlugin::getWorkspaceSize(
const PluginTensorDesc* inputs, int32_t nbInputs,
const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept
{
// 为批量矩阵求逆分配工作空间
return mNumMatrices * mMatrixSize * mMatrixSize * sizeof(float) * 2;
}
int32_t MatrixInversePlugin::enqueue(
const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) noexcept
{
const float* input = static_cast<const float*>(inputs[0]);
float* output = static_cast<float*>(outputs[0]);
// 调用CUDA kernel进行批量矩阵求逆
matrixInverseBatchedCUDA(input, output, mNumMatrices, mMatrixSize, stream);
return 0;
}
const char* MatrixInversePlugin::getPluginType() const noexcept
{
return MATRIX_INVERSE_PLUGIN_NAME;
}
const char* MatrixInversePlugin::getPluginVersion() const noexcept
{
return MATRIX_INVERSE_PLUGIN_VERSION;
}
int32_t MatrixInversePlugin::getNbOutputs() const noexcept
{
return 1;
}
int32_t MatrixInversePlugin::initialize() noexcept
{
return 0;
}
void MatrixInversePlugin::terminate() noexcept
{
}
size_t MatrixInversePlugin::getSerializationSize() const noexcept
{
return sizeof(DataType) + sizeof(int32_t) * 2;
}
void MatrixInversePlugin::serialize(void* buffer) const noexcept
{
char* d = static_cast<char*>(buffer);
const char* a = d;
plugin::write(d, mDataType);
plugin::write(d, mNumMatrices);
plugin::write(d, mMatrixSize);
assert(d == a + getSerializationSize());
}
void MatrixInversePlugin::destroy() noexcept
{
delete this;
}
void MatrixInversePlugin::setPluginNamespace(const char* pluginNamespace) noexcept
{
mNamespace = pluginNamespace;
}
const char* MatrixInversePlugin::getPluginNamespace() const noexcept
{
return mNamespace.c_str();
}
DataType MatrixInversePlugin::getOutputDataType(
int32_t index, const nvinfer1::DataType* inputTypes, int32_t nbInputs) const noexcept
{
assert(index == 0);
return inputTypes[0];
}
// MatrixInversePluginCreator实现
PluginFieldCollection MatrixInversePluginCreator::mFC{};
std::vector<PluginField> MatrixInversePluginCreator::mPluginAttributes;
MatrixInversePluginCreator::MatrixInversePluginCreator()
{
mPluginAttributes.clear();
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* MatrixInversePluginCreator::getPluginName() const noexcept
{
return MATRIX_INVERSE_PLUGIN_NAME;
}
const char* MatrixInversePluginCreator::getPluginVersion() const noexcept
{
return MATRIX_INVERSE_PLUGIN_VERSION;
}
const PluginFieldCollection* MatrixInversePluginCreator::getFieldNames() noexcept
{
return &mFC;
}
IPluginV2* MatrixInversePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
{
MatrixInversePlugin* plugin = new MatrixInversePlugin();
plugin->setPluginNamespace(mNamespace.c_str());
return plugin;
}
IPluginV2* MatrixInversePluginCreator::deserializePlugin(
const char* name, const void* serialData, size_t serialLength) noexcept
{
MatrixInversePlugin* plugin = new MatrixInversePlugin(serialData, serialLength);
plugin->setPluginNamespace(mNamespace.c_str());
return plugin;
}
void MatrixInversePluginCreator::setPluginNamespace(const char* pluginNamespace) noexcept
{
mNamespace = pluginNamespace;
}
const char* MatrixInversePluginCreator::getPluginNamespace() const noexcept
{
return mNamespace.c_str();
}
h文件
c
#ifndef MATRIX_INVERSE_PLUGIN_H
#define MATRIX_INVERSE_PLUGIN_H
#include <NvInfer.h>
#include <vector>
#include <string>
#include <cuda_runtime.h>
namespace nvinfer1 {
namespace plugin {
class MatrixInversePlugin : public IPluginV2DynamicExt {
public:
MatrixInversePlugin();
MatrixInversePlugin(const void* data, size_t length);
~MatrixInversePlugin() override;
IPluginV2DynamicExt* clone() const noexcept override;
DimsExprs getOutputDimensions(int32_t outputIndex, const DimsExprs* inputs,
int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override;
bool supportsFormatCombination(int32_t pos, const PluginTensorDesc* inOut,
int32_t nbInputs, int32_t nbOutputs) noexcept override;
void configurePlugin(const DynamicPluginTensorDesc* in, int32_t nbInputs,
const DynamicPluginTensorDesc* out, int32_t nbOutputs) noexcept override;
size_t getWorkspaceSize(const PluginTensorDesc* inputs, int32_t nbInputs,
const PluginTensorDesc* outputs, int32_t nbOutputs) const noexcept override;
int32_t enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs,
void* workspace, cudaStream_t stream) noexcept override;
const char* getPluginType() const noexcept override;
const char* getPluginVersion() const noexcept override;
int32_t getNbOutputs() const noexcept override;
int32_t initialize() noexcept override;
void terminate() noexcept override;
size_t getSerializationSize() const noexcept override;
void serialize(void* buffer) const noexcept override;
void destroy() noexcept override;
void setPluginNamespace(const char* pluginNamespace) noexcept override;
const char* getPluginNamespace() const noexcept override;
DataType getOutputDataType(int32_t index, const nvinfer1::DataType* inputTypes, int32_t nbInputs) const noexcept override;
private:
std::string mNamespace;
DataType mDataType;
int32_t mNumMatrices;
int32_t mMatrixSize;
};
class MatrixInversePluginCreator : public IPluginCreator {
public:
MatrixInversePluginCreator();
~MatrixInversePluginCreator() override = default;
const char* getPluginName() const noexcept override;
const char* getPluginVersion() const noexcept override;
const PluginFieldCollection* getFieldNames() noexcept override;
IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
void setPluginNamespace(const char* pluginNamespace) noexcept override;
const char* getPluginNamespace() const noexcept override;
private:
static PluginFieldCollection mFC;
static std::vector<PluginField> mPluginAttributes;
std::string mNamespace;
};
} // namespace plugin
} // namespace nvinfer1
#endif // MATRIX_INVERSE_PLUGIN_H
测试
写好插件后为了判定与python的实现内容是一致的最好的验证方式是写一个验证脚本,我们可以把输入结果pytorch的递推结果中拿出来,然后构建一个只有插件这一层的trt引擎,将同样的输入给到pytorch代码和trt代码,通过对比结果来看插件的工作是否是有效的。
数据保存与加载
保存的例子
python
# np.save(f"{debug_dir}/post_rots.npy", post_rots.cpu().numpy())
# np.save(f"{debug_dir}/intrins.npy", intrins.cpu().numpy())
加载并对比插件
python
# TensorRT MatrixInverse 插件验证
import torch
import numpy as np
import tensorrt as trt
import os
import sys
import ctypes
from typing import Tuple, Optional
# 插件路径
PLUGIN_PATH = os.path.join(os.path.dirname(__file__), "..", "cpp", "build", "lib", "liblss_trt_plugins.so")
PLUGIN_PATH = os.path.abspath(PLUGIN_PATH)
def load_trt_plugins():
"""加载 TensorRT 插件库"""
print(f"Loading plugin: {PLUGIN_PATH}")
if os.path.exists(PLUGIN_PATH):
# 使用 ctypes 加载库,触发静态初始化函数
ctypes.CDLL(PLUGIN_PATH)
trt.init_libnvinfer_plugins(None, "")
print("Plugin loaded successfully!")
else:
raise FileNotFoundError(f"Plugin not found: {PLUGIN_PATH}")
def create_inverse_engine():
"""创建包含 MatrixInverse 插件的 TensorRT 引擎(支持 4D 输入 [batch, N, 3, 3])"""
logger = trt.Logger(trt.Logger.ERROR)
builder = trt.Builder(logger)
# 网络配置
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
# 输入 tensor: 使用动态形状 [-1, -1, 3, 3]
input_tensor = network.add_input("input", trt.DataType.FLOAT, [-1, -1, 3, 3])
# 获取插件
plugin_registry = trt.get_plugin_registry()
plugin_creator = plugin_registry.get_plugin_creator("MatrixInverse", "1", "")
# 创建空的 PluginFieldCollection
fc = trt.PluginFieldCollection()
# 创建插件
plugin = plugin_creator.create_plugin("MatrixInverse", fc)
# 添加插件层
inverse_layer = network.add_plugin_v2([input_tensor], plugin)
# 输出
network.mark_output(inverse_layer.get_output(0))
# 构建引擎
config = builder.create_builder_config()
config.max_workspace_size = 1 << 20 # 1MB
# 设置优化配置文件(支持动态 batch 和 cameras)
profile = builder.create_optimization_profile()
profile.set_shape("input", [1, 1, 3, 3], [10, 10, 3, 3], [100, 20, 3, 3])
config.add_optimization_profile(profile)
engine = builder.build_engine(network, config)
return engine
def run_trt_inference(engine, input_data: np.ndarray) -> np.ndarray:
"""使用 TensorRT 引擎运行推理"""
context = engine.create_execution_context()
# 设置输入形状(使用 tensor 名称)
context.set_input_shape("input", input_data.shape)
# 分配内存
input_np = input_data.astype(np.float32)
output_np = np.empty(input_data.shape, dtype=np.float32)
# 复制输入到 GPU
d_input = torch.cuda.FloatTensor(input_np).cuda()
d_output = torch.cuda.FloatTensor(output_np).cuda()
# 推理
context.execute_v2([d_input.data_ptr(), d_output.data_ptr()])
# 同步并返回
torch.cuda.synchronize()
return d_output.cpu().numpy()
def torch_inverse(post_rots: torch.Tensor) -> torch.Tensor:
"""PyTorch 矩阵求逆(基准)"""
return torch.inverse(post_rots)
def verify_with_npy(data_path: str) -> Tuple[bool, dict]:
"""
使用 .npy 文件验证插件正确性
Args:
data_path: .npy 文件路径
Returns:
(是否通过, 统计信息)
"""
if not os.path.exists(data_path):
raise FileNotFoundError(f"数据文件不存在: {data_path}")
# 加载数据
data = np.load(data_path)
print(f"加载数据: {data_path}")
print(f"形状: {data.shape}, dtype: {data.dtype}")
# 转换为 tensor
test_matrices = torch.from_numpy(data).float()
return verify_with_tensor(test_matrices)
def verify_with_tensor(test_matrices: torch.Tensor) -> Tuple[bool, dict]:
"""
使用 tensor 数据验证插件正确性
Args:
test_matrices: 输入矩阵 tensor
Returns:
(是否通过, 统计信息)
"""
# 加载插件
load_trt_plugins()
engine = create_inverse_engine()
print(f"测试数据形状: {test_matrices.shape}")
# PyTorch 结果
pytorch_results = torch_inverse(test_matrices)
# TensorRT 结果
trt_results = run_trt_inference(engine, test_matrices.numpy())
trt_results = torch.from_numpy(trt_results)
# 计算误差
diff = torch.abs(pytorch_results - trt_results)
max_error = float(torch.max(diff))
mean_error = float(torch.mean(diff))
# 验证 A @ A⁻¹ = I
product = torch.bmm(test_matrices.view(-1, 3, 3), pytorch_results.view(-1, 3, 3))
product = product.view(test_matrices.shape)
identity = torch.eye(3).unsqueeze(0).expand_as(product)
inv_error = torch.abs(product - identity)
inv_max_error = float(torch.max(inv_error))
# 统计
stats = {
"num_samples": test_matrices.shape[0],
"max_error": max_error,
"mean_error": mean_error,
"inverse_property_error": inv_max_error,
"passed": max_error < 1e-5 and inv_max_error < 1e-5
}
return stats["passed"], stats
# python test_inverse_plugin.py /path/to/post_rots.npy
if __name__ == "__main__":
print("=" * 50)
print("TensorRT MatrixInverse Plugin Verification")
print("=" * 50)
# 解析参数
passed, stats = verify_with_npy(sys.argv[1])
print(f"\n Test Results:")
print(f" Samples: {stats.get('num_samples', stats.get('num_tests', 'N/A'))}")
print(f" Max Error (vs PyTorch): {stats['max_error']:.2e}")
print(f" Mean Error (vs PyTorch): {stats['mean_error']:.2e}")
print(f" A @ A⁻¹ = I Error: {stats['inverse_property_error']:.2e}")
print(f"\n{'PASSED' if passed else 'FAILED'}")
print("=" * 50)
结果
shell
Loading plugin: /workspace/current_workspace/cpp/build/lib/liblss_trt_plugins.so
Plugin loaded successfully!
/workspace/current_workspace/py/test_inverse_plugin.py:56: DeprecationWarning: Use set_memory_pool_limit instead.
config.max_workspace_size = 1 << 20 # 1MB
/workspace/current_workspace/py/test_inverse_plugin.py:63: DeprecationWarning: Use build_serialized_network instead.
engine = builder.build_engine(network, config)
测试数据形状: torch.Size([1, 5, 3, 3])
/workspace/current_workspace/py/test_inverse_plugin.py:79: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:78.)
d_input = torch.cuda.FloatTensor(input_np).cuda()
Test Results:
Samples: 1
Max Error (vs PyTorch): 4.77e-07
Mean Error (vs PyTorch): 4.34e-08
A @ A⁻¹ = I Error: 3.73e-09
PASSED
==================================================