使用：Pytorch C++ API

快速上手 PyTorch C++（LibTorch）部署

核心管线：图像/数据 → Tensor → model.forward → 解析结果

Part 1：环境搭建

组件	作用
LibTorch	PyTorch 的 C++ 分发包，从官网下载，别自己编译
OpenCV	图像读、写、预处理
CMake	构建工具
CUDA	（可选）GPU 推理

Part 2：Tensor 基础

PyTorch C++ 里 Tensor 是核心。但你不需要记所有 API，记住 4 个操作就够了：

cpp 复制代码

#include <torch/torch.h>
#include <iostream>

int main() {
    // 创建一个 [480, 640, 3] 的 uint8 Tensor（模拟一张图）
    torch::Tensor img = torch::randint(0, 255, {480, 640, 3}, torch::kUInt8);

    // 1. permute --- 调换维度顺序（HWC → CHW）
    img = img.permute({2, 0, 1});           // [3, 480, 640]

    // 2. unsqueeze --- 加一个维度（加 batch）
    img = img.unsqueeze(0);                  // [1, 3, 480, 640]

    // 3. squeeze --- 去掉一个维度
    img = img.squeeze(0);                    // [3, 480, 640]

    // 4. to --- 改类型、改设备
    img = img.to(torch::kFloat32);                     // uint8 → float32
    img = img.to(torch::kCUDA);                        // CPU → GPU

    // 调试三板斧
    std::cout << img.sizes() << std::endl;   // shape
    std::cout << img.dtype() << std::endl;   // 数据类型
    std::cout << img.device() << std::endl;  // CPU / CUDA
}

为什么是 HWC → CHW？

OpenCV 读图是 HWC 格式（行、列、通道），PyTorch 模型输入是 NCHW（batch、通道、高、宽）。

真实的 demo_001.cpp 运行效果

cpp 复制代码

#include <torch/torch.h>
#include <iostream>
using namespace std;

int main(){
    torch::Tensor x = torch::randn({224, 224, 3});
    cout << "原始 Tensor x:\n" << x.sizes() << " " << x.dtype() << " " << x.device() << endl;
    torch::Tensor y = torch::zeros({480, 640, 3}, torch::kUInt8);   // 数据类型为 uint8
    cout << "原始 Tensor y:\n" << y.sizes() << " " << y.dtype() << " " << y.device() << endl;
    // Tensor 的维度转换
    x = x.permute({2, 0, 1});       // HWC→CHW
    cout << "x 经过 permute 后:\n" << x.sizes() << " " << x.dtype() << " " << x.device() << endl;
    x = x.unsqueeze(0);             // 加 batch 维
    cout << "x 经过 unsqueeze 后:\n" << x.sizes() << " " << x.dtype() << " " << x.device() << endl;
    x = x.squeeze(0);               // 去 batch 维
    cout << "x 经过 squeeze 后:\n" << x.sizes() << " " << x.dtype() << " " << x.device() << endl;
    x = x.reshape({1, 3, 224, 224});// reshape 也可以改变维度，但不要求连续内存
    cout << "x 经过 reshape 后:\n" << x.sizes() << " " << x.dtype() << " " << x.device() << endl;

    // 转换数据类型
    y = y.to(torch::kCUDA, torch::kFloat32).div(255.0);
    cout << "y 经过 to + div 后:\n" << y.sizes() << " " << y.dtype() << " " << y.device() << endl;
    y = y.to(torch::kInt32);
    cout << "y 经过 to int32 后:\n" << y.sizes() << " " << y.dtype() << " " << y.device() << endl;

    return 0;
}

输出：

复制代码

原始 Tensor x:
[224, 224, 3] float cpu
原始 Tensor y:
[480, 640, 3] unsigned char cpu
x 经过 permute 后:
[3, 224, 224] float cpu
x 经过 unsqueeze 后:
[1, 3, 224, 224] float cpu
x 经过 squeeze 后:
[3, 224, 224] float cpu
x 经过 reshape 后:
[1, 3, 224, 224] float cpu
y 经过 to + div 后:
[480, 640, 3] float cuda:0
y 经过 to int32 后:
[480, 640, 3] int cuda:0

Part 3：预处理

图像部署中最常写的代码就是这部分。

cpp 复制代码

torch::Tensor preprocess(cv::Mat img, int w, int h, torch::Device device) {
    // 1. resize
    cv::resize(img, img, {w, h});

    // 2. BGR → RGB（大坑！OpenCV 读出来是 BGR）
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);

    // 3. Mat → Tensor（必须 clone！）
    torch::Tensor t = torch::from_blob(
        img.data, {img.rows, img.cols, 3}, torch::kUInt8
    ).clone();

    // 4. uint8 → float32 /255
    t = t.to(torch::kFloat32).div(255.0);

    // 5. HWC → NCHW
    t = t.permute({2, 0, 1}).unsqueeze(0);

    // 6. normalize（ImageNet 标准）
    torch::Tensor mean = torch::tensor({0.485, 0.456, 0.406}).view({1,3,1,1});
    torch::Tensor std = torch::tensor({0.229, 0.224, 0.225}).view({1,3,1,1});
    t = t.sub(mean).div(std);

    return t.contiguous().to(device);
}

经常踩的坑

#	问题	后果
1	BGR → RGB 忘了	颜色颠倒，结果全错
2	HWC → CHW 忘了	模型崩溃或结果离谱
3	忘了 /255	输入范围不对，预测概率极低
4	`from_blob` 没 `clone`	Mat 释放后 Tensor 野指针，crash
5	CPU Tensor 给了 CUDA 模型	运行时崩溃

Part 4：模型推理

cpp 复制代码

#include <torch/script.h>

// 加载模型
torch::jit::script::Module model = torch::jit::load("model.pt");
model.to(device);
model.eval();

// 推理
torch::Tensor input = preprocess(img, 224, 224, device);

c10::InferenceMode guard;                  // 关闭梯度计算，更快
auto output = model.forward({input});
torch::Tensor result = output.toTensor();   // 如果是单 Tensor 输出

处理复杂的模型输出

不是所有模型都返回单个 Tensor。如果返回 Tuple：

cpp 复制代码

auto output = model.forward({input}).toTuple();
torch::Tensor boxes = output->elements()[0].toTensor();
torch::Tensor scores = output->elements()[1].toTensor();
torch::Tensor labels = output->elements()[2].toTensor();

如果返回 Dict：

cpp 复制代码

auto dict = model.forward({input}).toGenericDict();
for (auto& item : dict) {
    std::cout << item.key() << std::endl;
}

Part 5：后处理

分类

cpp 复制代码

auto prob = torch::softmax(output, 1);
auto [values, indices] = prob.topk(5, 1);
for (int i = 0; i < 5; i++) {
    int cls = indices[0][i].item<int>();
    float score = values[0][i].item<float>();
    printf("  [%d] class=%d score=%.4f\n", i, cls, score);
}

检测 NMS

cpp 复制代码

struct Detection {
    int class_id;
    float score;
    cv::Rect box;
};

// 按 score 排序 → 计算 IoU → 抑制重复框
std::vector<int> nms(torch::Tensor boxes, torch::Tensor scores, float iou_thresh) {
    auto [_, idx] = scores.sort(0, true);
    std::vector<bool> supressed(boxes.size(0), false);
    std::vector<int> keep;
    for (int i = 0; i < boxes.size(0); i++) {
        int idx_i = idx[i].item<int>();
        if (supressed[idx_i]) continue;
        keep.push_back(idx_i);
        // ... 计算 IoU，标记重叠框 ...
    }
    return keep;
}

Part 6：一个完整例子 --- MNIST 手写数字识别

让我们把上面所有步骤串起来。

第一步：Python 训练并导出模型

python 复制代码

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from PIL import Image
import os


# 网络
class MNIST_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        # conv1: 1×28×28  →  32×14×14
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        # conv2: 32×14×14 →  64×7×7
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        # fc1: 64*7*7=3136 → 128
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        # fc2: 128 → 10
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        # x: [B, 1, 28, 28]
        x = self.pool(F.relu(self.conv1(x)))    # → [B, 32, 14, 14]
        x = self.pool(F.relu(self.conv2(x)))    # → [B, 64, 7, 7]
        x = x.view(x.size(0), -1)               # → [B, 3136]
        x = F.relu(self.fc1(x))                 # → [B, 128]
        x = self.fc2(x)                         # → [B, 10]
        return x


# 2. 训练
def train():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] 使用设备: {device}")

    # 数据预处理：转 Tensor + 归一化到 [0,1]
    transform = transforms.Compose([
        transforms.ToTensor(),  # 自动把 0-255 变成 0-1，单通道
    ])

    # 下载 MNIST 数据集（第一次运行会自动下载到 ~/.pytorch/MNIST_data/）
    print("[INFO] 加载 MNIST 数据集...")
    train_dataset = datasets.MNIST(
        root="./data", train=True, download=True, transform=transform
    )
    test_dataset = datasets.MNIST(
        root="./data", train=False, download=True, transform=transform
    )
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader  = DataLoader(test_dataset,  batch_size=1000, shuffle=False)

    model = MNIST_CNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 训练 3 个 epoch（MNIST 很简单，3 个就够了）
    print("[INFO] 开始训练...")
    for epoch in range(1, 4):
        model.train()
        running_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            if batch_idx % 100 == 0:
                print(f"  Epoch {epoch}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item():.4f}")

        avg_loss = running_loss / len(train_loader)
        print(f"[Epoch {epoch}] 平均 Loss: {avg_loss:.4f}")

    # 测试准确率
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            correct += (pred == target).sum().item()
            total += target.size(0)
    acc = 100.0 * correct / total
    print(f"[INFO] 测试准确率: {acc:.2f}%")

    return model, device


# 3. 导出 TorchScript
def export_torchscript(model, device, save_path="models/mnist_cnn.pt"):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    model.eval()
    model.to("cpu")

    # 用 trace 导出（给定一个示例输入走一遍 forward，自动记录计算图）
    example_input = torch.randn(1, 1, 28, 28)  # [batch=1, channel=1, H=28, W=28]
    traced_model = torch.jit.trace(model, example_input)
    traced_model.save(save_path)
    print(f"[OK] TorchScript 模型已保存: {save_path}")

    # 验证：重新加载并跑一次推理
    loaded = torch.jit.load(save_path)
    loaded.eval()
    with torch.no_grad():
        test_input = torch.randn(1, 1, 28, 28)
        out = loaded(test_input)
        print(f"[OK] 模型验证通过，输出 shape: {out.shape}")  # 应为 [1, 10]


# 4. 导出一张测试图片
def export_test_image():
    """
    从 MNIST 测试集中取一张图片，保存为 PNG。
    这样后续 C++ 程序可以用 OpenCV 读取它来做推理验证。
    """
    os.makedirs("images", exist_ok=True)
    test_dataset = datasets.MNIST(
        root="./data", train=False, download=True,
        transform=transforms.ToTensor()
    )
    # 取第一张图片（数字 7）
    img_tensor, label = test_dataset[0]
    print(f"[INFO] 测试图片标签: {label}")

    # Tensor → PIL Image → 保存为 PNG（0-1 → 0-255）
    img = Image.fromarray((img_tensor.squeeze(0).numpy() * 255).astype("uint8"))
    img.save("images/mnist_test.png")
    print(f"[OK] 测试图片已保存: images/mnist_test.png (标签={label})")


# main
if __name__ == "__main__":
    model, device = train()
    export_torchscript(model, device)
    export_test_image()
    print("\n[DONE] 全部完成！")
    print("  - 模型: models/mnist_cnn.pt")
    print("  - 测试图: images/mnist_test.png")

第二步：Python 推理比对

完整 Python 推理代码，与上面的 C++ 流程一一对应：

python 复制代码

import torch
import torch.nn.functional as F
import numpy as np
import cv2
import sys
import os


def preprocess_mnist(image_path: str, device: torch.device) -> torch.Tensor:
    """预处理: 读图 → 灰度 → resize(28,28) → Tensor [1, 1, 28, 28] float32"""
    # 读图（灰度）
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(f"无法读取图片: {image_path}")

    # 确保 28x28
    if img.shape != (28, 28):
        img = cv2.resize(img, (28, 28))
        print("[WARN] 图片尺寸不是 28x28，已自动 resize")

    # uint8 → float32 → /255 → Tensor [1, 1, 28, 28]
    tensor = torch.from_numpy(img.astype(np.float32)).div(255.0)
    tensor = tensor.unsqueeze(0).unsqueeze(0)  # [H, W] → [1, 1, H, W]

    return tensor.to(device)


def postprocess_mnist(output: torch.Tensor) -> int:
    """后处理: softmax → 打印各类概率 → 返回预测类别"""
    prob = F.softmax(output, dim=1)  # [1, 10]
    confidence, predicted = prob.max(dim=1)

    predicted_class = predicted.item()
    confidence_val = confidence.item()

    print("\n========== 预测结果 ==========")
    print(f"  类别: {predicted_class}")
    print(f"  置信度: {confidence_val * 100:.2f}%")

    print("\n  各类别概率:")
    prob_cpu = prob.cpu().squeeze(0)
    for i in range(10):
        p = prob_cpu[i].item()
        bar = "#" * int(p * 50)
        print(f"    {i}: {bar} {p * 100:.2f}%")

    return predicted_class


def main():
    # 参数解析
    model_path = "models/mnist_cnn.pt"
    image_path = "images/mnist_test.png"

    if len(sys.argv) >= 3:
        model_path = sys.argv[1]
        image_path = sys.argv[2]
    else:
        print(f"[INFO] 使用默认路径:")
        print(f"  模型: {model_path}")
        print(f"  图片: {image_path}")

    # 选择设备
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] 使用设备: {device}")

    # 检查文件
    for f in [model_path, image_path]:
        if not os.path.exists(f):
            print(f"[ERROR] 文件不存在: {f}")
            return

    # Step 1: 预处理
    input_tensor = preprocess_mnist(image_path, device)
    print(f"[OK] 预处理完成, tensor shape: {input_tensor.shape}, device: {input_tensor.device}")

    # Step 2: 加载模型
    print(f"[INFO] 加载模型: {model_path}")
    try:
        model = torch.jit.load(model_path, map_location=device)
        model.to(device)
        model.eval()
        print(f"[OK] 模型加载成功, device: {device}")
    except Exception as e:
        print(f"[ERROR] 加载模型失败: {e}")
        return

    # Step 3: 推理
    print("[INFO] 开始推理...")
    with torch.inference_mode():
        output = model(input_tensor)
    print(f"[OK] 推理完成, output shape: {output.shape}")

    # Step 4: 后处理
    predicted = postprocess_mnist(output)

    print(f"\n[DONE] 预测数字: {predicted}")


if __name__ == "__main__":
    main()

第三步：C++ 推理

完整代码。你可以分成 5 步来理解：

cpp 复制代码

#include <torch/torch.h>
#include <torch/script.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include <vector>

using namespace std;

// 预处理: cv::Mat → Tensor [1, 1, 28, 28] float32
torch::Tensor preprocess_mnist(const cv::Mat& image, torch::Device device) {
    // MNIST 是 28x28 灰度图，所以不需要 resize、不需要 BGR→RGB
    // 但 OpenCV 读进来可能是 (28,28) 或 (28,28,1)，统一转一下

    cv::Mat gray;
    if (image.channels() == 3) {
        cv::cvtColor(image, gray, cv::COLOR_BGR2GRAY);
    } else if (image.channels() == 4) {
        cv::cvtColor(image, gray, cv::COLOR_BGRA2GRAY);
    } else {
        gray = image.clone();
    }

    // 确保是 28x28
    if (gray.rows != 28 || gray.cols != 28) {
        cv::resize(gray, gray, cv::Size(28, 28));
        cout << "[WARN] 图片尺寸不是 28x28，已自动 resize" << endl;
    }

    // Mat → Tensor (H=28, W=28) uint8
    // from_blob 必须 clone，否则 Mat 释放后 Tensor 变野指针
    torch::Tensor tensor = torch::from_blob(
        gray.data,
        {gray.rows, gray.cols},
        torch::kUInt8
    ).clone();

    // uint8 → float32 → /255 归一化到 [0, 1]
    tensor = tensor.to(torch::kFloat32).div(255.0);

    // HWC → CHW: [28, 28] → [1, 28, 28]（加 channel 维）
    tensor = tensor.unsqueeze(0);   // [28, 28] → [1, 28, 28]

    // 加 batch 维: [1, 28, 28] → [1, 1, 28, 28]
    tensor = tensor.unsqueeze(0);

    // contiguous + 搬到 device
    tensor = tensor.contiguous().to(device);

    return tensor;
}

// 后处理: 输出 Tensor [1, 10] → 预测类别
int postprocess_mnist(const torch::Tensor& output) {
    // output shape: [1, 10]，每个位置是 logits
    auto prob = torch::softmax(output, 1);      // logits → 概率
    auto max_result = prob.max(1);              // 返回 tuple (values, indices)

    // C++ 中用 std::get<0> 取 values，std::get<1> 取 indices
    int predicted_class = std::get<1>(max_result).item<int>();
    float confidence    = std::get<0>(max_result).item<float>();

    cout << "\n========== 预测结果 ==========" << endl;
    cout << "  类别: " << predicted_class << endl;
    cout << "  置信度: " << confidence * 100 << "%" << endl;

    // 打印每个类别的概率
    cout << "\n  各类别概率:" << endl;
    auto prob_cpu = prob.cpu().squeeze(0);  // [10]
    for (int i = 0; i < 10; i++) {
        float p = prob_cpu[i].item<float>();
        string bar = string(int(p * 50), '#');
        cout << "    " << i << ": " << bar << " " << p * 100 << "%" << endl;
    }

    return predicted_class;
}

// main
int main(int argc, char** argv) {
    // 参数解析
    string model_path;
    string image_path;

    if (argc >= 3) {
        model_path = argv[1];
        image_path = argv[2];
    } else {
        // 默认路径
        model_path = "models/mnist_cnn.pt";
        image_path = "images/mnist_test.png";
        cout << "[INFO] 使用默认路径:" << endl;
        cout << "  模型: " << model_path << endl;
        cout << "  图片: " << image_path << endl;
    }

    // 选择设备
    torch::Device device(torch::kCPU);
    if (torch::cuda::is_available()) {
        device = torch::Device(torch::kCUDA);
        cout << "[INFO] 使用 GPU (CUDA)" << endl;
    } else {
        cout << "[INFO] 使用 CPU" << endl;
    }

    // Step 1: 读图
    cv::Mat image = cv::imread(image_path, cv::IMREAD_GRAYSCALE);
    if (image.empty()) {
        cerr << "[ERROR] 无法读取图片: " << image_path << endl;
        return -1;
    }
    cout << "[OK] 图片已加载: " << image.cols << "x" << image.rows << ", 通道数=" << image.channels() << endl;

    // Step 2: 预处理 → Tensor
    auto input = preprocess_mnist(image, device);
    cout << "[OK] 预处理完成, tensor shape: " << input.sizes() << ", dtype: " << input.dtype() << ", device: " << input.device() << endl;

    // Step 3: 加载模型
    cout << "[INFO] 加载模型: " << model_path << endl;
    torch::jit::script::Module model;
    try {
        model = torch::jit::load(model_path);
        model.to(device);
        model.eval();
        cout << "[OK] 模型加载成功, device: " << (device.is_cuda() ? "CUDA" : "CPU") << endl;
    } catch (const std::exception& e) {
        cerr << "[ERROR] 加载模型失败: " << e.what() << endl;
        return -1;
    }

    // Step 4: 推理
    cout << "[INFO] 开始推理..." << endl;
    torch::Tensor output;
    {
        c10::InferenceMode guard;  // 推理模式，不计算梯度，更快
        auto result = model.forward({input});
        output = result.toTensor();
    }
    cout << "[OK] 推理完成, output shape: " << output.sizes() << endl;

    // Step 5: 后处理 → 打印结果
    int predicted = postprocess_mnist(output);

    cout << "\n[DONE] 预测数字: " << predicted << endl;
    return 0;
}