Windows上GPU版本的Paddle Inference3.2.1安装和使用教程

Windows上Paddle Inference安装和使用教程

一、环境准备

1.1 系统要求

Windows 10/11 64位
Visual Studio 2019
CMake 3.10+
CUDA（根据版本选择）
cuDNN（根据版本选择）

1.2 下载链接

根据您的CUDA版本选择合适的包：

复制代码

# CUDA 11.8版本
paddle_inference_3.2.1_windows-x86-64_cuda11.8_cudnn8.6.0_trt8.5.1.7_mkl_avx_vs2019.zip

# CUDA 12.6版本  
paddle_inference_3.2.1_windows-x86-64_cuda12.6_cudnn9.5.1_trt10.5.0.18_mkl_avx_vs2019.zip

# CUDA 12.9版本
paddle_inference_3.2.1_windows-x86-64_cuda12.9_cudnn9.9.0_trt10.5.0.18_mkl_avx_vs2019.zip

下载地址：

安装包名称	下载地址
x86-64-cuda12.6-cudnn9.5.1-trt10.5.0.18-mkl-avx-vs2019-paddle-inference-3.0.0.zip	点我下载
x86-64-cuda11.8-cudnn8.6.0-trt8.5.1.7-mkl-avx-vs2019-paddle-inference-3.0.0.zip	点我下载
paddle-inference-3.0.0-cpu.zip	点我下载
paddle-inference-3.2.1-windows-x86-64-cuda12.6-cudnn9.5.1-trt10.5.0.18-mkl-avx-vs2019.zip	点我下载
paddle-inference-3.2.1-windows-x86-64-cuda12.9-cudnn9.9.0-trt10.5.0.18-mkl-avx-vs2019.zip	点我下载
paddle-inference-3.2.1-windows-x86-64-cuda11.8-cudnn8.6.0-trt8.5.1.7-mkl-avx-vs2019.zip	点我下载

二、安装步骤

2.1 安装依赖

powershell 复制代码

# 1. 安装Visual Studio 2019
# 确保安装"使用C++的桌面开发"工作负载

# 2. 安装对应版本的CUDA
# CUDA 11.8: https://developer.nvidia.com/cuda-11-8-0-download-archive
# CUDA 12.6: https://developer.nvidia.com/cuda-12-6-0-download-archive

# 3. 安装对应版本的cuDNN
# 从NVIDIA官网下载并解压到CUDA安装目录

2.2 解压Paddle Inference

powershell 复制代码

# 解压下载的zip文件到指定目录，例如：
D:\Paddle\paddle_inference

目录结构：

复制代码

paddle_inference/
├── paddle/
│   ├── include/     # 头文件
│   └── lib/         # 库文件
├── third_party/     # 第三方依赖
└── version.txt      # 版本信息

2.3 配置环境变量

powershell 复制代码

# 1. 添加Paddle Inference库路径到系统PATH
setx PATH "%PATH%;D:\Paddle\paddle_inference\paddle\lib"

# 2. CUDA环境变量（如果未自动设置）
setx CUDA_PATH "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
setx PATH "%PATH%;%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp"

三、CMake项目配置

3.1 CMakeLists.txt示例

cmake 复制代码

cmake_minimum_required(VERSION 3.10)
project(paddle_demo)

# 设置Paddle Inference路径
set(PADDLE_INFERENCE_DIR "D:/Paddle/paddle_inference")

# 设置CMAKE配置
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")

# 包含目录
include_directories(
    ${PADDLE_INFERENCE_DIR}/paddle/include
    ${PADDLE_INFERENCE_DIR}/third_party/install/mklml/include
    ${PADDLE_INFERENCE_DIR}/third_party/install/onnxruntime/include
)

# 链接目录
link_directories(
    ${PADDLE_INFERENCE_DIR}/paddle/lib
    ${PADDLE_INFERENCE_DIR}/third_party/install/mklml/lib
    ${PADDLE_INFERENCE_DIR}/third_party/install/onnxruntime/lib
)

# CUDA相关库（如果使用GPU）
find_package(CUDA REQUIRED)
link_directories(${CUDA_LIBRARY_DIRS})

# 添加可执行文件
add_executable(inference_demo inference_demo.cpp)

# 链接库文件
target_link_libraries(inference_demo
    paddle_inference
    paddle_inference_c
    # 其他必要库...
)

四、基础使用示例

4.1 C++示例代码

cpp 复制代码

#include <iostream>
#include <vector>
#include <numeric>
#include "paddle_inference_api.h"

using namespace paddle_infer;

int main() {
    // 1. 创建配置
    Config config;
    
    // 2. 设置模型路径
    config.SetModel("model.pdmodel", "model.pdiparams");
    
    // 3. 启用GPU（可选）
    config.EnableUseGpu(100, 0);  // 100MB显存，GPU 0
    
    // 4. 启用MKLDNN（CPU加速）
    config.EnableMKLDNN();
    
    // 5. 设置优化配置
    config.SwitchIrOptim(true);
    
    // 6. 创建预测器
    auto predictor = CreatePredictor(config);
    
    // 7. 准备输入数据
    auto input_names = predictor->GetInputNames();
    auto input_tensor = predictor->GetInputHandle(input_names[0]);
    
    std::vector<int> input_shape = {1, 3, 224, 224};
    std::vector<float> input_data(1*3*224*224, 1.0f);
    
    input_tensor->Reshape(input_shape);
    input_tensor->CopyFromCpu(input_data.data());
    
    // 8. 运行推理
    predictor->Run();
    
    // 9. 获取输出
    auto output_names = predictor->GetOutputNames();
    auto output_tensor = predictor->GetOutputHandle(output_names[0]);
    
    std::vector<int> output_shape = output_tensor->shape();
    int output_size = std::accumulate(
        output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
    
    std::vector<float> output_data(output_size);
    output_tensor->CopyToCpu(output_data.data());
    
    // 10. 输出结果
    std::cout << "Output shape: ";
    for (auto s : output_shape) {
        std::cout << s << " ";
    }
    std::cout << std::endl;
    
    return 0;
}

4.2 Python示例代码

python 复制代码

import numpy as np
import paddle.inference as paddle_infer

def run_inference():
    # 1. 创建配置
    config = paddle_infer.Config()
    
    # 2. 设置模型路径
    config.set_prog_file("model.pdmodel")
    config.set_params_file("model.pdiparams")
    
    # 3. 启用GPU
    config.enable_use_gpu(memory_pool_init_size_mb=100, device_id=0)
    
    # 4. 启用TensorRT（可选）
    config.enable_tensorrt_engine(
        workspace_size=1 << 30,
        max_batch_size=1,
        min_subgraph_size=3,
        precision_mode=paddle_infer.PrecisionType.Half
    )
    
    # 5. 创建预测器
    predictor = paddle_infer.create_predictor(config)
    
    # 6. 准备输入
    input_names = predictor.get_input_names()
    input_tensor = predictor.get_input_handle(input_names[0])
    
    # 创建输入数据
    input_data = np.ones((1, 3, 224, 224)).astype('float32')
    input_tensor.copy_from_cpu(input_data)
    
    # 7. 运行推理
    predictor.run()
    
    # 8. 获取输出
    output_names = predictor.get_output_names()
    output_tensor = predictor.get_output_handle(output_names[0])
    
    output_data = output_tensor.copy_to_cpu()
    
    print(f"Output shape: {output_data.shape}")
    print(f"Output data: {output_data}")
    
    return output_data

if __name__ == "__main__":
    run_inference()

五、常见问题解决

5.1 DLL加载失败

powershell 复制代码

# 确保以下DLL在PATH中：
# 1. paddle_inference.dll
# 2. CUDA相关DLL（cudart64_110.dll等）
# 3. MKLDNN相关DLL

# 临时解决方案：
copy "D:\Paddle\paddle_inference\paddle\lib\*.dll" "C:\Windows\System32\"

5.2 CUDA版本不匹配

复制代码

错误：cudaGetDeviceCount failed
解决方案：确保Paddle Inference版本与CUDA版本完全匹配

5.3 内存不足

cpp 复制代码

// 减少显存使用
config.EnableUseGpu(100, 0);  // 限制显存为100MB
config.EnableMemoryOptim();    // 启用内存优化

六、高级功能

6.1 TensorRT加速

cpp 复制代码

// 启用TensorRT
config.EnableTensorRtEngine(
    1 << 30,    // workspace大小
    1,          // 最大batch size
    3,          // 最小子图大小
    Precision::kHalf,  // 精度模式
    false,      // 使用静态shape
    false       // 禁用细粒度
);

// 设置动态shape
std::map<std::string, std::vector<int>> min_input_shape = {
    {"input", {1, 3, 224, 224}}
};
config.SetTRTDynamicShapeInfo(min_input_shape, {}, {});

6.2 多线程推理

cpp 复制代码

// 设置线程数
config.SetCpuMathLibraryNumThreads(4);
config.EnableMKLDNN();

// 多预测器并行
std::vector<std::thread> threads;
for (int i = 0; i < 4; ++i) {
    threads.emplace_back([&]() {
        auto predictor = CreatePredictor(config);
        // 推理代码...
    });
}

七、性能优化建议

使用混合精度：FP16或INT8量化
批处理：适当增大batch size

启用所有优化 ：

cpp 复制代码

config.SwitchIrOptim(true);
config.EnableMemoryOptim();
config.EnableProfile();  // 性能分析

使用ONNX Runtime后端（如果适用）

这个教程涵盖了从安装到基础使用的完整流程。根据您的具体需求，可以选择合适的CUDA版本和配置选项。