GPU编程全攻略:从图形渲染到通用计算

0 前言

GPU(Graphics Processing Unit,图形处理单元)已经从单纯的图形渲染设备演变为强大的通用计算引擎。现代GPU拥有数千个并行计算核心,在图形渲染、科学计算、机器学习等领域展现出惊人的性能[1]

本文将全面介绍GPU编程的各个方面,从传统的图形渲染API(OpenGL、Vulkan)到通用GPU计算框架(CUDA、OpenCL),再到机器学习加速。每个主题都配有最小可运行的验证代码和详细的编译运行说明,涵盖Linux和Android两大平台。无论你是游戏开发者、科研人员还是AI工程师,都能从中找到实用的GPU编程知识。


1 GPU编程概述

1.1 GPU架构特点

与CPU不同,GPU采用SIMD(Single Instruction Multiple Data)架构,拥有大量的并行计算单元。CPU擅长复杂的逻辑控制,而GPU在大规模并行计算上具有绝对优势[2]

CPU vs GPU对比:

cpp 复制代码
// CPU: 串行处理,少量核心但擅长逻辑控制
int sum = 0;
for (int i = 0; i < n; i++) {
    if (data[i] > 0)  // 复杂条件判断
        sum += data[i];
}

// GPU: 并行处理,数千核心但逻辑控制较弱
// 所有数据同时处理,逻辑分支会影响性能
__global__ void parallelSum(int* data, int n, int* result) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        atomicAdd(result, data[idx]);
    }
}

// @file: examples/cpu_gpu_comparison.cu

1.2 GPU编程主要领域

现代GPU编程主要分为三大类:图形渲染编程、通用GPU计算(GPGPU)和机器学习加速。图形渲染包括OpenGL、Vulkan、DirectX等API;GPGPU包括CUDA、OpenCL、Compute Shader等技术;机器学习加速则有TensorFlow、PyTorch等框架[3]

GPU编程技术栈:

复制代码
GPU编程技术栈
├── 图形渲染
│   ├── OpenGL ES (Android/Linux)
│   ├── Vulkan (跨平台)
│   └── DirectX (Windows)
├── 通用计算 (GPGPU)
│   ├── CUDA (NVIDIA GPU)
│   ├── OpenCL (跨平台)
│   └── Compute Shaders
└── 机器学习
    ├── TensorFlow
    ├── PyTorch
    └── 专用推理引擎

2 OpenGL图形编程

2.1 OpenGL基础三角形

OpenGL是最流行的跨平台图形API。下面是一个在Linux上绘制简单三角形的最小程序[4]

C语言版本(Linux):

c 复制代码
#include <GL/glew.h>
#include <GLFW/glfw3.h>
#include <stdio.h>

int main(void) {
    // 1. 初始化GLFW
    if (!glfwInit()) {
        fprintf(stderr, "Failed to initialize GLFW\n");
        return -1;
    }

    // 2. 创建窗口
    GLFWwindow* window = glfwCreateWindow(640, 480,
                                          "OpenGL Triangle",
                                          NULL, NULL);
    if (!window) {
        glfwTerminate();
        return -1;
    }
    glfwMakeContextCurrent(window);

    // 3. 初始化GLEW
    if (glewInit() != GLEW_OK) {
        fprintf(stderr, "Failed to initialize GLEW\n");
        return -1;
    }

    // 4. 定义顶点数据
    float vertices[] = {
        -0.5f, -0.5f, 0.0f,  // 左下
         0.5f, -0.5f, 0.0f,  // 右下
         0.0f,  0.5f, 0.0f   // 顶部
    };

    unsigned int VBO, VAO;
    glGenVertexArrays(1, &VAO);
    glGenBuffers(1, &VBO);

    glBindVertexArray(VAO);
    glBindBuffer(GL_ARRAY_BUFFER, VBO);
    glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices,
                 GL_STATIC_DRAW);

    glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE,
                         3 * sizeof(float), (void*)0);
    glEnableVertexAttribArray(0);

    // 5. 简单的顶点着色器
    const char* vertexShaderSource =
        "#version 330 core\n"
        "layout (location = 0) in vec3 aPos;\n"
        "void main() {\n"
        "   gl_Position = vec4(aPos.x, aPos.y, aPos.z, 1.0);\n"
        "}\0";

    // 6. 简单的片段着色器
    const char* fragmentShaderSource =
        "#version 330 core\n"
        "out vec4 FragColor;\n"
        "void main() {\n"
        "   FragColor = vec4(1.0, 0.5, 0.2, 1.0);\n"
        "}\0";

    // 7. 编译着色器
    unsigned int vertexShader = glCreateShader(GL_VERTEX_SHADER);
    glShaderSource(vertexShader, 1, &vertexShaderSource, NULL);
    glCompileShader(vertexShader);

    unsigned int fragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
    glShaderSource(fragmentShader, 1, &fragmentShaderSource, NULL);
    glCompileShader(fragmentShader);

    // 8. 链接着色器程序
    unsigned int shaderProgram = glCreateProgram();
    glAttachShader(shaderProgram, vertexShader);
    glAttachShader(shaderProgram, fragmentShader);
    glLinkProgram(shaderProgram);

    glDeleteShader(vertexShader);
    glDeleteShader(fragmentShader);

    // 9. 渲染循环
    while (!glfwWindowShouldClose(window)) {
        glClear(GL_COLOR_BUFFER_BIT);

        glUseProgram(shaderProgram);
        glBindVertexArray(VAO);
        glDrawArrays(GL_TRIANGLES, 0, 3);

        glfwSwapBuffers(window);
        glfwPollEvents();
    }

    glfwTerminate();
    return 0;
}

// @file: opengl/triangle.c

编译运行(Linux):

bash 复制代码
# 安装依赖
sudo apt-get install libglfw3-dev libglew-dev

# 编译
gcc -o triangle triangle.c -lglfw -lGLEW -lGL -lm

# 运行
./triangle

# @file: opengl/build_triangle.sh

2.2 OpenGL ES(Android)

Android上使用OpenGL ES需要通过Java/Kotlin或NDK(C++)。下面是一个使用NDK的最小例子[5]

C++渲染器(Android NDK):

cpp 复制代码
#include <GLES3/gl3.h>
#include <android/log.h>
#include <android/native_window.h>

#define LOG_TAG "OpenGLDemo"
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)

class TriangleRenderer {
public:
    void init() {
        // 顶点着色器
        const char* vertexShaderSource =
            "#version 300 es\n"
            "layout (location = 0) in vec3 aPos;\n"
            "void main() {\n"
            "   gl_Position = vec4(aPos.x, aPos.y, aPos.z, 1.0);\n"
            "}\n";

        // 片段着色器
        const char* fragmentShaderSource =
            "#version 300 es\n"
            "precision mediump float;\n"
            "out vec4 FragColor;\n"
            "void main() {\n"
            "   FragColor = vec4(1.0, 0.5, 0.2, 1.0);\n"
            "}\n";

        // 编译着色器
        GLuint vertexShader = compileShader(GL_VERTEX_SHADER,
                                           vertexShaderSource);
        GLuint fragmentShader = compileShader(GL_FRAGMENT_SHADER,
                                             fragmentShaderSource);

        // 链接程序
        program = glCreateProgram();
        glAttachShader(program, vertexShader);
        glAttachShader(program, fragmentShader);
        glLinkProgram(program);

        glDeleteShader(vertexShader);
        glDeleteShader(fragmentShader);

        // 设置顶点数据
        float vertices[] = {
            -0.5f, -0.5f, 0.0f,
             0.5f, -0.5f, 0.0f,
             0.0f,  0.5f, 0.0f
        };

        glGenVertexArrays(1, &VAO);
        glGenBuffers(1, &VBO);

        glBindVertexArray(VAO);
        glBindBuffer(GL_ARRAY_BUFFER, VBO);
        glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices,
                     GL_STATIC_DRAW);

        glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE,
                             3 * sizeof(float), (void*)0);
        glEnableVertexAttribArray(0);

        glBindVertexArray(0);

        LOGI("OpenGL ES initialized successfully");
    }

    void draw() {
        glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
        glClear(GL_COLOR_BUFFER_BIT);

        glUseProgram(program);
        glBindVertexArray(VAO);
        glDrawArrays(GL_TRIANGLES, 0, 3);

        glFinish();
    }

private:
    GLuint program;
    GLuint VAO, VBO;

    GLuint compileShader(GLenum type, const char* source) {
        GLuint shader = glCreateShader(type);
        glShaderSource(shader, 1, &source, NULL);
        glCompileShader(shader);

        int success;
        glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
        if (!success) {
            char infoLog[512];
            glGetShaderInfoLog(shader, 512, NULL, infoLog);
            LOGI("Shader compilation failed: %s", infoLog);
        }

        return shader;
    }
};

// @file: jni/triangle_renderer.cpp

CMakeLists.txt配置:

cmake 复制代码
cmake_minimum_required(VERSION 3.18.1)

project("opengles_demo")

find_package(egl REQUIRED)
find_package(gles3 REQUIRED)

add_library(opengles_demo SHARED
    jni/triangle_renderer.cpp
    jni/android_main.cpp
)

target_link_libraries(opengles_demo
    EGL
    GLESv3
    android
    log
)

# @file: jni/CMakeLists.txt

3 Vulkan图形编程

3.1 Vulkan基础框架

Vulkan是新一代跨平台图形API,提供更底层的硬件访问。下面是一个最小Vulkan程序框架[6]

Vulkan初始化(Linux):

cpp 复制代码
#define GLFW_INCLUDE_VULKAN
#include <GLFW/glfw3.h>
#include <vector>
#include <iostream>

const uint32_t WIDTH = 800;
const uint32_t HEIGHT = 600;
const std::vector<const char*> validationLayers = {
    "VK_LAYER_KHRONOS_validation"
};

class HelloTriangleApplication {
public:
    void run() {
        initWindow();
        initVulkan();
        mainLoop();
        cleanup();
    }

private:
    GLFWwindow* window;
    VkInstance instance;
    VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
    VkDevice device;
    VkQueue graphicsQueue;

    void initWindow() {
        glfwInit();
        glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
        glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);

        window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan",
                                   nullptr, nullptr);
    }

    void initVulkan() {
        createInstance();
        pickPhysicalDevice();
        createLogicalDevice();
    }

    void createInstance() {
        VkApplicationInfo appInfo{};
        appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
        appInfo.pApplicationName = "Hello Triangle";
        appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
        appInfo.pEngineName = "No Engine";
        appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
        appInfo.apiVersion = VK_API_VERSION_1_0;

        VkInstanceCreateInfo createInfo{};
        createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
        createInfo.pApplicationInfo = &appInfo;

        uint32_t glfwExtensionCount = 0;
        const char** glfwExtensions;
        glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);

        createInfo.enabledExtensionCount = glfwExtensionCount;
        createInfo.ppEnabledExtensionNames = glfwExtensions;
        createInfo.enabledLayerCount = 0;

        if (vkCreateInstance(&createInfo, nullptr, &instance)
            != VK_SUCCESS) {
            throw std::runtime_error("failed to create instance!");
        }
    }

    void pickPhysicalDevice() {
        uint32_t deviceCount = 0;
        vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);

        if (deviceCount == 0) {
            throw std::runtime_error("failed to find GPUs with Vulkan support!");
        }

        std::vector<VkPhysicalDevice> devices(deviceCount);
        vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());

        physicalDevice = devices[0];  // 选择第一个GPU
    }

    void createLogicalDevice() {
        // 简化版本,实际需要查询队列族
        VkDeviceQueueCreateInfo queueCreateInfo{};
        queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
        queueCreateInfo.queueFamilyIndex = 0;
        queueCreateInfo.queueCount = 1;
        queueCreateInfo.pQueuePriorities = &(float{1.0f});

        VkPhysicalDeviceFeatures deviceFeatures{};

        VkDeviceCreateInfo createInfo{};
        createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
        createInfo.pQueueCreateInfos = &queueCreateInfo;
        createInfo.queueCreateInfoCount = 1;
        createInfo.pEnabledFeatures = &deviceFeatures;
        createInfo.enabledExtensionCount = 0;
        createInfo.enabledLayerCount = 0;

        if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device)
            != VK_SUCCESS) {
            throw std::runtime_error("failed to create logical device!");
        }

        vkGetDeviceQueue(device, 0, 0, &graphicsQueue);
    }

    void mainLoop() {
        while (!glfwWindowShouldClose(window)) {
            glfwPollEvents();
        }
    }

    void cleanup() {
        vkDestroyDevice(device, nullptr);
        vkDestroyInstance(instance, nullptr);
        glfwDestroyWindow(window);
        glfwTerminate();
    }
};

int main() {
    HelloTriangleApplication app;

    try {
        app.run();
    } catch (const std::exception& e) {
        std::cerr << e.what() << std::endl;
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}

// @file: vulkan/hello_triangle.cpp

编译运行:

bash 复制代码
# 安装Vulkan SDK
# 下载: https://vulkan.lunarg.com/sdk/home

# 编译
g++ -o hello_triangle hello_triangle.cpp \
    -lvulkan -lglfw -std=c++17

# 运行
./hello_triangle

# @file: vulkan/build.sh

4 CUDA通用GPU计算

4.1 CUDA向量加法

CUDA是NVIDIA推出的并行计算平台和编程模型。下面是一个最简单的向量加法程序[7]

CUDA内核代码:

cpp 复制代码
#include <stdio.h>
#include <cuda_runtime.h>

// CUDA内核函数:在GPU上执行
__global__ void vectorAdd(const float* A, const float* B,
                          float* C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

// 主函数
int main(int argc, char** argv) {
    int N = 1024 * 1024;
    size_t size = N * sizeof(float);

    // 分配主机内存
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C = (float*)malloc(size);

    // 初始化数据
    for (int i = 0; i < N; i++) {
        h_A[i] = i * 1.0f;
        h_B[i] = i * 2.0f;
    }

    // 分配设备内存
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // 拷贝数据到GPU
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // 计算执行配置
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // 启动CUDA内核
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // 等待GPU完成
    cudaDeviceSynchronize();

    // 拷贝结果回主机
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // 验证结果
    bool success = true;
    for (int i = 0; i < N; i++) {
        if (fabs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
            success = false;
            break;
        }
    }

    if (success) {
        printf("向量加法验证成功!\n");
    } else {
        printf("向量加法验证失败!\n");
    }

    // 释放内存
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

// @file: cuda/vector_add.cu

编译运行:

bash 复制代码
# 使用nvcc编译
nvcc -o vector_add vector_add.cu

# 运行
./vector_add

# 输出: 向量加法验证成功!

# @file: cuda/build.sh

4.2 CUDA矩阵乘法

矩阵乘法是GPU计算的典型应用。下面是一个优化的矩阵乘法实现[8]

CUDA矩阵乘法:

cpp 复制代码
#include <stdio.h>
#include <cuda_runtime.h>

#define TILE_SIZE 16

// 简单的矩阵乘法内核
__global__ void matrixMulSimple(float* C, const float* A,
                                 const float* B, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < width && col < width) {
        float sum = 0.0f;
        for (int k = 0; k < width; k++) {
            sum += A[row * width + k] * B[k * width + col];
        }
        C[row * width + col] = sum;
    }
}

// 优化的分块矩阵乘法
__global__ void matrixMulTiled(float* C, const float* A,
                                const float* B, int width) {
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];

    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;

    float sum = 0.0f;

    for (int tile = 0; tile < (width + TILE_SIZE - 1) / TILE_SIZE;
         tile++) {
        // 加载数据到共享内存
        if (row < width && tile * TILE_SIZE + threadIdx.x < width) {
            As[threadIdx.y][threadIdx.x] =
                A[row * width + tile * TILE_SIZE + threadIdx.x];
        } else {
            As[threadIdx.y][threadIdx.x] = 0.0f;
        }

        if (col < width && tile * TILE_SIZE + threadIdx.y < width) {
            Bs[threadIdx.y][threadIdx.x] =
                B[(tile * TILE_SIZE + threadIdx.y) * width + col];
        } else {
            Bs[threadIdx.y][threadIdx.x] = 0.0f;
        }

        __syncthreads();

        // 计算部分和
        for (int k = 0; k < TILE_SIZE; k++) {
            sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
        }

        __syncthreads();
    }

    if (row < width && col < width) {
        C[row * width + col] = sum;
    }
}

int main() {
    int width = 1024;
    size_t size = width * width * sizeof(float);

    // 分配内存
    float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
    h_A = (float*)malloc(size);
    h_B = (float*)malloc(size);
    h_C = (float*)malloc(size);

    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // 初始化矩阵
    for (int i = 0; i < width * width; i++) {
        h_A[i] = 1.0f;
        h_B[i] = 1.0f;
    }

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // 执行配置
    dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
    dim3 blocksPerGrid((width + TILE_SIZE - 1) / TILE_SIZE,
                       (width + TILE_SIZE - 1) / TILE_SIZE);

    // 启动内核
    matrixMulTiled<<<blocksPerGrid, threadsPerBlock>>>(
        d_C, d_A, d_B, width);

    cudaDeviceSynchronize();
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // 验证结果
    printf("结果[0] = %f (期望值: %f)\n", h_C[0], width * 1.0f);

    // 释放内存
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

// @file: cuda/matrix_mul.cu

编译运行:

bash 复制代码
# 编译
nvcc -o matrix_mul matrix_mul.cu -O3

# 运行
./matrix_mul

# 输出: 结果[0] = 1024.000000 (期望值: 1024.000000)

# @file: cuda/build_matrix.sh

5 OpenCL跨平台计算

5.1 OpenCL向量加法

OpenCL是跨平台的并行编程标准,支持CPU、GPU、DSP等多种设备[9]

主机代码:

c 复制代码
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>

// OpenCL内核代码(字符串形式)
const char* kernelSource =
"__kernel void vector_add(__global const float* A,\n"
"                        __global const float* B,\n"
"                        __global float* C,\n"
"                        const int N) {\n"
"    int idx = get_global_id(0);\n"
"    if (idx < N) {\n"
"        C[idx] = A[idx] + B[idx];\n"
"    }\n"
"}\n";

int main() {
    int N = 1024 * 1024;
    size_t size = N * sizeof(float);

    // 主机数据
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C = (float*)malloc(size);

    for (int i = 0; i < N; i++) {
        h_A[i] = i * 1.0f;
        h_B[i] = i * 2.0f;
    }

    // 获取平台
    cl_platform_id platform;
    clGetPlatformIDs(1, &platform, NULL);

    // 获取设备
    cl_device_id device;
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    // 创建上下文
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

    // 创建命令队列
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);

    // 创建缓冲区
    cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, NULL);
    cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, NULL);
    cl_mem bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size, NULL, NULL);

    // 拷贝数据到设备
    clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, size, h_A, 0, NULL, NULL);
    clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, size, h_B, 0, NULL, NULL);

    // 创建程序
    cl_program program = clCreateProgramWithSource(context, 1,
                                                   &kernelSource, NULL, NULL);
    clBuildProgram(program, 1, &device, NULL, NULL, NULL);

    // 创建内核
    cl_kernel kernel = clCreateKernel(program, "vector_add", NULL);

    // 设置内核参数
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);
    clSetKernelArg(kernel, 3, sizeof(int), &N);

    // 执行内核
    size_t globalSize = N;
    clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL,
                          0, NULL, NULL);

    // 读取结果
    clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, size, h_C, 0, NULL, NULL);

    // 验证
    printf("OpenCL向量加法: C[0] = %f (期望: %f)\n", h_C[0], h_A[0] + h_B[0]);

    // 清理
    clReleaseMemObject(bufA);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufC);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

// @file: opencl/vector_add.c

编译运行(Linux + NVIDIA GPU):

bash 复制代码
# 安装OpenCL
sudo apt-get install opencl-headers ocl-icd-opencl-dev

# 编译
gcc -o vector_add_opencl vector_add.c -lOpenCL

# 运行
./vector_add_opencl

# @file: opencl/build.sh

编译运行(Linux + AMD GPU):

bash 复制代码
# 安装AMD ROCm
# 参考: https://rocm.docs.amd.com/

# 编译
gcc -o vector_add_opencl vector_add.c \
    -I/opt/rocm/opencl/include \
    -L/opt/rocm/opencl/lib \
    -lOpenCL

# 运行
./vector_add_opencl

# @file: opencl/build_amd.sh

6 机器学习GPU加速

6.1 PyTorch GPU计算

PyTorch是最流行的深度学习框架之一,支持CUDA加速[10]

PyTorch GPU示例:

python 复制代码
import torch
import time

# 检查CUDA可用性
print(f"CUDA可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA版本: {torch.version.cuda}")
    print(f"GPU设备: {torch.cuda.get_device_name(0)}")

# 矩阵大小
N = 4096

# CPU上的矩阵乘法
print("\n=== CPU矩阵乘法 ===")
a_cpu = torch.randn(N, N)
b_cpu = torch.randn(N, N)

start = time.time()
c_cpu = torch.matmul(a_cpu, b_cpu)
cpu_time = time.time() - start
print(f"CPU时间: {cpu_time:.4f}秒")

# GPU上的矩阵乘法
if torch.cuda.is_available():
    print("\n=== GPU矩阵乘法 ===")

    # 将数据移动到GPU
    a_gpu = a_cpu.cuda()
    b_gpu = b_cpu.cuda()

    # 预热GPU
    for _ in range(5):
        _ = torch.matmul(a_gpu, b_gpu)
    torch.cuda.synchronize()

    # 计时
    start = time.time()
    c_gpu = torch.matmul(a_gpu, b_gpu)
    torch.cuda.synchronize()
    gpu_time = time.time() - start

    print(f"GPU时间: {gpu_time:.4f}秒")
    print(f"加速比: {cpu_time / gpu_time:.2f}x")

    # 简单的神经网络示例
    print("\n=== GPU神经网络 ===")

    import torch.nn as nn
    import torch.optim as optim

    # 定义网络
    class SimpleNet(nn.Module):
        def __init__(self):
            super(SimpleNet, self).__init__()
            self.fc1 = nn.Linear(784, 256)
            self.fc2 = nn.Linear(256, 128)
            self.fc3 = nn.Linear(128, 10)
            self.relu = nn.ReLU()

        def forward(self, x):
            x = self.relu(self.fc1(x))
            x = self.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    # 创建模型并移动到GPU
    model = SimpleNet().cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    # 模拟训练
    batch_size = 64
    for epoch in range(3):
        # 生成随机数据
        inputs = torch.randn(batch_size, 784).cuda()
        labels = torch.randint(0, 10, (batch_size,)).cuda()

        # 前向传播
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # 反向传播
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# @file: ml/pytorch_gpu.py

运行(需要PyTorch + CUDA):

bash 复制代码
# 安装PyTorch (CUDA版本)
pip install torch torchvision torchaudio

# 运行
python pytorch_gpu.py

# 输出示例:
# CUDA可用: True
# GPU设备: NVIDIA GeForce RTX 3090
# CPU时间: 2.3456秒
# GPU时间: 0.0234秒
# 加速比: 100.24x

# @file: ml/run_pytorch.sh

6.2 TensorFlow GPU加速

TensorFlow是Google开发的深度学习框架,同样支持GPU加速[11]

TensorFlow GPU示例:

python 复制代码
import tensorflow as tf
import time

# 检查GPU
print("TensorFlow版本:", tf.__version__)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"发现 {len(gpus)} 个GPU:")
    for gpu in gpus:
        print(f"  - {gpu.name}")
else:
    print("未发现GPU,使用CPU")

# 矩阵乘法性能对比
N = 4096

print("\n=== CPU矩阵乘法 ===")
with tf.device('/CPU:0'):
    a_cpu = tf.random.normal((N, N))
    b_cpu = tf.random.normal((N, N))

    start = time.time()
    for _ in range(10):
        c_cpu = tf.matmul(a_cpu, b_cpu)
    cpu_time = (time.time() - start) / 10
    print(f"CPU平均时间: {cpu_time:.4f}秒")

if gpus:
    print("\n=== GPU矩阵乘法 ===")
    with tf.device('/GPU:0'):
        a_gpu = tf.random.normal((N, N))
        b_gpu = tf.random.normal((N, N))

        # 预热
        for _ in range(5):
            _ = tf.matmul(a_gpu, b_gpu)

        start = time.time()
        for _ in range(10):
            c_gpu = tf.matmul(a_gpu, b_gpu)
        gpu_time = (time.time() - start) / 10

        print(f"GPU平均时间: {gpu_time:.4f}秒")
        print(f"加速比: {cpu_time / gpu_time:.2f}x")

# 简单的神经网络
print("\n=== 神经网络训练 ===")

# 创建数据
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(-1, 784).astype('float32') / 255.0
y_train = tf.one_hot(y_train, 10)

# 定义模型
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(784,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# 训练(使用前1000个样本快速演示)
start = time.time()
history = model.fit(
    x_train[:1000], y_train[:1000],
    epochs=3,
    batch_size=64,
    verbose=1
)
train_time = time.time() - start

print(f"\n训练完成,用时: {train_time:.2f}秒")

# @file: ml/tensorflow_gpu.py

运行:

bash 复制代码
# 安装TensorFlow (GPU版本)
pip install tensorflow

# 运行
python tensorflow_gpu.py

# @file: ml/run_tensorflow.sh

7 Android Compute Shader

7.1 OpenGL ES计算着色器

Android 5.0+支持OpenGL ES 3.1,引入了计算着色器,可以进行通用GPU计算[12]

计算着色器代码:

glsl 复制代码
#version 310 es

// 定义工作组大小
layout (local_size_x = 128) in;

// 输入输出缓冲
layout (std430, binding = 0) buffer InputA {
    float data[];
} input_a;

layout (std430, binding = 1) buffer InputB {
    float data[];
} input_b;

layout (std430, binding = 2) buffer Output {
    float data[];
} output;

void main() {
    uint idx = gl_GlobalInvocationID.x;

    // 向量加法
    output.data[idx] = input_a.data[idx] + input_b.data[idx];
}

// @file: android/app/src/main/res/raw/compute.glsl

Java端调用代码:

java 复制代码
import android.content.Context;
import android.opengl.GLES31;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;

public class ComputeShader {
    private int program;
    private FloatBuffer bufferA;
    private FloatBuffer bufferB;
    private FloatBuffer bufferResult;
    private int ssboA, ssboB, ssboResult;
    private final int DATA_SIZE = 1024;

    public ComputeShader(Context context) {
        init();
    }

    private void init() {
        // 1. 创建计算着色器程序
        int computeShader = GLES31.glCreateShader(GLES31.GL_COMPUTE_SHADER);

        String shaderSource = "#version 310 es\n" +
            "layout (local_size_x = 128) in;\n" +
            "layout (std430, binding = 0) buffer InputA {\n" +
            "    float data[];\n" +
            "} input_a;\n" +
            "layout (std430, binding = 1) buffer InputB {\n" +
            "    float data[];\n" +
            "} input_b;\n" +
            "layout (std430, binding = 2) buffer Output {\n" +
            "    float data[];\n" +
            "} output;\n" +
            "void main() {\n" +
            "    uint idx = gl_GlobalInvocationID.x;\n" +
            "    output.data[idx] = input_a.data[idx] + input_b.data[idx];\n" +
            "}\n";

        GLES31.glShaderSource(computeShader, shaderSource);
        GLES31.glCompileShader(computeShader);

        program = GLES31.glCreateProgram();
        GLES31.glAttachShader(program, computeShader);
        GLES31.glLinkProgram(program);

        // 2. 准备数据
        bufferA = ByteBuffer.allocateDirect(DATA_SIZE * 4)
            .order(ByteOrder.nativeOrder())
            .asFloatBuffer();
        bufferB = ByteBuffer.allocateDirect(DATA_SIZE * 4)
            .order(ByteOrder.nativeOrder())
            .asFloatBuffer();
        bufferResult = ByteBuffer.allocateDirect(DATA_SIZE * 4)
            .order(ByteOrder.nativeOrder())
            .asFloatBuffer();

        // 初始化数据
        for (int i = 0; i < DATA_SIZE; i++) {
            bufferA.put(i, i * 1.0f);
            bufferB.put(i, i * 2.0f);
        }

        // 3. 创建SSBO
        int[] ssbos = new int[3];
        GLES31.glGenBuffers(3, ssbos, 0);
        ssboA = ssbos[0];
        ssboB = ssbos[1];
        ssboResult = ssbos[2];

        // 上传数据到SSBO
        bufferA.position(0);
        GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, ssboA);
        GLES31.glBufferData(GLES31.GL_SHADER_STORAGE_BUFFER,
                           DATA_SIZE * 4, bufferA, GLES31.GL_STATIC_DRAW);

        bufferB.position(0);
        GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, ssboB);
        GLES31.glBufferData(GLES31.GL_SHADER_STORAGE_BUFFER,
                           DATA_SIZE * 4, bufferB, GLES31.GL_STATIC_DRAW);

        // 创建输出缓冲
        GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, ssboResult);
        GLES31.glBufferData(GLES31.GL_SHADER_STORAGE_BUFFER,
                           DATA_SIZE * 4, null, GLES31.GL_DYNAMIC_DRAW);
    }

    public void compute() {
        // 绑定SSBO到绑定点
        GLES31.glBindBufferBase(GLES31.GL_SHADER_STORAGE_BUFFER, 0, ssboA);
        GLES31.glBindBufferBase(GLES31.GL_SHADER_STORAGE_BUFFER, 1, ssboB);
        GLES31.glBindBufferBase(GLES31.GL_SHADER_STORAGE_BUFFER, 2, ssboResult);

        // 执行计算着色器
        GLES31.glUseProgram(program);
        GLES31.glDispatchCompute(DATA_SIZE / 128, 1, 1);

        // 等待计算完成
        GLES31.glMemoryBarrier(GLES31.GL_SHADER_STORAGE_BARRIER_BIT);

        // 读取结果
        GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, ssboResult);
        bufferResult.position(0);
        GLES31.glGetBufferSubData(GLES31.GL_SHADER_STORAGE_BUFFER,
                                  0, DATA_SIZE * 4, bufferResult);

        // 验证结果
        float expected = bufferA.get(0) + bufferB.get(0);
        float actual = bufferResult.get(0);
        android.util.Log.d("ComputeShader",
            "Result[0] = " + actual + " (expected: " + expected + ")");
    }
}

// @file: app/src/main/java/com/example/compute/ComputeShader.java

8 综合实战与性能对比

8.1 各平台性能对比测试

下面是一个综合性能测试,对比不同平台和技术的计算性能[13]

性能测试框架:

cpp 复制代码
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// CPU版本
void matrix_mul_cpu(float* C, const float* A, const float* B, int N) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
            for (int k = 0; k < N; k++) {
                sum += A[i * N + k] * B[k * N + j];
            }
            C[i * N + j] = sum;
        }
    }
}

#ifdef USE_CUDA
// CUDA版本
__global__ void matrix_mul_cuda(float* C, const float* A,
                                 const float* B, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

void launch_cuda_kernel(float* d_C, float* d_A, float* d_B, int N) {
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + 15) / 16, (N + 15) / 16);
    matrix_mul_cuda<<<blocksPerGrid, threadsPerBlock>>>(d_C, d_A, d_B, N);
    cudaDeviceSynchronize();
}
#endif

double get_time() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

int main(int argc, char** argv) {
    int N = 1024;
    size_t size = N * N * sizeof(float);

    // 分配并初始化数据
    float *A = (float*)malloc(size);
    float *B = (float*)malloc(size);
    float *C_cpu = (float*)malloc(size);

    for (int i = 0; i < N * N; i++) {
        A[i] = 1.0f;
        B[i] = 1.0f;
    }

    printf("矩阵大小: %dx%d\n\n", N, N);

    // CPU性能测试
    printf("=== CPU性能测试 ===\n");
    double start = get_time();
    matrix_mul_cpu(C_cpu, A, B, N);
    double cpu_time = get_time() - start;
    printf("CPU时间: %.4f秒\n", cpu_time);
    printf("GFLOPS: %.2f\n", 2.0 * N * N * N / cpu_time / 1e9);

#ifdef USE_CUDA
    // CUDA性能测试
    printf("\n=== CUDA性能测试 ===\n");
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    // 预热
    for (int i = 0; i < 5; i++) {
        launch_cuda_kernel(d_C, d_A, d_B, N);
    }

    start = get_time();
    for (int i = 0; i < 10; i++) {
        launch_cuda_kernel(d_C, d_A, d_B, N);
    }
    double cuda_time = (get_time() - start) / 10.0;

    printf("CUDA时间: %.4f秒\n", cuda_time);
    printf("GFLOPS: %.2f\n", 2.0 * N * N * N / cuda_time / 1e9);
    printf("加速比: %.2fx\n", cpu_time / cuda_time);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
#endif

    free(A);
    free(B);
    free(C_cpu);

    return 0;
}

// @file: benchmark/performance_test.cpp

编译CPU版本:

bash 复制代码
g++ -o perf_cpu performance_test.cpp -O3 -std=c++11 -lrt
./perf_cpu

# @file: benchmark/build_cpu.sh

编译CUDA版本:

bash 复制代码
nvcc -o perf_cuda performance_test.cpp -O3 -DUSE_CUDA -std=c++11
./perf_cuda

# @file: benchmark/build_cuda.sh

参考资料

1\] [NVIDIA CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) \[2\] [OpenGL Programming Guide](https://www.opengl.org/redbook/) \[3\] [Vulkan Tutorial](https://vulkan-tutorial.com/) \[4\] [Learn OpenGL - Getting Started](https://learnopengl.com/) \[5\] [Android Graphics Architecture](https://source.android.com/devices/graphics/architecture) \[6\] [OpenCL Specification](https://www.khronos.org/opencl/) \[7\] [PyTorch CUDA Semantics](https://pytorch.org/docs/stable/cuda.html) \[8\] [TensorFlow GPU Support](https://www.tensorflow.org/install/gpu) \[9\] [GPU Gems](https://developer.nvidia.com/gpugems/) \[10\] [CUDA C Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/) \[11\] [OpenCL Programming Guide](https://www.khronos.org/opencl/opencl-book/) \[12\] [OpenGL ES Compute Shaders](https://www.khronos.org/opengl/wiki/Compute_Shader) \[13\] [GPU Roam: High-Performance GPU Computing](https://www.nvidia.com/en-us/data-center/)

相关推荐
玖釉-8 小时前
[Vulkan 实战] 深入解析 Vulkan Compute Shader:实现高效 N-Body 粒子模拟
c++·windows·图形渲染
玖釉-1 天前
[Vulkan 学习之路] 11 - 组装流水线:固定功能阶段 (Fixed Functions)
c++·windows·图形渲染
玖釉-1 天前
Windows 下 VS2022 编译运行 Khronos Vulkan Samples 全避坑指南
c++·windows·图形渲染
郝学胜-神的一滴1 天前
QtOpenGL多线程渲染方案深度解析
c++·qt·unity·游戏引擎·godot·图形渲染·unreal engine
明洞日记2 天前
【CUDA手册002】CUDA 基础执行模型:写出第一个正确的 Kernel
c++·图像处理·算法·ai·图形渲染·gpu·cuda
明洞日记2 天前
【CUDA手册004】一个典型算子的 CUDA 化完整流程
c++·图像处理·算法·ai·图形渲染·gpu·cuda
做cv的小昊2 天前
3DGS加速&压缩指标评测方法、高斯数量变化曲线绘制——Training Time、FPS、Gaussian Number、Peak Memory
笔记·计算机视觉·3d·开源·github·图形渲染·3dgs
玖釉-3 天前
[Vulkan 学习之路] 16 - 最终章:渲染循环与同步 (Rendering & Presentation)
c++·windows·图形渲染
玖釉-3 天前
[Vulkan 学习之路] 19 - 顶点缓冲区:顶点输入描述 (Vertex Input Description)
c++·windows·图形渲染