0 前言
GPU(Graphics Processing Unit,图形处理单元)已经从单纯的图形渲染设备演变为强大的通用计算引擎。现代GPU拥有数千个并行计算核心,在图形渲染、科学计算、机器学习等领域展现出惊人的性能[1]。
本文将全面介绍GPU编程的各个方面,从传统的图形渲染API(OpenGL、Vulkan)到通用GPU计算框架(CUDA、OpenCL),再到机器学习加速。每个主题都配有最小可运行的验证代码和详细的编译运行说明,涵盖Linux和Android两大平台。无论你是游戏开发者、科研人员还是AI工程师,都能从中找到实用的GPU编程知识。
1 GPU编程概述
1.1 GPU架构特点
与CPU不同,GPU采用SIMD(Single Instruction Multiple Data)架构,拥有大量的并行计算单元。CPU擅长复杂的逻辑控制,而GPU在大规模并行计算上具有绝对优势[2]。
CPU vs GPU对比:
cpp
// CPU: 串行处理,少量核心但擅长逻辑控制
int sum = 0;
for (int i = 0; i < n; i++) {
if (data[i] > 0) // 复杂条件判断
sum += data[i];
}
// GPU: 并行处理,数千核心但逻辑控制较弱
// 所有数据同时处理,逻辑分支会影响性能
__global__ void parallelSum(int* data, int n, int* result) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
atomicAdd(result, data[idx]);
}
}
// @file: examples/cpu_gpu_comparison.cu
1.2 GPU编程主要领域
现代GPU编程主要分为三大类:图形渲染编程、通用GPU计算(GPGPU)和机器学习加速。图形渲染包括OpenGL、Vulkan、DirectX等API;GPGPU包括CUDA、OpenCL、Compute Shader等技术;机器学习加速则有TensorFlow、PyTorch等框架[3]。
GPU编程技术栈:
GPU编程技术栈
├── 图形渲染
│ ├── OpenGL ES (Android/Linux)
│ ├── Vulkan (跨平台)
│ └── DirectX (Windows)
├── 通用计算 (GPGPU)
│ ├── CUDA (NVIDIA GPU)
│ ├── OpenCL (跨平台)
│ └── Compute Shaders
└── 机器学习
├── TensorFlow
├── PyTorch
└── 专用推理引擎
2 OpenGL图形编程
2.1 OpenGL基础三角形
OpenGL是最流行的跨平台图形API。下面是一个在Linux上绘制简单三角形的最小程序[4]。
C语言版本(Linux):
c
#include <GL/glew.h>
#include <GLFW/glfw3.h>
#include <stdio.h>
int main(void) {
// 1. 初始化GLFW
if (!glfwInit()) {
fprintf(stderr, "Failed to initialize GLFW\n");
return -1;
}
// 2. 创建窗口
GLFWwindow* window = glfwCreateWindow(640, 480,
"OpenGL Triangle",
NULL, NULL);
if (!window) {
glfwTerminate();
return -1;
}
glfwMakeContextCurrent(window);
// 3. 初始化GLEW
if (glewInit() != GLEW_OK) {
fprintf(stderr, "Failed to initialize GLEW\n");
return -1;
}
// 4. 定义顶点数据
float vertices[] = {
-0.5f, -0.5f, 0.0f, // 左下
0.5f, -0.5f, 0.0f, // 右下
0.0f, 0.5f, 0.0f // 顶部
};
unsigned int VBO, VAO;
glGenVertexArrays(1, &VAO);
glGenBuffers(1, &VBO);
glBindVertexArray(VAO);
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices,
GL_STATIC_DRAW);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE,
3 * sizeof(float), (void*)0);
glEnableVertexAttribArray(0);
// 5. 简单的顶点着色器
const char* vertexShaderSource =
"#version 330 core\n"
"layout (location = 0) in vec3 aPos;\n"
"void main() {\n"
" gl_Position = vec4(aPos.x, aPos.y, aPos.z, 1.0);\n"
"}\0";
// 6. 简单的片段着色器
const char* fragmentShaderSource =
"#version 330 core\n"
"out vec4 FragColor;\n"
"void main() {\n"
" FragColor = vec4(1.0, 0.5, 0.2, 1.0);\n"
"}\0";
// 7. 编译着色器
unsigned int vertexShader = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vertexShader, 1, &vertexShaderSource, NULL);
glCompileShader(vertexShader);
unsigned int fragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fragmentShader, 1, &fragmentShaderSource, NULL);
glCompileShader(fragmentShader);
// 8. 链接着色器程序
unsigned int shaderProgram = glCreateProgram();
glAttachShader(shaderProgram, vertexShader);
glAttachShader(shaderProgram, fragmentShader);
glLinkProgram(shaderProgram);
glDeleteShader(vertexShader);
glDeleteShader(fragmentShader);
// 9. 渲染循环
while (!glfwWindowShouldClose(window)) {
glClear(GL_COLOR_BUFFER_BIT);
glUseProgram(shaderProgram);
glBindVertexArray(VAO);
glDrawArrays(GL_TRIANGLES, 0, 3);
glfwSwapBuffers(window);
glfwPollEvents();
}
glfwTerminate();
return 0;
}
// @file: opengl/triangle.c
编译运行(Linux):
bash
# 安装依赖
sudo apt-get install libglfw3-dev libglew-dev
# 编译
gcc -o triangle triangle.c -lglfw -lGLEW -lGL -lm
# 运行
./triangle
# @file: opengl/build_triangle.sh
2.2 OpenGL ES(Android)
Android上使用OpenGL ES需要通过Java/Kotlin或NDK(C++)。下面是一个使用NDK的最小例子[5]。
C++渲染器(Android NDK):
cpp
#include <GLES3/gl3.h>
#include <android/log.h>
#include <android/native_window.h>
#define LOG_TAG "OpenGLDemo"
#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
class TriangleRenderer {
public:
void init() {
// 顶点着色器
const char* vertexShaderSource =
"#version 300 es\n"
"layout (location = 0) in vec3 aPos;\n"
"void main() {\n"
" gl_Position = vec4(aPos.x, aPos.y, aPos.z, 1.0);\n"
"}\n";
// 片段着色器
const char* fragmentShaderSource =
"#version 300 es\n"
"precision mediump float;\n"
"out vec4 FragColor;\n"
"void main() {\n"
" FragColor = vec4(1.0, 0.5, 0.2, 1.0);\n"
"}\n";
// 编译着色器
GLuint vertexShader = compileShader(GL_VERTEX_SHADER,
vertexShaderSource);
GLuint fragmentShader = compileShader(GL_FRAGMENT_SHADER,
fragmentShaderSource);
// 链接程序
program = glCreateProgram();
glAttachShader(program, vertexShader);
glAttachShader(program, fragmentShader);
glLinkProgram(program);
glDeleteShader(vertexShader);
glDeleteShader(fragmentShader);
// 设置顶点数据
float vertices[] = {
-0.5f, -0.5f, 0.0f,
0.5f, -0.5f, 0.0f,
0.0f, 0.5f, 0.0f
};
glGenVertexArrays(1, &VAO);
glGenBuffers(1, &VBO);
glBindVertexArray(VAO);
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices,
GL_STATIC_DRAW);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE,
3 * sizeof(float), (void*)0);
glEnableVertexAttribArray(0);
glBindVertexArray(0);
LOGI("OpenGL ES initialized successfully");
}
void draw() {
glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
glClear(GL_COLOR_BUFFER_BIT);
glUseProgram(program);
glBindVertexArray(VAO);
glDrawArrays(GL_TRIANGLES, 0, 3);
glFinish();
}
private:
GLuint program;
GLuint VAO, VBO;
GLuint compileShader(GLenum type, const char* source) {
GLuint shader = glCreateShader(type);
glShaderSource(shader, 1, &source, NULL);
glCompileShader(shader);
int success;
glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
if (!success) {
char infoLog[512];
glGetShaderInfoLog(shader, 512, NULL, infoLog);
LOGI("Shader compilation failed: %s", infoLog);
}
return shader;
}
};
// @file: jni/triangle_renderer.cpp
CMakeLists.txt配置:
cmake
cmake_minimum_required(VERSION 3.18.1)
project("opengles_demo")
find_package(egl REQUIRED)
find_package(gles3 REQUIRED)
add_library(opengles_demo SHARED
jni/triangle_renderer.cpp
jni/android_main.cpp
)
target_link_libraries(opengles_demo
EGL
GLESv3
android
log
)
# @file: jni/CMakeLists.txt
3 Vulkan图形编程
3.1 Vulkan基础框架
Vulkan是新一代跨平台图形API,提供更底层的硬件访问。下面是一个最小Vulkan程序框架[6]。
Vulkan初始化(Linux):
cpp
#define GLFW_INCLUDE_VULKAN
#include <GLFW/glfw3.h>
#include <vector>
#include <iostream>
const uint32_t WIDTH = 800;
const uint32_t HEIGHT = 600;
const std::vector<const char*> validationLayers = {
"VK_LAYER_KHRONOS_validation"
};
class HelloTriangleApplication {
public:
void run() {
initWindow();
initVulkan();
mainLoop();
cleanup();
}
private:
GLFWwindow* window;
VkInstance instance;
VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
VkDevice device;
VkQueue graphicsQueue;
void initWindow() {
glfwInit();
glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);
window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan",
nullptr, nullptr);
}
void initVulkan() {
createInstance();
pickPhysicalDevice();
createLogicalDevice();
}
void createInstance() {
VkApplicationInfo appInfo{};
appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
appInfo.pApplicationName = "Hello Triangle";
appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.pEngineName = "No Engine";
appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.apiVersion = VK_API_VERSION_1_0;
VkInstanceCreateInfo createInfo{};
createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
createInfo.pApplicationInfo = &appInfo;
uint32_t glfwExtensionCount = 0;
const char** glfwExtensions;
glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
createInfo.enabledExtensionCount = glfwExtensionCount;
createInfo.ppEnabledExtensionNames = glfwExtensions;
createInfo.enabledLayerCount = 0;
if (vkCreateInstance(&createInfo, nullptr, &instance)
!= VK_SUCCESS) {
throw std::runtime_error("failed to create instance!");
}
}
void pickPhysicalDevice() {
uint32_t deviceCount = 0;
vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);
if (deviceCount == 0) {
throw std::runtime_error("failed to find GPUs with Vulkan support!");
}
std::vector<VkPhysicalDevice> devices(deviceCount);
vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());
physicalDevice = devices[0]; // 选择第一个GPU
}
void createLogicalDevice() {
// 简化版本,实际需要查询队列族
VkDeviceQueueCreateInfo queueCreateInfo{};
queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
queueCreateInfo.queueFamilyIndex = 0;
queueCreateInfo.queueCount = 1;
queueCreateInfo.pQueuePriorities = &(float{1.0f});
VkPhysicalDeviceFeatures deviceFeatures{};
VkDeviceCreateInfo createInfo{};
createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
createInfo.pQueueCreateInfos = &queueCreateInfo;
createInfo.queueCreateInfoCount = 1;
createInfo.pEnabledFeatures = &deviceFeatures;
createInfo.enabledExtensionCount = 0;
createInfo.enabledLayerCount = 0;
if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device)
!= VK_SUCCESS) {
throw std::runtime_error("failed to create logical device!");
}
vkGetDeviceQueue(device, 0, 0, &graphicsQueue);
}
void mainLoop() {
while (!glfwWindowShouldClose(window)) {
glfwPollEvents();
}
}
void cleanup() {
vkDestroyDevice(device, nullptr);
vkDestroyInstance(instance, nullptr);
glfwDestroyWindow(window);
glfwTerminate();
}
};
int main() {
HelloTriangleApplication app;
try {
app.run();
} catch (const std::exception& e) {
std::cerr << e.what() << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
// @file: vulkan/hello_triangle.cpp
编译运行:
bash
# 安装Vulkan SDK
# 下载: https://vulkan.lunarg.com/sdk/home
# 编译
g++ -o hello_triangle hello_triangle.cpp \
-lvulkan -lglfw -std=c++17
# 运行
./hello_triangle
# @file: vulkan/build.sh
4 CUDA通用GPU计算
4.1 CUDA向量加法
CUDA是NVIDIA推出的并行计算平台和编程模型。下面是一个最简单的向量加法程序[7]。
CUDA内核代码:
cpp
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA内核函数:在GPU上执行
__global__ void vectorAdd(const float* A, const float* B,
float* C, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
C[idx] = A[idx] + B[idx];
}
}
// 主函数
int main(int argc, char** argv) {
int N = 1024 * 1024;
size_t size = N * sizeof(float);
// 分配主机内存
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
// 初始化数据
for (int i = 0; i < N; i++) {
h_A[i] = i * 1.0f;
h_B[i] = i * 2.0f;
}
// 分配设备内存
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
// 拷贝数据到GPU
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// 计算执行配置
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// 启动CUDA内核
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
// 等待GPU完成
cudaDeviceSynchronize();
// 拷贝结果回主机
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// 验证结果
bool success = true;
for (int i = 0; i < N; i++) {
if (fabs(h_C[i] - (h_A[i] + h_B[i])) > 1e-5) {
success = false;
break;
}
}
if (success) {
printf("向量加法验证成功!\n");
} else {
printf("向量加法验证失败!\n");
}
// 释放内存
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
// @file: cuda/vector_add.cu
编译运行:
bash
# 使用nvcc编译
nvcc -o vector_add vector_add.cu
# 运行
./vector_add
# 输出: 向量加法验证成功!
# @file: cuda/build.sh
4.2 CUDA矩阵乘法
矩阵乘法是GPU计算的典型应用。下面是一个优化的矩阵乘法实现[8]。
CUDA矩阵乘法:
cpp
#include <stdio.h>
#include <cuda_runtime.h>
#define TILE_SIZE 16
// 简单的矩阵乘法内核
__global__ void matrixMulSimple(float* C, const float* A,
const float* B, int width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < width && col < width) {
float sum = 0.0f;
for (int k = 0; k < width; k++) {
sum += A[row * width + k] * B[k * width + col];
}
C[row * width + col] = sum;
}
}
// 优化的分块矩阵乘法
__global__ void matrixMulTiled(float* C, const float* A,
const float* B, int width) {
__shared__ float As[TILE_SIZE][TILE_SIZE];
__shared__ float Bs[TILE_SIZE][TILE_SIZE];
int row = blockIdx.y * TILE_SIZE + threadIdx.y;
int col = blockIdx.x * TILE_SIZE + threadIdx.x;
float sum = 0.0f;
for (int tile = 0; tile < (width + TILE_SIZE - 1) / TILE_SIZE;
tile++) {
// 加载数据到共享内存
if (row < width && tile * TILE_SIZE + threadIdx.x < width) {
As[threadIdx.y][threadIdx.x] =
A[row * width + tile * TILE_SIZE + threadIdx.x];
} else {
As[threadIdx.y][threadIdx.x] = 0.0f;
}
if (col < width && tile * TILE_SIZE + threadIdx.y < width) {
Bs[threadIdx.y][threadIdx.x] =
B[(tile * TILE_SIZE + threadIdx.y) * width + col];
} else {
Bs[threadIdx.y][threadIdx.x] = 0.0f;
}
__syncthreads();
// 计算部分和
for (int k = 0; k < TILE_SIZE; k++) {
sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
}
__syncthreads();
}
if (row < width && col < width) {
C[row * width + col] = sum;
}
}
int main() {
int width = 1024;
size_t size = width * width * sizeof(float);
// 分配内存
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
h_A = (float*)malloc(size);
h_B = (float*)malloc(size);
h_C = (float*)malloc(size);
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
// 初始化矩阵
for (int i = 0; i < width * width; i++) {
h_A[i] = 1.0f;
h_B[i] = 1.0f;
}
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// 执行配置
dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
dim3 blocksPerGrid((width + TILE_SIZE - 1) / TILE_SIZE,
(width + TILE_SIZE - 1) / TILE_SIZE);
// 启动内核
matrixMulTiled<<<blocksPerGrid, threadsPerBlock>>>(
d_C, d_A, d_B, width);
cudaDeviceSynchronize();
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// 验证结果
printf("结果[0] = %f (期望值: %f)\n", h_C[0], width * 1.0f);
// 释放内存
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
// @file: cuda/matrix_mul.cu
编译运行:
bash
# 编译
nvcc -o matrix_mul matrix_mul.cu -O3
# 运行
./matrix_mul
# 输出: 结果[0] = 1024.000000 (期望值: 1024.000000)
# @file: cuda/build_matrix.sh
5 OpenCL跨平台计算
5.1 OpenCL向量加法
OpenCL是跨平台的并行编程标准,支持CPU、GPU、DSP等多种设备[9]。
主机代码:
c
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
// OpenCL内核代码(字符串形式)
const char* kernelSource =
"__kernel void vector_add(__global const float* A,\n"
" __global const float* B,\n"
" __global float* C,\n"
" const int N) {\n"
" int idx = get_global_id(0);\n"
" if (idx < N) {\n"
" C[idx] = A[idx] + B[idx];\n"
" }\n"
"}\n";
int main() {
int N = 1024 * 1024;
size_t size = N * sizeof(float);
// 主机数据
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
for (int i = 0; i < N; i++) {
h_A[i] = i * 1.0f;
h_B[i] = i * 2.0f;
}
// 获取平台
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
// 获取设备
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
// 创建上下文
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
// 创建命令队列
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
// 创建缓冲区
cl_mem bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, NULL);
cl_mem bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, NULL);
cl_mem bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size, NULL, NULL);
// 拷贝数据到设备
clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, size, h_A, 0, NULL, NULL);
clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, size, h_B, 0, NULL, NULL);
// 创建程序
cl_program program = clCreateProgramWithSource(context, 1,
&kernelSource, NULL, NULL);
clBuildProgram(program, 1, &device, NULL, NULL, NULL);
// 创建内核
cl_kernel kernel = clCreateKernel(program, "vector_add", NULL);
// 设置内核参数
clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);
clSetKernelArg(kernel, 3, sizeof(int), &N);
// 执行内核
size_t globalSize = N;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL,
0, NULL, NULL);
// 读取结果
clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, size, h_C, 0, NULL, NULL);
// 验证
printf("OpenCL向量加法: C[0] = %f (期望: %f)\n", h_C[0], h_A[0] + h_B[0]);
// 清理
clReleaseMemObject(bufA);
clReleaseMemObject(bufB);
clReleaseMemObject(bufC);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(h_A);
free(h_B);
free(h_C);
return 0;
}
// @file: opencl/vector_add.c
编译运行(Linux + NVIDIA GPU):
bash
# 安装OpenCL
sudo apt-get install opencl-headers ocl-icd-opencl-dev
# 编译
gcc -o vector_add_opencl vector_add.c -lOpenCL
# 运行
./vector_add_opencl
# @file: opencl/build.sh
编译运行(Linux + AMD GPU):
bash
# 安装AMD ROCm
# 参考: https://rocm.docs.amd.com/
# 编译
gcc -o vector_add_opencl vector_add.c \
-I/opt/rocm/opencl/include \
-L/opt/rocm/opencl/lib \
-lOpenCL
# 运行
./vector_add_opencl
# @file: opencl/build_amd.sh
6 机器学习GPU加速
6.1 PyTorch GPU计算
PyTorch是最流行的深度学习框架之一,支持CUDA加速[10]。
PyTorch GPU示例:
python
import torch
import time
# 检查CUDA可用性
print(f"CUDA可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA版本: {torch.version.cuda}")
print(f"GPU设备: {torch.cuda.get_device_name(0)}")
# 矩阵大小
N = 4096
# CPU上的矩阵乘法
print("\n=== CPU矩阵乘法 ===")
a_cpu = torch.randn(N, N)
b_cpu = torch.randn(N, N)
start = time.time()
c_cpu = torch.matmul(a_cpu, b_cpu)
cpu_time = time.time() - start
print(f"CPU时间: {cpu_time:.4f}秒")
# GPU上的矩阵乘法
if torch.cuda.is_available():
print("\n=== GPU矩阵乘法 ===")
# 将数据移动到GPU
a_gpu = a_cpu.cuda()
b_gpu = b_cpu.cuda()
# 预热GPU
for _ in range(5):
_ = torch.matmul(a_gpu, b_gpu)
torch.cuda.synchronize()
# 计时
start = time.time()
c_gpu = torch.matmul(a_gpu, b_gpu)
torch.cuda.synchronize()
gpu_time = time.time() - start
print(f"GPU时间: {gpu_time:.4f}秒")
print(f"加速比: {cpu_time / gpu_time:.2f}x")
# 简单的神经网络示例
print("\n=== GPU神经网络 ===")
import torch.nn as nn
import torch.optim as optim
# 定义网络
class SimpleNet(nn.Module):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc1 = nn.Linear(784, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, 10)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x
# 创建模型并移动到GPU
model = SimpleNet().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
# 模拟训练
batch_size = 64
for epoch in range(3):
# 生成随机数据
inputs = torch.randn(batch_size, 784).cuda()
labels = torch.randint(0, 10, (batch_size,)).cuda()
# 前向传播
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
# 反向传播
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
# @file: ml/pytorch_gpu.py
运行(需要PyTorch + CUDA):
bash
# 安装PyTorch (CUDA版本)
pip install torch torchvision torchaudio
# 运行
python pytorch_gpu.py
# 输出示例:
# CUDA可用: True
# GPU设备: NVIDIA GeForce RTX 3090
# CPU时间: 2.3456秒
# GPU时间: 0.0234秒
# 加速比: 100.24x
# @file: ml/run_pytorch.sh
6.2 TensorFlow GPU加速
TensorFlow是Google开发的深度学习框架,同样支持GPU加速[11]。
TensorFlow GPU示例:
python
import tensorflow as tf
import time
# 检查GPU
print("TensorFlow版本:", tf.__version__)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
print(f"发现 {len(gpus)} 个GPU:")
for gpu in gpus:
print(f" - {gpu.name}")
else:
print("未发现GPU,使用CPU")
# 矩阵乘法性能对比
N = 4096
print("\n=== CPU矩阵乘法 ===")
with tf.device('/CPU:0'):
a_cpu = tf.random.normal((N, N))
b_cpu = tf.random.normal((N, N))
start = time.time()
for _ in range(10):
c_cpu = tf.matmul(a_cpu, b_cpu)
cpu_time = (time.time() - start) / 10
print(f"CPU平均时间: {cpu_time:.4f}秒")
if gpus:
print("\n=== GPU矩阵乘法 ===")
with tf.device('/GPU:0'):
a_gpu = tf.random.normal((N, N))
b_gpu = tf.random.normal((N, N))
# 预热
for _ in range(5):
_ = tf.matmul(a_gpu, b_gpu)
start = time.time()
for _ in range(10):
c_gpu = tf.matmul(a_gpu, b_gpu)
gpu_time = (time.time() - start) / 10
print(f"GPU平均时间: {gpu_time:.4f}秒")
print(f"加速比: {cpu_time / gpu_time:.2f}x")
# 简单的神经网络
print("\n=== 神经网络训练 ===")
# 创建数据
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(-1, 784).astype('float32') / 255.0
y_train = tf.one_hot(y_train, 10)
# 定义模型
model = tf.keras.Sequential([
tf.keras.layers.Dense(256, activation='relu', input_shape=(784,)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
# 训练(使用前1000个样本快速演示)
start = time.time()
history = model.fit(
x_train[:1000], y_train[:1000],
epochs=3,
batch_size=64,
verbose=1
)
train_time = time.time() - start
print(f"\n训练完成,用时: {train_time:.2f}秒")
# @file: ml/tensorflow_gpu.py
运行:
bash
# 安装TensorFlow (GPU版本)
pip install tensorflow
# 运行
python tensorflow_gpu.py
# @file: ml/run_tensorflow.sh
7 Android Compute Shader
7.1 OpenGL ES计算着色器
Android 5.0+支持OpenGL ES 3.1,引入了计算着色器,可以进行通用GPU计算[12]。
计算着色器代码:
glsl
#version 310 es
// 定义工作组大小
layout (local_size_x = 128) in;
// 输入输出缓冲
layout (std430, binding = 0) buffer InputA {
float data[];
} input_a;
layout (std430, binding = 1) buffer InputB {
float data[];
} input_b;
layout (std430, binding = 2) buffer Output {
float data[];
} output;
void main() {
uint idx = gl_GlobalInvocationID.x;
// 向量加法
output.data[idx] = input_a.data[idx] + input_b.data[idx];
}
// @file: android/app/src/main/res/raw/compute.glsl
Java端调用代码:
java
import android.content.Context;
import android.opengl.GLES31;
import java.nio.ByteBuffer;
import java.nio.FloatBuffer;
public class ComputeShader {
private int program;
private FloatBuffer bufferA;
private FloatBuffer bufferB;
private FloatBuffer bufferResult;
private int ssboA, ssboB, ssboResult;
private final int DATA_SIZE = 1024;
public ComputeShader(Context context) {
init();
}
private void init() {
// 1. 创建计算着色器程序
int computeShader = GLES31.glCreateShader(GLES31.GL_COMPUTE_SHADER);
String shaderSource = "#version 310 es\n" +
"layout (local_size_x = 128) in;\n" +
"layout (std430, binding = 0) buffer InputA {\n" +
" float data[];\n" +
"} input_a;\n" +
"layout (std430, binding = 1) buffer InputB {\n" +
" float data[];\n" +
"} input_b;\n" +
"layout (std430, binding = 2) buffer Output {\n" +
" float data[];\n" +
"} output;\n" +
"void main() {\n" +
" uint idx = gl_GlobalInvocationID.x;\n" +
" output.data[idx] = input_a.data[idx] + input_b.data[idx];\n" +
"}\n";
GLES31.glShaderSource(computeShader, shaderSource);
GLES31.glCompileShader(computeShader);
program = GLES31.glCreateProgram();
GLES31.glAttachShader(program, computeShader);
GLES31.glLinkProgram(program);
// 2. 准备数据
bufferA = ByteBuffer.allocateDirect(DATA_SIZE * 4)
.order(ByteOrder.nativeOrder())
.asFloatBuffer();
bufferB = ByteBuffer.allocateDirect(DATA_SIZE * 4)
.order(ByteOrder.nativeOrder())
.asFloatBuffer();
bufferResult = ByteBuffer.allocateDirect(DATA_SIZE * 4)
.order(ByteOrder.nativeOrder())
.asFloatBuffer();
// 初始化数据
for (int i = 0; i < DATA_SIZE; i++) {
bufferA.put(i, i * 1.0f);
bufferB.put(i, i * 2.0f);
}
// 3. 创建SSBO
int[] ssbos = new int[3];
GLES31.glGenBuffers(3, ssbos, 0);
ssboA = ssbos[0];
ssboB = ssbos[1];
ssboResult = ssbos[2];
// 上传数据到SSBO
bufferA.position(0);
GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, ssboA);
GLES31.glBufferData(GLES31.GL_SHADER_STORAGE_BUFFER,
DATA_SIZE * 4, bufferA, GLES31.GL_STATIC_DRAW);
bufferB.position(0);
GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, ssboB);
GLES31.glBufferData(GLES31.GL_SHADER_STORAGE_BUFFER,
DATA_SIZE * 4, bufferB, GLES31.GL_STATIC_DRAW);
// 创建输出缓冲
GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, ssboResult);
GLES31.glBufferData(GLES31.GL_SHADER_STORAGE_BUFFER,
DATA_SIZE * 4, null, GLES31.GL_DYNAMIC_DRAW);
}
public void compute() {
// 绑定SSBO到绑定点
GLES31.glBindBufferBase(GLES31.GL_SHADER_STORAGE_BUFFER, 0, ssboA);
GLES31.glBindBufferBase(GLES31.GL_SHADER_STORAGE_BUFFER, 1, ssboB);
GLES31.glBindBufferBase(GLES31.GL_SHADER_STORAGE_BUFFER, 2, ssboResult);
// 执行计算着色器
GLES31.glUseProgram(program);
GLES31.glDispatchCompute(DATA_SIZE / 128, 1, 1);
// 等待计算完成
GLES31.glMemoryBarrier(GLES31.GL_SHADER_STORAGE_BARRIER_BIT);
// 读取结果
GLES31.glBindBuffer(GLES31.GL_SHADER_STORAGE_BUFFER, ssboResult);
bufferResult.position(0);
GLES31.glGetBufferSubData(GLES31.GL_SHADER_STORAGE_BUFFER,
0, DATA_SIZE * 4, bufferResult);
// 验证结果
float expected = bufferA.get(0) + bufferB.get(0);
float actual = bufferResult.get(0);
android.util.Log.d("ComputeShader",
"Result[0] = " + actual + " (expected: " + expected + ")");
}
}
// @file: app/src/main/java/com/example/compute/ComputeShader.java
8 综合实战与性能对比
8.1 各平台性能对比测试
下面是一个综合性能测试,对比不同平台和技术的计算性能[13]。
性能测试框架:
cpp
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
// CPU版本
void matrix_mul_cpu(float* C, const float* A, const float* B, int N) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
}
#ifdef USE_CUDA
// CUDA版本
__global__ void matrix_mul_cuda(float* C, const float* A,
const float* B, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < N) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
sum += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}
void launch_cuda_kernel(float* d_C, float* d_A, float* d_B, int N) {
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid((N + 15) / 16, (N + 15) / 16);
matrix_mul_cuda<<<blocksPerGrid, threadsPerBlock>>>(d_C, d_A, d_B, N);
cudaDeviceSynchronize();
}
#endif
double get_time() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
int main(int argc, char** argv) {
int N = 1024;
size_t size = N * N * sizeof(float);
// 分配并初始化数据
float *A = (float*)malloc(size);
float *B = (float*)malloc(size);
float *C_cpu = (float*)malloc(size);
for (int i = 0; i < N * N; i++) {
A[i] = 1.0f;
B[i] = 1.0f;
}
printf("矩阵大小: %dx%d\n\n", N, N);
// CPU性能测试
printf("=== CPU性能测试 ===\n");
double start = get_time();
matrix_mul_cpu(C_cpu, A, B, N);
double cpu_time = get_time() - start;
printf("CPU时间: %.4f秒\n", cpu_time);
printf("GFLOPS: %.2f\n", 2.0 * N * N * N / cpu_time / 1e9);
#ifdef USE_CUDA
// CUDA性能测试
printf("\n=== CUDA性能测试 ===\n");
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
// 预热
for (int i = 0; i < 5; i++) {
launch_cuda_kernel(d_C, d_A, d_B, N);
}
start = get_time();
for (int i = 0; i < 10; i++) {
launch_cuda_kernel(d_C, d_A, d_B, N);
}
double cuda_time = (get_time() - start) / 10.0;
printf("CUDA时间: %.4f秒\n", cuda_time);
printf("GFLOPS: %.2f\n", 2.0 * N * N * N / cuda_time / 1e9);
printf("加速比: %.2fx\n", cpu_time / cuda_time);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
#endif
free(A);
free(B);
free(C_cpu);
return 0;
}
// @file: benchmark/performance_test.cpp
编译CPU版本:
bash
g++ -o perf_cpu performance_test.cpp -O3 -std=c++11 -lrt
./perf_cpu
# @file: benchmark/build_cpu.sh
编译CUDA版本:
bash
nvcc -o perf_cuda performance_test.cpp -O3 -DUSE_CUDA -std=c++11
./perf_cuda
# @file: benchmark/build_cuda.sh
参考资料
1\] [NVIDIA CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) \[2\] [OpenGL Programming Guide](https://www.opengl.org/redbook/) \[3\] [Vulkan Tutorial](https://vulkan-tutorial.com/) \[4\] [Learn OpenGL - Getting Started](https://learnopengl.com/) \[5\] [Android Graphics Architecture](https://source.android.com/devices/graphics/architecture) \[6\] [OpenCL Specification](https://www.khronos.org/opencl/) \[7\] [PyTorch CUDA Semantics](https://pytorch.org/docs/stable/cuda.html) \[8\] [TensorFlow GPU Support](https://www.tensorflow.org/install/gpu) \[9\] [GPU Gems](https://developer.nvidia.com/gpugems/) \[10\] [CUDA C Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/) \[11\] [OpenCL Programming Guide](https://www.khronos.org/opencl/opencl-book/) \[12\] [OpenGL ES Compute Shaders](https://www.khronos.org/opengl/wiki/Compute_Shader) \[13\] [GPU Roam: High-Performance GPU Computing](https://www.nvidia.com/en-us/data-center/)