文章目录
-
-
- 一、线程组织结构回顾
- 二、内置变量速查表
- 三、一维线程配置(最常见)
- 四、二维线程配置
- 五、三维线程配置
-
- 计算公式
- [完整示例 - 体素处理](#完整示例 - 体素处理)
- 六、常见计算模式
- 七、实用工具函数
- 八、注意事项和最佳实践
-
- [1. 边界检查](#1. 边界检查)
- [2. warp对齐考虑](#2. warp对齐考虑)
- [3. 性能优化](#3. 性能优化)
- [4. 调试技巧](#4. 调试技巧)
- 九、完整示例:向量加法与性能统计
-
一、线程组织结构回顾
CUDA采用三层线程组织结构:
-
Thread:最小执行单元
-
Block:线程块,包含多个线程
-
Grid:网格,包含多个线程块
Grid
├── Block (0,0)
│ ├── Thread (0,0)
│ ├── Thread (1,0)
│ └── ...
├── Block (1,0)
└── Block (0,1)
└── ...
二、内置变量速查表
| 变量 | 类型 | 含义 |
|---|---|---|
threadIdx |
dim3 | 线程在block内的索引 |
blockIdx |
dim3 | block在grid内的索引 |
blockDim |
dim3 | block的维度(每block线程数) |
gridDim |
dim3 | grid的维度(每grid block数) |
三、一维线程配置(最常见)
计算公式
cuda
int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
示例代码
cuda
// 假设配置:N个元素,每block 256线程
int N = 1000;
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
__global__ void kernel(float* data, int N) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N) {
data[tid] = tid; // 每个线程处理一个元素
}
}
kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, N);
计算示例
配置:gridDim.x=4, blockDim.x=256
线程索引计算:
- Block 0: tid = 0*256 + 0~255 = 0~255
- Block 1: tid = 1*256 + 0~255 = 256~511
- Block 2: tid = 2*256 + 0~255 = 512~767
- Block 3: tid = 3*256 + 0~255 = 768~1023
四、二维线程配置
计算公式
cuda
int thread_id_2d = threadIdx.y * blockDim.x + threadIdx.x; // block内线性ID
int block_id_2d = blockIdx.y * gridDim.x + blockIdx.x; // grid内线性ID
int global_thread_id = block_id_2d * (blockDim.x * blockDim.y) + thread_id_2d;
// 或者分别获取行和列索引
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int global_tid = row * (gridDim.x * blockDim.x) + col;
完整示例 - 图像处理
cuda
// 处理800x600图像
int width = 800, height = 600;
dim3 blockDim(16, 16); // 每block 16x16 = 256线程
dim3 gridDim(
(width + blockDim.x - 1) / blockDim.x,
(height + blockDim.y - 1) / blockDim.y
);
__global__ void imageKernel(unsigned char* image, int width, int height) {
// 计算像素坐标
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
int pixel_index = y * width + x; // 线性内存索引
image[pixel_index] = 255; // 处理像素
}
}
计算示例
配置:gridDim(2,2), blockDim(3,3)
线程总数:6x6 = 36线程
Block(0,0)内线程:
threadIdx(0,0): global (0,0)
threadIdx(1,0): global (1,0)
threadIdx(2,0): global (2,0)
threadIdx(0,1): global (0,1)
...
Block(1,0)内线程:
threadIdx(0,0): global (3,0)
threadIdx(1,0): global (4,0)
...
Block(0,1)内线程:
threadIdx(0,0): global (0,3)
...
五、三维线程配置
计算公式
cuda
// 3D全局线性ID
int global_tid =
blockIdx.z * gridDim.x * gridDim.y * blockDim.x * blockDim.y * blockDim.z +
blockIdx.y * gridDim.x * blockDim.x * blockDim.y * blockDim.z +
blockIdx.x * blockDim.x * blockDim.y * blockDim.z +
threadIdx.z * blockDim.x * blockDim.y +
threadIdx.y * blockDim.x +
threadIdx.x;
// 更清晰的三维坐标
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int z = blockIdx.z * blockDim.z + threadIdx.z;
int linear_id = z * (gridDim.x * blockDim.x) * (gridDim.y * blockDim.y) +
y * (gridDim.x * blockDim.x) + x;
完整示例 - 体素处理
cuda
// 处理64x64x64体素
dim3 blockDim(4, 4, 4); // 64线程
dim3 gridDim(16, 16, 16); // 4096 blocks
__global__ void voxelKernel(float* volume, int size) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int z = blockIdx.z * blockDim.z + threadIdx.z;
if (x < size && y < size && z < size) {
int idx = (z * size + y) * size + x;
volume[idx] = idx;
}
}
六、常见计算模式
模式1:一维处理N个元素
cuda
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; // 网格跨度
// 网格步长循环(处理多于线程数的元素)
for (int i = tid; i < N; i += stride) {
data[i] = process(data[i]);
}
模式2:矩阵处理(行优先)
cuda
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int index = row * width + col;
模式3:卷积/图像处理(带边界)
cuda
__global__ void convolution(float* input, float* output, int width, int height) {
// 计算输出像素位置
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x > 0 && x < width-1 && y > 0 && y < height-1) {
// 排除边界像素,使用3x3卷积核
float sum = 0;
for (int ky = -1; ky <= 1; ky++) {
for (int kx = -1; kx <= 1; kx++) {
sum += input[(y+ky)*width + (x+kx)];
}
}
output[y*width + x] = sum / 9.0f;
}
}
七、实用工具函数
cuda
// 获取全局线性ID(1D)
__device__ int getGlobalTid1D() {
return blockIdx.x * blockDim.x + threadIdx.x;
}
// 获取全局线性ID(2D)
__device__ int getGlobalTid2D(int width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
return row * width + col;
}
// 获取block内线性ID
__device__ int getLocalTid() {
return threadIdx.x + threadIdx.y * blockDim.x +
threadIdx.z * blockDim.x * blockDim.y;
}
// 打印调试信息
__global__ void debugKernel() {
printf("Block(%d,%d,%d) Thread(%d,%d,%d) Global(%d,%d,%d) TID=%d\n",
blockIdx.x, blockIdx.y, blockIdx.z,
threadIdx.x, threadIdx.y, threadIdx.z,
blockIdx.x*blockDim.x+threadIdx.x,
blockIdx.y*blockDim.y+threadIdx.y,
blockIdx.z*blockDim.z+threadIdx.z,
getGlobalTid1D());
}
八、注意事项和最佳实践
1. 边界检查
cuda
// 总是检查边界,避免越界
if (tid < N) {
// 安全处理
}
2. warp对齐考虑
cuda
// Warp大小32线程,确保连续索引对齐
int warp_id = tid / 32;
int lane_id = tid % 32;
3. 性能优化
cuda
// 优先使用一维配置(最简单)
// 二维适合矩阵运算
// 三维适合体素和物理模拟
// 避免计算复杂的全局索引
int pitch = blockDim.x * gridDim.x;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
4. 调试技巧
cuda
// 在核函数开头添加调试代码
__global__ void myKernel(float* data, int N) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
#ifdef DEBUG
if (tid == 0) {
printf("Grid: %d,%d,%d Block: %d,%d,%d\n",
gridDim.x, gridDim.y, gridDim.z,
blockDim.x, blockDim.y, blockDim.z);
}
#endif
// ... 正常代码
}
九、完整示例:向量加法与性能统计
cuda
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void vectorAdd(const float* a, const float* b, float* c, int n) {
// 方法1:直接映射
int tid1 = blockIdx.x * blockDim.x + threadIdx.x;
// 方法2:网格步长循环(处理更大数组)
int tid2 = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// 使用网格步长循环处理所有元素
for (int i = tid2; i < n; i += stride) {
c[i] = a[i] + b[i];
}
}
int main() {
int n = 1000000;
size_t bytes = n * sizeof(float);
// 分配和初始化数据...
// 配置核函数
int threadsPerBlock = 256;
int blocksPerGrid = min(32, (n + threadsPerBlock - 1) / threadsPerBlock);
printf("Launching kernel with %d blocks, %d threads per block\n",
blocksPerGrid, threadsPerBlock);
printf("Total threads: %d, Processing %d elements\n",
blocksPerGrid * threadsPerBlock, n);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
cudaDeviceSynchronize();
return 0;
}
理解线程号的计算是CUDA编程的基础,正确计算全局线程ID可以确保每个线程处理正确的数据元素,避免数据竞争和内存访问错误。