(5)cuda中的grid、block

文章目录

概要

在CUDA中,host和device 是两个重要的概念,我们用host指代CPU及其内存 ,而用device指代GPU及其内存

一般的CUDA程序的执行流程如下:

  1. 分配host内存,并进行数据初始化;
  2. 分配device内存,并从host将数据拷贝到device上;
  3. 调用CUDA的核函数在device上完成指定的运算;
  4. 将device上的运算结果拷贝到host上;
  5. 释放device和host上分配的内存。

整体架构流程

一般来说:

一个kernel对应一个grid

一个grid可以有多个block,一维~三维

一个block可以有多个thread,一维~三维

我们写的kernel function运行在block中的每个thread中。

https://cuda-programming.blogspot.com/2013/01/thread-and-block-heuristics-in-cuda.html

cpp 复制代码
#include <cuda_runtime.h>
#include <stdio.h>

//核函数 打印线程索引
__global__ void print_idx(){
    printf("block idx: (%3d, %3d, %3d), thread idx: (%3d, %3d, %3d)\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         threadIdx.z, threadIdx.y, threadIdx.x);
}

void demo_print(){
    int inputSize = 8;
    int blockDim = 4;  // block的维度 即 block中的线程数量
    int gridDim = inputSize / blockDim; // 计算出需要2个block,所以grid的维度为2

    dim3 block(blockDim);
    dim3 grid(gridDim);

    print_idx<<<grid, block>>>();
	//cudaDeviceSynchroize()来强制性的让kernel函数的结果执行结
	//束之后host再执行下一步。
    cudaDeviceSynchronize();
}

int main() {
    demo_print();
    return 0;
}

打印grid和block的维度

cpp 复制代码
__global__ void print_dim(){
    printf("grid dimension: (%3d, %3d, %3d), block dimension: (%3d, %3d, %3d)\n",
         gridDim.z, gridDim.y, gridDim.x,
         blockDim.z, blockDim.y, blockDim.x);
}

计算每个线程在block中的索引

cpp 复制代码
__global__ void print_thread_idx_per_block(){
    int index = threadIdx.z * blockDim.x * blockDim.y + \
              threadIdx.y * blockDim.x + \
              threadIdx.x;

    printf("block idx: (%3d, %3d, %3d), thread idx: %3d\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         index);
}

计算每个线程在grid中的索引

cpp 复制代码
__global__ void print_thread_idx_per_grid(){
    int block_Size  = blockDim.z * blockDim.y * blockDim.x;

    int block_Index = blockIdx.z * gridDim.x * gridDim.y + \
               blockIdx.y * gridDim.x + \
               blockIdx.x;

    int thread_Index = threadIdx.z * blockDim.x * blockDim.y + \
               threadIdx.y * blockDim.x + \
               threadIdx.x;

    int thread_index_in_grid  = block_Index * block_Size + thread_Index;

    printf("block idx: %3d, thread idx in block: %3d, thread index in grid: %3d\n", 
         block_Index, thread_Index, thread_index_in_grid);
}

完整代码与输出

cpp 复制代码
#include <cuda_runtime.h>
#include <iostream>
#include <stdio.h>

//核函数 打印线程索引
__global__ void print_idx(){
    printf("block idx: (%3d, %3d, %3d), thread idx: (%3d, %3d, %3d)\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         threadIdx.z, threadIdx.y, threadIdx.x);
}
//核函数 打印grid和block的维度
__global__ void print_dim(){
    printf("grid dimension: (%3d, %3d, %3d), block dimension: (%3d, %3d, %3d)\n",
         gridDim.z, gridDim.y, gridDim.x,
         blockDim.z, blockDim.y, blockDim.x);
}
//核函数 计算每个线程在block中的索引。GPU遍历顺序为Z,Y,X,所以计算的如下:
__global__ void print_thread_idx_per_block(){
    int index = threadIdx.z * blockDim.x * blockDim.y + \
              threadIdx.y * blockDim.x + \
              threadIdx.x;

    printf("block idx: (%3d, %3d, %3d), thread idx: %3d\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         index);
}

//核函数 计算每个线程在grid中的索引。GPU遍历顺序为Z,Y,X:
__global__ void print_thread_idx_per_grid(){
    int block_Size  = blockDim.z * blockDim.y * blockDim.x;

    int block_Index = blockIdx.z * gridDim.x * gridDim.y + \
               blockIdx.y * gridDim.x + \
               blockIdx.x;

    int thread_Index = threadIdx.z * blockDim.x * blockDim.y + \
               threadIdx.y * blockDim.x + \
               threadIdx.x;

    int thread_index_in_grid  = block_Index * block_Size + thread_Index;

    printf("block idx: %3d, thread idx in block: %3d, thread index in grid: %3d\n", 
         block_Index, thread_Index, thread_index_in_grid);
}



void demo_print(){
    int inputSize = 8;
    int blockDim = 4;  // block的维度 即 block中的线程数量
    int gridDim = inputSize / blockDim; // 计算出需要2个block,所以grid的维度为2

    dim3 block(blockDim);
    dim3 grid(gridDim);

    print_idx<<<grid, block>>>();
    //cudaDeviceSynchroize()来强制性的让kernel函数的结果执行结
	//束之后host再执行下一步。
    cudaDeviceSynchronize();
    std::cout << "---------------分割线---------------------------" << std::endl;
    print_dim<<<grid, block>>>();
    cudaDeviceSynchronize();
    std::cout << "---------------分割线---------------------------" << std::endl;
    print_thread_idx_per_block<<<grid, block>>>();
    cudaDeviceSynchronize();
    std::cout << "---------------分割线---------------------------" << std::endl;
    print_thread_idx_per_grid<<<grid, block>>>();
    cudaDeviceSynchronize();

}

int main() {
    demo_print();
    return 0;
}
bash 复制代码
cmake_minimum_required(VERSION 3.10)

project(test CUDA)
set(CMAKE_CUDA_STANDARD 20)

add_executable(test1 print_index_demo1.cu)
相关推荐
我也不曾来过123 分钟前
list底层原理
数据结构·c++·list
A charmer30 分钟前
C++ 日志系统实战第三步:熟悉掌握各种设计模式
c++·日志系统
Ethon_王39 分钟前
STL容器适配器详解:queue篇
c++
静听夜半雨43 分钟前
CANoe入门——3、新建LIN工程及LIN DataBase(LDF文件)的创建
网络·数据库·c++·编辑器
梁下轻语的秋缘1 小时前
每日c/c++题 备战蓝桥杯 ([洛谷 P1226] 快速幂求模题解)
c++·算法·蓝桥杯
虾球xz2 小时前
游戏引擎学习第244天: 完成异步纹理下载
c++·学习·游戏引擎
矛取矛求2 小时前
C++区别于C语言的提升用法(万字总结)
c语言·c++
ephemerals__2 小时前
【c++11】c++11新特性(下)(可变参数模板、default和delete、容器新设定、包装器)
开发语言·c++
egoist20233 小时前
【C++指南】告别C字符串陷阱:如何实现封装string?
开发语言·数据结构·c++·c++11·string·auto·深/浅拷贝
Zfox_3 小时前
【Qt】文件
c++·qt·qt5·客户端开发