目录
一、概述
HIP属于显式编程模型,需要在程序中明确写出并行控制语句,包括数据传输、核函数启动等。核函数是运行在DCU上的函数,在CPU端运行的部分称为主机端(主要是执行管理和启动),DCU端运行的部分称为设备端(用于执行计算)。大概的流程如下图:
HIP程序流程
①主机端将需要并行计算的数据通过hipMemcpy()传递给DCU(将CPU存储的内容传递给DCU的显存);
②调用核函数启动函数hipLaunchKernelGGL()启动DCU,开始执行计算;
③设备端将计算好的结果数据通过hipMemcpy()从DCU复制回CPU。
hipMemcpy()是阻塞式的,数据复制完成后才可以执行后续的程序;hipLanuchKernelGGL()是非阻塞式的,执行完后程序继续向后执行,但是在Kernel没有计算完成之前,最后一个hipMemcpy()是不会开始的,这是由于HIP的Stream机制。
二、程序实现
下面是对Bank冲突的具体实现,sharememBankConflict.cpp:
cpp
#include <stdio.h>
#include <stdlib.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <sys/time.h>
#define BDIMX 32
#define BDIMY 16
#define IPAD 4
/*定义计时器*/
struct my_timer
{
struct timeval start_time, end_time;
double time_use;
void start()
{
gettimeofday(&start_time, NULL);
}
void stop()
{
gettimeofday(&end_time, NULL);
time_use = (end_time.tv_sec - start_time.tv_sec) * 1.0e6 + end_time.tv_usec - start_time.tv_usec;
}
};
void printData(char *msg, int *in, const int size)
{
printf("%s:", msg);
int sum;
for(int i = 0; i < size; i++)
{
sum += in[i];
}
printf("%5d", sum);
fflush(stdout);
return;
}
__global__ void setRowReadRow(int *out)
{
/*静态共享内存*/
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = idx;
/*同步*/
__syncthreads();
out[idx] = tile[threadIdx.y][threadIdx.x];
}
__global__ void setColReadCol(int *out)
{
/*静态共享内存*/
__shared__ int tile[BDIMX][BDIMY];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.x][threadIdx.y] = idx;
/*同步*/
__syncthreads();
out[idx] = tile[threadIdx.x][threadIdx.y];
}
__global__ void setRowReadCol(int *out)
{
/*静态共享内存*/
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int irow = idx / blockDim.y;
unsigned int icol = idx % blockDim.y;
tile[threadIdx.y][threadIdx.x] = idx;
/*同步*/
__syncthreads();
out[idx] = tile[icol][irow];
}
__global__ void setRowReadColDyn(int *out)
{
/*动态共享内存*/
extern __shared__ int tile[];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int irow = idx / blockDim.y;
unsigned int icol = idx % blockDim.y;
unsigned int col_idx = icol * blockDim.x + irow;
tile[idx] = idx;
/*同步*/
__syncthreads();
out[idx] = tile[col_idx];
}
__global__ void setRowReadColPad(int *out)
{
/*静态共享内存*/
__shared__ int tile[BDIMY][BDIMX + IPAD];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int irow = idx / blockDim.y;
unsigned int icol = idx % blockDim.y;
tile[threadIdx.y][threadIdx.x] = idx;
/*同步*/
__syncthreads();
out[idx] = tile[icol][irow];
}
__global__ void setRowReadColDynPad(int *out)
{
/*动态共享内存*/
extern __shared__ int tile[];
unsigned int g_idx = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int irow = g_idx / blockDim.y;
unsigned int icol = g_idx % blockDim.y;
unsigned int row_idx = threadIdx.y * (blockDim.x + IPAD) + threadIdx.x;
unsigned int col_idx = icol * (blockDim.x + IPAD) + irow;
tile[row_idx] = g_idx;
/*同步*/
__syncthreads();
out[g_idx] = tile[col_idx];
}
int main(int argc, char *argv[])
{
hipDeviceProp_t props;
int deviceID = 0;
hipGetDeviceProperties(&props, deviceID);
printf("%s at ", argv[0]);
printf("device %d: %s ", deviceID, props.name);
hipSetDevice(deviceID);
hipSharedMemConfig pConfig;
hipDeviceGetSharedMemConfig(&pConfig);
printf("with Bank Mode:%s ", pConfig == 1 ? "4-Byte" : "8-byte");
int nx = BDIMX;
int ny = BDIMY;
bool iprintf(0);
if(argc > 1) iprintf = atoi(argv[1]);
size_t nBytes = nx * ny * sizeof(int);
dim3 block(BDIMX, BDIMY);
dim3 grid(nx / BDIMX, ny/BDIMY);
printf("<<< grid (%d,%d) block (%d, %d)>>>\n", grid.x, grid.y, block.x, block.y);
int *d_C;
hipMalloc((int **)&d_C,nBytes);
int *gpuRef = (int *)malloc(nBytes);
hipLaunchKernelGGL(setRowReadRow, grid, block, 0, 0, d_C);
hipLaunchKernelGGL(setColReadCol, grid, block, 0, 0, d_C);
my_timer timer1;
hipMemset(d_C, 0, nBytes);
timer1.start();
hipLaunchKernelGGL(setRowReadRow, grid, block, 0, 0, d_C);
hipDeviceSynchronize();
timer1.stop();
hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);
printData((char*)"set row read row ", gpuRef, nx * ny);
printf("\t cost time : %0.5f ms\n", timer1.time_use/1000);
my_timer timer2;
hipMemset(d_C, 0, nBytes);
timer2.start();
hipLaunchKernelGGL(setColReadCol, grid, block, 0, 0, d_C);
hipDeviceSynchronize();
timer2.stop();
hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);
printData((char*)"set col read col ", gpuRef, nx * ny);
printf("\t cost time : %0.5f ms\n", timer2.time_use/1000);
my_timer timer6;
hipMemset(d_C, 0, nBytes);
timer6.start();
hipLaunchKernelGGL(setRowReadCol, grid, block, 0, 0, d_C);
hipDeviceSynchronize();
timer6.stop();
hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);
printData((char*)"set row read col ", gpuRef, nx * ny);
printf("\t cost time : %0.5f ms\n", timer6.time_use/1000);
my_timer timer3;
hipMemset(d_C, 0, nBytes);
timer3.start();
hipLaunchKernelGGL(setRowReadColDyn, grid, block, BDIMX*BDIMY*sizeof(int), 0, d_C);
hipDeviceSynchronize();
timer3.stop();
hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);
printData((char*)"set row read col dyn ", gpuRef, nx * ny);
printf("\t cost time : %0.5f ms\n", timer3.time_use/1000);
my_timer timer4;
hipMemset(d_C, 0, nBytes);
timer4.start();
hipLaunchKernelGGL(setRowReadColPad, grid, block, 0, 0, d_C);
hipDeviceSynchronize();
timer4.stop();
hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);
printData((char*)"set row read col pad ", gpuRef, nx * ny);
printf("\t cost time : %0.5f ms\n", timer4.time_use/1000);
my_timer timer5;
hipMemset(d_C, 0, nBytes);
timer5.start();
hipLaunchKernelGGL(setRowReadColDynPad, grid, block, (BDIMX+IPAD)*BDIMY*sizeof(int), 0, d_C);
hipDeviceSynchronize();
timer5.stop();
hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);
printData((char*)"set row read col dyn pad ", gpuRef, nx * ny);
printf("\t cost time : %0.5f ms\n", timer5.time_use/1000);
hipFree(d_C);
free(gpuRef);
/*重置设备*/
hipDeviceReset();
return 0;
}
三、编译运行
HIP程序采用hipcc编译。
运行结果: