DCU异构程序——Bank冲突

目录

一、概述

二、程序实现

三、编译运行


一、概述

HIP属于显式编程模型,需要在程序中明确写出并行控制语句,包括数据传输、核函数启动等。核函数是运行在DCU上的函数,在CPU端运行的部分称为主机端(主要是执行管理和启动),DCU端运行的部分称为设备端(用于执行计算)。大概的流程如下图:
HIP程序流程

①主机端将需要并行计算的数据通过hipMemcpy()传递给DCU(将CPU存储的内容传递给DCU的显存);

②调用核函数启动函数hipLaunchKernelGGL()启动DCU,开始执行计算;

③设备端将计算好的结果数据通过hipMemcpy()从DCU复制回CPU。

hipMemcpy()是阻塞式的,数据复制完成后才可以执行后续的程序;hipLanuchKernelGGL()是非阻塞式的,执行完后程序继续向后执行,但是在Kernel没有计算完成之前,最后一个hipMemcpy()是不会开始的,这是由于HIP的Stream机制。

二、程序实现

下面是对Bank冲突的具体实现,sharememBankConflict.cpp:

cpp 复制代码
#include <stdio.h>
#include <stdlib.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <sys/time.h>

#define BDIMX 32
#define BDIMY 16
#define IPAD  4

/*定义计时器*/
struct my_timer
{
    struct timeval start_time, end_time;
    double time_use;
    void start()
    {
        gettimeofday(&start_time,  NULL);
    }
    void stop()
    {
        gettimeofday(&end_time, NULL);
        time_use = (end_time.tv_sec - start_time.tv_sec) * 1.0e6 + end_time.tv_usec - start_time.tv_usec;
    }
};

void printData(char *msg, int *in, const int size)
{
    printf("%s:", msg);
    int sum;

    for(int i = 0; i < size; i++)
    {
        sum += in[i];
    }

    printf("%5d", sum);
    fflush(stdout);
    return;
}

__global__ void setRowReadRow(int *out)
{
    /*静态共享内存*/
    __shared__ int tile[BDIMY][BDIMX];

    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;

    tile[threadIdx.y][threadIdx.x] = idx;

    /*同步*/
    __syncthreads();

    out[idx] = tile[threadIdx.y][threadIdx.x];
}

__global__ void setColReadCol(int *out)
{
    /*静态共享内存*/
    __shared__ int tile[BDIMX][BDIMY];

    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;

    tile[threadIdx.x][threadIdx.y] = idx;

    /*同步*/
    __syncthreads();

    out[idx] = tile[threadIdx.x][threadIdx.y];
}

__global__ void setRowReadCol(int *out)
{
    /*静态共享内存*/
    __shared__ int tile[BDIMY][BDIMX];

    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int irow = idx / blockDim.y;
    unsigned int icol = idx % blockDim.y;

    tile[threadIdx.y][threadIdx.x] = idx;

    /*同步*/
    __syncthreads();

    out[idx] = tile[icol][irow];
}

__global__ void setRowReadColDyn(int *out)
{
    /*动态共享内存*/
    extern __shared__ int tile[];
    
    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int irow = idx / blockDim.y;
    unsigned int icol = idx % blockDim.y;

    unsigned int col_idx = icol * blockDim.x + irow;

    tile[idx] = idx;

    /*同步*/
    __syncthreads();

    out[idx] = tile[col_idx];
}

__global__ void setRowReadColPad(int *out)
{
    /*静态共享内存*/
    __shared__ int tile[BDIMY][BDIMX + IPAD];

    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int irow = idx / blockDim.y;
    unsigned int icol = idx % blockDim.y;

    tile[threadIdx.y][threadIdx.x] = idx;

    /*同步*/
    __syncthreads();

    out[idx] = tile[icol][irow];
}

__global__ void setRowReadColDynPad(int *out)
{
    /*动态共享内存*/
    extern __shared__ int tile[];
    
    unsigned int g_idx = threadIdx.y * blockDim.x + threadIdx.x;

    unsigned int irow = g_idx / blockDim.y;
    unsigned int icol = g_idx % blockDim.y;

    unsigned int row_idx = threadIdx.y * (blockDim.x + IPAD) + threadIdx.x;
    unsigned int col_idx = icol * (blockDim.x + IPAD) + irow;

    tile[row_idx] = g_idx;

    /*同步*/
    __syncthreads();

    out[g_idx] = tile[col_idx];
}


int main(int argc, char *argv[])
{
    hipDeviceProp_t props;
    int deviceID = 0;
    hipGetDeviceProperties(&props, deviceID);
    printf("%s at ", argv[0]);
    printf("device %d: %s ", deviceID, props.name);
    hipSetDevice(deviceID);

    hipSharedMemConfig pConfig;
    hipDeviceGetSharedMemConfig(&pConfig);
    printf("with Bank Mode:%s ", pConfig == 1 ? "4-Byte" : "8-byte");

    int nx = BDIMX;
    int ny = BDIMY;

    bool iprintf(0);

    if(argc > 1) iprintf = atoi(argv[1]);
    
    size_t nBytes = nx * ny * sizeof(int);

    dim3 block(BDIMX, BDIMY);
    dim3 grid(nx / BDIMX, ny/BDIMY);
    printf("<<< grid (%d,%d) block (%d, %d)>>>\n", grid.x, grid.y, block.x, block.y);
    
    int *d_C;
    hipMalloc((int **)&d_C,nBytes);
    int *gpuRef = (int *)malloc(nBytes);

    hipLaunchKernelGGL(setRowReadRow, grid, block, 0, 0, d_C);
    hipLaunchKernelGGL(setColReadCol, grid, block, 0, 0, d_C);

    my_timer timer1;
    hipMemset(d_C, 0, nBytes);
    timer1.start();
    hipLaunchKernelGGL(setRowReadRow, grid, block, 0, 0, d_C);
    hipDeviceSynchronize();
    timer1.stop();
    hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);

    printData((char*)"set row read row        ", gpuRef, nx * ny);
    printf("\t cost time : %0.5f ms\n", timer1.time_use/1000);

    my_timer timer2;
    hipMemset(d_C, 0, nBytes);
    timer2.start();
    hipLaunchKernelGGL(setColReadCol, grid, block, 0, 0, d_C);
    hipDeviceSynchronize();
    timer2.stop();
    hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);

    printData((char*)"set col read col        ", gpuRef, nx * ny);
    printf("\t cost time : %0.5f ms\n", timer2.time_use/1000);
    
    my_timer timer6;
    hipMemset(d_C, 0, nBytes);
    timer6.start();
    hipLaunchKernelGGL(setRowReadCol, grid, block, 0, 0, d_C);
    hipDeviceSynchronize();
    timer6.stop();
    hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);

    printData((char*)"set row read col        ", gpuRef, nx * ny);
    printf("\t cost time : %0.5f ms\n", timer6.time_use/1000);

    my_timer timer3;
    hipMemset(d_C, 0, nBytes);
    timer3.start();
    hipLaunchKernelGGL(setRowReadColDyn, grid, block, BDIMX*BDIMY*sizeof(int), 0, d_C);
    hipDeviceSynchronize();
    timer3.stop();
    hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);

    printData((char*)"set row read col dyn     ", gpuRef, nx * ny);
    printf("\t cost time : %0.5f ms\n", timer3.time_use/1000);

    my_timer timer4;
    hipMemset(d_C, 0, nBytes);
    timer4.start();
    hipLaunchKernelGGL(setRowReadColPad, grid, block, 0, 0, d_C);
    hipDeviceSynchronize();
    timer4.stop();
    hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);
    
    printData((char*)"set row read col pad     ", gpuRef, nx * ny);
    printf("\t cost time : %0.5f ms\n", timer4.time_use/1000);

    my_timer timer5;
    hipMemset(d_C, 0, nBytes);
    timer5.start();
    hipLaunchKernelGGL(setRowReadColDynPad, grid, block, (BDIMX+IPAD)*BDIMY*sizeof(int), 0, d_C);
    hipDeviceSynchronize();
    timer5.stop();
    hipMemcpy(gpuRef, d_C, nBytes, hipMemcpyDeviceToHost);

    printData((char*)"set row read col dyn pad ", gpuRef, nx * ny);
    printf("\t cost time : %0.5f ms\n", timer5.time_use/1000);

    hipFree(d_C);
    free(gpuRef);

    /*重置设备*/
    hipDeviceReset();
    
    return 0;
}

三、编译运行

HIP程序采用hipcc编译。

运行结果:

相关推荐
带鱼吃猫几秒前
Linux系统:文件系统前言,详解CHS&LBA地址
linux·运维·服务器
默默提升实验室18 分钟前
Linux 系统如何挂载U盘
linux·运维·服务器
元亓亓亓23 分钟前
LeetCode热题100--206.反转链表--简单
算法·leetcode·链表
诚丞成38 分钟前
BFS算法篇——从晨曦到星辰,BFS算法在多源最短路径问题中的诗意航行(上)
java·算法·宽度优先
mahuifa38 分钟前
python实现usb热插拔检测(linux)
linux·服务器·python
hongjianMa38 分钟前
2024睿抗编程赛国赛-题解
算法·深度优先·图论·caip
zandy10111 小时前
高并发场景下的BI架构设计:衡石分布式查询引擎与缓存分级策略
分布式·缓存·高并发架构·弹性扩展·分布式查询·缓存分级·mpp引擎
Lw老王要学习1 小时前
Linux架构篇、第五章git2.49.0部署与使用
linux·运维·git·云计算·it
czy87874751 小时前
两种常见的C语言实现64位无符号整数乘以64位无符号整数的实现方法
c语言·算法
yzx9910131 小时前
支持向量机案例
算法·机器学习·支持向量机