DCU异构程序——带宽测试

一、概述

HIP属于显式编程模型，需要在程序中明确写出并行控制语句，包括数据传输、核函数启动等。核函数是运行在DCU上的函数，在CPU端运行的部分称为主机端（主要是执行管理和启动），DCU端运行的部分称为设备端（用于执行计算）。大概的流程如下图：
HIP程序流程

①主机端将需要并行计算的数据通过hipMemcpy()传递给DCU（将CPU存储的内容传递给DCU的显存）；

②调用核函数启动函数hipLaunchKernelGGL()启动DCU，开始执行计算；

③设备端将计算好的结果数据通过hipMemcpy()从DCU复制回CPU。

hipMemcpy()是阻塞式的，数据复制完成后才可以执行后续的程序；hipLanuchKernelGGL()是非阻塞式的，执行完后程序继续向后执行，但是在Kernel没有计算完成之前，最后一个hipMemcpy()是不会开始的，这是由于HIP的Stream机制。

二、程序实现

下面是对带宽测试的具体实现，bandwidthcpp：

cpp 复制代码

#include <stdio.h>
#include <assert.h>
#include <hip/hip_runtime.h>

inline hipError_t checkhip(hipError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
    if(result != hipSuccess)
    {
        fprintf(stderr, "hip Runtime Error: %s\n", hipGetErrorString(result));
        assert(result == hipSuccess);
    }
#endif
    return result;
}

template<typename T>
__global__ void offset(T *a, int s)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x + s;
    a[i] = a[i] + 1;
}

template<typename T>
__global__ void stride(T *a, int s)
{
    int i = (blockDim.x * blockIdx.x + threadIdx.x) * s;
    a[i] = a[i] + 1;
}

template<typename T>
void runTest(int deviceId, int nMB)
{
    int blockSize = 256;
    float ms;

    T *d_a;
    hipEvent_t startEvent, stopEvent;

    int n = nMB * 1024 * 1024 / sizeof(T);

    checkhip(hipMalloc(&d_a, n * 33 * sizeof(T)));
    checkhip(hipEventCreate(&startEvent));
    checkhip(hipEventCreate(&stopEvent));

    printf("Offset, Bandwith (GB/s): \n");

    offset<<<n / blockSize, blockSize>>>(d_a, 0);

    for(int i = 0; i <= 32; i++)
    {
        checkhip(hipMemset(d_a, 0, n * sizeof(T)));
        checkhip(hipEventRecord(startEvent, 0));
        offset<<<n / blockSize, blockSize>>>(d_a, i);
        checkhip(hipEventRecord(stopEvent, 0));
        checkhip(hipEventSynchronize(stopEvent));

        checkhip(hipEventElapsedTime(&ms, startEvent, stopEvent));
        printf("%d, %f\n", i, 2 * nMB / ms);
    }

    printf("\n Stride, Bandwidth (GB/s):\n");

    stride<<<n / blockSize, blockSize>>>(d_a, 1);

    for(int i = 1; i <= 32; i++)
    {
        checkhip(hipMemset(d_a, 0, n * sizeof(T)));
        checkhip(hipEventRecord(startEvent, 0));
        stride<<<n / blockSize, blockSize>>>(d_a, i);
        checkhip(hipEventRecord(stopEvent, 0));
        checkhip(hipEventSynchronize(stopEvent));

        checkhip(hipEventElapsedTime(&ms, startEvent, stopEvent));
        printf("%d, %f\n", i, 2 * nMB / ms);
    }

    checkhip(hipEventDestroy(startEvent));
    checkhip(hipEventDestroy(stopEvent));
    hipFree(d_a);
}


int main(int argc, char *argv[])
{
    int nMB = 128;
    int deviceId = 0;
    bool bFp64 = false;

    for(int i = 1; i < argc; i++)
    {
        if(!strncmp(argv[i], "dev=", 4))
        {
            deviceId = atoi((char *)(&argv[i][1]));
        }
        else if(!strcmp(argv[i], "fp64"))
        {
            bFp64 = true;
        }
    }

    hipDeviceProp_t prop;

    checkhip(hipSetDevice(deviceId));
    checkhip(hipGetDeviceProperties(&prop, deviceId));
    printf("Device: %s\n", prop.name);
    printf("Transfer size (MB): %d\n", nMB);
    printf("%s Precision\n", bFp64 ? "Double":"Single");

    if(bFp64)
    {
        runTest<double>(deviceId, nMB);
    }
    else
    {
        runTest<float>(deviceId, nMB);
    }
    return 0;
}

三、编译运行

HIP程序采用hipcc编译。

运行结果：