Tensor Core的WMMA API编程入门

WMMA (Warp-level Matrix Multiply Accumulate) API

对于计算能力在7.0及以上的CUDA设备，可以使用CUDA C++ API调用Tensor Core，支持形如D = AB + C的混合精度的矩阵乘运算。

C 复制代码

// fragment：Tensor Core数据存储类，支持matrix_a、matrix_b和accumulator
template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;

// load_matrix_sync：Tensor Core数据加载API，支持将矩阵数据从global memory或shared memory加载到fragment
void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm);
void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm, layout_t layout);

// store_matrix_sync：Tensor Core结果存储API，支持将计算结果从fragment存储到global memory或shared memory
void store_matrix_sync(T* mptr, const fragment<...> &a, unsigned ldm, layout_t layout);

// fill_fragment：fragment填充API，支持常数值填充
void fill_fragment(fragment<...> &a, const T& v);

// mma_sync：Tensor Core矩阵乘计算API，支持D = AB + C或者C = AB + C
void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, const fragment<...> &c, bool satf=false);

示例

以m16n16k16为例，实现HGEMM：C = AB，其中矩阵A（M * K，row major）、B（K * N，col major）和C（M * N，row major）的精度均为FP16。首先我们看如何使用CUDA Core写HGEMM naive算法。

CUDA Core

按照每个线程计算矩阵C中的一个元素来构建naive kernel，首先确定当前线程处理矩阵C的元素坐标，再遍历K并直接从global memory中加载所需A、B矩阵元素到寄存器参与计算，最后将计算结果从寄存器直接写回矩阵C。所有block计算完成之后即可得到矩阵C。这个例子不能说简单，只能说技术含量不高，不过我们只是为了对比。

C 复制代码

#define DIV_CEIL(x, y) (((x) + (y) - 1) / (y))
__global__ void naiveKernel(const half *__restrict__ A, const half *__restrict__ B, half *__restrict__ C, size_t M,
                                size_t N, size_t K) {
    size_t row = threadIdx.x + blockDim.x * blockIdx.x;
    size_t col = threadIdx.y + blockDim.y * blockIdx.y;
    if (row < M && col < N) {
        half tmp = 0.0;
        for (size_t i = 0; i < K; ++i) {
            tmp += A[row * K + i] * B[i + col * K];
        }
        C[row * N + col] = tmp;
    }
}
void hgemmNaive(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
    dim3 block(16, 16);
    dim3 grid(DIV_CEIL(M, block.x), DIV_CEIL(N, block.y));
    naiveKernel<<<grid, block>>>(A, B, C, M, N, K);
}

Tensor Core

我们再来看如何用WMMA API来构建naive kernel，参考cuda sample。与CUDA Core naive不同的是，WMMA需要按照每个warp处理一个矩阵C的WMMA_M * WMMA_N大小的tile的思路来构建，因为Tensor Core的计算层级是warp级别，计算的矩阵元素也是二维的。接下来，与CUDA Core naive的处理思路一致，首先确定当前warp处理矩阵C的tile坐标，声明计算tilie所需的fragment，再以WMMA_K为步长遍历K并直接从global memory中加载所需A、B矩阵tile到fragment参与计算，最后将计算结果从fragment直接写回矩阵C。所有block计算完成之后即可得到矩阵C。值得注意的是，load_matrix_sync和store_matrix_sync都是按stride访问矩阵元素。

C 复制代码

#include <mma.h>
#define WARP_SIZE 32
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
using namespace nvcuda;
 
__global__ void wmmaNaiveKernel(const half *__restrict__ A, const half *__restrict__ B, half *__restrict__ C, size_t M,
                                size_t N, size_t K) {
    size_t warpM = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
    size_t warpN = (blockIdx.y * blockDim.y + threadIdx.y);
    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half> c_frag;
    wmma::fill_fragment(c_frag, 0.0f);
    for (size_t i = 0; i < K; i += WMMA_K) {
        size_t aCol = i;
        size_t aRow = warpM * WMMA_M;
        size_t bCol = warpN * WMMA_N;
        size_t bRow = i;
        if (aRow < M && aCol < K && bRow < K && bCol < N) {
            wmma::load_matrix_sync(a_frag, A + aCol + aRow * K, K);
            wmma::load_matrix_sync(b_frag, B + bRow + bCol * K, K);
            wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
        }
    }
    size_t cCol = warpN * WMMA_N;
    size_t cRow = warpM * WMMA_M;
    if (cRow < M && cCol < N) {
        wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, wmma::mem_row_major);
    }
}
void hgemmWmmaNaive(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
    dim3 block(128, 4);
    dim3 grid((M - 1) / (WMMA_M * block.x / WARP_SIZE) + 1, (N - 1) / (WMMA_N * block.y) + 1);
    wmmaNaiveKernel<<<grid, block>>>(A, B, C, M, N, K);
}

从上述两个naive kernel的代码来看调用CUDA Core和Tensor Core的区别如下：

计算层级：CUDA Core是线程级别，Tensor Core是warp级别

计算维度：CUDA Core是一维逐点计算，Tensor Core是二维逐tile计算

计算依赖：WMMA调用Tensor Core需要借助数据存储类fragment，CUDA Core不需要借助其他

示例

C 复制代码

// Wave Matrix Multiply Accumulate (WMMA) using HIP compiler intrinsic
// Does a matrix multiplication of two 16x16, fp16 matrices, and stores them into a 16x16 fp16 result matrix

#include <iostream>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>

using namespace std;

// Use half16 as an alias of the internal clang vector type of 16 fp16 values
typedef _Float16 half16 __attribute__((ext_vector_type(16)));

__global__ void wmma_matmul(__half* a, __half* b, __half* c)
{
    const int gIdx = blockIdx.x * blockDim.x + threadIdx.x;
    const int lIdx = threadIdx.x;

    // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and b
    // a_frag will store one column of the 16x16 matrix A tile
    // b_frag will store one row of the 16x16 matrix B tile
    half16 a_frag;
    half16 b_frag;
    // initialize c fragment to 0
    half16 c_frag = {};

    // lane is (0-31) mod 16 instead of 0-31 due to matrix replication in RDNA 3
    const int lane = lIdx % 16;

    for (int ele = 0; ele < 16; ++ele)
    {
        b_frag[ele] = b[16*ele + lane];
    }

    for (int ele = 0; ele < 16; ++ele)
    {
        a_frag[ele] = a[16 * lane + ele];
    }

    // call the WMMA intrinsic with OPSEL set to "false"
    c_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a_frag, b_frag, c_frag, false);

    for (int ele = 0; ele < 8; ++ele)
    {
        const int r = ele * 2 + (lIdx / 16);
        // store results from unpacked c_frag output
        c[16 * r + lane] = c_frag[ele*2];
        // if OPSEL was set to "true", the line above would instead be
        // c[16 * r + lane] = c_frag[ele*2 + 1];
    }

}

int main(int argc, char* argv[])

{
    __half a[16 * 16] = {};
    __half b[16 * 16] = {};
    __half c[16 * 16] = {};
    __half *a_gpu, *b_gpu, *c_gpu;
    hipMalloc(&a_gpu, 16*16 * sizeof(__half));
    hipMalloc(&b_gpu, 16*16 * sizeof(__half));
    hipMalloc(&c_gpu, 16*16 * sizeof(__half));

    // fill in some data into matrices A and B
    for (int i = 0; i < 16; ++i)
    {
        for (int j = 0; j < 16; ++j)
        {
            a[i * 16 + j] = (__half)1.f;
            b[i * 16 + j] = (__half)1.f;
        }
    }

    hipMemcpy(a_gpu, a, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
    hipMemcpy(b_gpu, b, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
    hipMemcpy(c_gpu, c, (16*16) * sizeof(__half), hipMemcpyHostToDevice);

    wmma_matmul<<<dim3(1), dim3(32, 1, 1), 0, 0>>>(a_gpu, b_gpu, c_gpu);

    hipMemcpy(c, c_gpu, (16 * 16) * sizeof(__half), hipMemcpyDeviceToHost);

    hipFree(a_gpu);
    hipFree(b_gpu);
    hipFree(c_gpu);

    for (int i = 0; i < 16; ++i)
    {
        for (int j = 0; j < 16; ++j)
        {
            printf("%f ", (float)c[i * 16 + j]);
        }
        printf("\n");
    }

    return 0;
}