Tensor Core的WMMA API编程入门

WMMA (Warp-level Matrix Multiply Accumulate) API

对于计算能力在7.0及以上的CUDA设备,可以使用CUDA C++ API调用Tensor Core,支持形如D = AB + C的混合精度的矩阵乘运算。

C 复制代码
// fragment:Tensor Core数据存储类,支持matrix_a、matrix_b和accumulator
template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;

// load_matrix_sync:Tensor Core数据加载API,支持将矩阵数据从global memory或shared memory加载到fragment
void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm);
void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm, layout_t layout);

// store_matrix_sync:Tensor Core结果存储API,支持将计算结果从fragment存储到global memory或shared memory
void store_matrix_sync(T* mptr, const fragment<...> &a, unsigned ldm, layout_t layout);

// fill_fragment:fragment填充API,支持常数值填充
void fill_fragment(fragment<...> &a, const T& v);

// mma_sync:Tensor Core矩阵乘计算API,支持D = AB + C或者C = AB + C
void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, const fragment<...> &c, bool satf=false);
示例

以m16n16k16为例,实现HGEMM:C = AB,其中矩阵A(M * K,row major)、B(K * N,col major)和C(M * N,row major)的精度均为FP16。首先我们看如何使用CUDA Core写HGEMM naive算法。

CUDA Core

按照每个线程计算矩阵C中的一个元素来构建naive kernel,首先确定当前线程处理矩阵C的元素坐标,再遍历K并直接从global memory中加载所需A、B矩阵元素到寄存器参与计算,最后将计算结果从寄存器直接写回矩阵C。所有block计算完成之后即可得到矩阵C。这个例子不能说简单,只能说技术含量不高,不过我们只是为了对比。

C 复制代码
#define DIV_CEIL(x, y) (((x) + (y) - 1) / (y))
__global__ void naiveKernel(const half *__restrict__ A, const half *__restrict__ B, half *__restrict__ C, size_t M,
                                size_t N, size_t K) {
    size_t row = threadIdx.x + blockDim.x * blockIdx.x;
    size_t col = threadIdx.y + blockDim.y * blockIdx.y;
    if (row < M && col < N) {
        half tmp = 0.0;
        for (size_t i = 0; i < K; ++i) {
            tmp += A[row * K + i] * B[i + col * K];
        }
        C[row * N + col] = tmp;
    }
}
void hgemmNaive(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
    dim3 block(16, 16);
    dim3 grid(DIV_CEIL(M, block.x), DIV_CEIL(N, block.y));
    naiveKernel<<<grid, block>>>(A, B, C, M, N, K);
}
Tensor Core

我们再来看如何用WMMA API来构建naive kernel,参考cuda sample。与CUDA Core naive不同的是,WMMA需要按照每个warp处理一个矩阵C的WMMA_M * WMMA_N大小的tile的思路来构建,因为Tensor Core的计算层级是warp级别,计算的矩阵元素也是二维的。接下来,与CUDA Core naive的处理思路一致,首先确定当前warp处理矩阵C的tile坐标,声明计算tilie所需的fragment,再以WMMA_K为步长遍历K并直接从global memory中加载所需A、B矩阵tile到fragment参与计算,最后将计算结果从fragment直接写回矩阵C。所有block计算完成之后即可得到矩阵C。值得注意的是,load_matrix_sync和store_matrix_sync都是按stride访问矩阵元素。

C 复制代码
#include <mma.h>
#define WARP_SIZE 32
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
using namespace nvcuda;
 
__global__ void wmmaNaiveKernel(const half *__restrict__ A, const half *__restrict__ B, half *__restrict__ C, size_t M,
                                size_t N, size_t K) {
    size_t warpM = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
    size_t warpN = (blockIdx.y * blockDim.y + threadIdx.y);
    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half> c_frag;
    wmma::fill_fragment(c_frag, 0.0f);
    for (size_t i = 0; i < K; i += WMMA_K) {
        size_t aCol = i;
        size_t aRow = warpM * WMMA_M;
        size_t bCol = warpN * WMMA_N;
        size_t bRow = i;
        if (aRow < M && aCol < K && bRow < K && bCol < N) {
            wmma::load_matrix_sync(a_frag, A + aCol + aRow * K, K);
            wmma::load_matrix_sync(b_frag, B + bRow + bCol * K, K);
            wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
        }
    }
    size_t cCol = warpN * WMMA_N;
    size_t cRow = warpM * WMMA_M;
    if (cRow < M && cCol < N) {
        wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, wmma::mem_row_major);
    }
}
void hgemmWmmaNaive(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
    dim3 block(128, 4);
    dim3 grid((M - 1) / (WMMA_M * block.x / WARP_SIZE) + 1, (N - 1) / (WMMA_N * block.y) + 1);
    wmmaNaiveKernel<<<grid, block>>>(A, B, C, M, N, K);
}

从上述两个naive kernel的代码来看调用CUDA Core和Tensor Core的区别如下:

计算层级:CUDA Core是线程级别,Tensor Core是warp级别

计算维度:CUDA Core是一维逐点计算,Tensor Core是二维逐tile计算

计算依赖:WMMA调用Tensor Core需要借助数据存储类fragment,CUDA Core不需要借助其他

示例
C 复制代码
// Wave Matrix Multiply Accumulate (WMMA) using HIP compiler intrinsic
// Does a matrix multiplication of two 16x16, fp16 matrices, and stores them into a 16x16 fp16 result matrix

#include <iostream>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>

using namespace std;

// Use half16 as an alias of the internal clang vector type of 16 fp16 values
typedef _Float16 half16 __attribute__((ext_vector_type(16)));

__global__ void wmma_matmul(__half* a, __half* b, __half* c)
{
    const int gIdx = blockIdx.x * blockDim.x + threadIdx.x;
    const int lIdx = threadIdx.x;

    // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and b
    // a_frag will store one column of the 16x16 matrix A tile
    // b_frag will store one row of the 16x16 matrix B tile
    half16 a_frag;
    half16 b_frag;
    // initialize c fragment to 0
    half16 c_frag = {};

    // lane is (0-31) mod 16 instead of 0-31 due to matrix replication in RDNA 3
    const int lane = lIdx % 16;

    for (int ele = 0; ele < 16; ++ele)
    {
        b_frag[ele] = b[16*ele + lane];
    }

    for (int ele = 0; ele < 16; ++ele)
    {
        a_frag[ele] = a[16 * lane + ele];
    }

    // call the WMMA intrinsic with OPSEL set to "false"
    c_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a_frag, b_frag, c_frag, false);

    for (int ele = 0; ele < 8; ++ele)
    {
        const int r = ele * 2 + (lIdx / 16);
        // store results from unpacked c_frag output
        c[16 * r + lane] = c_frag[ele*2];
        // if OPSEL was set to "true", the line above would instead be
        // c[16 * r + lane] = c_frag[ele*2 + 1];
    }

}

int main(int argc, char* argv[])

{
    __half a[16 * 16] = {};
    __half b[16 * 16] = {};
    __half c[16 * 16] = {};
    __half *a_gpu, *b_gpu, *c_gpu;
    hipMalloc(&a_gpu, 16*16 * sizeof(__half));
    hipMalloc(&b_gpu, 16*16 * sizeof(__half));
    hipMalloc(&c_gpu, 16*16 * sizeof(__half));

    // fill in some data into matrices A and B
    for (int i = 0; i < 16; ++i)
    {
        for (int j = 0; j < 16; ++j)
        {
            a[i * 16 + j] = (__half)1.f;
            b[i * 16 + j] = (__half)1.f;
        }
    }

    hipMemcpy(a_gpu, a, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
    hipMemcpy(b_gpu, b, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
    hipMemcpy(c_gpu, c, (16*16) * sizeof(__half), hipMemcpyHostToDevice);

    wmma_matmul<<<dim3(1), dim3(32, 1, 1), 0, 0>>>(a_gpu, b_gpu, c_gpu);

    hipMemcpy(c, c_gpu, (16 * 16) * sizeof(__half), hipMemcpyDeviceToHost);

    hipFree(a_gpu);
    hipFree(b_gpu);
    hipFree(c_gpu);

    for (int i = 0; i < 16; ++i)
    {
        for (int j = 0; j < 16; ++j)
        {
            printf("%f ", (float)c[i * 16 + j]);
        }
        printf("\n");
    }

    return 0;
}
相关推荐
ALISHENGYA2 分钟前
全国青少年信息学奥林匹克竞赛(信奥赛)备考实战之分支结构(实战项目二)
数据结构·c++·算法
guogaocai1235 分钟前
连续自成核退火热分级(SSA)技术表征共聚聚丙烯(PP)分子链结构
算法
咸鱼桨9 分钟前
《庐山派从入门到...》PWM板载蜂鸣器
人工智能·windows·python·k230·庐山派
arong_xu16 分钟前
现代C++锁介绍
c++·多线程·mutex
强哥之神20 分钟前
Nexa AI发布OmniAudio-2.6B:一款快速的音频语言模型,专为边缘部署设计
人工智能·深度学习·机器学习·语言模型·自然语言处理·音视频·openai
汤姆和杰瑞在瑞士吃糯米粑粑20 分钟前
【C++学习篇】AVL树
开发语言·c++·学习
yusaisai大鱼24 分钟前
tensorflow_probability与tensorflow版本依赖关系
人工智能·python·tensorflow
18号房客24 分钟前
一个简单的深度学习模型例程,使用Keras(基于TensorFlow)构建一个卷积神经网络(CNN)来分类MNIST手写数字数据集。
人工智能·深度学习·机器学习·生成对抗网络·语言模型·自然语言处理·tensorflow
DARLING Zero two♡28 分钟前
【优选算法】Pointer-Slice:双指针的算法切片(下)
java·数据结构·c++·算法·leetcode
神秘的土鸡31 分钟前
神经网络图像隐写术:用AI隐藏信息的艺术
人工智能·深度学习·神经网络