WMMA (Warp-level Matrix Multiply Accumulate) API
对于计算能力在7.0及以上的CUDA设备,可以使用CUDA C++ API调用Tensor Core,支持形如D = AB + C的混合精度的矩阵乘运算。
C
// fragment:Tensor Core数据存储类,支持matrix_a、matrix_b和accumulator
template<typename Use, int m, int n, int k, typename T, typename Layout=void> class fragment;
// load_matrix_sync:Tensor Core数据加载API,支持将矩阵数据从global memory或shared memory加载到fragment
void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm);
void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm, layout_t layout);
// store_matrix_sync:Tensor Core结果存储API,支持将计算结果从fragment存储到global memory或shared memory
void store_matrix_sync(T* mptr, const fragment<...> &a, unsigned ldm, layout_t layout);
// fill_fragment:fragment填充API,支持常数值填充
void fill_fragment(fragment<...> &a, const T& v);
// mma_sync:Tensor Core矩阵乘计算API,支持D = AB + C或者C = AB + C
void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b, const fragment<...> &c, bool satf=false);
示例
以m16n16k16为例,实现HGEMM:C = AB,其中矩阵A(M * K,row major)、B(K * N,col major)和C(M * N,row major)的精度均为FP16。首先我们看如何使用CUDA Core写HGEMM naive算法。
CUDA Core
按照每个线程计算矩阵C中的一个元素来构建naive kernel,首先确定当前线程处理矩阵C的元素坐标,再遍历K并直接从global memory中加载所需A、B矩阵元素到寄存器参与计算,最后将计算结果从寄存器直接写回矩阵C。所有block计算完成之后即可得到矩阵C。这个例子不能说简单,只能说技术含量不高,不过我们只是为了对比。
C
#define DIV_CEIL(x, y) (((x) + (y) - 1) / (y))
__global__ void naiveKernel(const half *__restrict__ A, const half *__restrict__ B, half *__restrict__ C, size_t M,
size_t N, size_t K) {
size_t row = threadIdx.x + blockDim.x * blockIdx.x;
size_t col = threadIdx.y + blockDim.y * blockIdx.y;
if (row < M && col < N) {
half tmp = 0.0;
for (size_t i = 0; i < K; ++i) {
tmp += A[row * K + i] * B[i + col * K];
}
C[row * N + col] = tmp;
}
}
void hgemmNaive(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
dim3 block(16, 16);
dim3 grid(DIV_CEIL(M, block.x), DIV_CEIL(N, block.y));
naiveKernel<<<grid, block>>>(A, B, C, M, N, K);
}
Tensor Core
我们再来看如何用WMMA API来构建naive kernel,参考cuda sample。与CUDA Core naive不同的是,WMMA需要按照每个warp处理一个矩阵C的WMMA_M * WMMA_N大小的tile的思路来构建,因为Tensor Core的计算层级是warp级别,计算的矩阵元素也是二维的。接下来,与CUDA Core naive的处理思路一致,首先确定当前warp处理矩阵C的tile坐标,声明计算tilie所需的fragment,再以WMMA_K为步长遍历K并直接从global memory中加载所需A、B矩阵tile到fragment参与计算,最后将计算结果从fragment直接写回矩阵C。所有block计算完成之后即可得到矩阵C。值得注意的是,load_matrix_sync和store_matrix_sync都是按stride访问矩阵元素。
C
#include <mma.h>
#define WARP_SIZE 32
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
using namespace nvcuda;
__global__ void wmmaNaiveKernel(const half *__restrict__ A, const half *__restrict__ B, half *__restrict__ C, size_t M,
size_t N, size_t K) {
size_t warpM = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
size_t warpN = (blockIdx.y * blockDim.y + threadIdx.y);
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, half> c_frag;
wmma::fill_fragment(c_frag, 0.0f);
for (size_t i = 0; i < K; i += WMMA_K) {
size_t aCol = i;
size_t aRow = warpM * WMMA_M;
size_t bCol = warpN * WMMA_N;
size_t bRow = i;
if (aRow < M && aCol < K && bRow < K && bCol < N) {
wmma::load_matrix_sync(a_frag, A + aCol + aRow * K, K);
wmma::load_matrix_sync(b_frag, B + bRow + bCol * K, K);
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
}
}
size_t cCol = warpN * WMMA_N;
size_t cRow = warpM * WMMA_M;
if (cRow < M && cCol < N) {
wmma::store_matrix_sync(C + cCol + cRow * N, c_frag, N, wmma::mem_row_major);
}
}
void hgemmWmmaNaive(half *A, half *B, half *C, size_t M, size_t N, size_t K) {
dim3 block(128, 4);
dim3 grid((M - 1) / (WMMA_M * block.x / WARP_SIZE) + 1, (N - 1) / (WMMA_N * block.y) + 1);
wmmaNaiveKernel<<<grid, block>>>(A, B, C, M, N, K);
}
从上述两个naive kernel的代码来看调用CUDA Core和Tensor Core的区别如下:
计算层级:CUDA Core是线程级别,Tensor Core是warp级别
计算维度:CUDA Core是一维逐点计算,Tensor Core是二维逐tile计算
计算依赖:WMMA调用Tensor Core需要借助数据存储类fragment,CUDA Core不需要借助其他
示例
C
// Wave Matrix Multiply Accumulate (WMMA) using HIP compiler intrinsic
// Does a matrix multiplication of two 16x16, fp16 matrices, and stores them into a 16x16 fp16 result matrix
#include <iostream>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
using namespace std;
// Use half16 as an alias of the internal clang vector type of 16 fp16 values
typedef _Float16 half16 __attribute__((ext_vector_type(16)));
__global__ void wmma_matmul(__half* a, __half* b, __half* c)
{
const int gIdx = blockIdx.x * blockDim.x + threadIdx.x;
const int lIdx = threadIdx.x;
// a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and b
// a_frag will store one column of the 16x16 matrix A tile
// b_frag will store one row of the 16x16 matrix B tile
half16 a_frag;
half16 b_frag;
// initialize c fragment to 0
half16 c_frag = {};
// lane is (0-31) mod 16 instead of 0-31 due to matrix replication in RDNA 3
const int lane = lIdx % 16;
for (int ele = 0; ele < 16; ++ele)
{
b_frag[ele] = b[16*ele + lane];
}
for (int ele = 0; ele < 16; ++ele)
{
a_frag[ele] = a[16 * lane + ele];
}
// call the WMMA intrinsic with OPSEL set to "false"
c_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a_frag, b_frag, c_frag, false);
for (int ele = 0; ele < 8; ++ele)
{
const int r = ele * 2 + (lIdx / 16);
// store results from unpacked c_frag output
c[16 * r + lane] = c_frag[ele*2];
// if OPSEL was set to "true", the line above would instead be
// c[16 * r + lane] = c_frag[ele*2 + 1];
}
}
int main(int argc, char* argv[])
{
__half a[16 * 16] = {};
__half b[16 * 16] = {};
__half c[16 * 16] = {};
__half *a_gpu, *b_gpu, *c_gpu;
hipMalloc(&a_gpu, 16*16 * sizeof(__half));
hipMalloc(&b_gpu, 16*16 * sizeof(__half));
hipMalloc(&c_gpu, 16*16 * sizeof(__half));
// fill in some data into matrices A and B
for (int i = 0; i < 16; ++i)
{
for (int j = 0; j < 16; ++j)
{
a[i * 16 + j] = (__half)1.f;
b[i * 16 + j] = (__half)1.f;
}
}
hipMemcpy(a_gpu, a, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
hipMemcpy(b_gpu, b, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
hipMemcpy(c_gpu, c, (16*16) * sizeof(__half), hipMemcpyHostToDevice);
wmma_matmul<<<dim3(1), dim3(32, 1, 1), 0, 0>>>(a_gpu, b_gpu, c_gpu);
hipMemcpy(c, c_gpu, (16 * 16) * sizeof(__half), hipMemcpyDeviceToHost);
hipFree(a_gpu);
hipFree(b_gpu);
hipFree(c_gpu);
for (int i = 0; i < 16; ++i)
{
for (int j = 0; j < 16; ++j)
{
printf("%f ", (float)c[i * 16 + j]);
}
printf("\n");
}
return 0;
}