【模型推理优化学习笔记】CUDA加速矩阵乘计算

矩阵乘可以利用gpu多线程并行的特点进行加速计算，但是传统简单的方法需要多次读取数据到寄存器中，增加耗时，因此利用gpu的共享内存可以被一个block内的所有线程访问到的特性，结合tiling技术进行加速计算。

理论部分不解释了，网上有很多，关键在于网上很多利用共享内存计算的代码存在错误（大部分只有在设置blockDim.x == blockDim.y 的时候，凑巧能对齐index给出正确的结果，若这俩不等，结果就错了），这里给出一个修正的版本：

cpp 复制代码

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define M 32
#define K 32
#define N 32


void initial(float *array, int size)
{
	for (int i = 0; i < size; i++)
	{
		array[i] = (float)(1);
	}
}

void printMatrix(float *array, int row, int col)
{
	float *p = array;
	for (int y = 0; y < row; y++)
	{
		for (int x = 0; x < col; x++)
		{
			printf("%.2f ", p[x]);
		}
		p = p + col;
		printf("\n");
	}
	return;
}


__global__ void multiplicateMatrixOnDevice(float *array_A, float *array_B, float *array_C, int M_p, int K_p, int N_p)
{
	int ix = threadIdx.x + blockDim.x*blockIdx.x;//row number
	int iy = threadIdx.y + blockDim.y*blockIdx.y;//col number

	if (ix < N_p && iy < M_p)
	{
		float sum = 0;
		for (int k = 0; k < K_p; k++)
		{
			sum += array_A[iy*K_p + k] * array_B[k*N_p + ix];
		}
		array_C[iy*N_p + ix] = sum;
	}
}

// Compute C = A * B
//  M, K, K, N, M, N
__global__ void matrixMultiplyShared(float *A, float *B, float *C,
	int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns)
{
	//@@ Insert code to implement matrix multiplication here
	//@@ You have to use shared memory for this MP
 
    // 1. 相比网上代码，修改这里的index
	__shared__ float sharedM[8][16];  
	__shared__ float sharedN[16][8];  

	int bx = blockIdx.x;  
	int by = blockIdx.y; 
	int tx = threadIdx.x; 
	int ty = threadIdx.y; 

	int row = by * blockDim.y + ty;     
	int col = bx * blockDim.x + tx;     


	float Csub = 0.0;
    // for (int i = 0; i < 2; ++i)     
	for (int i = 0; i < (int)(ceil((float)numAColumns / blockDim.x)); i++)
	{
		if (i*blockDim.x + tx < numAColumns && row < numARows)
			sharedM[ty][tx] = A[row*numAColumns + i*blockDim.x + tx];
		else
			sharedM[ty][tx] = 0.0;

        // 2. 相比网上代码，修改这里的index
		if (i*blockDim.x + tx < numBRows && col < numBColumns)
			sharedN[tx][ty] = B[(i*blockDim.x + tx)*numBColumns + col];
		else
			sharedN[tx][ty] = 0.0;

		__syncthreads();

        // if (blockIdx.x == 0 && blockIdx.y == 1 && threadIdx.x == 0 && threadIdx.y ==0 ) {
        //     printf("sharedM: \n");
        //     for (int i = 0; i < 8; ++i) {
        //         for (int j = 0; j < 16; ++j) {
        //             printf("%f ", sharedM[i][j]);
        //         }
        //         printf("\n");
        //     }
        //     printf("sharedN: \n");
        //     for (int i = 0; i < 16; ++i) {
        //         for (int j = 0; j < 8; ++j) {
        //             printf("%f ", sharedM[i][j]);
        //         }
        //         printf("\n");
        //     }
        // }

		for (int j = 0; j < blockDim.x; j++)
        // 3. 相比网上代码，修改这里的index
			Csub += sharedM[ty][j] * sharedN[j][ty];
		__syncthreads();
	}

	if (row < numCRows && col < numCColumns)
		C[row*numCColumns + col] = Csub;

}


int main(int argc, char **argv)
{
	clock_t start = 0, finish = 0;
	float time;

	int Axy = M * K;
	int Bxy = K * N;
	int Cxy = M * N;

    float *h_A, *h_B, *hostRef, *deviceRef;
	h_A = (float*)malloc(Axy * sizeof(float));
	h_B = (float*)malloc(Bxy * sizeof(float));

    int nBytes = M * N * sizeof(float);
	hostRef = (float*)malloc(Cxy * sizeof(float));
	deviceRef = (float*)malloc(Cxy * sizeof(float));

    initial(h_A, Axy);
	initial(h_B, Bxy);
    // printMatrix(h_A, M, K);



    float *d_A, *d_B, *d_C;
	cudaMalloc((void**)&d_A, Axy * sizeof(float));
	cudaMalloc((void**)&d_B, Bxy * sizeof(float));
	cudaMalloc((void**)&d_C, Cxy * sizeof(float));

    cudaMemcpy(d_A, h_A, Axy * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, Bxy * sizeof(float), cudaMemcpyHostToDevice);

    int dimx = 16;
    int dimy = 16;
    dim3 block(dimx, dimy);
    dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);

    cudaEvent_t gpustart, gpustop;
    float elapsedTime = 0.0;
    cudaEventCreate(&gpustart);
    cudaEventCreate(&gpustop);

    cudaEventRecord(gpustart, 0);
    // multiplicateMatrixOnDevice<<<grid,block>>> (d_A, d_B, d_C, M, K, N);
    matrixMultiplyShared << < grid, block >> > (d_A, d_B, d_C, M, K, K, N, M, N);
    cudaDeviceSynchronize();
    cudaEventRecord(gpustop, 0);
    cudaEventSynchronize(gpustop);

    cudaEventElapsedTime(&elapsedTime, gpustart, gpustop);
    cudaEventDestroy(gpustart);
    cudaEventDestroy(gpustop);

    cudaMemcpy(deviceRef, d_C, Cxy * sizeof(float), cudaMemcpyDeviceToHost);



    printMatrix(deviceRef, M, N);
    return 0;
}