并行编程实战——CUDA编程的流的优先级

一、优先级

优先级在计算机编程中可谓是无所不在，尽管很多开发者没有什么机会直接操作优先级编程，但实际上听得到地方却非常多。从操作系统的进程优先级到数据队列中处理的优先级，甚至到排队策略的优先级等等。可以说到处都有优先级的影子。

优先级很好理解，就是谁先谁后。举个大家非常熟悉的例子，去银行业务大厅办业务，普通的优先级就是先来先服务；可如果来一个VIP，那就是人家直接越过所有人，先行办理业务。这就是优先级的一个典型的例子。

二、CUDA中流的优先级

在计算机的任务中是有优先级处理的，同样在GPU中的任务也一定会有优先级的处理。具体到CUDA编程中，CUDA中的流处理，也是有优先级划分的。正如上面提到的例子，如果没有VIP的加入，CUDA中的流具有相同的优先级，也就是说，它们按照正常的顺序执行。但总有一些任务，需要快速的应急处理，所以CUDA中也提供了流的优先级控制。在主流架构的GPU中，一般只支持高和低两个优先级的情况。同时，需要注意的是，一定要确保硬件本身支持优先的处理，换句话说，不是所有的GPU都支持优先级操作。所以在使用优先级操作时，需要先进行下列步骤：

查询GPU是否支持
可以使用CUDA提供的接口cudaDeviceGetStreamPriorityRange函数来查询GPU支持的优先级范围，如果返回的high_priority和low_priority的值相等，则表明此GPU并不支持流的优先级
真实的性能影响
另外一个重要的情况就是，开发者使用优先级来控制流，目的就是能让高优先级的优先执行来获取更多的资源从而更快的执行完成。但实际上，却有可能达不到开发者的目的。其实这和CPU中的调度是一致的。优先级只保证调度的顺序，而不保证结果的执行效率。同时，和多线程中调度一样，不当的优先设置，可能导致低优先级的流的饿死。
官方文档的提示
在NVIDIA的官方文档中，流优先级应用的某些相关接口在未来可能会有所改变。具体的细节请务必查看相关的文档说明

其实非常好理解为什么要设置流的优先级，现在的AI技术普遍应用到了非常多的场景下。而这些场景往往都需要处理一些紧急情况，这就需要优先处理一些指令。比如在自动驾驶的过程中，突然发现闯入行人需要紧急刹车，一些大数据计算中临时需要提取一些重要或敏感数据等。

下面看一个官方的设置流优先级的方法代码：

c 复制代码

// get the range of stream priorities for this device
int leastPriority, greatestPriority;
cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
// create streams with highest and lowest available priorities
cudaStream_t st_high, st_low;
cudaStreamCreateWithPriority(&st_high, cudaStreamNonBlocking, greatestPriority));
cudaStreamCreateWithPriority(&st_low, cudaStreamNonBlocking, leastPriority);

代码中首先通过接口获取流的优先级范围，然后根据返回的值来创建高和低两种优先级的流。

三、应用例程

下面看一个简单的流优先级应用的例子：

c 复制代码

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdio>


__global__ void vecAdd(const float* a, const float* b, float* c, int n) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < n) {
		for (int i = 0; i < 100; i++) {
			c[idx] = a[idx] + b[idx];
		}
	}
}

__global__ void matrixMul(const float* a, const float* b, float* c, int n) {
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	if (row < n && col < n) {
		float sum = 0.0f;
		for (int k = 0; k < n; k++) {
			for (int i = 0; i < 10; i++) {
				sum += a[row * n + k] * b[k * n + col];
			}
		}
		c[row * n + col] = sum;
	}
}


int main() {
	printf("*************start CUDA Priority demo****************\n");

	int devID = 0;
	cudaSetDevice(devID);

	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, devID);

	printf("device name: %s\n", prop.name);
	printf("CUDA Capability: %d.%d\n", prop.major, prop.minor);

	int lowPriority, highPriority;
	cudaDeviceGetStreamPriorityRange(&lowPriority, &highPriority);

	printf("PriorityRange: least=%d, greattest=%d\n", lowPriority, highPriority);

	if (lowPriority == highPriority) {
		printf("not supprot!\n");
		return 0;
	}

	cudaStream_t highPriorityStream, normalPriorityStream, lowPriorityStream;

	// create high
	cudaStreamCreateWithPriority(&highPriorityStream,
		cudaStreamNonBlocking,
		highPriority);

	// create normal
	int normalPriority = (lowPriority + highPriority) / 2;
	cudaStreamCreateWithPriority(&normalPriorityStream,
		cudaStreamNonBlocking,
		normalPriority);

	// create low
	cudaStreamCreateWithPriority(&lowPriorityStream,
		cudaStreamNonBlocking,
		lowPriority);

	printf("create three priority type stream:\n");
	printf("  *** high (priority=%d)\n", highPriority);
	printf("  *** normal (priority=%d)\n", normalPriority);
	printf("  *** low (priority=%d)\n", lowPriority);


	const int vecSize = 1 << 20;
	const int matrixSize = 512;
	const size_t vecBytes = vecSize * sizeof(float);
	const size_t matrixBytes = matrixSize * matrixSize * sizeof(float);

	float *dA, *dB, *dC;
	float *matrixA, *matrixB, *matrixC;

	cudaMalloc(&dA, vecBytes);
	cudaMalloc(&dB, vecBytes);
	cudaMalloc(&dC, vecBytes);

	cudaMalloc(&matrixA, matrixBytes);
	cudaMalloc(&matrixB, matrixBytes);
	cudaMalloc(&matrixC, matrixBytes);

	cudaEvent_t startHigh, stopHigh;
	cudaEvent_t startNormal, stopNormal;
	cudaEvent_t startLow, stopLow;

	cudaEventCreate(&startHigh);
	cudaEventCreate(&stopHigh);
	cudaEventCreate(&startNormal);
	cudaEventCreate(&stopNormal);
	cudaEventCreate(&startLow);
	cudaEventCreate(&stopLow);


	printf("\n  streams:Start the kernel, different priorities...\n");

	dim3 vecBlockDim(256);
	dim3 vecGridDim((vecSize + vecBlockDim.x - 1) / vecBlockDim.x);

	dim3 matBlockDim(16, 16);
	dim3 gridDim_mat((matrixSize + matBlockDim.x - 1) / matBlockDim.x,
		(matrixSize + matBlockDim.y - 1) / matBlockDim.y);

	cudaEventRecord(startHigh, highPriorityStream);
	cudaEventRecord(startNormal, normalPriorityStream);
	cudaEventRecord(startLow, lowPriorityStream);

	printf("high start add...\n");
	vecAdd << <vecGridDim, vecBlockDim, 0, highPriorityStream >> >(dA, dB, dC, vecSize);
	cudaGetLastError();

	printf("normal start mul...\n");
	matrixMul << <gridDim_mat, matBlockDim, 0, normalPriorityStream >> >(matrixA, matrixB, matrixC, matrixSize);
	cudaGetLastError();

	printf("low start mul other...\n");
	matrixMul << <gridDim_mat, matBlockDim, 0, lowPriorityStream >> >(matrixA, matrixB, matrixC, matrixSize);
	cudaGetLastError();

	cudaEventRecord(stopHigh, highPriorityStream);
	cudaEventRecord(stopNormal, normalPriorityStream);
	cudaEventRecord(stopLow, lowPriorityStream);


	printf("\n waitting end...\n");

	cudaStreamSynchronize(highPriorityStream);
	cudaStreamSynchronize(normalPriorityStream);
	cudaStreamSynchronize(lowPriorityStream);

	float msHigh, msNormal, msLow;

	cudaEventElapsedTime(&msHigh, startHigh, stopHigh);
	cudaEventElapsedTime(&msNormal, startNormal, stopNormal);
	cudaEventElapsedTime(&msLow, startLow, stopLow);

	printf("\n******** exec time calculate ********\n");
	printf("high task Time consumed by priority: %.2f ms\n", msHigh);
	printf("normal task Time consumed by priority: %.2f ms\n", msNormal);
	printf("low task Time consumed by priority: %.2f ms\n", msLow);


	printf("\n********execution order************\n");
	printf("Tasks in high priority streams usually start executing earlier\n");

	cudaEventDestroy(startHigh);
	cudaEventDestroy(stopHigh);
	cudaEventDestroy(startNormal);
	cudaEventDestroy(stopNormal);
	cudaEventDestroy(startLow);
	cudaEventDestroy(stopLow);

	cudaStreamDestroy(highPriorityStream);
	cudaStreamDestroy(normalPriorityStream);
	cudaStreamDestroy(lowPriorityStream);

	cudaFree(dA);
	cudaFree(dB);
	cudaFree(dC);
	cudaFree(matrixA);
	cudaFree(matrixB);
	cudaFree(matrixC);

	return 0;
}

上面的代码把事件和优先级的应用结合起来，既复习了事件的使用方法又对优先级进行了显示。

四、总结

基础的东西往往让开发者一看就明白，觉得没啥意思。这就和中学的学习一样，拿起书本感觉啥都会，放下书本解题却一片茫然。这不是说基础没用而并没有真正的把基础学会、学通。还需要不断的历练自己，才能把技术的大厦打好基础，从而能够灵活运用。