**引言:**CUDA的矩阵乘优化经常见到 pragma unroll 的使用,本文通过简单的示例,展示了CPU和CUDA对循环展开前后的性能表现,来通俗理解循环展开的优化策略。
一、什么是循环展开?
简单理解:将代码中的for循环展开,减少循环次数;循环展开的本质是,利用CPU指令级并行,来降低循环的开销,当然,同时也有利于指令流水线的高效调度
优点
- 提高缓存命中(cache hit)率,增加循环体内语句并发执行的可能性(需要循环体内语句不相关);
- 减少分支预测失败的可能性,提高性能
缺点
- 程序代码膨胀、代码可读性降低
- 消耗较多寄存器缓存(SM里的寄存器大小是有限的,SM会根据一个块需要消耗的寄存器大小和线程的个数去分配该SM上块的个数,当一个SM连一个块都分配不了时,就会导致内核启动不了)
二、循环展开的使用
循环展开在CPU和CUDA端都可以使用,但在CPU端可以由程序员手动实现,也可以通过成熟的编译器实现优化。# pragma unroll 是常用在CUDA编程的核函数中对for循环展开的使用方法。
下面通过计算0-100000个数字累加的和为例,展示CPU和CUDA下的对循环展开使用的理解。
CPU端
1)原始不展开
cpp
void test_cpu_1(int count, const char* name)
{
int sum = 0;
auto start = std::chrono::system_clock::now();
for(int i = 0;i < count;i++){
sum += i;
}
auto end = std::chrono::system_clock::now();
auto dura = std::chrono::duration_cast<std::chrono::microseconds> (end - start);
std::cout << name <<" cost time: "<< dura.count() << " microseconds" << std::endl;
printf(" sum = %d\n",sum);
}
2)循环展间隔4次
cpp
void test_cpu_2(int count, const char* name)
{
int sum = 0;
auto start = std::chrono::system_clock::now();
for(int i=0; i<count; i+=4)
{
sum += i;
sum += i+1;
sum += i+2;
sum += i+3;
}
auto end = std::chrono::system_clock::now();
auto dura = std::chrono::duration_cast<std::chrono::microseconds> (end - start);
std::cout << name <<" cost time: "<< dura.count() << " microseconds" << std::endl;
printf(" sum = %d\n",sum);
}
3)循环展开间隔4次,优化循环内的数据依赖关系
上面虽然实现了循环展开,但是循环体内是的4行代码之间共用sum地址, 所以是有先后依赖的,如果我们把他们之间的依赖关系去掉,则能进一步提升代码性能。
cpp
void test_cpu_3(int count, const char* name)
{
int sum = 0;
int sum1=0,sum2=0,sum3=0, sum4=0;
auto start = std::chrono::system_clock::now();
for(int i=0;i < count;i+=4){
sum1 += i;
sum2 += i+1;
sum3 += i+2;
sum4 += i+3;
}
sum = sum1+sum2+sum3+sum4;
auto end = std::chrono::system_clock::now();
auto dura = std::chrono::duration_cast<std::chrono::microseconds> (end - start);
std::cout << name <<" cost time: "<< dura.count() << " microseconds" << std::endl;
printf(" sum = %d\n",sum);
}
CUDA端
CUDA则主要对比使用# pragma unroll前后的区别。
1)原始不展开
cpp
__global__ void progam_kernel1(int* sum, int count)
{
for(int i = 0;i < count;i++){
*sum += i;
}
}
2)使用循环展开
cpp
__global__ void progam_kernel2(int* sum, int count)
{
#pragma unroll
for(int i = 0;i < count;i++){
*sum += i;
}
}
性能分析与测试接口实现
上面各种对比的方法测试时间如下,可以看到CPU端循环展开比原始不展开时间减少接近一半,而优化后的循环展开时间又减少将近一半。CUDA端使用pragma unroll后,时间减少三分之二。
cpp
cpu origin cost time: 1079 microseconds
sum = 704982704
cpu pragma unroll cost time: 678 microseconds
sum = 704982704
cpu pragma unroll_1 cost time: 374 microseconds
sum = 704982704
cuda origin cost time: 18 microseconds
sum = 704982704
cuda pragma unroll cost time: 6 microseconds
sum = 704982704
编译如下,因为把kernel函数写在一起了,所以用.cu为后缀命名。
bash
nvcc -o test test_performance.cu
下面是总体实现的代码
cpp
// file name: test_performance.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <unistd.h>
#include <chrono>
#include <iostream>
#include <string>
using namespace std;
void test_cpu_1(int count, const char* name)
{
int sum = 0;
auto start = std::chrono::system_clock::now();
for(int i = 0;i < count;i++){
sum += i;
}
auto end = std::chrono::system_clock::now();
auto dura = std::chrono::duration_cast<std::chrono::microseconds> (end - start);
std::cout << name <<" cost time: "<< dura.count() << " microseconds" << std::endl;
printf(" sum = %d\n",sum);
}
void test_cpu_2(int count, const char* name)
{
int sum = 0;
auto start = std::chrono::system_clock::now();
for(int i=0; i<count; i+=4)
{
sum += i;
sum += i+1;
sum += i+2;
sum += i+3;
}
auto end = std::chrono::system_clock::now();
auto dura = std::chrono::duration_cast<std::chrono::microseconds> (end - start);
std::cout << name <<" cost time: "<< dura.count() << " microseconds" << std::endl;
printf(" sum = %d\n",sum);
}
void test_cpu_3(int count, const char* name)
{
int sum = 0;
int sum1=0,sum2=0,sum3=0, sum4=0;
auto start = std::chrono::system_clock::now();
for(int i=0;i < count;i+=4){
sum1 += i;
sum2 += i+1;
sum3 += i+2;
sum4 += i+3;
}
sum = sum1+sum2+sum3+sum4;
auto end = std::chrono::system_clock::now();
auto dura = std::chrono::duration_cast<std::chrono::microseconds> (end - start);
std::cout << name <<" cost time: "<< dura.count() << " microseconds" << std::endl;
printf(" sum = %d\n",sum);
}
__global__ void progam_kernel1(int* sum, int count)
{
for(int i = 0;i < count;i++){
*sum += i;
}
}
__global__ void progam_kernel2(int* sum, int count)
{
#pragma unroll
for(int i = 0;i < count;i++){
*sum += i;
}
}
void test_cuda_1(int count, const char* name)
{
int sum =0;
int* g_sum;
cudaMalloc((void **)&g_sum, sizeof(int) * 1);
cudaMemcpy(g_sum, &sum, 1 * sizeof(int),cudaMemcpyHostToDevice);
auto start = std::chrono::system_clock::now();
progam_kernel1<<<1,1>>>(g_sum, count); //调用核函数
auto end = std::chrono::system_clock::now();
auto dura = std::chrono::duration_cast<std::chrono::microseconds> (end - start);
std::cout << name <<" cost time: "<< dura.count() << " microseconds" << std::endl;
cudaMemcpy(&sum, g_sum, sizeof(int) * 1, cudaMemcpyDeviceToHost);
printf(" sum = %d\n",sum);
cudaFree(g_sum);
}
void test_cuda_2(int count, const char* name)
{
int sum =0;
int* g_sum;
cudaMalloc((void **)&g_sum, sizeof(int) * 1);
cudaMemcpy(g_sum, &sum, 1 * sizeof(int),cudaMemcpyHostToDevice);
auto start = std::chrono::system_clock::now();
progam_kernel2<<<1,1>>>(g_sum, count); //调用核函数
auto end = std::chrono::system_clock::now();
auto dura = std::chrono::duration_cast<std::chrono::microseconds> (end - start);
std::cout << name <<" cost time: "<< dura.count() << " microseconds" << std::endl;
cudaMemcpy(&sum, g_sum, sizeof(int) * 1, cudaMemcpyDeviceToHost);
printf(" sum = %d\n", sum);
cudaFree(g_sum);
}
void test_performance()
{
int count =100000;
std::string s1 ="cpu origin";
std::string s2 = "cpu pragma unroll";
std::string s21 = "cpu pragma unroll_1";
std::string s3 = "cuda origin";
std::string s4 = "cuda pragma unroll";
test_cpu_1(count, s1.c_str());
test_cpu_2(count, s2.c_str());
test_cpu_3(count, s21.c_str());
test_cuda_1(count, s3.c_str());
test_cuda_2(count, s4.c_str());
}
int main(int argc, char *argv[])
{
test_performance();
return 0;
}
借助编译器的性能优化
程序员针对CPU端编写代码时候,可以使用上面的循环展开实现,实际上在c/c++的编译器已经非常成熟,针对这种代码都有对应的优化策略。在实际项目部署时候,可以开启编译器自动优化选项,帮助我们进一步提升代码性能。
比如,本次测试我写了CMakeLists.txt脚本,添加编译器优化的参数后执行结果如下。CPU端和未开启编译器优化相比,时间性能有了很大的提升。手动增加的循环展开的代码时间也大大降低了。
bash
cpu origin cost time: 31 microseconds
sum = 704982704
cpu pragma unroll cost time: 0 microseconds
sum = 704982704
cpu pragma unroll_1 cost time: 0 microseconds
sum = 704982704
cuda origin cost time: 18 microseconds
sum = 704982704
cuda pragma unroll cost time: 6 microseconds
sum = 704982704
上面未开启编译器优化的输出:
cpu origin cost time: 1079 microseconds
sum = 704982704
cpu pragma unroll cost time: 678 microseconds
sum = 704982704
cpu pragma unroll_1 cost time: 374 microseconds
sum = 704982704
cuda origin cost time: 18 microseconds
sum = 704982704
cuda pragma unroll cost time: 6 microseconds
sum = 704982704
在CMakeLists.txt添加了如下一行:
bash
set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -O1 -Wall")
参考: