main.cpp
cpp
#include "sum.h"
#include "stdlib.h"
#include <chrono>
void init_array(float* array,int l)
{
for(int i=0; i<l; i++)
{
array[i]=i%10;
}
}
int main(int argc,char **argv)
{
//a数组很大,栈区放不下,必须放在堆里
float *a = (float*)malloc((ARRAY_L+500) * sizeof(float));
float b[1];
init_array(a,ARRAY_L+500);
char* endptr;
int block = strtol(argv[1], &endptr, 10);
auto start = std::chrono::steady_clock::now();
ArraySum( a, b, ARRAY_L, block);
auto end = std::chrono::steady_clock::now();
long long us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
printf("gpu1 cost:%ld us;sum=%f\n", us, b[0]);
start = std::chrono::steady_clock::now();
ArraySumUnfold( a, b, ARRAY_L, block);
end = std::chrono::steady_clock::now();
us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
printf("gpu2 cost:%ld us;sum=%f\n", us, b[0]);
start = std::chrono::steady_clock::now();
ArraySumUnfold3( a, b, ARRAY_L, block);
end = std::chrono::steady_clock::now();
us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
printf("gpu3 cost:%ld us;sum=%f\n", us, b[0]);
printf("sum=%f\n",b[0]);
start = std::chrono::steady_clock::now();
float sum=0;
for(int i=0;i<ARRAY_L;i++)
{
sum += a[i];
}
end = std::chrono::steady_clock::now();
us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
printf("cpu cost:%ld us;sum=%f\n", us, sum);
free(a);
return 0;
}
cpp
#include <cuda_runtime.h>
#include <stdlib.h>
#include "sum.h"
/*
每个block都是一个小数组,对每个小数组求和,最后合成大数组
blockDim=256,array_long=10000,gridDim=40时:
grid[0],block[0]里的一个warp,也就是blockIdx.x=0,0<=threadIdx.x<32;
idx就是[0到31],idara=&input[0],stride从128开始二分,tid就是0到31
而最后一个线程束里发生了什么?
tid是224到255,idx是256*39+224=10208到10239,全部越界,直接return
可最后一个block的warp0呢?
tid是0到31,idx是9984到10015,部分越界!
此时idata=&input[9984],idx等于9984时不被拦截,
第一次stride等于128,那么tid+stride=128,
此时idata可以访问input[9984+128],直接越界!
*/
__global__ void sumKernel(float* input,float* output,int num,unsigned long long* block_times)
{
unsigned long long start = clock64();
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//创建小数组,注意不要非法访问
float* idata = input + blockIdx.x * blockDim.x;
if(idx >= num) return;
//长度为4时,stride=2,tid取0,1;长度为3时,stride=1,tid取0,
//则input[2]无法计算,不能适用于奇数情况,不过,threadDim都是32的倍数,所以无需考虑奇数
for(int stride = blockDim.x/2; stride > 0; stride /= 2)
{
if(tid < stride && ( idx + stride < num)) //数组前半部分
{
idata[tid]+=idata[tid + stride];
}
__syncthreads(); //原数据已经被更新,必须同步
}
if(tid==0)
//每个小数组的和
output[blockIdx.x]=idata[0];
unsigned long long end = clock64();
block_times[blockIdx.x] = end -start;
}
__global__ void sumKernelUnfold(float* input,float* output,int num,unsigned long long* block_times)
{
unsigned long long start = clock64();
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x * 2 + threadIdx.x;//将原来的两个block合并
//创建小数组,注意不要非法访问
float* idata = input + blockIdx.x * blockDim.x * 2;
if(idx+blockDim.x<num) input[idx]+=input[idx+blockDim.x];//合并两个数据块
__syncthreads();
//input已经合并,idata直接对合并后的值相加
for(int stride = blockDim.x/2; stride > 0; stride /= 2)
{
if(tid < stride && ( idx + stride < num)) //数组前半部分
{
idata[tid]+=idata[tid + stride];
}
__syncthreads(); //原数据已经被更新,必须同步
}
if(tid==0)
//每个小数组的和
output[blockIdx.x]=idata[0];
unsigned long long end = clock64();
block_times[blockIdx.x] = end -start;
}
__global__ void sumKernelUnfold3(float* input, float* output, int num, unsigned long long* block_times) {
unsigned long long start = clock64();
int tid = threadIdx.x;
int blockSize = blockDim.x;
int total_per_block = blockSize * 3; // 每个block处理3倍数据
int idx = blockIdx.x * total_per_block + tid; // 全局起始索引
float* idata = input + blockIdx.x * total_per_block;
// ---- 第一步:合并三个块(仅当存在三个完整块时) ----
// 将第二个块(偏移 blockSize)累加到第一个块
if (idx + blockSize < num) {
input[idx] += input[idx + blockSize];
}
__syncthreads(); // 确保第一批合并完成
// 将第三个块(偏移 2*blockSize)累加到第一个块(此时第一个块已包含第二个块的值)
if (idx + 2 * blockSize < num) {
input[idx] += input[idx + 2 * blockSize];
}
__syncthreads();
// ---- 第二步:对合并后的第一个块进行归约(标准做法) ----
// 注意:此时有效数据只在前 blockSize 个元素中(但最后一个block可能不足)
// 我们仍然用同样的归约循环,但只对前 blockSize 个线程有效
for (int stride = blockSize / 2; stride > 0; stride >>= 1) {
if (tid < stride) {
// 只要当前线程对应的元素有效(idx < num)且待加的元素有效(idx + stride < num?)
// 但这里我们操作的是 idata,是局部连续地址,所以检查相对偏移更合理。
// 使用相对偏移:如果 tid + stride < blockSize 且 (idx + stride) < num
// 但 idx 是绝对索引,可能跨过有效范围,所以用相对偏移判断更准确:
int relIdx = tid + stride;
if (relIdx < blockSize && (blockIdx.x * total_per_block + relIdx) < num) {
idata[tid] += idata[relIdx];
}
}
__syncthreads();
}
if (tid == 0) {
// 只有第一个元素(即块内归约结果)有效
output[blockIdx.x] = idata[0];
}
unsigned long long end = clock64();
if (tid == 0) {
block_times[blockIdx.x] = end - start;
}
}
void ArraySum(const float* h_a,float* h_b,int num,int w)
{
float *d_a,*d_b;
unsigned long long *block_times;
//多分配500,为了验证越界
CUDA_CHECK(cudaMalloc((void**)&d_a, (num+500) * sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_a, h_a, (num+500) * sizeof(float), cudaMemcpyHostToDevice));
dim3 block(w);
dim3 grid((ARRAY_L+block.x-1)/block.x);
CUDA_CHECK(cudaMalloc((void**)&d_b, grid.x * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&block_times, grid.x * sizeof(unsigned long long)));
sumKernel<<<grid,block>>>(d_a,d_b,num,block_times);
printf("block(%d);grid(%d)\n", block.x, grid.x);
unsigned long long* h_block_times = (unsigned long long*)malloc(grid.x * sizeof(unsigned long long));
cudaMemcpy(h_block_times, block_times, grid.x * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
float av_time=0;
for(int i=0;i<grid.x;i++)
{
av_time+=h_block_times[i];
}
av_time/=grid.x;
printf("block average time:%f cycles\n",av_time);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
float* h_partial = (float*)malloc(grid.x * sizeof(float));
cudaMemcpy(h_partial, d_b, grid.x * sizeof(float), cudaMemcpyDeviceToHost);
// CPU 累加得到最终结果
float sum = 0.0f;
for (int i = 0; i < grid.x; i++) {
sum += h_partial[i];
}
*h_b = sum;
CUDA_CHECK(cudaFree(d_a));
CUDA_CHECK(cudaFree(d_b));
CUDA_CHECK(cudaFree(block_times));
free(h_partial);
free(h_block_times);
}
void ArraySumUnfold(const float* h_a,float* h_b,int num,int w)
{
float *d_a,*d_b;
unsigned long long *block_times;
//多分配500,为了验证越界
CUDA_CHECK(cudaMalloc((void**)&d_a, (num+500) * sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_a, h_a, (num+500) * sizeof(float), cudaMemcpyHostToDevice));
dim3 block(w);
dim3 grid((ARRAY_L/2+block.x-1)/block.x);//两块合并
CUDA_CHECK(cudaMalloc((void**)&d_b, grid.x * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&block_times, grid.x * sizeof(unsigned long long)));
sumKernelUnfold<<<grid,block>>>(d_a,d_b,num,block_times);
unsigned long long* h_block_times = (unsigned long long*)malloc(grid.x * sizeof(unsigned long long));
cudaMemcpy(h_block_times, block_times, grid.x * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
printf("block(%d);grid(%d)\n", block.x, grid.x);
float av_time=0;
for(int i=0;i<grid.x;i++)
{
av_time+=h_block_times[i];
}
av_time/=grid.x;
printf("unfold block average time:%f cycles\n",av_time);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
float* h_partial = (float*)malloc(grid.x * sizeof(float));
cudaMemcpy(h_partial, d_b, grid.x * sizeof(float), cudaMemcpyDeviceToHost);
// CPU 累加得到最终结果
float sum = 0.0f;
for (int i = 0; i < grid.x; i++) {
sum += h_partial[i];
}
*h_b = sum;
CUDA_CHECK(cudaFree(d_a));
CUDA_CHECK(cudaFree(d_b));
CUDA_CHECK(cudaFree(block_times));
free(h_partial);
free(h_block_times);
}
void ArraySumUnfold3(const float* h_a, float* h_b, int num, int w) {
float *d_a, *d_b;
unsigned long long *block_times;
// 分配设备内存(多分配500方便越界测试,实际可省略)
CUDA_CHECK(cudaMalloc((void**)&d_a, (num + 500) * sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_a, h_a, (num + 500) * sizeof(float), cudaMemcpyHostToDevice));
int blockSize = w;
int total_per_block = blockSize * 3;
int gridSize = (num + total_per_block - 1) / total_per_block; // 上取整
dim3 block(blockSize);
dim3 grid(gridSize);
CUDA_CHECK(cudaMalloc((void**)&d_b, gridSize * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&block_times, gridSize * sizeof(unsigned long long)));
sumKernelUnfold3<<<grid, block>>>(d_a, d_b, num, block_times);
cudaDeviceSynchronize();
// 拷贝 block_times 回主机并计算平均耗时
unsigned long long* h_block_times = (unsigned long long*)malloc(gridSize * sizeof(unsigned long long));
cudaMemcpy(h_block_times, block_times, gridSize * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
unsigned long long total_cycles = 0;
int valid_blocks = 0;
for (int i = 0; i < gridSize; i++) {
if (h_block_times[i] > 0) { // 忽略从未写入的块(所有block都有tid=0写入,所以都有效)
total_cycles += h_block_times[i];
valid_blocks++;
}
}
double avg_cycles = (valid_blocks > 0) ? (double)total_cycles / valid_blocks : 0.0;
printf("block(%d);grid(%d)\n", blockSize, gridSize);
printf("unfold3 block average time: %f cycles\n", avg_cycles);
// 拷贝部分和并求和
float* h_partial = (float*)malloc(gridSize * sizeof(float));
cudaMemcpy(h_partial, d_b, gridSize * sizeof(float), cudaMemcpyDeviceToHost);
float sum = 0.0f;
for (int i = 0; i < gridSize; i++) {
sum += h_partial[i];
}
*h_b = sum;
// 释放资源
free(h_block_times);
free(h_partial);
CUDA_CHECK(cudaFree(d_a));
CUDA_CHECK(cudaFree(d_b));
CUDA_CHECK(cudaFree(block_times));
}
sum.h
cpp
#pragma once
#include "stdio.h"
#define ARRAY_L 1000000
// CUDA 错误检查宏
#define CUDA_CHECK(call) \
do { \
cudaError_t err = (call); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
void ArraySum(const float* h_a,float* h_b,int num,int w);
void ArraySumUnfold(const float* h_a,float* h_b,int num,int w);
void ArraySumUnfold3(const float* h_a, float* h_b, int num, int w);
输出
cpp
eric@eric-virtual-machine:~/cfz/learn_by_DS/CUDA/sum/build$ nvprof ./cuda_sum 256
==162784== NVPROF is profiling process 162784, command: ./cuda_sum 256
block(256);grid(3907)
block average time:1987.840332 cycles
gpu1 cost:239869 us;sum=4500000.000000
block(256);grid(1954)
unfold block average time:2221.188232 cycles
gpu2 cost:713 us;sum=4500000.000000
block(256);grid(1303)
unfold3 block average time: 2941.056792 cycles
gpu3 cost:722 us;sum=4500000.000000
sum=4500000.000000
cpu cost:2214 us;sum=4500000.000000
==162784== Profiling application: ./cuda_sum 256
==162784== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 78.50% 1.1675ms 3 389.17us 353.86us 430.44us [CUDA memcpy HtoD]
9.94% 147.78us 1 147.78us 147.78us 147.78us sumKernel(float*, float*, int, __int64*)
5.58% 82.945us 1 82.945us 82.945us 82.945us sumKernelUnfold(float*, float*, int, __int64*)
4.76% 70.721us 1 70.721us 70.721us 70.721us sumKernelUnfold3(float*, float*, int, __int64*)
1.23% 18.271us 6 3.0450us 2.3360us 4.3520us [CUDA memcpy DtoH]
API calls: 97.03% 71.709ms 9 7.9677ms 2.8240us 71.389ms cudaMalloc
2.06% 1.5222ms 9 169.14us 11.609us 457.47us cudaMemcpy
0.36% 262.43us 9 29.158us 3.0960us 63.593us cudaFree
0.26% 188.72us 3 62.906us 9.7560us 168.33us cudaLaunchKernel
0.14% 100.87us 3 33.624us 2.2410us 95.706us cudaDeviceSynchronize
0.14% 100.11us 101 991ns 94ns 38.500us cuDeviceGetAttribute
0.02% 14.428us 1 14.428us 14.428us 14.428us cuDeviceGetName
0.01% 4.6110us 1 4.6110us 4.6110us 4.6110us cuDeviceGetPCIBusId
0.00% 1.7840us 3 594ns 149ns 1.3920us cuDeviceGetCount
0.00% 809ns 1 809ns 809ns 809ns cuModuleGetLoadingMode
0.00% 779ns 2 389ns 136ns 643ns cuDeviceGet
0.00% 643ns 2 321ns 272ns 371ns cudaGetLastError
0.00% 244ns 1 244ns 244ns 244ns cuDeviceTotalMem
0.00% 155ns 1 155ns 155ns 155ns cuDeviceGetUuid
| 指标 | sumKernel |
sumKernelUnfold |
变化 |
|---|---|---|---|
| Block 平均周期数 | 1985.7 | 2216.1 | +11.6% (每个 Block 变慢) |
| Block 总数 | 3907 | 1954 | -50% |
| 总周期数(平均×个数) | ~7.76M | ~4.33M | -44.2% |
nvprof 总时间 |
147.74 µs | 82.59 µs | -44.1% |
将原来的两个block合并,虽然每个block计算变复杂了,但是总block数减少,最后gpu耗时反而还减少了,而且总周期数变换率和Kernel耗时变化率完全一样!
-
单 Block 时间 (
clock64测得):指一个 Block 从开始到结束需要约 1.18 µs=2216/1.88G(假设频率 1.88GHz)。 -
内核总时间 (
nvprof测得):指 1954 个 Block 全部算完 需要 82.9 µs。
由于 GTX 1650 只有 14 个 SM,每个 SM 同时最多跑 2 个 Block,即同时只能跑 28 个 Block。
-
1954 个 Block 需要分成
1954 / 28 ≈ 70批(波次)执行。 -
总时间 ≈
70 批 × 1.18 µs/批 ≈ 82.6 µs。 -
这完美验证了
nvprof的 82.9 µs!至于为什么每个SM最多同时跑2个block,我还不知道,怎么预测每个block的周期数,我也不知道,不过现在好歹算出了个nvprof的时间,剩下再学吧。