cuda编程笔记（6）--流

cudaHostAlloc

cudaHostAlloc 是 CUDA 提供的用于在 主机端申请页锁定内存（pinned memory） 的函数

`cudaHostAlloc()` 和 `malloc()` 的核心区别

特性/方面	`malloc()`（标准 C）	`cudaHostAlloc()`（CUDA）
内存类型	普通主机内存（分页内存）	页锁定内存（pinned memory）
是否支持 DMA 传输	❌ 需要复制缓冲区	✅ 可直接使用 DMA 提升带宽
是否支持异步传输	❌ 不支持	✅ 支持与 CUDA 流异步 `cudaMemcpyAsync`
是否支持映射给 GPU	❌ 不支持	✅ 可映射到设备端地址空间（取决于 flag）
释放函数	`free()`	`cudaFreeHost()`
是否影响主机调度	否	是（锁页后主机无法交换该页）

`cudaHostAlloc()` 函数原型与参数

cpp 复制代码

cudaError_t cudaHostAlloc(void** ptr, size_t size, unsigned int flags);

参数说明：

void** ptr：返回页锁定主机内存的地址；
size：申请的字节数；
flags：控制内存行为，常见值如下：

Flag	含义
`cudaHostAllocDefault`	默认锁页内存，适用于异步传输
`cudaHostAllocMapped`	允许内存映射到设备（即 GPU 可以直接访问）
`cudaHostAllocWriteCombined`	禁止读优化，仅用于主机写后设备读，提高性能
`cudaHostAllocPortable`	多 GPU 上可见（跨 context 使用）

什么是页锁定的内存

页锁定内存（pinned memory）是指不会被操作系统虚拟内存管理机制移到磁盘的主机内存 ，

它是物理内存中被"锁住"的一段区域，可用于GPU 的直接访问（DMA）和异步数据传输

在操作系统中，主机内存通常是分页（pageable）管理的：

普通内存（如 malloc() 得到的）：是可分页的；
- 如果内存长时间未访问，操作系统可能把它换出到磁盘（swap），释放物理内存。
页锁定内存（如 cudaHostAlloc() 得到的）：是不可分页的；
- 它始终驻留在主机物理内存中，不能被 swap 掉。

为什么这很重要？

因为 GPU 想要高效读取主机内存时，必须使用 DMA（直接内存访问），而 DMA 不能访问"随时可能被换出"的普通内存。

优势	说明
✅ 支持异步内存传输	允许 `cudaMemcpyAsync` 与 kernel 并发
✅ 支持更高带宽的内存传输	通过 DMA 直连传输，避免临时缓存
✅ 可被 GPU 映射访问	如使用 `cudaHostAllocMapped`
✅ 节省系统临时复制开销	避免内部"拷贝到临时 pinned 缓冲区"的中间过程

问题	原因与建议
❗ pinned memory 会限制系统物理内存	不应大规模分配，避免影响主机性能
🔄 必须用 `cudaFreeHost()` 释放	不要用 `free()`
❗ 必须配合流和 `cudaMemcpyAsync` 才能异步传输	否则性能不会提升

速度测试

cpp 复制代码

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include <iostream>
#include<cstdio>
#define SIZE 10*1024*1024
void error_handling(cudaError_t res) {
    if (res !=cudaSuccess) {
        std::cout << "error!" << std::endl;
    }
}
//up是内存分配的方向
float cuda_malloc_test(int size, bool up) {
    cudaEvent_t start, stop;
    int* a, * dev_a;
    float elapsedTime;
    error_handling(cudaEventCreate(&start));
    error_handling(cudaEventCreate(&stop));
    a = new int[size];
    error_handling(cudaMalloc((void**)&dev_a,size*sizeof(int)));
    error_handling(cudaEventRecord(start,0));
    //执行100复制操作
    for (int i = 0; i < 100; i++) {
        if (up) {//主机到设备
            error_handling(cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice));
        }
        else {//设备到主机
            error_handling(cudaMemcpy(a, dev_a, size * sizeof(int), cudaMemcpyDeviceToHost));
        }
    }
    error_handling(cudaEventRecord(stop, 0));
    error_handling(cudaEventSynchronize(stop));
    error_handling(cudaEventElapsedTime(&elapsedTime, start, stop));
    delete[]a;
    error_handling(cudaFree(dev_a));
    error_handling(cudaEventDestroy(start));
    error_handling(cudaEventDestroy(stop));
    return elapsedTime;
}
float cuda_host_malloc_test(int size, bool up) {
    cudaEvent_t start, stop;
    int* a, * dev_a;
    float elapsedTime;
    error_handling(cudaEventCreate(&start));
    error_handling(cudaEventCreate(&stop));
    //
    error_handling(cudaHostAlloc((void**)&a, size * sizeof(int),cudaHostAllocDefault));
    //
    error_handling(cudaMalloc((void**)&dev_a, size * sizeof(int)));
    error_handling(cudaEventRecord(start, 0));
    //执行100复制操作
    for (int i = 0; i < 100; i++) {
        if (up) {//主机到设备
            error_handling(cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice));
        }
        else {//设备到主机
            error_handling(cudaMemcpy(a, dev_a, size * sizeof(int), cudaMemcpyDeviceToHost));
        }
    }
    error_handling(cudaEventRecord(stop, 0));
    error_handling(cudaEventSynchronize(stop));
    error_handling(cudaEventElapsedTime(&elapsedTime, start, stop));
    error_handling(cudaFreeHost(a));
    error_handling(cudaFree(dev_a));
    error_handling(cudaEventDestroy(start));
    error_handling(cudaEventDestroy(stop));
    return elapsedTime;
}
int main() {
    float elapsedTime;
    float MB = (float)100 * SIZE * sizeof(int) / 1024/1024;
    //可以自行修改调用的函数，进行速度测试
    elapsedTime = cuda_malloc_test(SIZE, true);
    printf("Time using cudaMalloc: %3.1f ms\n", elapsedTime);
    printf("\tMB/s during copy up:  %3.1f\n", MB / (elapsedTime / 1000));
}

流

CUDA 中采用 流（stream） 能带来性能提升的核心原因是：

流允许数据传输（拷贝）和 GPU 计算（kernel） 之间 并行 / 重叠执行 ，最大化利用 CPU、PCIe 带宽、GPU 核心 等多个资源。

默认情况下（不使用流），操作是串行执行的，比如：

cpp 复制代码

cudaMemcpy(...);               // 主机到设备拷贝 (同步，阻塞)
my_kernel<<<...>>>(...);      // kernel 启动
cudaMemcpy(...);               // 设备到主机拷贝 (同步)

这些操作会：

一个结束后才开始下一个（传输和计算不能并发）；
阻塞 CPU，无法做其他事情；
GPU 计算资源闲置（等数据）；
PCIe 传输通道利用率低。

流可以带来哪些 并发重叠？

传输和计算重叠

通过 流 + pinned memory + async copy，可以做到：

在 stream1 中异步拷贝 A 到 GPU；
同时 stream2 中 GPU 正在处理 B 数据；
然后把 B 结果拷回主机的同时拷入 C 数据......

这称为 pipeline 式数据处理。

时间线	Stream 1	Stream 2	Stream 3
T1	HtoD A
T2	Kernel A	HtoD B
T3	DtoH A	Kernel B	HtoD C
T4		DtoH B	Kernel C
T5			DtoH C

这样：数据传输和 GPU 计算可以重叠，整体吞吐量大幅提升。

多核函数并发调度

多个独立 kernel（如多个图像处理任务），如果：

都用默认流：会串行排队；
用不同的流：调度器会并行启动它们！

前提是设备支持并发执行（Concurrent Kernel Execution）。

什么条件下流才能提升效果？

条件	是否必须	说明
✔ 使用 `cudaMemcpyAsync`	✅ 必须	否则不会异步执行
✔ 主机内存是 pinned memory	✅ 必须	普通 malloc 内存不支持异步
✔ 操作放在非默认流上	✅ 必须	默认流（stream 0）会阻塞其他流
✔ 设备支持并发（多数支持）	✅ 强烈建议	多流多 kernel 才能并行调度
❌ 没有内存访问依赖关系	✅ 最佳	否则执行顺序会被限制

代码示例

cpp 复制代码

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include <iostream>
#include<cstdio>
const int N = 1024 * 1024;
#define FULL_DATA_SIZE N*20
void error_handling(cudaError_t res) {
    if (res !=cudaSuccess) {
        std::cout << "error!" << std::endl;
    }
}
//核函数不关键，可以不看
__global__ void kernel(int* a, int* b, int* c) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        int idx1 = (idx + 1) % 256;
        int idx2 = (idx + 2) % 256;
        float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
        float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
        c[idx] = (as + bs) / 2;
    }
}
int main() {
    cudaDeviceProp prop;
    int whichDevice;
    error_handling(cudaGetDevice(&whichDevice));
    error_handling(cudaGetDeviceProperties_v2(&prop, whichDevice));
    if (!prop.deviceOverlap) {//选择一个支持设备重叠功能的设备
        printf("Device will not handle overlappps,so no spped up from streams\n");
        return 0;
    }
    cudaEvent_t start, stop;
    float elappsedTime;
    //启动计时器
    error_handling(cudaEventCreate(&start));
    error_handling(cudaEventCreate(&stop));
    error_handling(cudaEventRecord(start, 0));
    //初始化流
    cudaStream_t stream;
    error_handling(cudaStreamCreate(&stream));
    int* host_a, * host_b, * host_c;
    int* dev_a, * dev_b, * dev_c;
    //分配设备内存
    error_handling(cudaMalloc((void**)&dev_a,N*sizeof(int)));
    error_handling(cudaMalloc((void**)&dev_b, N * sizeof(int)));
    error_handling(cudaMalloc((void**)&dev_c, N * sizeof(int)));
    //分配由流使用的页锁定内存
    error_handling(cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));
    error_handling(cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));
    error_handling(cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));
    for (int i = 0; i < FULL_DATA_SIZE; i++) {
        host_a[i] = rand();
        host_b[i] = rand();
    }
    //每一批做三件事，都通过同一个 stream 异步调度：
    for (int i = 0; i < FULL_DATA_SIZE; i += N) {
        error_handling(cudaMemcpyAsync(dev_a, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream));
        error_handling(cudaMemcpyAsync(dev_b, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream));
        kernel << <N / 256, 256, 0, stream >> > (dev_a, dev_b, dev_c);
        error_handling(cudaMemcpyAsync(host_c + i, dev_c, N * sizeof(int), cudaMemcpyHostToDevice, stream));
    }
    error_handling(cudaStreamSynchronize(stream));
    error_handling(cudaEventRecord(stop, 0));
    error_handling(cudaEventSynchronize(stop));
    error_handling(cudaEventElapsedTime(&elappsedTime, start, stop));
    printf("Time taken : %31f ms\n", elappsedTime);
    error_handling(cudaFreeHost(host_a));
    error_handling(cudaFreeHost(host_b));
    error_handling(cudaFreeHost(host_c));
    error_handling(cudaFree(dev_a));
    error_handling(cudaFree(dev_b));
    error_handling(cudaFree(dev_c));
    error_handling(cudaStreamDestroy(stream));
    error_handling(cudaEventDestroy(start));
    error_handling(cudaEventDestroy(stop));
}

多流 pipelining（真正 overlap）

目前所有操作都用一个 stream，所以 一个 batch 的三个操作仍然串行排队 。

可以使用多个 stream 轮流调度每个 batch：

cpp 复制代码

cudaStream_t streams[4];
for (int i = 0; i < 4; ++i)
    cudaStreamCreate(&streams[i]);

for (int i = 0; i < FULL_DATA_SIZE; i += N) {
    int s = (i / N) % 4;
    cudaMemcpyAsync(..., streams[s]);
    kernel<<<..., ..., 0, streams[s]>>>(...);
    cudaMemcpyAsync(..., streams[s]);
}

这样可实现拷贝和 kernel 的真正 overlap。

流+事件计时

例子：两个 kernel 用两个 stream 并发执行，各自测时间

cpp 复制代码

cudaStream_t stream1, stream2;
cudaEvent_t start1, stop1, start2, stop2;
float time1 = 0.0f, time2 = 0.0f;

cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);

cudaEventCreate(&start1); cudaEventCreate(&stop1);
cudaEventCreate(&start2); cudaEventCreate(&stop2);

// kernel1 时间测量（绑定到 stream1）
cudaEventRecord(start1, stream1);
my_kernel<<<gridDim, blockDim, 0, stream1>>>(...);
cudaEventRecord(stop1, stream1);

// kernel2 时间测量（绑定到 stream2）
cudaEventRecord(start2, stream2);
my_kernel2<<<gridDim, blockDim, 0, stream2>>>(...);
cudaEventRecord(stop2, stream2);

// 等待各自完成
cudaEventSynchronize(stop1);
cudaEventSynchronize(stop2);

// 计算时间（单位 ms）
cudaEventElapsedTime(&time1, start1, stop1);
cudaEventElapsedTime(&time2, start2, stop2);

printf("Kernel 1 time: %.2f ms\n", time1);
printf("Kernel 2 time: %.2f ms\n", time2);

// 清理
cudaEventDestroy(start1); cudaEventDestroy(stop1);
cudaEventDestroy(start2); cudaEventDestroy(stop2);
cudaStreamDestroy(stream1); cudaStreamDestroy(stream2);

cuda编程笔记（6）--流

cudaHostAlloc

cudaHostAlloc() 和 malloc() 的核心区别

cudaHostAlloc() 函数原型与参数

参数说明：

什么是页锁定的内存

速度测试

流

传输和计算重叠

多核函数并发调度

代码示例

多流 pipelining（真正 overlap）

流+事件计时

`cudaHostAlloc()` 和 `malloc()` 的核心区别

`cudaHostAlloc()` 函数原型与参数