第27章 Device_partition - 设备分区测试

27.1 概述

设备分区(Device Partition)是OpenCL 1.2引入的特性,允许将物理设备逻辑地划分为多个子设备(sub-devices)。这一特性使应用程序能够更精细地控制资源分配,实现更好的负载均衡和性能隔离。

27.1.1 设备分区的意义

传统单设备模型的限制:

c 复制代码

// 整个GPU作为单一设备
cl_device_id gpu_device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &gpu_device, NULL);

// 所有工作都在同一设备上调度
// 无法隔离不同任务
// 难以实现细粒度的资源管理

设备分区的优势:

c 复制代码

// 将GPU划分为多个子设备
cl_device_id sub_devices[4];
cl_device_partition_property props[] = {
    CL_DEVICE_PARTITION_EQUALLY, 4, 0
};
clCreateSubDevices(gpu_device, props, 4, sub_devices, NULL);

// 不同任务使用不同子设备
// 资源隔离
// 更好的并发性

27.1.2 测试目录结构

复制代码

test_conformance/device_partition/
├── main.cpp                        # 测试主入口
├── procs.h                         # 测试函数声明
├── testBase.h                      # 测试基类定义
├── test_device_partition.cpp       # 设备分区测试实现
└── CMakeLists.txt                  # 构建配置

27.1.3 测试覆盖范围

测试类型	分区方式	描述
partition_equally	均匀分区	按计算单元数均匀分配
partition_by_counts	按数量分区	指定每个子设备的计算单元数
partition_by_affinity_domain_numa	NUMA域分区	按NUMA节点分区
partition_by_affinity_domain_l4_cache	L4缓存分区	按L4缓存域分区
partition_by_affinity_domain_l3_cache	L3缓存分区	按L3缓存域分区
partition_by_affinity_domain_l2_cache	L2缓存分区	按L2缓存域分区
partition_by_affinity_domain_l1_cache	L1缓存分区	按L1缓存域分区
partition_by_affinity_domain_next_partitionable	下一可分区域	按下一级可分区域分区
partition_all	所有类型	测试所有分区方式

27.2 设备分区API

27.2.1 查询设备分区能力

c 复制代码

// 查询设备是否支持分区
cl_device_partition_property partition_properties[10];
size_t size_ret;

cl_int err = clGetDeviceInfo(
    device,
    CL_DEVICE_PARTITION_PROPERTIES,  // 查询支持的分区类型
    sizeof(partition_properties),
    partition_properties,
    &size_ret);

// partition_properties包含支持的分区类型:
// - CL_DEVICE_PARTITION_EQUALLY
// - CL_DEVICE_PARTITION_BY_COUNTS
// - CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN

查询最大子设备数量:

c 复制代码

cl_uint max_sub_devices;
err = clGetDeviceInfo(
    device,
    CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
    sizeof(max_sub_devices),
    &max_sub_devices,
    NULL);

printf("Max sub-devices: %u\n", max_sub_devices);

查询设备的计算单元数量:

c 复制代码

cl_uint max_compute_units;
err = clGetDeviceInfo(
    device,
    CL_DEVICE_MAX_COMPUTE_UNITS,
    sizeof(max_compute_units),
    &max_compute_units,
    NULL);

printf("Max compute units: %u\n", max_compute_units);

27.2.2 创建子设备

c 复制代码

cl_int clCreateSubDevices(
    cl_device_id in_device,                          // 父设备
    const cl_device_partition_property *properties, // 分区属性
    cl_uint num_devices,                             // 子设备数量
    cl_device_id *out_devices,                       // 输出子设备数组
    cl_uint *num_devices_ret);                       // 实际创建的子设备数

// 返回值:
// CL_SUCCESS - 成功
// CL_INVALID_DEVICE - 无效设备
// CL_INVALID_VALUE - 无效属性
// CL_DEVICE_PARTITION_FAILED - 分区失败
// CL_INVALID_DEVICE_PARTITION_COUNT - 分区数量无效

27.2.3 释放子设备

c 复制代码

cl_int clReleaseDevice(cl_device_id device);

// 释放子设备引用
// 当引用计数为0时,子设备被销毁

27.3 均匀分区 (PARTITION_EQUALLY)

27.3.1 均匀分区概念

将设备的计算单元均匀分配给子设备:

c 复制代码

// 假设设备有32个计算单元
// 均匀分成4份,每个子设备8个计算单元

cl_device_partition_property props[] = {
    CL_DEVICE_PARTITION_EQUALLY,  // 分区类型
    8,                             // 每个子设备的计算单元数
    0                              // 结束标记
};

计算规则:

复制代码

子设备数量 = max_compute_units / compute_units_per_sub_device

27.3.2 均匀分区示例

c 复制代码

void test_partition_equally(cl_device_id parent_device) {
    cl_int err;
    
    // 查询计算单元数量
    cl_uint max_compute_units;
    err = clGetDeviceInfo(parent_device, CL_DEVICE_MAX_COMPUTE_UNITS,
                         sizeof(max_compute_units), &max_compute_units, NULL);
    
    printf("Parent device has %u compute units\n", max_compute_units);
    
    if (max_compute_units <= 1) {
        printf("Device cannot be partitioned (only 1 CU)\n");
        return;
    }
    
    // 均匀分成2份
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_EQUALLY,
        max_compute_units / 2,  // 每份一半
        0
    };
    
    // 查询可以创建多少个子设备
    cl_uint num_sub_devices;
    err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
    if (err != CL_SUCCESS) {
        printf("Failed to query sub-device count: %d\n", err);
        return;
    }
    
    printf("Can create %u sub-devices\n", num_sub_devices);
    
    // 创建子设备
    cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
    err = clCreateSubDevices(parent_device, props, num_sub_devices,
                            sub_devices, NULL);
    if (err != CL_SUCCESS) {
        printf("Failed to create sub-devices: %d\n", err);
        delete[] sub_devices;
        return;
    }
    
    // 验证子设备
    for (cl_uint i = 0; i < num_sub_devices; i++) {
        cl_uint sub_compute_units;
        clGetDeviceInfo(sub_devices[i], CL_DEVICE_MAX_COMPUTE_UNITS,
                       sizeof(sub_compute_units), &sub_compute_units, NULL);
        printf("Sub-device %u has %u compute units\n", i, sub_compute_units);
        
        // 释放子设备
        clReleaseDevice(sub_devices[i]);
    }
    
    delete[] sub_devices;
}

测试场景:

c 复制代码

// 场景1: 32个CU,分成2份
max_compute_units = 32;
props[] = {CL_DEVICE_PARTITION_EQUALLY, 16, 0};
// 结果: 2个子设备,每个16个CU

// 场景2: 32个CU,分成4份
props[] = {CL_DEVICE_PARTITION_EQUALLY, 8, 0};
// 结果: 4个子设备,每个8个CU

// 场景3: 30个CU,分成3份
props[] = {CL_DEVICE_PARTITION_EQUALLY, 10, 0};
// 结果: 3个子设备,每个10个CU

// 场景4: 不能整除
max_compute_units = 30;
props[] = {CL_DEVICE_PARTITION_EQUALLY, 8, 0};
// 结果: 3个子设备(8+8+8=24个CU),剩余6个CU未使用

27.4 按数量分区 (PARTITION_BY_COUNTS)

27.4.1 按数量分区概念

显式指定每个子设备的计算单元数量:

c 复制代码

// 假设设备有32个计算单元
// 分成3个子设备: 4个CU, 12个CU, 16个CU

cl_device_partition_property props[] = {
    CL_DEVICE_PARTITION_BY_COUNTS,  // 分区类型
    4,                               // 子设备0: 4个CU
    12,                              // 子设备1: 12个CU
    16,                              // 子设备2: 16个CU
    CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,  // 列表结束标记
    0                                // 属性结束标记
};

约束条件:

复制代码

sum(compute_units) <= max_compute_units

27.4.2 按数量分区示例

c 复制代码

void test_partition_by_counts(cl_device_id parent_device) {
    cl_int err;
    
    cl_uint max_compute_units;
    err = clGetDeviceInfo(parent_device, CL_DEVICE_MAX_COMPUTE_UNITS,
                         sizeof(max_compute_units), &max_compute_units, NULL);
    
    printf("Parent device has %u compute units\n", max_compute_units);
    
    if (max_compute_units < 2) {
        printf("Device cannot be partitioned\n");
        return;
    }
    
    // 分成2个子设备: 1个CU和剩余的CU
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_BY_COUNTS,
        1,                              // 第1个子设备: 1个CU
        max_compute_units - 1,          // 第2个子设备: 剩余CU
        CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
        0
    };
    
    // 查询子设备数量
    cl_uint num_sub_devices;
    err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
    if (err != CL_SUCCESS) {
        printf("Failed to query: %d\n", err);
        return;
    }
    
    printf("Will create %u sub-devices\n", num_sub_devices);
    
    // 创建子设备
    cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
    err = clCreateSubDevices(parent_device, props, num_sub_devices,
                            sub_devices, NULL);
    if (err == CL_SUCCESS) {
        for (cl_uint i = 0; i < num_sub_devices; i++) {
            cl_uint units;
            clGetDeviceInfo(sub_devices[i], CL_DEVICE_MAX_COMPUTE_UNITS,
                           sizeof(units), &units, NULL);
            printf("Sub-device %u: %u CUs\n", i, units);
            clReleaseDevice(sub_devices[i]);
        }
    }
    
    delete[] sub_devices;
}

测试场景:

c 复制代码

// 场景1: 32个CU,不均匀分配
max_compute_units = 32;
props[] = {
    CL_DEVICE_PARTITION_BY_COUNTS,
    4, 8, 20,
    CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
    0
};
// 结果: 3个子设备(4+8+20=32个CU)

// 场景2: 极端分配
props[] = {
    CL_DEVICE_PARTITION_BY_COUNTS,
    1, 31,
    CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
    0
};
// 结果: 2个子设备(1+31=32个CU)

// 场景3: 多个小子设备
props[] = {
    CL_DEVICE_PARTITION_BY_COUNTS,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 10个子设备
    CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
    0
};
// 结果: 10个子设备,每个2个CU

27.5 按亲和域分区 (PARTITION_BY_AFFINITY_DOMAIN)

27.5.1 亲和域概念

按硬件拓扑结构分区,使子设备与特定的硬件资源(如缓存、NUMA节点)对应:

复制代码

设备拓扑结构:
┌─────────────────────────────────────┐
│           设备                       │
├─────────────────┬───────────────────┤
│    NUMA节点0    │    NUMA节点1      │
├────────┬────────┼────────┬──────────┤
│ L3缓存0│ L3缓存1│ L3缓存2│ L3缓存3  │
├───┬───┬───┬───┬───┬───┬───┬───┬───┤
│L2 │L2 │L2 │L2 │L2 │L2 │L2 │L2 │   │
└───┴───┴───┴───┴───┴───┴───┴───┴───┘

27.5.2 NUMA域分区

按NUMA(Non-Uniform Memory Access)节点分区:

c 复制代码

// 按NUMA节点分区
cl_device_partition_property props[] = {
    CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
    CL_DEVICE_AFFINITY_DOMAIN_NUMA,
    0
};

// 每个子设备对应一个NUMA节点
// 有利于内存访问局部性

使用示例:

c 复制代码

void test_numa_partition(cl_device_id parent_device) {
    cl_int err;
    
    // 查询是否支持NUMA分区
    cl_device_affinity_domain affinity_domain;
    err = clGetDeviceInfo(parent_device,
                         CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
                         sizeof(affinity_domain), &affinity_domain, NULL);
    
    if (!(affinity_domain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)) {
        printf("NUMA partitioning not supported\n");
        return;
    }
    
    // 按NUMA分区
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
        CL_DEVICE_AFFINITY_DOMAIN_NUMA,
        0
    };
    
    cl_uint num_sub_devices;
    err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
    
    if (err == CL_DEVICE_PARTITION_FAILED) {
        printf("Device cannot be partitioned by NUMA\n");
        return;
    }
    
    printf("NUMA partitioning created %u sub-devices\n", num_sub_devices);
    
    cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
    err = clCreateSubDevices(parent_device, props, num_sub_devices,
                            sub_devices, NULL);
    
    if (err == CL_SUCCESS) {
        for (cl_uint i = 0; i < num_sub_devices; i++) {
            printf("NUMA sub-device %u created\n", i);
            clReleaseDevice(sub_devices[i]);
        }
    }
    
    delete[] sub_devices;
}

27.5.3 缓存域分区

按不同级别的缓存分区:

c 复制代码

// L1缓存分区
cl_device_partition_property props_l1[] = {
    CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
    CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE,
    0
};

// L2缓存分区
cl_device_partition_property props_l2[] = {
    CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
    CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE,
    0
};

// L3缓存分区
cl_device_partition_property props_l3[] = {
    CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
    CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE,
    0
};

// L4缓存分区(如果支持)
cl_device_partition_property props_l4[] = {
    CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
    CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE,
    0
};

测试示例:

c 复制代码

void test_cache_partition(cl_device_id parent_device,
                         cl_device_affinity_domain cache_domain) {
    cl_int err;
    
    const char* cache_name;
    switch (cache_domain) {
        case CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE: cache_name = "L1"; break;
        case CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE: cache_name = "L2"; break;
        case CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE: cache_name = "L3"; break;
        case CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE: cache_name = "L4"; break;
        default: cache_name = "Unknown"; break;
    }
    
    // 查询支持
    cl_device_affinity_domain affinity_domain;
    err = clGetDeviceInfo(parent_device,
                         CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
                         sizeof(affinity_domain), &affinity_domain, NULL);
    
    if (!(affinity_domain & cache_domain)) {
        printf("%s cache partitioning not supported\n", cache_name);
        return;
    }
    
    // 分区
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
        cache_domain,
        0
    };
    
    cl_uint num_sub_devices;
    err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
    
    if (err == CL_DEVICE_PARTITION_FAILED) {
        printf("Device cannot be partitioned by %s cache\n", cache_name);
        return;
    }
    
    printf("%s cache partitioning: %u sub-devices\n",
           cache_name, num_sub_devices);
}

27.5.4 下一可分区域

c 复制代码

// 使用下一级别的可分区域
cl_device_partition_property props[] = {
    CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
    CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE,
    0
};

// 驱动选择下一级别的分区方式
// 例如:如果当前按NUMA分区,下一级可能按L3缓存分区

27.6 子设备查询

27.6.1 查询子设备的分区方式

c 复制代码

void query_partition_type(cl_device_id device) {
    cl_int err;
    size_t size;
    
    // 查询分区类型
    err = clGetDeviceInfo(device, CL_DEVICE_PARTITION_TYPE,
                         0, NULL, &size);
    
    if (size == 0) {
        printf("This is a root device (not partitioned)\n");
        return;
    }
    
    cl_device_partition_property *props =
        (cl_device_partition_property*)malloc(size);
    
    err = clGetDeviceInfo(device, CL_DEVICE_PARTITION_TYPE,
                         size, props, NULL);
    
    printf("Device partition type: ");
    switch (props[0]) {
        case CL_DEVICE_PARTITION_EQUALLY:
            printf("EQUALLY (CUs per sub-device: %lld)\n",
                   (long long)props[1]);
            break;
            
        case CL_DEVICE_PARTITION_BY_COUNTS:
            printf("BY_COUNTS (");
            for (size_t i = 1; props[i] != CL_DEVICE_PARTITION_BY_COUNTS_LIST_END; i++) {
                printf("%lld ", (long long)props[i]);
            }
            printf(")\n");
            break;
            
        case CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN:
            printf("BY_AFFINITY_DOMAIN (");
            switch (props[1]) {
                case CL_DEVICE_AFFINITY_DOMAIN_NUMA:
                    printf("NUMA"); break;
                case CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE:
                    printf("L1_CACHE"); break;
                case CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE:
                    printf("L2_CACHE"); break;
                case CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE:
                    printf("L3_CACHE"); break;
                case CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE:
                    printf("L4_CACHE"); break;
                case CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE:
                    printf("NEXT_PARTITIONABLE"); break;
            }
            printf(")\n");
            break;
    }
    
    free(props);
}

27.6.2 查询父设备

c 复制代码

void query_parent_device(cl_device_id device) {
    cl_device_id parent;
    cl_int err = clGetDeviceInfo(device, CL_DEVICE_PARENT_DEVICE,
                                 sizeof(parent), &parent, NULL);
    
    if (err == CL_SUCCESS && parent != NULL) {
        printf("This device has a parent device\n");
        
        // 查询父设备名称
        char parent_name[256];
        clGetDeviceInfo(parent, CL_DEVICE_NAME,
                       sizeof(parent_name), parent_name, NULL);
        printf("Parent device: %s\n", parent_name);
    } else {
        printf("This is a root device\n");
    }
}

27.7 多级分区

27.7.1 递归分区

子设备可以进一步分区:

c 复制代码

void recursive_partition(cl_device_id device, int level) {
    cl_int err;
    
    // 打印当前设备信息
    char device_name[256];
    clGetDeviceInfo(device, CL_DEVICE_NAME,
                   sizeof(device_name), device_name, NULL);
    
    cl_uint compute_units;
    clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
                   sizeof(compute_units), &compute_units, NULL);
    
    printf("%*sDevice: %s (%u CUs)\n",
           level * 2, "", device_name, compute_units);
    
    if (compute_units <= 1) {
        printf("%*sCannot partition further\n", level * 2, "");
        return;
    }
    
    // 尝试均匀分区
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_EQUALLY,
        compute_units / 2,
        0
    };
    
    cl_uint num_sub_devices;
    err = clCreateSubDevices(device, props, 0, NULL, &num_sub_devices);
    
    if (err != CL_SUCCESS || num_sub_devices == 0) {
        printf("%*sCannot partition further\n", level * 2, "");
        return;
    }
    
    // 创建子设备
    cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
    err = clCreateSubDevices(device, props, num_sub_devices,
                            sub_devices, NULL);
    
    if (err == CL_SUCCESS) {
        printf("%*sCreated %u sub-devices\n",
               level * 2, "", num_sub_devices);
        
        // 递归分区每个子设备
        for (cl_uint i = 0; i < num_sub_devices; i++) {
            recursive_partition(sub_devices[i], level + 1);
            clReleaseDevice(sub_devices[i]);
        }
    }
    
    delete[] sub_devices;
}

测试示例:

c 复制代码

// 示例: 64个CU的设备,三级分区
// 级别0: 64个CU
//   ├─ 级别1: 32个CU (均匀分成2份)
//   │   ├─ 级别2: 16个CU (再分成2份)
//   │   │   ├─ 级别3: 8个CU (再分成2份)
//   │   │   └─ 级别3: 8个CU
//   │   └─ 级别2: 16个CU
//   │       ├─ 级别3: 8个CU
//   │       └─ 级别3: 8个CU
//   └─ 级别1: 32个CU
//       └─ ... (类似结构)

27.8 子设备上下文与队列

27.8.1 在子设备上创建上下文

c 复制代码

void test_sub_device_context(cl_device_id parent_device) {
    cl_int err;
    
    // 创建子设备
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_EQUALLY,
        2,  // 假设每份2个CU
        0
    };
    
    cl_uint num_sub_devices;
    err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
    
    cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
    err = clCreateSubDevices(parent_device, props, num_sub_devices,
                            sub_devices, NULL);
    
    // 为所有子设备创建上下文
    cl_context context = clCreateContext(NULL, num_sub_devices,
                                        sub_devices, NULL, NULL, &err);
    
    if (err == CL_SUCCESS) {
        printf("Created context with %u sub-devices\n", num_sub_devices);
        
        // 为每个子设备创建命令队列
        for (cl_uint i = 0; i < num_sub_devices; i++) {
            cl_command_queue queue = clCreateCommandQueueWithProperties(
                context, sub_devices[i], NULL, &err);
            
            if (err == CL_SUCCESS) {
                printf("Created queue for sub-device %u\n", i);
                clReleaseCommandQueue(queue);
            }
        }
        
        clReleaseContext(context);
    }
    
    // 清理
    for (cl_uint i = 0; i < num_sub_devices; i++) {
        clReleaseDevice(sub_devices[i]);
    }
    delete[] sub_devices;
}

27.8.2 混合父设备和子设备

c 复制代码

void test_mixed_device_context(cl_device_id parent_device) {
    cl_int err;
    
    // 创建子设备
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_EQUALLY, 4, 0
    };
    
    cl_uint num_sub_devices;
    err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
    
    // 创建设备数组: 子设备 + 父设备
    cl_device_id *devices = new cl_device_id[num_sub_devices + 1];
    err = clCreateSubDevices(parent_device, props, num_sub_devices,
                            devices, NULL);
    devices[num_sub_devices] = parent_device;  // 添加父设备
    
    // 创建包含父设备和子设备的上下文
    cl_context context = clCreateContext(NULL, num_sub_devices + 1,
                                        devices, NULL, NULL, &err);
    
    if (err == CL_SUCCESS) {
        printf("Created mixed context with parent and %u sub-devices\n",
               num_sub_devices);
        clReleaseContext(context);
    }
    
    // 清理
    for (cl_uint i = 0; i < num_sub_devices; i++) {
        clReleaseDevice(devices[i]);
    }
    delete[] devices;
}

27.9 子设备并发测试

27.9.1 多队列并发执行

c 复制代码

__kernel void test_kernel(__global int *data) {
    int gid = get_global_id(0);
    data[gid] *= 3;  // 简单计算
}

测试代码:

c 复制代码

void test_concurrent_execution(cl_device_id parent_device, int iterations) {
    cl_int err;
    
    // 创建子设备
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_EQUALLY, 4, 0
    };
    
    cl_uint num_sub_devices;
    err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
    
    cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
    err = clCreateSubDevices(parent_device, props, num_sub_devices,
                            sub_devices, NULL);
    
    // 创建上下文
    cl_context context = clCreateContext(NULL, num_sub_devices,
                                        sub_devices, NULL, NULL, &err);
    
    // 创建程序和内核
    cl_program program = clCreateProgramWithSource(context, 1,
                                                   &kernel_source, NULL, &err);
    clBuildProgram(program, num_sub_devices, sub_devices, NULL, NULL, NULL);
    cl_kernel kernel = clCreateKernel(program, "test_kernel", &err);
    
    // 为每个子设备创建队列和缓冲区
    const int DATA_SIZE = 512;
    cl_command_queue *queues = new cl_command_queue[num_sub_devices];
    cl_mem *buffers = new cl_mem[num_sub_devices];
    int *host_data = new int[DATA_SIZE];
    
    for (cl_uint i = 0; i < num_sub_devices; i++) {
        // 创建队列
        queues[i] = clCreateCommandQueueWithProperties(
            context, sub_devices[i], NULL, &err);
        
        // 创建缓冲区
        for (int j = 0; j < DATA_SIZE; j++) {
            host_data[j] = j;
        }
        buffers[i] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
                                    DATA_SIZE * sizeof(int),
                                    host_data, &err);
    }
    
    printf("Running %d iterations on %u sub-devices concurrently\n",
           iterations, num_sub_devices);
    
    // 并发执行
    for (int iter = 0; iter < iterations; iter++) {
        for (cl_uint i = 0; i < num_sub_devices; i++) {
            clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffers[i]);
            
            size_t global_size = DATA_SIZE;
            size_t local_size = 64;
            clEnqueueNDRangeKernel(queues[i], kernel, 1, NULL,
                                  &global_size, &local_size,
                                  0, NULL, NULL);
        }
        
        // 同步所有队列
        for (cl_uint i = 0; i < num_sub_devices; i++) {
            clFinish(queues[i]);
        }
    }
    
    // 验证结果
    for (cl_uint i = 0; i < num_sub_devices; i++) {
        int *result = new int[DATA_SIZE];
        clEnqueueReadBuffer(queues[i], buffers[i], CL_TRUE, 0,
                           DATA_SIZE * sizeof(int), result, 0, NULL, NULL);
        
        // 验证: 每个元素应该是 j * (3^iterations)
        bool correct = true;
        for (int j = 0; j < DATA_SIZE && correct; j++) {
            int expected = j;
            for (int k = 0; k < iterations; k++) {
                expected *= 3;
            }
            if (result[j] != expected) {
                printf("Sub-device %u: Mismatch at index %d: "
                       "expected %d, got %d\n",
                       i, j, expected, result[j]);
                correct = false;
            }
        }
        
        if (correct) {
            printf("Sub-device %u: All results correct\n", i);
        }
        
        delete[] result;
    }
    
    // 清理
    for (cl_uint i = 0; i < num_sub_devices; i++) {
        clReleaseMemObject(buffers[i]);
        clReleaseCommandQueue(queues[i]);
        clReleaseDevice(sub_devices[i]);
    }
    
    delete[] queues;
    delete[] buffers;
    delete[] host_data;
    delete[] sub_devices;
    
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseContext(context);
}

27.10 实际应用场景

27.10.1 任务并行

使用子设备实现不同任务的并行执行:

c 复制代码

void parallel_tasks_example(cl_device_id device) {
    // 创建4个子设备
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_EQUALLY, 4, 0
    };
    
    cl_uint num_devices = 4;
    cl_device_id sub_devices[4];
    clCreateSubDevices(device, props, num_devices, sub_devices, NULL);
    
    cl_context context = clCreateContext(NULL, num_devices,
                                        sub_devices, NULL, NULL, NULL);
    
    // 任务1: 图像处理 (子设备0)
    cl_command_queue queue0 = clCreateCommandQueueWithProperties(
        context, sub_devices[0], NULL, NULL);
    // enqueue_image_processing_kernel(queue0, ...);
    
    // 任务2: 物理模拟 (子设备1)
    cl_command_queue queue1 = clCreateCommandQueueWithProperties(
        context, sub_devices[1], NULL, NULL);
    // enqueue_physics_simulation_kernel(queue1, ...);
    
    // 任务3: 机器学习推理 (子设备2)
    cl_command_queue queue2 = clCreateCommandQueueWithProperties(
        context, sub_devices[2], NULL, NULL);
    // enqueue_ml_inference_kernel(queue2, ...);
    
    // 任务4: 视频编码 (子设备3)
    cl_command_queue queue3 = clCreateCommandQueueWithProperties(
        context, sub_devices[3], NULL, NULL);
    // enqueue_video_encoding_kernel(queue3, ...);
    
    // 所有任务并行执行,互不干扰
}

27.10.2 资源隔离

c 复制代码

// 为不同用户/租户分配独立的子设备
void multi_tenant_example(cl_device_id device) {
    // 按用户数量分区
    int num_users = 4;
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_EQUALLY,
        compute_units / num_users,
        0
    };
    
    cl_device_id *user_devices = new cl_device_id[num_users];
    clCreateSubDevices(device, props, num_users, user_devices, NULL);
    
    // 每个用户获得独立的子设备
    // 保证资源隔离,防止相互影响
    for (int i = 0; i < num_users; i++) {
        // allocate_resources_to_user(i, user_devices[i]);
    }
}

27.10.3 NUMA优化

c 复制代码

// 利用NUMA分区优化内存访问
void numa_optimized_example(cl_device_id device) {
    // 按NUMA节点分区
    cl_device_partition_property props[] = {
        CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
        CL_DEVICE_AFFINITY_DOMAIN_NUMA,
        0
    };
    
    cl_uint num_numa_nodes;
    clCreateSubDevices(device, props, 0, NULL, &num_numa_nodes);
    
    cl_device_id *numa_devices = new cl_device_id[num_numa_nodes];
    clCreateSubDevices(device, props, num_numa_nodes, numa_devices, NULL);
    
    // 为每个NUMA节点分配本地内存
    // 减少跨NUMA访问,提高性能
    for (cl_uint i = 0; i < num_numa_nodes; i++) {
        // allocate_numa_local_memory(i, numa_devices[i]);
    }
}

27.11 小结

设备分区测试验证OpenCL的设备分区功能,使应用程序能够更灵活地管理和利用硬件资源。

27.11.1 核心要点

分区类型: EQUALLY、BY_COUNTS、BY_AFFINITY_DOMAIN
亲和域: NUMA、L1/L2/L3/L4缓存、NEXT_PARTITIONABLE
子设备管理: 创建、查询、释放
多级分区: 递归分区子设备
并发执行: 多个子设备同时工作

27.11.2 分区方式总结

分区类型	特点	适用场景
EQUALLY	均匀分配	负载均衡的并行任务
BY_COUNTS	自定义分配	不同优先级的任务
NUMA	按NUMA节点	内存密集型应用
L1/L2/L3缓存	按缓存域	缓存敏感型应用

27.11.3 实际优势

✅ 更细粒度的资源控制
✅ 任务隔离和优先级管理
✅ 改善负载均衡
✅ 利用硬件拓扑优化性能
✅ 多租户场景支持

27.11.4 注意事项

⚠️ OpenCL 1.2+才支持
⚠️ 不是所有设备都支持分区
⚠️ 子设备数量受计算单元数限制
⚠️ 需要显式释放子设备
⚠️ 分区可能影响性能(额外开销)

27.11.5 最佳实践

查询支持: 使用前检查设备是否支持分区
合理分配: 根据任务需求选择分区方式
资源管理: 及时释放不用的子设备
性能测试: 比较分区前后的性能
错误处理: 妥善处理分区失败的情况

27.11.6 下一步

完成设备分区测试后,下一章将介绍第28章 - Device_execution(设备端入队测试)。

参考文档:

OpenCL 1.2 Specification - Device Fission Extension
OpenCL 2.0 Specification - Device Partition (Core Feature)
test_conformance/device_partition/ 测试源码