27.1 概述
设备分区(Device Partition)是OpenCL 1.2引入的特性,允许将物理设备逻辑地划分为多个子设备(sub-devices)。这一特性使应用程序能够更精细地控制资源分配,实现更好的负载均衡和性能隔离。
27.1.1 设备分区的意义
传统单设备模型的限制:
c
// 整个GPU作为单一设备
cl_device_id gpu_device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &gpu_device, NULL);
// 所有工作都在同一设备上调度
// 无法隔离不同任务
// 难以实现细粒度的资源管理
设备分区的优势:
c
// 将GPU划分为多个子设备
cl_device_id sub_devices[4];
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY, 4, 0
};
clCreateSubDevices(gpu_device, props, 4, sub_devices, NULL);
// 不同任务使用不同子设备
// 资源隔离
// 更好的并发性
27.1.2 测试目录结构
test_conformance/device_partition/
├── main.cpp # 测试主入口
├── procs.h # 测试函数声明
├── testBase.h # 测试基类定义
├── test_device_partition.cpp # 设备分区测试实现
└── CMakeLists.txt # 构建配置
27.1.3 测试覆盖范围
| 测试类型 | 分区方式 | 描述 |
|---|---|---|
| partition_equally | 均匀分区 | 按计算单元数均匀分配 |
| partition_by_counts | 按数量分区 | 指定每个子设备的计算单元数 |
| partition_by_affinity_domain_numa | NUMA域分区 | 按NUMA节点分区 |
| partition_by_affinity_domain_l4_cache | L4缓存分区 | 按L4缓存域分区 |
| partition_by_affinity_domain_l3_cache | L3缓存分区 | 按L3缓存域分区 |
| partition_by_affinity_domain_l2_cache | L2缓存分区 | 按L2缓存域分区 |
| partition_by_affinity_domain_l1_cache | L1缓存分区 | 按L1缓存域分区 |
| partition_by_affinity_domain_next_partitionable | 下一可分区域 | 按下一级可分区域分区 |
| partition_all | 所有类型 | 测试所有分区方式 |
27.2 设备分区API
27.2.1 查询设备分区能力
c
// 查询设备是否支持分区
cl_device_partition_property partition_properties[10];
size_t size_ret;
cl_int err = clGetDeviceInfo(
device,
CL_DEVICE_PARTITION_PROPERTIES, // 查询支持的分区类型
sizeof(partition_properties),
partition_properties,
&size_ret);
// partition_properties包含支持的分区类型:
// - CL_DEVICE_PARTITION_EQUALLY
// - CL_DEVICE_PARTITION_BY_COUNTS
// - CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN
查询最大子设备数量:
c
cl_uint max_sub_devices;
err = clGetDeviceInfo(
device,
CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
sizeof(max_sub_devices),
&max_sub_devices,
NULL);
printf("Max sub-devices: %u\n", max_sub_devices);
查询设备的计算单元数量:
c
cl_uint max_compute_units;
err = clGetDeviceInfo(
device,
CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(max_compute_units),
&max_compute_units,
NULL);
printf("Max compute units: %u\n", max_compute_units);
27.2.2 创建子设备
c
cl_int clCreateSubDevices(
cl_device_id in_device, // 父设备
const cl_device_partition_property *properties, // 分区属性
cl_uint num_devices, // 子设备数量
cl_device_id *out_devices, // 输出子设备数组
cl_uint *num_devices_ret); // 实际创建的子设备数
// 返回值:
// CL_SUCCESS - 成功
// CL_INVALID_DEVICE - 无效设备
// CL_INVALID_VALUE - 无效属性
// CL_DEVICE_PARTITION_FAILED - 分区失败
// CL_INVALID_DEVICE_PARTITION_COUNT - 分区数量无效
27.2.3 释放子设备
c
cl_int clReleaseDevice(cl_device_id device);
// 释放子设备引用
// 当引用计数为0时,子设备被销毁
27.3 均匀分区 (PARTITION_EQUALLY)
27.3.1 均匀分区概念
将设备的计算单元均匀分配给子设备:
c
// 假设设备有32个计算单元
// 均匀分成4份,每个子设备8个计算单元
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY, // 分区类型
8, // 每个子设备的计算单元数
0 // 结束标记
};
计算规则:
子设备数量 = max_compute_units / compute_units_per_sub_device
27.3.2 均匀分区示例
c
void test_partition_equally(cl_device_id parent_device) {
cl_int err;
// 查询计算单元数量
cl_uint max_compute_units;
err = clGetDeviceInfo(parent_device, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(max_compute_units), &max_compute_units, NULL);
printf("Parent device has %u compute units\n", max_compute_units);
if (max_compute_units <= 1) {
printf("Device cannot be partitioned (only 1 CU)\n");
return;
}
// 均匀分成2份
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY,
max_compute_units / 2, // 每份一半
0
};
// 查询可以创建多少个子设备
cl_uint num_sub_devices;
err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
if (err != CL_SUCCESS) {
printf("Failed to query sub-device count: %d\n", err);
return;
}
printf("Can create %u sub-devices\n", num_sub_devices);
// 创建子设备
cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
err = clCreateSubDevices(parent_device, props, num_sub_devices,
sub_devices, NULL);
if (err != CL_SUCCESS) {
printf("Failed to create sub-devices: %d\n", err);
delete[] sub_devices;
return;
}
// 验证子设备
for (cl_uint i = 0; i < num_sub_devices; i++) {
cl_uint sub_compute_units;
clGetDeviceInfo(sub_devices[i], CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(sub_compute_units), &sub_compute_units, NULL);
printf("Sub-device %u has %u compute units\n", i, sub_compute_units);
// 释放子设备
clReleaseDevice(sub_devices[i]);
}
delete[] sub_devices;
}
测试场景:
c
// 场景1: 32个CU,分成2份
max_compute_units = 32;
props[] = {CL_DEVICE_PARTITION_EQUALLY, 16, 0};
// 结果: 2个子设备,每个16个CU
// 场景2: 32个CU,分成4份
props[] = {CL_DEVICE_PARTITION_EQUALLY, 8, 0};
// 结果: 4个子设备,每个8个CU
// 场景3: 30个CU,分成3份
props[] = {CL_DEVICE_PARTITION_EQUALLY, 10, 0};
// 结果: 3个子设备,每个10个CU
// 场景4: 不能整除
max_compute_units = 30;
props[] = {CL_DEVICE_PARTITION_EQUALLY, 8, 0};
// 结果: 3个子设备(8+8+8=24个CU),剩余6个CU未使用
27.4 按数量分区 (PARTITION_BY_COUNTS)
27.4.1 按数量分区概念
显式指定每个子设备的计算单元数量:
c
// 假设设备有32个计算单元
// 分成3个子设备: 4个CU, 12个CU, 16个CU
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_BY_COUNTS, // 分区类型
4, // 子设备0: 4个CU
12, // 子设备1: 12个CU
16, // 子设备2: 16个CU
CL_DEVICE_PARTITION_BY_COUNTS_LIST_END, // 列表结束标记
0 // 属性结束标记
};
约束条件:
sum(compute_units) <= max_compute_units
27.4.2 按数量分区示例
c
void test_partition_by_counts(cl_device_id parent_device) {
cl_int err;
cl_uint max_compute_units;
err = clGetDeviceInfo(parent_device, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(max_compute_units), &max_compute_units, NULL);
printf("Parent device has %u compute units\n", max_compute_units);
if (max_compute_units < 2) {
printf("Device cannot be partitioned\n");
return;
}
// 分成2个子设备: 1个CU和剩余的CU
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_BY_COUNTS,
1, // 第1个子设备: 1个CU
max_compute_units - 1, // 第2个子设备: 剩余CU
CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
0
};
// 查询子设备数量
cl_uint num_sub_devices;
err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
if (err != CL_SUCCESS) {
printf("Failed to query: %d\n", err);
return;
}
printf("Will create %u sub-devices\n", num_sub_devices);
// 创建子设备
cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
err = clCreateSubDevices(parent_device, props, num_sub_devices,
sub_devices, NULL);
if (err == CL_SUCCESS) {
for (cl_uint i = 0; i < num_sub_devices; i++) {
cl_uint units;
clGetDeviceInfo(sub_devices[i], CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(units), &units, NULL);
printf("Sub-device %u: %u CUs\n", i, units);
clReleaseDevice(sub_devices[i]);
}
}
delete[] sub_devices;
}
测试场景:
c
// 场景1: 32个CU,不均匀分配
max_compute_units = 32;
props[] = {
CL_DEVICE_PARTITION_BY_COUNTS,
4, 8, 20,
CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
0
};
// 结果: 3个子设备(4+8+20=32个CU)
// 场景2: 极端分配
props[] = {
CL_DEVICE_PARTITION_BY_COUNTS,
1, 31,
CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
0
};
// 结果: 2个子设备(1+31=32个CU)
// 场景3: 多个小子设备
props[] = {
CL_DEVICE_PARTITION_BY_COUNTS,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 10个子设备
CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
0
};
// 结果: 10个子设备,每个2个CU
27.5 按亲和域分区 (PARTITION_BY_AFFINITY_DOMAIN)
27.5.1 亲和域概念
按硬件拓扑结构分区,使子设备与特定的硬件资源(如缓存、NUMA节点)对应:
设备拓扑结构:
┌─────────────────────────────────────┐
│ 设备 │
├─────────────────┬───────────────────┤
│ NUMA节点0 │ NUMA节点1 │
├────────┬────────┼────────┬──────────┤
│ L3缓存0│ L3缓存1│ L3缓存2│ L3缓存3 │
├───┬───┬───┬───┬───┬───┬───┬───┬───┤
│L2 │L2 │L2 │L2 │L2 │L2 │L2 │L2 │ │
└───┴───┴───┴───┴───┴───┴───┴───┴───┘
27.5.2 NUMA域分区
按NUMA(Non-Uniform Memory Access)节点分区:
c
// 按NUMA节点分区
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
CL_DEVICE_AFFINITY_DOMAIN_NUMA,
0
};
// 每个子设备对应一个NUMA节点
// 有利于内存访问局部性
使用示例:
c
void test_numa_partition(cl_device_id parent_device) {
cl_int err;
// 查询是否支持NUMA分区
cl_device_affinity_domain affinity_domain;
err = clGetDeviceInfo(parent_device,
CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
sizeof(affinity_domain), &affinity_domain, NULL);
if (!(affinity_domain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)) {
printf("NUMA partitioning not supported\n");
return;
}
// 按NUMA分区
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
CL_DEVICE_AFFINITY_DOMAIN_NUMA,
0
};
cl_uint num_sub_devices;
err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
if (err == CL_DEVICE_PARTITION_FAILED) {
printf("Device cannot be partitioned by NUMA\n");
return;
}
printf("NUMA partitioning created %u sub-devices\n", num_sub_devices);
cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
err = clCreateSubDevices(parent_device, props, num_sub_devices,
sub_devices, NULL);
if (err == CL_SUCCESS) {
for (cl_uint i = 0; i < num_sub_devices; i++) {
printf("NUMA sub-device %u created\n", i);
clReleaseDevice(sub_devices[i]);
}
}
delete[] sub_devices;
}
27.5.3 缓存域分区
按不同级别的缓存分区:
c
// L1缓存分区
cl_device_partition_property props_l1[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE,
0
};
// L2缓存分区
cl_device_partition_property props_l2[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE,
0
};
// L3缓存分区
cl_device_partition_property props_l3[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE,
0
};
// L4缓存分区(如果支持)
cl_device_partition_property props_l4[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE,
0
};
测试示例:
c
void test_cache_partition(cl_device_id parent_device,
cl_device_affinity_domain cache_domain) {
cl_int err;
const char* cache_name;
switch (cache_domain) {
case CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE: cache_name = "L1"; break;
case CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE: cache_name = "L2"; break;
case CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE: cache_name = "L3"; break;
case CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE: cache_name = "L4"; break;
default: cache_name = "Unknown"; break;
}
// 查询支持
cl_device_affinity_domain affinity_domain;
err = clGetDeviceInfo(parent_device,
CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
sizeof(affinity_domain), &affinity_domain, NULL);
if (!(affinity_domain & cache_domain)) {
printf("%s cache partitioning not supported\n", cache_name);
return;
}
// 分区
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
cache_domain,
0
};
cl_uint num_sub_devices;
err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
if (err == CL_DEVICE_PARTITION_FAILED) {
printf("Device cannot be partitioned by %s cache\n", cache_name);
return;
}
printf("%s cache partitioning: %u sub-devices\n",
cache_name, num_sub_devices);
}
27.5.4 下一可分区域
c
// 使用下一级别的可分区域
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE,
0
};
// 驱动选择下一级别的分区方式
// 例如:如果当前按NUMA分区,下一级可能按L3缓存分区
27.6 子设备查询
27.6.1 查询子设备的分区方式
c
void query_partition_type(cl_device_id device) {
cl_int err;
size_t size;
// 查询分区类型
err = clGetDeviceInfo(device, CL_DEVICE_PARTITION_TYPE,
0, NULL, &size);
if (size == 0) {
printf("This is a root device (not partitioned)\n");
return;
}
cl_device_partition_property *props =
(cl_device_partition_property*)malloc(size);
err = clGetDeviceInfo(device, CL_DEVICE_PARTITION_TYPE,
size, props, NULL);
printf("Device partition type: ");
switch (props[0]) {
case CL_DEVICE_PARTITION_EQUALLY:
printf("EQUALLY (CUs per sub-device: %lld)\n",
(long long)props[1]);
break;
case CL_DEVICE_PARTITION_BY_COUNTS:
printf("BY_COUNTS (");
for (size_t i = 1; props[i] != CL_DEVICE_PARTITION_BY_COUNTS_LIST_END; i++) {
printf("%lld ", (long long)props[i]);
}
printf(")\n");
break;
case CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN:
printf("BY_AFFINITY_DOMAIN (");
switch (props[1]) {
case CL_DEVICE_AFFINITY_DOMAIN_NUMA:
printf("NUMA"); break;
case CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE:
printf("L1_CACHE"); break;
case CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE:
printf("L2_CACHE"); break;
case CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE:
printf("L3_CACHE"); break;
case CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE:
printf("L4_CACHE"); break;
case CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE:
printf("NEXT_PARTITIONABLE"); break;
}
printf(")\n");
break;
}
free(props);
}
27.6.2 查询父设备
c
void query_parent_device(cl_device_id device) {
cl_device_id parent;
cl_int err = clGetDeviceInfo(device, CL_DEVICE_PARENT_DEVICE,
sizeof(parent), &parent, NULL);
if (err == CL_SUCCESS && parent != NULL) {
printf("This device has a parent device\n");
// 查询父设备名称
char parent_name[256];
clGetDeviceInfo(parent, CL_DEVICE_NAME,
sizeof(parent_name), parent_name, NULL);
printf("Parent device: %s\n", parent_name);
} else {
printf("This is a root device\n");
}
}
27.7 多级分区
27.7.1 递归分区
子设备可以进一步分区:
c
void recursive_partition(cl_device_id device, int level) {
cl_int err;
// 打印当前设备信息
char device_name[256];
clGetDeviceInfo(device, CL_DEVICE_NAME,
sizeof(device_name), device_name, NULL);
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(compute_units), &compute_units, NULL);
printf("%*sDevice: %s (%u CUs)\n",
level * 2, "", device_name, compute_units);
if (compute_units <= 1) {
printf("%*sCannot partition further\n", level * 2, "");
return;
}
// 尝试均匀分区
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY,
compute_units / 2,
0
};
cl_uint num_sub_devices;
err = clCreateSubDevices(device, props, 0, NULL, &num_sub_devices);
if (err != CL_SUCCESS || num_sub_devices == 0) {
printf("%*sCannot partition further\n", level * 2, "");
return;
}
// 创建子设备
cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
err = clCreateSubDevices(device, props, num_sub_devices,
sub_devices, NULL);
if (err == CL_SUCCESS) {
printf("%*sCreated %u sub-devices\n",
level * 2, "", num_sub_devices);
// 递归分区每个子设备
for (cl_uint i = 0; i < num_sub_devices; i++) {
recursive_partition(sub_devices[i], level + 1);
clReleaseDevice(sub_devices[i]);
}
}
delete[] sub_devices;
}
测试示例:
c
// 示例: 64个CU的设备,三级分区
// 级别0: 64个CU
// ├─ 级别1: 32个CU (均匀分成2份)
// │ ├─ 级别2: 16个CU (再分成2份)
// │ │ ├─ 级别3: 8个CU (再分成2份)
// │ │ └─ 级别3: 8个CU
// │ └─ 级别2: 16个CU
// │ ├─ 级别3: 8个CU
// │ └─ 级别3: 8个CU
// └─ 级别1: 32个CU
// └─ ... (类似结构)
27.8 子设备上下文与队列
27.8.1 在子设备上创建上下文
c
void test_sub_device_context(cl_device_id parent_device) {
cl_int err;
// 创建子设备
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY,
2, // 假设每份2个CU
0
};
cl_uint num_sub_devices;
err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
err = clCreateSubDevices(parent_device, props, num_sub_devices,
sub_devices, NULL);
// 为所有子设备创建上下文
cl_context context = clCreateContext(NULL, num_sub_devices,
sub_devices, NULL, NULL, &err);
if (err == CL_SUCCESS) {
printf("Created context with %u sub-devices\n", num_sub_devices);
// 为每个子设备创建命令队列
for (cl_uint i = 0; i < num_sub_devices; i++) {
cl_command_queue queue = clCreateCommandQueueWithProperties(
context, sub_devices[i], NULL, &err);
if (err == CL_SUCCESS) {
printf("Created queue for sub-device %u\n", i);
clReleaseCommandQueue(queue);
}
}
clReleaseContext(context);
}
// 清理
for (cl_uint i = 0; i < num_sub_devices; i++) {
clReleaseDevice(sub_devices[i]);
}
delete[] sub_devices;
}
27.8.2 混合父设备和子设备
c
void test_mixed_device_context(cl_device_id parent_device) {
cl_int err;
// 创建子设备
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY, 4, 0
};
cl_uint num_sub_devices;
err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
// 创建设备数组: 子设备 + 父设备
cl_device_id *devices = new cl_device_id[num_sub_devices + 1];
err = clCreateSubDevices(parent_device, props, num_sub_devices,
devices, NULL);
devices[num_sub_devices] = parent_device; // 添加父设备
// 创建包含父设备和子设备的上下文
cl_context context = clCreateContext(NULL, num_sub_devices + 1,
devices, NULL, NULL, &err);
if (err == CL_SUCCESS) {
printf("Created mixed context with parent and %u sub-devices\n",
num_sub_devices);
clReleaseContext(context);
}
// 清理
for (cl_uint i = 0; i < num_sub_devices; i++) {
clReleaseDevice(devices[i]);
}
delete[] devices;
}
27.9 子设备并发测试
27.9.1 多队列并发执行
c
__kernel void test_kernel(__global int *data) {
int gid = get_global_id(0);
data[gid] *= 3; // 简单计算
}
测试代码:
c
void test_concurrent_execution(cl_device_id parent_device, int iterations) {
cl_int err;
// 创建子设备
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY, 4, 0
};
cl_uint num_sub_devices;
err = clCreateSubDevices(parent_device, props, 0, NULL, &num_sub_devices);
cl_device_id *sub_devices = new cl_device_id[num_sub_devices];
err = clCreateSubDevices(parent_device, props, num_sub_devices,
sub_devices, NULL);
// 创建上下文
cl_context context = clCreateContext(NULL, num_sub_devices,
sub_devices, NULL, NULL, &err);
// 创建程序和内核
cl_program program = clCreateProgramWithSource(context, 1,
&kernel_source, NULL, &err);
clBuildProgram(program, num_sub_devices, sub_devices, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "test_kernel", &err);
// 为每个子设备创建队列和缓冲区
const int DATA_SIZE = 512;
cl_command_queue *queues = new cl_command_queue[num_sub_devices];
cl_mem *buffers = new cl_mem[num_sub_devices];
int *host_data = new int[DATA_SIZE];
for (cl_uint i = 0; i < num_sub_devices; i++) {
// 创建队列
queues[i] = clCreateCommandQueueWithProperties(
context, sub_devices[i], NULL, &err);
// 创建缓冲区
for (int j = 0; j < DATA_SIZE; j++) {
host_data[j] = j;
}
buffers[i] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
DATA_SIZE * sizeof(int),
host_data, &err);
}
printf("Running %d iterations on %u sub-devices concurrently\n",
iterations, num_sub_devices);
// 并发执行
for (int iter = 0; iter < iterations; iter++) {
for (cl_uint i = 0; i < num_sub_devices; i++) {
clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffers[i]);
size_t global_size = DATA_SIZE;
size_t local_size = 64;
clEnqueueNDRangeKernel(queues[i], kernel, 1, NULL,
&global_size, &local_size,
0, NULL, NULL);
}
// 同步所有队列
for (cl_uint i = 0; i < num_sub_devices; i++) {
clFinish(queues[i]);
}
}
// 验证结果
for (cl_uint i = 0; i < num_sub_devices; i++) {
int *result = new int[DATA_SIZE];
clEnqueueReadBuffer(queues[i], buffers[i], CL_TRUE, 0,
DATA_SIZE * sizeof(int), result, 0, NULL, NULL);
// 验证: 每个元素应该是 j * (3^iterations)
bool correct = true;
for (int j = 0; j < DATA_SIZE && correct; j++) {
int expected = j;
for (int k = 0; k < iterations; k++) {
expected *= 3;
}
if (result[j] != expected) {
printf("Sub-device %u: Mismatch at index %d: "
"expected %d, got %d\n",
i, j, expected, result[j]);
correct = false;
}
}
if (correct) {
printf("Sub-device %u: All results correct\n", i);
}
delete[] result;
}
// 清理
for (cl_uint i = 0; i < num_sub_devices; i++) {
clReleaseMemObject(buffers[i]);
clReleaseCommandQueue(queues[i]);
clReleaseDevice(sub_devices[i]);
}
delete[] queues;
delete[] buffers;
delete[] host_data;
delete[] sub_devices;
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
}
27.10 实际应用场景
27.10.1 任务并行
使用子设备实现不同任务的并行执行:
c
void parallel_tasks_example(cl_device_id device) {
// 创建4个子设备
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY, 4, 0
};
cl_uint num_devices = 4;
cl_device_id sub_devices[4];
clCreateSubDevices(device, props, num_devices, sub_devices, NULL);
cl_context context = clCreateContext(NULL, num_devices,
sub_devices, NULL, NULL, NULL);
// 任务1: 图像处理 (子设备0)
cl_command_queue queue0 = clCreateCommandQueueWithProperties(
context, sub_devices[0], NULL, NULL);
// enqueue_image_processing_kernel(queue0, ...);
// 任务2: 物理模拟 (子设备1)
cl_command_queue queue1 = clCreateCommandQueueWithProperties(
context, sub_devices[1], NULL, NULL);
// enqueue_physics_simulation_kernel(queue1, ...);
// 任务3: 机器学习推理 (子设备2)
cl_command_queue queue2 = clCreateCommandQueueWithProperties(
context, sub_devices[2], NULL, NULL);
// enqueue_ml_inference_kernel(queue2, ...);
// 任务4: 视频编码 (子设备3)
cl_command_queue queue3 = clCreateCommandQueueWithProperties(
context, sub_devices[3], NULL, NULL);
// enqueue_video_encoding_kernel(queue3, ...);
// 所有任务并行执行,互不干扰
}
27.10.2 资源隔离
c
// 为不同用户/租户分配独立的子设备
void multi_tenant_example(cl_device_id device) {
// 按用户数量分区
int num_users = 4;
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_EQUALLY,
compute_units / num_users,
0
};
cl_device_id *user_devices = new cl_device_id[num_users];
clCreateSubDevices(device, props, num_users, user_devices, NULL);
// 每个用户获得独立的子设备
// 保证资源隔离,防止相互影响
for (int i = 0; i < num_users; i++) {
// allocate_resources_to_user(i, user_devices[i]);
}
}
27.10.3 NUMA优化
c
// 利用NUMA分区优化内存访问
void numa_optimized_example(cl_device_id device) {
// 按NUMA节点分区
cl_device_partition_property props[] = {
CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
CL_DEVICE_AFFINITY_DOMAIN_NUMA,
0
};
cl_uint num_numa_nodes;
clCreateSubDevices(device, props, 0, NULL, &num_numa_nodes);
cl_device_id *numa_devices = new cl_device_id[num_numa_nodes];
clCreateSubDevices(device, props, num_numa_nodes, numa_devices, NULL);
// 为每个NUMA节点分配本地内存
// 减少跨NUMA访问,提高性能
for (cl_uint i = 0; i < num_numa_nodes; i++) {
// allocate_numa_local_memory(i, numa_devices[i]);
}
}
27.11 小结
设备分区测试验证OpenCL的设备分区功能,使应用程序能够更灵活地管理和利用硬件资源。
27.11.1 核心要点
- 分区类型: EQUALLY、BY_COUNTS、BY_AFFINITY_DOMAIN
- 亲和域: NUMA、L1/L2/L3/L4缓存、NEXT_PARTITIONABLE
- 子设备管理: 创建、查询、释放
- 多级分区: 递归分区子设备
- 并发执行: 多个子设备同时工作
27.11.2 分区方式总结
| 分区类型 | 特点 | 适用场景 |
|---|---|---|
| EQUALLY | 均匀分配 | 负载均衡的并行任务 |
| BY_COUNTS | 自定义分配 | 不同优先级的任务 |
| NUMA | 按NUMA节点 | 内存密集型应用 |
| L1/L2/L3缓存 | 按缓存域 | 缓存敏感型应用 |
27.11.3 实际优势
- ✅ 更细粒度的资源控制
- ✅ 任务隔离和优先级管理
- ✅ 改善负载均衡
- ✅ 利用硬件拓扑优化性能
- ✅ 多租户场景支持
27.11.4 注意事项
- ⚠️ OpenCL 1.2+才支持
- ⚠️ 不是所有设备都支持分区
- ⚠️ 子设备数量受计算单元数限制
- ⚠️ 需要显式释放子设备
- ⚠️ 分区可能影响性能(额外开销)
27.11.5 最佳实践
- 查询支持: 使用前检查设备是否支持分区
- 合理分配: 根据任务需求选择分区方式
- 资源管理: 及时释放不用的子设备
- 性能测试: 比较分区前后的性能
- 错误处理: 妥善处理分区失败的情况
27.11.6 下一步
完成设备分区测试后,下一章将介绍第28章 - Device_execution(设备端入队测试)。
参考文档:
- OpenCL 1.2 Specification - Device Fission Extension
- OpenCL 2.0 Specification - Device Partition (Core Feature)
- test_conformance/device_partition/ 测试源码