章节概述
本章通过完整的代码示例,介绍HSA Runtime的核心API使用方法。你将学习如何初始化Runtime、发现设备、分配内存、创建队列、加载Kernel以及派发执行任务。
难度级别 : 🟢 基础
预计阅读时间 : 70分钟
前置知识: 第01-04章
📋 本章学习目标
完成本章学习后,你将能够:
- ✅ 掌握HSA Runtime的初始化和销毁
- ✅ 枚举和查询系统中的计算设备
- ✅ 分配和管理统一内存
- ✅ 创建和使用队列
- ✅ 加载Kernel代码对象
- ✅ 派发Kernel并等待完成
- ✅ 编写完整的HSA应用程序
5.1 Runtime初始化
5.1.1 基本初始化
最简初始化流程:
cpp
#include <hsa/hsa.h>
#include <iostream>
#include <cassert>
int main() {
// 1. 初始化HSA Runtime
hsa_status_t status = hsa_init();
if (status != HSA_STATUS_SUCCESS) {
std::cerr << "Failed to initialize HSA Runtime: "
<< status << std::endl;
return 1;
}
std::cout << "HSA Runtime initialized successfully!" << std::endl;
// 2. 执行HSA操作...
// 3. 销毁Runtime
status = hsa_shut_down();
assert(status == HSA_STATUS_SUCCESS);
return 0;
}
编译运行:
bash
g++ basic_init.cpp -o basic_init \
-I/opt/rocm/include \
-L/opt/rocm/lib \
-lhsa-runtime64
./basic_init
# 输出:HSA Runtime initialized successfully!
5.1.2 错误处理
健壮的初始化代码:
cpp
#include <hsa/hsa.h>
#include <iostream>
#include <string>
// 错误码转字符串
std::string hsa_status_string(hsa_status_t status) {
const char* str;
hsa_status_string(status, &str);
return std::string(str);
}
// 错误处理宏
#define HSA_CHECK(call) \
do { \
hsa_status_t _status = (call); \
if (_status != HSA_STATUS_SUCCESS) { \
std::cerr << "HSA Error at " << __FILE__ << ":" << __LINE__ \
<< " - " << hsa_status_string(_status) << std::endl; \
exit(1); \
} \
} while(0)
int main() {
// 使用宏简化错误处理
HSA_CHECK(hsa_init());
std::cout << "HSA initialized" << std::endl;
// 后续代码可以放心使用HSA_CHECK
HSA_CHECK(hsa_shut_down());
return 0;
}
5.1.3 版本查询
cpp
#include <hsa/hsa.h>
#include <iostream>
int main() {
hsa_init();
// 查询HSA版本
uint16_t major, minor;
hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &major);
hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &minor);
std::cout << "HSA Version: " << major << "." << minor << std::endl;
// 查询时间戳
uint64_t timestamp, freq;
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, ×tamp);
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
std::cout << "Timestamp: " << timestamp
<< " Hz (freq: " << freq << ")" << std::endl;
// 查询Signal最大值
uint64_t max_wait;
hsa_system_get_info(HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, &max_wait);
std::cout << "Max signal wait: " << max_wait << std::endl;
hsa_shut_down();
return 0;
}
5.2 设备发现与查询
5.2.1 枚举所有Agents
cpp
#include <hsa/hsa.h>
#include <iostream>
#include <vector>
// Agent信息结构
struct AgentInfo {
hsa_agent_t handle;
char name[64];
hsa_device_type_t type;
uint32_t cu_count;
};
// 回调函数:收集Agent信息
hsa_status_t collect_agents(hsa_agent_t agent, void* data) {
std::vector<AgentInfo>* agents =
static_cast<std::vector<AgentInfo>*>(data);
AgentInfo info;
info.handle = agent;
// 查询名称
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, info.name);
// 查询类型
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &info.type);
// 查询CU数量(GPU特有)
if (info.type == HSA_DEVICE_TYPE_GPU) {
hsa_agent_get_info(agent,
HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
&info.cu_count);
} else {
info.cu_count = 0;
}
agents->push_back(info);
return HSA_STATUS_SUCCESS;
}
int main() {
hsa_init();
// 枚举所有Agents
std::vector<AgentInfo> agents;
hsa_iterate_agents(collect_agents, &agents);
std::cout << "Found " << agents.size() << " agents:\n";
for (size_t i = 0; i < agents.size(); i++) {
const AgentInfo& info = agents[i];
std::cout << "Agent " << i << ": " << info.name << "\n";
std::cout << " Type: ";
switch (info.type) {
case HSA_DEVICE_TYPE_CPU:
std::cout << "CPU\n";
break;
case HSA_DEVICE_TYPE_GPU:
std::cout << "GPU\n";
std::cout << " Compute Units: " << info.cu_count << "\n";
break;
case HSA_DEVICE_TYPE_DSP:
std::cout << "DSP\n";
break;
}
}
hsa_shut_down();
return 0;
}
5.2.2 查找特定类型的Agent
cpp
// 查找GPU Agent
hsa_agent_t find_gpu_agent() {
struct FindData {
hsa_agent_t agent;
bool found;
} data = {0, false};
hsa_iterate_agents([](hsa_agent_t agent, void* d) {
FindData* data = static_cast<FindData*>(d);
hsa_device_type_t type;
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
if (type == HSA_DEVICE_TYPE_GPU) {
data->agent = agent;
data->found = true;
return HSA_STATUS_INFO_BREAK; // 停止遍历
}
return HSA_STATUS_SUCCESS;
}, &data);
if (!data.found) {
std::cerr << "No GPU found!\n";
exit(1);
}
return data.agent;
}
5.2.3 查询Agent详细信息
cpp
void print_agent_details(hsa_agent_t agent) {
char name[64];
hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name);
std::cout << "Agent: " << name << "\n";
// 设备类型
hsa_device_type_t type;
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
std::cout << " Device Type: "
<< (type == HSA_DEVICE_TYPE_GPU ? "GPU" : "CPU") << "\n";
// Wavefront大小
uint32_t wavefront_size;
hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
std::cout << " Wavefront Size: " << wavefront_size << "\n";
// Work-group最大大小
uint32_t max_wg_size;
hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &max_wg_size);
std::cout << " Max Workgroup Size: " << max_wg_size << "\n";
// 队列最大大小
uint32_t max_queue_size;
hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &max_queue_size);
std::cout << " Max Queue Size: " << max_queue_size << "\n";
if (type == HSA_DEVICE_TYPE_GPU) {
// GPU特有信息
uint32_t cu_count;
hsa_agent_get_info(agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
std::cout << " Compute Units: " << cu_count << "\n";
uint32_t max_waves_per_cu;
hsa_agent_get_info(agent, HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &max_waves_per_cu);
std::cout << " Max Waves per CU: " << max_waves_per_cu << "\n";
}
}
5.3 内存分配
5.3.1 查找内存区域
cpp
// 查找Fine-grained Global Region
hsa_region_t find_global_region(hsa_agent_t agent) {
struct FindData {
hsa_region_t region;
bool found;
} data = {0, false};
hsa_agent_iterate_regions(agent,
[](hsa_region_t region, void* d) {
FindData* data = static_cast<FindData*>(d);
// 检查是否是Global段
hsa_region_segment_t segment;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
if (segment != HSA_REGION_SEGMENT_GLOBAL) {
return HSA_STATUS_SUCCESS;
}
// 检查是否是Fine-grained
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
data->region = region;
data->found = true;
return HSA_STATUS_INFO_BREAK;
}
return HSA_STATUS_SUCCESS;
}, &data);
if (!data.found) {
std::cerr << "No fine-grained region found!\n";
exit(1);
}
return data.region;
}
5.3.2 内存分配和释放
cpp
#include <hsa/hsa.h>
#include <iostream>
#include <cstring>
int main() {
hsa_init();
// 1. 查找GPU和内存区域
hsa_agent_t gpu = find_gpu_agent();
hsa_region_t region = find_global_region(gpu);
// 2. 分配内存
const size_t size = 1024 * sizeof(float);
float* device_ptr = nullptr;
hsa_status_t status = hsa_memory_allocate(region, size,
(void**)&device_ptr);
if (status != HSA_STATUS_SUCCESS) {
std::cerr << "Memory allocation failed!\n";
return 1;
}
std::cout << "Allocated " << size << " bytes at "
<< device_ptr << "\n";
// 3. 使用内存(CPU可以直接访问Fine-grained内存)
for (int i = 0; i < 10; i++) {
device_ptr[i] = i * 0.5f;
}
// 4. 验证
std::cout << "First 10 elements:\n";
for (int i = 0; i < 10; i++) {
std::cout << " [" << i << "] = " << device_ptr[i] << "\n";
}
// 5. 释放内存
hsa_memory_free(device_ptr);
hsa_shut_down();
return 0;
}
5.3.3 查询内存区域信息
cpp
void print_region_info(hsa_region_t region) {
// 段类型
hsa_region_segment_t segment;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
std::cout << "Segment: ";
switch (segment) {
case HSA_REGION_SEGMENT_GLOBAL:
std::cout << "GLOBAL\n";
break;
case HSA_REGION_SEGMENT_READONLY:
std::cout << "READONLY\n";
break;
case HSA_REGION_SEGMENT_PRIVATE:
std::cout << "PRIVATE\n";
break;
case HSA_REGION_SEGMENT_GROUP:
std::cout << "GROUP\n";
break;
}
// 大小
size_t size;
hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &size);
std::cout << " Size: " << (size / 1024 / 1024) << " MB\n";
// 分配粒度
size_t alloc_granule;
hsa_region_get_info(region, HSA_REGION_INFO_ALLOC_MAX_SIZE, &alloc_granule);
std::cout << " Alloc Granule: " << alloc_granule << " bytes\n";
// Global区域特有信息
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
std::cout << " Flags:\n";
if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
std::cout << " - KERNARG\n";
}
if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
std::cout << " - FINE_GRAINED\n";
}
if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
std::cout << " - COARSE_GRAINED\n";
}
}
}
5.4 队列创建与管理
5.4.1 创建队列
cpp
#include <hsa/hsa.h>
#include <iostream>
int main() {
hsa_init();
// 1. 查找GPU
hsa_agent_t gpu = find_gpu_agent();
// 2. 创建队列
hsa_queue_t* queue = nullptr;
hsa_status_t status = hsa_queue_create(
gpu, // agent
1024, // size (必须是2的幂)
HSA_QUEUE_TYPE_MULTI, // 类型:多生产者
nullptr, // callback
nullptr, // callback data
UINT32_MAX, // private_segment_size
UINT32_MAX, // group_segment_size
&queue // 输出队列指针
);
if (status != HSA_STATUS_SUCCESS) {
std::cerr << "Failed to create queue!\n";
return 1;
}
std::cout << "Queue created successfully\n";
std::cout << " Base address: " << queue->base_address << "\n";
std::cout << " Size: " << queue->size << "\n";
std::cout << " Doorbell: " << queue->doorbell_signal.handle << "\n";
// 3. 使用队列...
// 4. 销毁队列
hsa_queue_destroy(queue);
hsa_shut_down();
return 0;
}
5.4.2 队列类型选择
cpp
// 单生产者队列(更快,但只能一个线程写入)
hsa_queue_create(gpu, 1024, HSA_QUEUE_TYPE_SINGLE,
nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue);
// 多生产者队列(多个线程可以同时写入)
hsa_queue_create(gpu, 1024, HSA_QUEUE_TYPE_MULTI,
nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue);
// 使用建议:
// - 如果只有一个线程派发任务 → HSA_QUEUE_TYPE_SINGLE
// - 如果多个线程同时派发任务 → HSA_QUEUE_TYPE_MULTI
5.4.3 队列状态查询
cpp
void check_queue_status(hsa_queue_t* queue) {
// 读取Write Index(下一个写入位置)
uint64_t write_index = hsa_queue_load_write_index_relaxed(queue);
std::cout << "Write Index: " << write_index << "\n";
// 读取Read Index(GPU已处理位置)
uint64_t read_index = hsa_queue_load_read_index_relaxed(queue);
std::cout << "Read Index: " << read_index << "\n";
// 计算队列使用情况
uint64_t used = write_index - read_index;
std::cout << "Queue Usage: " << used << " / " << queue->size << "\n";
if (used >= queue->size) {
std::cerr << "Warning: Queue is full!\n";
}
}
5.5 完整示例代码解析
5.5.1 向量加法完整示例
这是一个完整的HSA程序,包含Kernel加载和执行:
vector_add.cpp:
cpp
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <cassert>
// 辅助宏
#define HSA_CHECK(call) \
do { \
hsa_status_t _status = (call); \
if (_status != HSA_STATUS_SUCCESS) { \
const char* str; \
hsa_status_string(_status, &str); \
std::cerr << "HSA Error: " << str << " at " << __FILE__ \
<< ":" << __LINE__ << std::endl; \
exit(1); \
} \
} while(0)
// 查找GPU Agent
hsa_agent_t find_gpu() {
struct Data { hsa_agent_t agent; bool found; } data = {0, false};
hsa_iterate_agents([](hsa_agent_t agent, void* d) {
Data* data = static_cast<Data*>(d);
hsa_device_type_t type;
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
if (type == HSA_DEVICE_TYPE_GPU) {
data->agent = agent;
data->found = true;
return HSA_STATUS_INFO_BREAK;
}
return HSA_STATUS_SUCCESS;
}, &data);
assert(data.found && "No GPU found");
return data.agent;
}
// 查找内存区域
hsa_region_t find_global_region(hsa_agent_t agent) {
struct Data { hsa_region_t region; bool found; } data = {0, false};
hsa_agent_iterate_regions(agent,
[](hsa_region_t region, void* d) {
Data* data = static_cast<Data*>(d);
hsa_region_segment_t segment;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
data->region = region;
data->found = true;
return HSA_STATUS_INFO_BREAK;
}
}
return HSA_STATUS_SUCCESS;
}, &data);
assert(data.found && "No fine-grained region found");
return data.region;
}
int main(int argc, char** argv) {
const int N = 1024;
std::cout << "HSA Vector Add Example\n";
std::cout << "======================\n\n";
// 1. 初始化HSA Runtime
std::cout << "1. Initializing HSA Runtime...\n";
HSA_CHECK(hsa_init());
// 2. 查找GPU
std::cout << "2. Finding GPU...\n";
hsa_agent_t gpu = find_gpu();
char name[64];
hsa_agent_get_info(gpu, HSA_AGENT_INFO_NAME, name);
std::cout << " Found: " << name << "\n\n";
// 3. 查找内存区域
std::cout << "3. Finding memory region...\n";
hsa_region_t region = find_global_region(gpu);
// 4. 分配内存
std::cout << "4. Allocating memory...\n";
float *a, *b, *c;
HSA_CHECK(hsa_memory_allocate(region, N * sizeof(float), (void**)&a));
HSA_CHECK(hsa_memory_allocate(region, N * sizeof(float), (void**)&b));
HSA_CHECK(hsa_memory_allocate(region, N * sizeof(float), (void**)&c));
std::cout << " Allocated 3 x " << N << " floats\n\n";
// 5. 初始化数据
std::cout << "5. Initializing data...\n";
for (int i = 0; i < N; i++) {
a[i] = static_cast<float>(i);
b[i] = static_cast<float>(i * 2);
c[i] = 0.0f;
}
// 6. 创建队列
std::cout << "6. Creating queue...\n";
hsa_queue_t* queue;
HSA_CHECK(hsa_queue_create(gpu, 256, HSA_QUEUE_TYPE_MULTI,
nullptr, nullptr, UINT32_MAX, UINT32_MAX,
&queue));
// 7. 创建Signal
std::cout << "7. Creating signal...\n";
hsa_signal_t signal;
HSA_CHECK(hsa_signal_create(1, 0, nullptr, &signal));
// 注:实际的Kernel加载和派发需要:
// - 加载Code Object (.co文件)
// - 创建Executable
// - 获取Kernel符号
// - 准备Kernel参数
// - 构造AQL Dispatch Packet
// - 提交到队列
// 这部分较复杂,在后续章节详细讲解
// 8. 模拟计算(CPU执行,演示流程)
std::cout << "8. Computing on CPU (demo)...\n";
for (int i = 0; i < N; i++) {
c[i] = a[i] + b[i];
}
// 9. 验证结果
std::cout << "9. Verifying results...\n";
bool passed = true;
for (int i = 0; i < N; i++) {
float expected = a[i] + b[i];
if (c[i] != expected) {
std::cerr << " Mismatch at " << i << ": "
<< c[i] << " != " << expected << "\n";
passed = false;
break;
}
}
if (passed) {
std::cout << " ✓ All results correct!\n";
} else {
std::cout << " ✗ Verification failed!\n";
}
std::cout << "\nFirst 10 results:\n";
for (int i = 0; i < 10; i++) {
std::cout << " c[" << i << "] = " << a[i] << " + "
<< b[i] << " = " << c[i] << "\n";
}
// 10. 清理资源
std::cout << "\n10. Cleaning up...\n";
hsa_signal_destroy(signal);
hsa_queue_destroy(queue);
hsa_memory_free(a);
hsa_memory_free(b);
hsa_memory_free(c);
// 11. 销毁Runtime
std::cout << "11. Shutting down HSA Runtime...\n";
HSA_CHECK(hsa_shut_down());
std::cout << "\nDone!\n";
return 0;
}
编译运行:
bash
g++ vector_add.cpp -o vector_add \
-I/opt/rocm/include \
-L/opt/rocm/lib \
-lhsa-runtime64 \
-std=c++11
./vector_add
# 输出:
# HSA Vector Add Example
# ======================
#
# 1. Initializing HSA Runtime...
# 2. Finding GPU...
# Found: gfx1030
#
# 3. Finding memory region...
# 4. Allocating memory...
# Allocated 3 x 1024 floats
#
# 5. Initializing data...
# 6. Creating queue...
# 7. Creating signal...
# 8. Computing on CPU (demo)...
# 9. Verifying results...
# ✓ All results correct!
#
# First 10 results:
# c[0] = 0 + 0 = 0
# c[1] = 1 + 2 = 3
# c[2] = 2 + 4 = 6
# ...
#
# 10. Cleaning up...
# 11. Shutting down HSA Runtime...
#
# Done!
5.5.2 实用工具函数封装
hsa_utils.h:
cpp
#ifndef HSA_UTILS_H
#define HSA_UTILS_H
#include <hsa/hsa.h>
#include <iostream>
#include <string>
// 错误处理宏
#define HSA_CHECK(call) \
do { \
hsa_status_t _status = (call); \
if (_status != HSA_STATUS_SUCCESS) { \
const char* str; \
hsa_status_string(_status, &str); \
std::cerr << "HSA Error: " << str << " at " << __FILE__ \
<< ":" << __LINE__ << std::endl; \
exit(1); \
} \
} while(0)
// HSA Runtime RAII包装
class HsaRuntime {
public:
HsaRuntime() {
HSA_CHECK(hsa_init());
}
~HsaRuntime() {
hsa_shut_down();
}
// 禁止拷贝
HsaRuntime(const HsaRuntime&) = delete;
HsaRuntime& operator=(const HsaRuntime&) = delete;
};
// Agent查找器
class AgentFinder {
public:
static hsa_agent_t find_gpu() {
struct Data { hsa_agent_t agent; bool found; } data = {0, false};
hsa_iterate_agents([](hsa_agent_t agent, void* d) {
Data* data = static_cast<Data*>(d);
hsa_device_type_t type;
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
if (type == HSA_DEVICE_TYPE_GPU) {
data->agent = agent;
data->found = true;
return HSA_STATUS_INFO_BREAK;
}
return HSA_STATUS_SUCCESS;
}, &data);
if (!data.found) {
std::cerr << "No GPU found!\n";
exit(1);
}
return data.agent;
}
static hsa_agent_t find_cpu() {
struct Data { hsa_agent_t agent; bool found; } data = {0, false};
hsa_iterate_agents([](hsa_agent_t agent, void* d) {
Data* data = static_cast<Data*>(d);
hsa_device_type_t type;
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
if (type == HSA_DEVICE_TYPE_CPU) {
data->agent = agent;
data->found = true;
return HSA_STATUS_INFO_BREAK;
}
return HSA_STATUS_SUCCESS;
}, &data);
if (!data.found) {
std::cerr << "No CPU found!\n";
exit(1);
}
return data.agent;
}
};
// Region查找器
class RegionFinder {
public:
static hsa_region_t find_fine_grained_region(hsa_agent_t agent) {
struct Data { hsa_region_t region; bool found; } data = {0, false};
hsa_agent_iterate_regions(agent,
[](hsa_region_t region, void* d) {
Data* data = static_cast<Data*>(d);
hsa_region_segment_t segment;
hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t flags;
hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
data->region = region;
data->found = true;
return HSA_STATUS_INFO_BREAK;
}
}
return HSA_STATUS_SUCCESS;
}, &data);
if (!data.found) {
std::cerr << "No fine-grained region found!\n";
exit(1);
}
return data.region;
}
};
// Signal RAII包装
class Signal {
public:
Signal(hsa_signal_value_t initial_value = 1) {
HSA_CHECK(hsa_signal_create(initial_value, 0, nullptr, &signal_));
}
~Signal() {
hsa_signal_destroy(signal_);
}
operator hsa_signal_t() const { return signal_; }
void wait(hsa_signal_condition_t condition = HSA_SIGNAL_CONDITION_EQ,
hsa_signal_value_t compare_value = 0) {
hsa_signal_wait_scacquire(signal_, condition, compare_value,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
}
hsa_signal_value_t load() const {
return hsa_signal_load_relaxed(signal_);
}
private:
hsa_signal_t signal_;
};
#endif // HSA_UTILS_H
使用工具类的简化版本:
cpp
#include "hsa_utils.h"
int main() {
// RAII自动初始化和销毁
HsaRuntime runtime;
// 简化的查找操作
hsa_agent_t gpu = AgentFinder::find_gpu();
hsa_region_t region = RegionFinder::find_fine_grained_region(gpu);
// 分配内存
const int N = 1024;
float *data;
HSA_CHECK(hsa_memory_allocate(region, N * sizeof(float), (void**)&data));
// 使用Signal的RAII包装
Signal signal(1);
// 业务逻辑...
signal.wait();
// 自动清理(RAII)
hsa_memory_free(data);
return 0;
}
🎯 本章总结
核心要点回顾
-
Runtime初始化
hsa_init()/hsa_shut_down()- 错误处理和版本查询
- RAII包装提高安全性
-
设备发现
hsa_iterate_agents()枚举设备hsa_agent_get_info()查询信息- 区分CPU/GPU/DSP类型
-
内存管理
hsa_agent_iterate_regions()查找内存区域hsa_memory_allocate()/hsa_memory_free()- Fine-grained vs Coarse-grained
-
队列操作
hsa_queue_create()/hsa_queue_destroy()- SINGLE vs MULTI类型
- 队列状态监控
-
Signal同步
hsa_signal_create()/hsa_signal_destroy()hsa_signal_wait_scacquire()等待- 用于Host-Device同步
API速查表
cpp
// Runtime
hsa_init()
hsa_shut_down()
hsa_system_get_info()
// Agent
hsa_iterate_agents(callback, data)
hsa_agent_get_info(agent, attribute, value)
hsa_agent_iterate_regions(agent, callback, data)
// Memory
hsa_region_get_info(region, attribute, value)
hsa_memory_allocate(region, size, ptr)
hsa_memory_free(ptr)
// Queue
hsa_queue_create(agent, size, type, callback, data,
private_size, group_size, queue)
hsa_queue_destroy(queue)
hsa_queue_load_write_index_relaxed(queue)
hsa_queue_load_read_index_relaxed(queue)
// Signal
hsa_signal_create(initial_value, num_consumers, consumers, signal)
hsa_signal_destroy(signal)
hsa_signal_wait_scacquire(signal, condition, compare, timeout, wait_hint)
hsa_signal_load_relaxed(signal)
hsa_signal_store_relaxed(signal, value)
实践建议
- 错误处理:使用HSA_CHECK宏简化
- 资源管理:用RAII避免泄漏
- 工具封装:封装常用查找函数
- 调试技巧:启用HSA_ENABLE_DEBUG
- 性能考虑:选择合适的内存区域和队列类型
🔗 导航
- 上一章 : 第03章:HSA编程模型基础
- 下一章 : 第06章:Agent设备抽象
- 返回目录 : 00-HSA-Runtime学习文档目录