第05章:HSA-API快速入门

章节概述

本章通过完整的代码示例,介绍HSA Runtime的核心API使用方法。你将学习如何初始化Runtime、发现设备、分配内存、创建队列、加载Kernel以及派发执行任务。

难度级别 : 🟢 基础
预计阅读时间 : 70分钟
前置知识: 第01-04章


📋 本章学习目标

完成本章学习后,你将能够:

  • ✅ 掌握HSA Runtime的初始化和销毁
  • ✅ 枚举和查询系统中的计算设备
  • ✅ 分配和管理统一内存
  • ✅ 创建和使用队列
  • ✅ 加载Kernel代码对象
  • ✅ 派发Kernel并等待完成
  • ✅ 编写完整的HSA应用程序

5.1 Runtime初始化

5.1.1 基本初始化

最简初始化流程

cpp 复制代码
#include <hsa/hsa.h>
#include <iostream>
#include <cassert>

int main() {
    // 1. 初始化HSA Runtime
    hsa_status_t status = hsa_init();
    
    if (status != HSA_STATUS_SUCCESS) {
        std::cerr << "Failed to initialize HSA Runtime: " 
                  << status << std::endl;
        return 1;
    }
    
    std::cout << "HSA Runtime initialized successfully!" << std::endl;
    
    // 2. 执行HSA操作...
    
    // 3. 销毁Runtime
    status = hsa_shut_down();
    assert(status == HSA_STATUS_SUCCESS);
    
    return 0;
}

编译运行

bash 复制代码
g++ basic_init.cpp -o basic_init \
    -I/opt/rocm/include \
    -L/opt/rocm/lib \
    -lhsa-runtime64

./basic_init
# 输出:HSA Runtime initialized successfully!

5.1.2 错误处理

健壮的初始化代码

cpp 复制代码
#include <hsa/hsa.h>
#include <iostream>
#include <string>

// 错误码转字符串
std::string hsa_status_string(hsa_status_t status) {
    const char* str;
    hsa_status_string(status, &str);
    return std::string(str);
}

// 错误处理宏
#define HSA_CHECK(call) \
    do { \
        hsa_status_t _status = (call); \
        if (_status != HSA_STATUS_SUCCESS) { \
            std::cerr << "HSA Error at " << __FILE__ << ":" << __LINE__ \
                      << " - " << hsa_status_string(_status) << std::endl; \
            exit(1); \
        } \
    } while(0)

int main() {
    // 使用宏简化错误处理
    HSA_CHECK(hsa_init());
    
    std::cout << "HSA initialized" << std::endl;
    
    // 后续代码可以放心使用HSA_CHECK
    
    HSA_CHECK(hsa_shut_down());
    return 0;
}

5.1.3 版本查询

cpp 复制代码
#include <hsa/hsa.h>
#include <iostream>

int main() {
    hsa_init();
    
    // 查询HSA版本
    uint16_t major, minor;
    hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &major);
    hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &minor);
    
    std::cout << "HSA Version: " << major << "." << minor << std::endl;
    
    // 查询时间戳
    uint64_t timestamp, freq;
    hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &timestamp);
    hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
    
    std::cout << "Timestamp: " << timestamp 
              << " Hz (freq: " << freq << ")" << std::endl;
    
    // 查询Signal最大值
    uint64_t max_wait;
    hsa_system_get_info(HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, &max_wait);
    std::cout << "Max signal wait: " << max_wait << std::endl;
    
    hsa_shut_down();
    return 0;
}

5.2 设备发现与查询

5.2.1 枚举所有Agents

cpp 复制代码
#include <hsa/hsa.h>
#include <iostream>
#include <vector>

// Agent信息结构
struct AgentInfo {
    hsa_agent_t handle;
    char name[64];
    hsa_device_type_t type;
    uint32_t cu_count;
};

// 回调函数:收集Agent信息
hsa_status_t collect_agents(hsa_agent_t agent, void* data) {
    std::vector<AgentInfo>* agents = 
        static_cast<std::vector<AgentInfo>*>(data);
    
    AgentInfo info;
    info.handle = agent;
    
    // 查询名称
    hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, info.name);
    
    // 查询类型
    hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &info.type);
    
    // 查询CU数量(GPU特有)
    if (info.type == HSA_DEVICE_TYPE_GPU) {
        hsa_agent_get_info(agent, 
                          HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 
                          &info.cu_count);
    } else {
        info.cu_count = 0;
    }
    
    agents->push_back(info);
    return HSA_STATUS_SUCCESS;
}

int main() {
    hsa_init();
    
    // 枚举所有Agents
    std::vector<AgentInfo> agents;
    hsa_iterate_agents(collect_agents, &agents);
    
    std::cout << "Found " << agents.size() << " agents:\n";
    for (size_t i = 0; i < agents.size(); i++) {
        const AgentInfo& info = agents[i];
        std::cout << "Agent " << i << ": " << info.name << "\n";
        std::cout << "  Type: ";
        switch (info.type) {
            case HSA_DEVICE_TYPE_CPU:
                std::cout << "CPU\n";
                break;
            case HSA_DEVICE_TYPE_GPU:
                std::cout << "GPU\n";
                std::cout << "  Compute Units: " << info.cu_count << "\n";
                break;
            case HSA_DEVICE_TYPE_DSP:
                std::cout << "DSP\n";
                break;
        }
    }
    
    hsa_shut_down();
    return 0;
}

5.2.2 查找特定类型的Agent

cpp 复制代码
// 查找GPU Agent
hsa_agent_t find_gpu_agent() {
    struct FindData {
        hsa_agent_t agent;
        bool found;
    } data = {0, false};
    
    hsa_iterate_agents([](hsa_agent_t agent, void* d) {
        FindData* data = static_cast<FindData*>(d);
        
        hsa_device_type_t type;
        hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
        
        if (type == HSA_DEVICE_TYPE_GPU) {
            data->agent = agent;
            data->found = true;
            return HSA_STATUS_INFO_BREAK; // 停止遍历
        }
        
        return HSA_STATUS_SUCCESS;
    }, &data);
    
    if (!data.found) {
        std::cerr << "No GPU found!\n";
        exit(1);
    }
    
    return data.agent;
}

5.2.3 查询Agent详细信息

cpp 复制代码
void print_agent_details(hsa_agent_t agent) {
    char name[64];
    hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name);
    std::cout << "Agent: " << name << "\n";
    
    // 设备类型
    hsa_device_type_t type;
    hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
    std::cout << "  Device Type: " 
              << (type == HSA_DEVICE_TYPE_GPU ? "GPU" : "CPU") << "\n";
    
    // Wavefront大小
    uint32_t wavefront_size;
    hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
    std::cout << "  Wavefront Size: " << wavefront_size << "\n";
    
    // Work-group最大大小
    uint32_t max_wg_size;
    hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &max_wg_size);
    std::cout << "  Max Workgroup Size: " << max_wg_size << "\n";
    
    // 队列最大大小
    uint32_t max_queue_size;
    hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &max_queue_size);
    std::cout << "  Max Queue Size: " << max_queue_size << "\n";
    
    if (type == HSA_DEVICE_TYPE_GPU) {
        // GPU特有信息
        uint32_t cu_count;
        hsa_agent_get_info(agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
        std::cout << "  Compute Units: " << cu_count << "\n";
        
        uint32_t max_waves_per_cu;
        hsa_agent_get_info(agent, HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &max_waves_per_cu);
        std::cout << "  Max Waves per CU: " << max_waves_per_cu << "\n";
    }
}

5.3 内存分配

5.3.1 查找内存区域

cpp 复制代码
// 查找Fine-grained Global Region
hsa_region_t find_global_region(hsa_agent_t agent) {
    struct FindData {
        hsa_region_t region;
        bool found;
    } data = {0, false};
    
    hsa_agent_iterate_regions(agent, 
        [](hsa_region_t region, void* d) {
            FindData* data = static_cast<FindData*>(d);
            
            // 检查是否是Global段
            hsa_region_segment_t segment;
            hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
            
            if (segment != HSA_REGION_SEGMENT_GLOBAL) {
                return HSA_STATUS_SUCCESS;
            }
            
            // 检查是否是Fine-grained
            hsa_region_global_flag_t flags;
            hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
            
            if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
                data->region = region;
                data->found = true;
                return HSA_STATUS_INFO_BREAK;
            }
            
            return HSA_STATUS_SUCCESS;
        }, &data);
    
    if (!data.found) {
        std::cerr << "No fine-grained region found!\n";
        exit(1);
    }
    
    return data.region;
}

5.3.2 内存分配和释放

cpp 复制代码
#include <hsa/hsa.h>
#include <iostream>
#include <cstring>

int main() {
    hsa_init();
    
    // 1. 查找GPU和内存区域
    hsa_agent_t gpu = find_gpu_agent();
    hsa_region_t region = find_global_region(gpu);
    
    // 2. 分配内存
    const size_t size = 1024 * sizeof(float);
    float* device_ptr = nullptr;
    
    hsa_status_t status = hsa_memory_allocate(region, size, 
                                               (void**)&device_ptr);
    if (status != HSA_STATUS_SUCCESS) {
        std::cerr << "Memory allocation failed!\n";
        return 1;
    }
    
    std::cout << "Allocated " << size << " bytes at " 
              << device_ptr << "\n";
    
    // 3. 使用内存(CPU可以直接访问Fine-grained内存)
    for (int i = 0; i < 10; i++) {
        device_ptr[i] = i * 0.5f;
    }
    
    // 4. 验证
    std::cout << "First 10 elements:\n";
    for (int i = 0; i < 10; i++) {
        std::cout << "  [" << i << "] = " << device_ptr[i] << "\n";
    }
    
    // 5. 释放内存
    hsa_memory_free(device_ptr);
    
    hsa_shut_down();
    return 0;
}

5.3.3 查询内存区域信息

cpp 复制代码
void print_region_info(hsa_region_t region) {
    // 段类型
    hsa_region_segment_t segment;
    hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
    std::cout << "Segment: ";
    switch (segment) {
        case HSA_REGION_SEGMENT_GLOBAL:
            std::cout << "GLOBAL\n";
            break;
        case HSA_REGION_SEGMENT_READONLY:
            std::cout << "READONLY\n";
            break;
        case HSA_REGION_SEGMENT_PRIVATE:
            std::cout << "PRIVATE\n";
            break;
        case HSA_REGION_SEGMENT_GROUP:
            std::cout << "GROUP\n";
            break;
    }
    
    // 大小
    size_t size;
    hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &size);
    std::cout << "  Size: " << (size / 1024 / 1024) << " MB\n";
    
    // 分配粒度
    size_t alloc_granule;
    hsa_region_get_info(region, HSA_REGION_INFO_ALLOC_MAX_SIZE, &alloc_granule);
    std::cout << "  Alloc Granule: " << alloc_granule << " bytes\n";
    
    // Global区域特有信息
    if (segment == HSA_REGION_SEGMENT_GLOBAL) {
        hsa_region_global_flag_t flags;
        hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
        
        std::cout << "  Flags:\n";
        if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
            std::cout << "    - KERNARG\n";
        }
        if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
            std::cout << "    - FINE_GRAINED\n";
        }
        if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED) {
            std::cout << "    - COARSE_GRAINED\n";
        }
    }
}

5.4 队列创建与管理

5.4.1 创建队列

cpp 复制代码
#include <hsa/hsa.h>
#include <iostream>

int main() {
    hsa_init();
    
    // 1. 查找GPU
    hsa_agent_t gpu = find_gpu_agent();
    
    // 2. 创建队列
    hsa_queue_t* queue = nullptr;
    
    hsa_status_t status = hsa_queue_create(
        gpu,                         // agent
        1024,                        // size (必须是2的幂)
        HSA_QUEUE_TYPE_MULTI,        // 类型:多生产者
        nullptr,                     // callback
        nullptr,                     // callback data
        UINT32_MAX,                  // private_segment_size
        UINT32_MAX,                  // group_segment_size
        &queue                       // 输出队列指针
    );
    
    if (status != HSA_STATUS_SUCCESS) {
        std::cerr << "Failed to create queue!\n";
        return 1;
    }
    
    std::cout << "Queue created successfully\n";
    std::cout << "  Base address: " << queue->base_address << "\n";
    std::cout << "  Size: " << queue->size << "\n";
    std::cout << "  Doorbell: " << queue->doorbell_signal.handle << "\n";
    
    // 3. 使用队列...
    
    // 4. 销毁队列
    hsa_queue_destroy(queue);
    
    hsa_shut_down();
    return 0;
}

5.4.2 队列类型选择

cpp 复制代码
// 单生产者队列(更快,但只能一个线程写入)
hsa_queue_create(gpu, 1024, HSA_QUEUE_TYPE_SINGLE, 
                 nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue);

// 多生产者队列(多个线程可以同时写入)
hsa_queue_create(gpu, 1024, HSA_QUEUE_TYPE_MULTI, 
                 nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue);

// 使用建议:
// - 如果只有一个线程派发任务 → HSA_QUEUE_TYPE_SINGLE
// - 如果多个线程同时派发任务 → HSA_QUEUE_TYPE_MULTI

5.4.3 队列状态查询

cpp 复制代码
void check_queue_status(hsa_queue_t* queue) {
    // 读取Write Index(下一个写入位置)
    uint64_t write_index = hsa_queue_load_write_index_relaxed(queue);
    std::cout << "Write Index: " << write_index << "\n";
    
    // 读取Read Index(GPU已处理位置)
    uint64_t read_index = hsa_queue_load_read_index_relaxed(queue);
    std::cout << "Read Index: " << read_index << "\n";
    
    // 计算队列使用情况
    uint64_t used = write_index - read_index;
    std::cout << "Queue Usage: " << used << " / " << queue->size << "\n";
    
    if (used >= queue->size) {
        std::cerr << "Warning: Queue is full!\n";
    }
}

5.5 完整示例代码解析

5.5.1 向量加法完整示例

这是一个完整的HSA程序,包含Kernel加载和执行:

vector_add.cpp:

cpp 复制代码
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstring>
#include <cassert>

// 辅助宏
#define HSA_CHECK(call) \
    do { \
        hsa_status_t _status = (call); \
        if (_status != HSA_STATUS_SUCCESS) { \
            const char* str; \
            hsa_status_string(_status, &str); \
            std::cerr << "HSA Error: " << str << " at " << __FILE__ \
                      << ":" << __LINE__ << std::endl; \
            exit(1); \
        } \
    } while(0)

// 查找GPU Agent
hsa_agent_t find_gpu() {
    struct Data { hsa_agent_t agent; bool found; } data = {0, false};
    
    hsa_iterate_agents([](hsa_agent_t agent, void* d) {
        Data* data = static_cast<Data*>(d);
        hsa_device_type_t type;
        hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
        if (type == HSA_DEVICE_TYPE_GPU) {
            data->agent = agent;
            data->found = true;
            return HSA_STATUS_INFO_BREAK;
        }
        return HSA_STATUS_SUCCESS;
    }, &data);
    
    assert(data.found && "No GPU found");
    return data.agent;
}

// 查找内存区域
hsa_region_t find_global_region(hsa_agent_t agent) {
    struct Data { hsa_region_t region; bool found; } data = {0, false};
    
    hsa_agent_iterate_regions(agent, 
        [](hsa_region_t region, void* d) {
            Data* data = static_cast<Data*>(d);
            hsa_region_segment_t segment;
            hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
            
            if (segment == HSA_REGION_SEGMENT_GLOBAL) {
                hsa_region_global_flag_t flags;
                hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
                if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
                    data->region = region;
                    data->found = true;
                    return HSA_STATUS_INFO_BREAK;
                }
            }
            return HSA_STATUS_SUCCESS;
        }, &data);
    
    assert(data.found && "No fine-grained region found");
    return data.region;
}

int main(int argc, char** argv) {
    const int N = 1024;
    
    std::cout << "HSA Vector Add Example\n";
    std::cout << "======================\n\n";
    
    // 1. 初始化HSA Runtime
    std::cout << "1. Initializing HSA Runtime...\n";
    HSA_CHECK(hsa_init());
    
    // 2. 查找GPU
    std::cout << "2. Finding GPU...\n";
    hsa_agent_t gpu = find_gpu();
    
    char name[64];
    hsa_agent_get_info(gpu, HSA_AGENT_INFO_NAME, name);
    std::cout << "   Found: " << name << "\n\n";
    
    // 3. 查找内存区域
    std::cout << "3. Finding memory region...\n";
    hsa_region_t region = find_global_region(gpu);
    
    // 4. 分配内存
    std::cout << "4. Allocating memory...\n";
    float *a, *b, *c;
    HSA_CHECK(hsa_memory_allocate(region, N * sizeof(float), (void**)&a));
    HSA_CHECK(hsa_memory_allocate(region, N * sizeof(float), (void**)&b));
    HSA_CHECK(hsa_memory_allocate(region, N * sizeof(float), (void**)&c));
    std::cout << "   Allocated 3 x " << N << " floats\n\n";
    
    // 5. 初始化数据
    std::cout << "5. Initializing data...\n";
    for (int i = 0; i < N; i++) {
        a[i] = static_cast<float>(i);
        b[i] = static_cast<float>(i * 2);
        c[i] = 0.0f;
    }
    
    // 6. 创建队列
    std::cout << "6. Creating queue...\n";
    hsa_queue_t* queue;
    HSA_CHECK(hsa_queue_create(gpu, 256, HSA_QUEUE_TYPE_MULTI,
                                nullptr, nullptr, UINT32_MAX, UINT32_MAX,
                                &queue));
    
    // 7. 创建Signal
    std::cout << "7. Creating signal...\n";
    hsa_signal_t signal;
    HSA_CHECK(hsa_signal_create(1, 0, nullptr, &signal));
    
    // 注:实际的Kernel加载和派发需要:
    // - 加载Code Object (.co文件)
    // - 创建Executable
    // - 获取Kernel符号
    // - 准备Kernel参数
    // - 构造AQL Dispatch Packet
    // - 提交到队列
    // 这部分较复杂,在后续章节详细讲解
    
    // 8. 模拟计算(CPU执行,演示流程)
    std::cout << "8. Computing on CPU (demo)...\n";
    for (int i = 0; i < N; i++) {
        c[i] = a[i] + b[i];
    }
    
    // 9. 验证结果
    std::cout << "9. Verifying results...\n";
    bool passed = true;
    for (int i = 0; i < N; i++) {
        float expected = a[i] + b[i];
        if (c[i] != expected) {
            std::cerr << "   Mismatch at " << i << ": " 
                      << c[i] << " != " << expected << "\n";
            passed = false;
            break;
        }
    }
    
    if (passed) {
        std::cout << "   ✓ All results correct!\n";
    } else {
        std::cout << "   ✗ Verification failed!\n";
    }
    
    std::cout << "\nFirst 10 results:\n";
    for (int i = 0; i < 10; i++) {
        std::cout << "   c[" << i << "] = " << a[i] << " + " 
                  << b[i] << " = " << c[i] << "\n";
    }
    
    // 10. 清理资源
    std::cout << "\n10. Cleaning up...\n";
    hsa_signal_destroy(signal);
    hsa_queue_destroy(queue);
    hsa_memory_free(a);
    hsa_memory_free(b);
    hsa_memory_free(c);
    
    // 11. 销毁Runtime
    std::cout << "11. Shutting down HSA Runtime...\n";
    HSA_CHECK(hsa_shut_down());
    
    std::cout << "\nDone!\n";
    return 0;
}

编译运行

bash 复制代码
g++ vector_add.cpp -o vector_add \
    -I/opt/rocm/include \
    -L/opt/rocm/lib \
    -lhsa-runtime64 \
    -std=c++11

./vector_add

# 输出:
# HSA Vector Add Example
# ======================
# 
# 1. Initializing HSA Runtime...
# 2. Finding GPU...
#    Found: gfx1030
# 
# 3. Finding memory region...
# 4. Allocating memory...
#    Allocated 3 x 1024 floats
# 
# 5. Initializing data...
# 6. Creating queue...
# 7. Creating signal...
# 8. Computing on CPU (demo)...
# 9. Verifying results...
#    ✓ All results correct!
# 
# First 10 results:
#    c[0] = 0 + 0 = 0
#    c[1] = 1 + 2 = 3
#    c[2] = 2 + 4 = 6
#    ...
# 
# 10. Cleaning up...
# 11. Shutting down HSA Runtime...
# 
# Done!

5.5.2 实用工具函数封装

hsa_utils.h

cpp 复制代码
#ifndef HSA_UTILS_H
#define HSA_UTILS_H

#include <hsa/hsa.h>
#include <iostream>
#include <string>

// 错误处理宏
#define HSA_CHECK(call) \
    do { \
        hsa_status_t _status = (call); \
        if (_status != HSA_STATUS_SUCCESS) { \
            const char* str; \
            hsa_status_string(_status, &str); \
            std::cerr << "HSA Error: " << str << " at " << __FILE__ \
                      << ":" << __LINE__ << std::endl; \
            exit(1); \
        } \
    } while(0)

// HSA Runtime RAII包装
class HsaRuntime {
public:
    HsaRuntime() {
        HSA_CHECK(hsa_init());
    }
    
    ~HsaRuntime() {
        hsa_shut_down();
    }
    
    // 禁止拷贝
    HsaRuntime(const HsaRuntime&) = delete;
    HsaRuntime& operator=(const HsaRuntime&) = delete;
};

// Agent查找器
class AgentFinder {
public:
    static hsa_agent_t find_gpu() {
        struct Data { hsa_agent_t agent; bool found; } data = {0, false};
        
        hsa_iterate_agents([](hsa_agent_t agent, void* d) {
            Data* data = static_cast<Data*>(d);
            hsa_device_type_t type;
            hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
            if (type == HSA_DEVICE_TYPE_GPU) {
                data->agent = agent;
                data->found = true;
                return HSA_STATUS_INFO_BREAK;
            }
            return HSA_STATUS_SUCCESS;
        }, &data);
        
        if (!data.found) {
            std::cerr << "No GPU found!\n";
            exit(1);
        }
        
        return data.agent;
    }
    
    static hsa_agent_t find_cpu() {
        struct Data { hsa_agent_t agent; bool found; } data = {0, false};
        
        hsa_iterate_agents([](hsa_agent_t agent, void* d) {
            Data* data = static_cast<Data*>(d);
            hsa_device_type_t type;
            hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
            if (type == HSA_DEVICE_TYPE_CPU) {
                data->agent = agent;
                data->found = true;
                return HSA_STATUS_INFO_BREAK;
            }
            return HSA_STATUS_SUCCESS;
        }, &data);
        
        if (!data.found) {
            std::cerr << "No CPU found!\n";
            exit(1);
        }
        
        return data.agent;
    }
};

// Region查找器
class RegionFinder {
public:
    static hsa_region_t find_fine_grained_region(hsa_agent_t agent) {
        struct Data { hsa_region_t region; bool found; } data = {0, false};
        
        hsa_agent_iterate_regions(agent, 
            [](hsa_region_t region, void* d) {
                Data* data = static_cast<Data*>(d);
                hsa_region_segment_t segment;
                hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
                
                if (segment == HSA_REGION_SEGMENT_GLOBAL) {
                    hsa_region_global_flag_t flags;
                    hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
                    if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
                        data->region = region;
                        data->found = true;
                        return HSA_STATUS_INFO_BREAK;
                    }
                }
                return HSA_STATUS_SUCCESS;
            }, &data);
        
        if (!data.found) {
            std::cerr << "No fine-grained region found!\n";
            exit(1);
        }
        
        return data.region;
    }
};

// Signal RAII包装
class Signal {
public:
    Signal(hsa_signal_value_t initial_value = 1) {
        HSA_CHECK(hsa_signal_create(initial_value, 0, nullptr, &signal_));
    }
    
    ~Signal() {
        hsa_signal_destroy(signal_);
    }
    
    operator hsa_signal_t() const { return signal_; }
    
    void wait(hsa_signal_condition_t condition = HSA_SIGNAL_CONDITION_EQ,
              hsa_signal_value_t compare_value = 0) {
        hsa_signal_wait_scacquire(signal_, condition, compare_value,
                                   UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
    }
    
    hsa_signal_value_t load() const {
        return hsa_signal_load_relaxed(signal_);
    }
    
private:
    hsa_signal_t signal_;
};

#endif // HSA_UTILS_H

使用工具类的简化版本

cpp 复制代码
#include "hsa_utils.h"

int main() {
    // RAII自动初始化和销毁
    HsaRuntime runtime;
    
    // 简化的查找操作
    hsa_agent_t gpu = AgentFinder::find_gpu();
    hsa_region_t region = RegionFinder::find_fine_grained_region(gpu);
    
    // 分配内存
    const int N = 1024;
    float *data;
    HSA_CHECK(hsa_memory_allocate(region, N * sizeof(float), (void**)&data));
    
    // 使用Signal的RAII包装
    Signal signal(1);
    
    // 业务逻辑...
    
    signal.wait();
    
    // 自动清理(RAII)
    hsa_memory_free(data);
    return 0;
}

🎯 本章总结

核心要点回顾

  1. Runtime初始化

    • hsa_init() / hsa_shut_down()
    • 错误处理和版本查询
    • RAII包装提高安全性
  2. 设备发现

    • hsa_iterate_agents() 枚举设备
    • hsa_agent_get_info() 查询信息
    • 区分CPU/GPU/DSP类型
  3. 内存管理

    • hsa_agent_iterate_regions() 查找内存区域
    • hsa_memory_allocate() / hsa_memory_free()
    • Fine-grained vs Coarse-grained
  4. 队列操作

    • hsa_queue_create() / hsa_queue_destroy()
    • SINGLE vs MULTI类型
    • 队列状态监控
  5. Signal同步

    • hsa_signal_create() / hsa_signal_destroy()
    • hsa_signal_wait_scacquire() 等待
    • 用于Host-Device同步

API速查表

cpp 复制代码
// Runtime
hsa_init()
hsa_shut_down()
hsa_system_get_info()

// Agent
hsa_iterate_agents(callback, data)
hsa_agent_get_info(agent, attribute, value)
hsa_agent_iterate_regions(agent, callback, data)

// Memory
hsa_region_get_info(region, attribute, value)
hsa_memory_allocate(region, size, ptr)
hsa_memory_free(ptr)

// Queue
hsa_queue_create(agent, size, type, callback, data, 
                 private_size, group_size, queue)
hsa_queue_destroy(queue)
hsa_queue_load_write_index_relaxed(queue)
hsa_queue_load_read_index_relaxed(queue)

// Signal
hsa_signal_create(initial_value, num_consumers, consumers, signal)
hsa_signal_destroy(signal)
hsa_signal_wait_scacquire(signal, condition, compare, timeout, wait_hint)
hsa_signal_load_relaxed(signal)
hsa_signal_store_relaxed(signal, value)

实践建议

  1. 错误处理:使用HSA_CHECK宏简化
  2. 资源管理:用RAII避免泄漏
  3. 工具封装:封装常用查找函数
  4. 调试技巧:启用HSA_ENABLE_DEBUG
  5. 性能考虑:选择合适的内存区域和队列类型

🔗 导航


📚 延伸阅读

示例代码

API文档


相关推荐
Breath577 小时前
让 AI Agent 操作钉钉文档、表格的技能(Skill)库
人工智能·开源·钉钉·agent·ai agent·skill·agent skil
一个处女座的程序猿8 小时前
AGI之AI-Assistant之MultiAgent之OpenClaw:IronClaw的简介、安装和使用方法、案例应用之详细攻略
agent·multiagent·ai-assistant·ironclaw
熏鱼的小迷弟Liu9 小时前
【AI】ReAct是什么?解决了哪些痛点?有哪些应用场景?
ai·agent
金智维科技官方9 小时前
信通院认证,金智维的政务智能体让政务工作“智能自动化”
人工智能·ai·自动化·agent·智能体
SelectDB技术团队10 小时前
易车 × Apache Doris:构建湖仓一体新架构,加速 AI 业务融合实践
数据仓库·人工智能·数据分析·agent·apache doris·mcp·易车
TechFind11 小时前
OpenClaw 多 Agent 协作实战:从零搭建自动化内容营销系统
人工智能·agent
桦说编程12 小时前
提示词工程的艺术
设计模式·agent·ai编程
明月(Alioo)12 小时前
OpenClaw与ClawHub的关系:当“智能体”遇上“技能商店”
python·ai·agent