HIP Runtime资源分配相关的核心API分类总结

根据HIP Runtime的架构设计，以下是资源分配相关的核心API分类总结，特别关注AI工作负载的优化：

一、内存分配API（核心资源）

1. 设备内存分配（GPU显存）

c 复制代码

// 基础分配
hipError_t hipMalloc(void** ptr, size_t size);
hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height);

// AI优化分配（对齐、NUMA感知）
hipError_t hipMallocAligned(void** ptr, size_t size, size_t alignment);  // 张量对齐
hipError_t hipMallocNuma(void** ptr, size_t size, int numaNode);         // NUMA优化

// 释放
hipError_t hipFree(void* ptr);

2. 主机内存分配（Pinned/Page-Locked）

c 复制代码

hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags);
hipError_t hipHostFree(void* ptr);

// 标志选项（AI数据传输优化）
#define hipHostMallocDefault        0x00
#define hipHostMallocPortable       0x01    // 多GPU可访问
#define hipHostMallocMapped         0x02    // 映射到设备地址空间
#define hipHostMallocWriteCombined  0x04    // 写合并（AI训练数据流）
#define hipHostMallocNumaUser       0x08    // 用户指定NUMA节点

3. 统一内存分配（UM，AI模型常用）

c 复制代码

hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int flags);
// 标志选项
#define hipMemAttachGlobal          0x01    // 全局附加
#define hipMemAttachHost            0x02    // 主机附加
#define hipMemAttachSingle          0x04    // 单GPU附加

// 统一内存建议（AI工作集优化）
hipError_t hipMemAdvise(const void* ptr, size_t count, hipMemoryAdvise advice, int deviceId);
hipError_t hipMemPrefetchAsync(const void* ptr, size_t count, int dstDeviceId, hipStream_t stream);

4. 内存池分配（AI训练优化）

c 复制代码

// 内存池创建与管理
hipError_t hipMemPoolCreate(hipMemPool_t* memPool, const hipMemPoolProps* poolProps);
hipError_t hipMemPoolDestroy(hipMemPool_t memPool);

// 从内存池分配
hipError_t hipMallocFromPoolAsync(void** ptr, size_t size, hipMemPool_t memPool, hipStream_t stream);

// 内存池属性（AI工作负载配置）
typedef struct hipMemPoolProps {
    hipMemAllocationType allocType;      // 分配类型
    hipMemAllocationHandleType handleType; // 句柄类型
    hipMemLocation location;             // 内存位置
    size_t maxSize;                      // 最大池大小
    unsigned int reserved[8];            // 保留字段
} hipMemPoolProps;

二、流资源分配API（执行并行性）

1. 流创建与管理

c 复制代码

// 流创建
hipError_t hipStreamCreate(hipStream_t* stream);
hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);
hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority);

// 流标志（AI流水线优化）
#define hipStreamDefault             0x00    // 默认流
#define hipStreamNonBlocking         0x01    // 非阻塞流（AI计算/传输重叠）
#define hipStreamComputeOnly         0x02    // 纯计算流（AI内核专用）

// 流销毁
hipError_t hipStreamDestroy(hipStream_t stream);

2. 默认流管理

c 复制代码

hipError_t hipStreamGetDeviceDefault(hipStream_t* stream, int deviceId);
hipError_t hipStreamSetDeviceDefault(hipStream_t stream, int deviceId);

3. 流同步控制

c 复制代码

hipError_t hipStreamSynchronize(hipStream_t stream);
hipError_t hipStreamQuery(hipStream_t stream);
hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);

三、事件资源分配API（时间测量与同步）

1. 事件创建与管理

c 复制代码

hipError_t hipEventCreate(hipEvent_t* event);
hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned int flags);

// 事件标志（AI性能分析）
#define hipEventDefault             0x00
#define hipEventBlockingSync        0x01    // 阻塞同步
#define hipEventDisableTiming       0x02    // 禁用计时（仅同步）
#define hipEventInterprocess        0x04    // 进程间事件
#define hipEventReleaseToSystem     0x08    // 系统释放

// 事件销毁
hipError_t hipEventDestroy(hipEvent_t event);

2. 事件池分配（高性能重用）

c 复制代码

hipError_t hipEventPoolCreate(hipEventPool_t* eventPool, const hipEventPoolProps* props);
hipError_t hipEventPoolDestroy(hipEventPool_t eventPool);
hipError_t hipEventCreateFromPool(hipEvent_t* event, hipEventPool_t pool);

四、内核资源分配API（AI计算核心）

1. 模块与函数管理

c 复制代码

// 模块加载（AI模型内核）
hipError_t hipModuleLoad(hipModule_t* module, const char* fname);
hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
hipError_t hipModuleUnload(hipModule_t module);

// 函数获取
hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname);

2. 内核参数准备

c 复制代码

// 内核启动配置（AI张量维度）
hipError_t hipModuleLaunchKernel(hipFunction_t f, 
                                 uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
                                 uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
                                 uint32_t sharedMemBytes, hipStream_t stream,
                                 void** kernelParams, void** extra);

// 内核缓存管理（AI内核重用）
hipError_t hipKernelCacheEnable(int deviceId, bool enable);
hipError_t hipKernelCacheGetSize(size_t* cacheSize, int deviceId);

五、纹理与表面API（AI图像处理）

1. 纹理资源分配

c 复制代码

hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, 
                                  const hipResourceDesc* pResDesc,
                                  const hipTextureDesc* pTexDesc,
                                  const hipResourceViewDesc* pResViewDesc);

hipError_t hipDestroyTextureObject(hipTextureObject_t texObject);

2. 表面内存分配

c 复制代码

hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, 
                                  const hipResourceDesc* pResDesc);

hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfObject);

六、共享内存配置API（AI内核优化）

c 复制代码

hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config);
hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t config);

// 共享内存配置选项
typedef enum hipSharedMemConfig {
    hipSharedMemBankSizeDefault   = 0x00,  // 默认bank大小
    hipSharedMemBankSizeFourByte  = 0x01,  // 4字节bank（AI矩阵运算）
    hipSharedMemBankSizeEightByte = 0x02   // 8字节bank（AI张量运算）
} hipSharedMemConfig;

七、资源分配架构流程

AI训练典型资源分配流程：

复制代码

1. 初始化阶段：
   hipMallocManaged()  // 分配模型参数UM
   hipHostMalloc(WriteCombined)  // 分配训练数据pinned内存
   hipStreamCreate(NonBlocking)  // 创建计算/传输重叠流

2. 训练迭代：
   hipMemPrefetchAsync()  // 预取下一批数据
   hipMallocFromPoolAsync()  // 从内存池分配临时张量
   hipModuleLaunchKernel()  // 启动AI内核
   hipEventRecord()  // 记录性能事件

3. 清理阶段：
   hipFree() / hipHostFree()
   hipStreamDestroy()
   hipMemPoolTrimTo()  // 内存池整理

八、AI优化专用API

1. 张量内存分配（AI专用）

c 复制代码

hipError_t hipMallocTensor(void** ptr, const hipTensorDescriptor_t* desc);
hipError_t hipMallocTensorAligned(void** ptr, const hipTensorDescriptor_t* desc, size_t alignment);

typedef struct hipTensorDescriptor {
    hipTensorDataType dataType;      // 数据类型（FP16/FP32/BF16/INT8）
    int dimensions[8];               // 张量维度
    int strides[8];                  // 内存步长
    size_t alignment;                // 对齐要求（张量核心优化）
} hipTensorDescriptor_t;

2. 工作空间分配（AI算子临时内存）

c 复制代码

hipError_t hipAllocWorkspace(size_t* allocatedSize, 
                            const hipWorkspaceRequest* request,
                            hipWorkspacePolicy policy);

typedef enum hipWorkspacePolicy {
    hipWorkspacePolicyMinimal,      // 最小工作空间
    hipWorkspacePolicyConservative, // 保守分配
    hipWorkspacePolicyAggressive    // 激进分配（AI训练加速）
} hipWorkspacePolicy;

关键设计特点：

分层分配策略：基础分配 → 优化分配 → AI专用分配
异步分配支持：内存池、流的异步分配减少同步开销
AI工作负载优化：张量对齐、混合精度内存、内核缓存
资源重用机制：内存池、事件池、流池提高资源利用率
硬件感知分配：NUMA优化、bank配置、缓存策略

这些API为AI应用提供了从基础内存分配到高级优化策略的完整资源管理能力，特别针对大模型训练和推理场景进行了优化。