深入解析CANN-acl应用层接口：构建高效的AI应用开发框架

深入解析CANN acl应用层接口:构建高效的AI应用开发框架

引言

CANN平台的应用层接口(Application Computing Language, ACL)为开发者提供了简洁易用的编程接口,是连接上层应用和底层运行时环境的重要桥梁。ACL封装了底层复杂的硬件操作,让开发者能够专注于业务逻辑而非底层细节。本文将深入剖析ACL的架构设计和代码实现。

相关链接:

CANN组织:https://atomgit.com/cann
acl仓库:https://atomgit.com/cann/acl

一、ACL核心架构与初始化

1.1 ACL框架设计

cpp 复制代码

// ACL核心框架头文件
#ifndef CANN_ACL_FRAMEWORK_H
#define CANN_ACL_FRAMEWORK_H

#include <memory>
#include <string>
#include <vector>
#include <map>
#include <mutex>

namespace acl {

// 错误码定义
enum class Error {
    NONE = 0,
    INVALID_PARAM = 100,
    OUT_OF_MEMORY = 101,
    INTERNAL_ERROR = 102,
    DEVICE_NOT_FOUND = 103,
    // ... 更多错误码
};

// 返回状态
class Status {
private:
    Error error_;
    std::string message_;

public:
    Status() : error_(Error::NONE), message_("") {}
    Status(Error error, const std::string& message) 
        : error_(error), message_(message) {}
    
    bool IsOk() const { return error_ == Error::NONE; }
    Error GetError() const { return error_; }
    const std::string& GetMessage() const { return message_; }
    
    static Status OK() { return Status(); }
    
    static Status InvalidParam(const std::string& msg) {
        return Status(Error::INVALID_PARAM, msg);
    }
    
    static Status OutOfMemory(const std::string& msg) {
        return Status(Error::OUT_OF_MEMORY, msg);
    }
};

// ACL上下文
class Context {
private:
    static std::shared_ptr<Context> instance_;
    static std::mutex mutex_;
    
    bool initialized_;
    int device_id_;
    std::map<std::string, void*> resources_;
    
    Context() : initialized_(false), device_id_(-1) {}
    
public:
    static std::shared_ptr<Context> GetInstance() {
        std::lock_guard<std::mutex> lock(mutex_);
        if (!instance_) {
            instance_ = std::shared_ptr<Context>(new Context());
        }
        return instance_;
    }
    
    Status Initialize(int device_id = 0) {
        if (initialized_) {
            return Status::OK();
        }
        
        // 调用底层初始化
        auto ret = aclInit(nullptr);
        if (ret != 0) {
            return Status(Error::INTERNAL_ERROR, "Failed to initialize ACL");
        }
        
        // 设置设备
        ret = aclrtSetDevice(device_id);
        if (ret != 0) {
            return Status(Error::DEVICE_NOT_FOUND, "Failed to set device");
        }
        
        device_id_ = device_id;
        initialized_ = true;
        
        return Status::OK();
    }
    
    Status Finalize() {
        if (!initialized_) {
            return Status::OK();
        }
        
        // 释放资源
        for (auto& pair : resources_) {
            // 根据资源类型释放
        }
        resources_.clear();
        
        // 重置设备
        aclrtResetDevice(device_id_);
        
        // 终止ACL
        aclFinalize();
        
        initialized_ = false;
        device_id_ = -1;
        
        return Status::OK();
    }
    
    bool IsInitialized() const { return initialized_; }
    int GetDeviceId() const { return device_id_; }
    
    void SetResource(const std::string& key, void* resource) {
        resources_[key] = resource;
    }
    
    void* GetResource(const std::string& key) {
        auto it = resources_.find(key);
        if (it != resources_.end()) {
            return it->second;
        }
        return nullptr;
    }
};

// 初始化静态成员
std::shared_ptr<Context> Context::instance_ = nullptr;
std::mutex Context::mutex_;

} // namespace acl

#endif // CANN_ACL_FRAMEWORK_H

1.2 Python ACL封装

python 复制代码

import acl
from typing import Optional, Dict, Any
import threading
import atexit

class ACLContext:
    """ACL上下文管理器"""
    
    _instance = None
    _lock = threading.Lock()
    
    def __new__(cls):
        if cls._instance is None:
            with cls._lock:
                if cls._instance is None:
                    cls._instance = super().__new__(cls)
                    cls._instance._initialized = False
                    cls._instance._device_id = -1
                    cls._instance._resources = {}
        return cls._instance
    
    def initialize(self, device_id: int = 0) -> bool:
        """初始化ACL"""
        if self._initialized:
            return True
        
        # 初始化ACL
        ret = acl.init()
        if ret != 0:
            raise RuntimeError(f"Failed to initialize ACL: {ret}")
        
        # 设置设备
        ret = acl.rt.set_device(device_id)
        if ret != 0:
            raise RuntimeError(f"Failed to set device {device_id}: {ret}")
        
        self._device_id = device_id
        self._initialized = True
        
        # 注册清理函数
        atexit.register(self.finalize)
        
        return True
    
    def finalize(self):
        """终止ACL"""
        if not self._initialized:
            return
        
        # 清理资源
        for name, resource in self._resources.items():
            try:
                resource.cleanup()
            except Exception as e:
                print(f"Error cleaning up {name}: {e}")
        
        self._resources.clear()
        
        # 重置设备
        acl.rt.reset_device(self._device_id)
        
        # 终止ACL
        acl.finalize()
        
        self._initialized = False
        self._device_id = -1
    
    def __enter__(self):
        """上下文管理器入口"""
        self.initialize()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """上下文管理器出口"""
        self.finalize()
    
    def register_resource(self, name: str, resource):
        """注册资源"""
        self._resources[name] = resource
    
    def unregister_resource(self, name: str):
        """注销资源"""
        if name in self._resources:
            del self._resources[name]
    
    @property
    def device_id(self) -> int:
        """获取设备ID"""
        return self._device_id
    
    @property
    def is_initialized(self) -> bool:
        """检查是否已初始化"""
        return self._initialized


# 全局ACL上下文
acl_context = ACLContext()


def initialize_acl(device_id: int = 0) -> ACLContext:
    """初始化ACL"""
    acl_context.initialize(device_id)
    return acl_context


def finalize_acl():
    """终止ACL"""
    acl_context.finalize()


# 使用示例
if __name__ == "__main__":
    # 使用上下文管理器
    with ACLContext() as ctx:
        print(f"ACL initialized on device {ctx.device_id}")
        print(f"Is initialized: {ctx.is_initialized}")

二、张量与数据操作

2.1 张量类实现

cpp 复制代码

// 张量类定义
#ifndef CANN_ACL_TENSOR_H
#define CANN_ACL_TENSOR_H

#include <vector>
#include <memory>
#include "acl_framework.h"

namespace acl {

// 数据类型
enum class DataType {
    DT_FLOAT = 0,
    DT_FLOAT16 = 1,
    DT_INT8 = 2,
    DT_INT16 = 3,
    DT_INT32 = 4,
    DT_INT64 = 5,
    DT_UINT8 = 6,
    DT_UINT16 = 7,
    DT_UINT32 = 8,
    DT_UINT64 = 9,
    DT_BOOL = 10,
    DT_DOUBLE = 11,
};

// 张量形状
class Shape {
private:
    std::vector<int64_t> dims_;

public:
    Shape() = default;
    explicit Shape(const std::vector<int64_t>& dims) : dims_(dims) {}
    
    int64_t operator[](size_t index) const { return dims_[index]; }
    size_t Size() const { return dims_.size(); }
    
    int64_t NumElements() const {
        if (dims_.empty()) return 0;
        int64_t count = 1;
        for (auto dim : dims_) {
            count *= dim;
        }
        return count;
    }
    
    const std::vector<int64_t>& GetDims() const { return dims_; }
    
    void AddDim(int64_t dim) { dims_.push_back(dim); }
    void SetDim(size_t index, int64_t dim) { dims_[index] = dim; }
    
    std::string ToString() const {
        std::string str = "[";
        for (size_t i = 0; i < dims_.size(); ++i) {
            if (i > 0) str += ", ";
            str += std::to_string(dims_[i]);
        }
        str += "]";
        return str;
    }
};

// 张量描述
class TensorDesc {
private:
    DataType data_type_;
    Shape shape_;
    std::string format_;
    size_t element_size_;

public:
    TensorDesc() : data_type_(DataType::DT_FLOAT), 
                  format_("NCHW"), element_size_(4) {}
    
    TensorDesc(DataType data_type, const Shape& shape, 
               const std::string& format = "NCHW")
        : data_type_(data_type), shape_(shape), format_(format) {
        element_size_ = GetDataTypeSize(data_type);
    }
    
    DataType GetDataType() const { return data_type_; }
    void SetDataType(DataType dtype) { data_type_ = dtype; element_size_ = GetDataTypeSize(dtype); }
    
    const Shape& GetShape() const { return shape_; }
    void SetShape(const Shape& shape) { shape_ = shape; }
    
    const std::string& GetFormat() const { return format_; }
    void SetFormat(const std::string& format) { format_ = format; }
    
    size_t GetElementSize() const { return element_size_; }
    size_t GetTotalSize() const { 
        return shape_.NumElements() * element_size_; 
    }
    
    static size_t GetDataTypeSize(DataType dtype) {
        switch (dtype) {
            case DataType::DT_FLOAT:
            case DataType::DT_INT32:
            case DataType::DT_UINT32:
                return 4;
            case DataType::DT_FLOAT16:
            case DataType::DT_INT16:
            case DataType::DT_UINT16:
                return 2;
            case DataType::DT_INT8:
            case DataType::DT_UINT8:
            case DataType::DT_BOOL:
                return 1;
            case DataType::DT_INT64:
            case DataType::DT_UINT64:
            case DataType::DT_DOUBLE:
                return 8;
            default:
                return 4;
        }
    }
};

// 数据缓冲区
class DataBuffer {
private:
    void* data_;
    size_t size_;
    bool is_device_;
    bool owns_data_;

public:
    DataBuffer() : data_(nullptr), size_(0), is_device_(false), owns_data_(false) {}
    
    DataBuffer(void* data, size_t size, bool is_device = false, bool owns_data = true)
        : data_(data), size_(size), is_device_(is_device), owns_data_(owns_data) {}
    
    ~DataBuffer() {
        if (owns_data_ && data_ != nullptr) {
            if (is_device_) {
                aclrtFree(data_);
            } else {
                free(data_);
            }
        }
    }
    
    void* GetData() const { return data_; }
    size_t GetSize() const { return size_; }
    bool IsDevice() const { return is_device_; }
    
    void SetData(void* data, size_t size, bool is_device = false, bool owns_data = true) {
        if (owns_data_ && data_ != nullptr) {
            if (is_device_) {
                aclrtFree(data_);
            } else {
                free(data_);
            }
        }
        
        data_ = data;
        size_ = size;
        is_device_ = is_device;
        owns_data_ = owns_data;
    }
    
    void CopyFrom(const DataBuffer& other) {
        if (size_ != other.size_) {
            throw std::runtime_error("Buffer size mismatch");
        }
        
        if (is_device_ && other.is_device_) {
            aclrtMemcpy(data_, size_, other.data_, other.size_, 
                       ACL_MEMCPY_DEVICE_TO_DEVICE);
        } else if (is_device_ && !other.is_device_) {
            aclrtMemcpy(data_, size_, other.data_, other.size_, 
                       ACL_MEMCPY_HOST_TO_DEVICE);
        } else if (!is_device_ && other.is_device_) {
            aclrtMemcpy(data_, size_, other.data_, other.size_, 
                       ACL_MEMCPY_DEVICE_TO_HOST);
        } else {
            memcpy(data_, other.data_, size_);
        }
    }
};

// 张量类
class Tensor {
private:
    TensorDesc desc_;
    DataBuffer buffer_;
    std::string name_;

public:
    Tensor() = default;
    
    Tensor(const std::string& name, const TensorDesc& desc)
        : desc_(desc), name_(name) {
        // 分配内存
        void* data = nullptr;
        size_t size = desc_.GetTotalSize();
        auto ret = aclrtMalloc(&data, size, ACL_MEM_MALLOC_HUGE_FIRST);
        if (ret != ACL_ERROR_NONE) {
            throw std::runtime_error("Failed to allocate device memory");
        }
        buffer_.SetData(data, size, true, true);
    }
    
    Tensor(const std::string& name, const TensorDesc& desc, void* data, bool is_device = false)
        : desc_(desc), name_(name) {
        size_t size = desc_.GetTotalSize();
        buffer_.SetData(data, size, is_device, false);
    }
    
    const TensorDesc& GetDesc() const { return desc_; }
    TensorDesc& GetDesc() { return desc_; }
    
    const DataBuffer& GetBuffer() const { return buffer_; }
    DataBuffer& GetBuffer() { return buffer_; }
    
    const std::string& GetName() const { return name_; }
    void SetName(const std::string& name) { name_ = name; }
    
    size_t GetNumElements() const { return desc_.GetShape().NumElements(); }
    size_t GetSizeInBytes() const { return desc_.GetTotalSize(); }
    
    Status CopyFrom(const Tensor& other) {
        if (desc_.GetShape() != other.desc_.GetShape() ||
            desc_.GetDataType() != other.desc_.GetDataType()) {
            return Status::InvalidParam("Tensor shape or dtype mismatch");
        }
        
        buffer_.CopyFrom(other.buffer_);
        return Status::OK();
    }
    
    Status ToHost(Tensor& host_tensor) const {
        if (host_tensor.desc_.GetShape() != desc_.GetShape() ||
            host_tensor.desc_.GetDataType() != desc_.GetDataType()) {
            return Status::InvalidParam("Tensor shape or dtype mismatch");
        }
        
        auto ret = aclrtMemcpy(host_tensor.buffer_.GetData(), 
                              host_tensor.buffer_.GetSize(),
                              buffer_.GetData(), buffer_.GetSize(),
                              ACL_MEMCPY_DEVICE_TO_HOST);
        if (ret != ACL_ERROR_NONE) {
            return Status(Error::INTERNAL_ERROR, "Failed to copy to host");
        }
        
        return Status::OK();
    }
    
    Status ToDevice(Tensor& device_tensor) const {
        if (device_tensor.desc_.GetShape() != desc_.GetShape() ||
            device_tensor.desc_.GetDataType() != desc_.GetDataType()) {
            return Status::InvalidParam("Tensor shape or dtype mismatch");
        }
        
        auto ret = aclrtMemcpy(device_tensor.buffer_.GetData(),
                              device_tensor.buffer_.GetSize(),
                              buffer_.GetData(), buffer_.GetSize(),
                              ACL_MEMCPY_HOST_TO_DEVICE);
        if (ret != ACL_ERROR_NONE) {
            return Status(Error::INTERNAL_ERROR, "Failed to copy to device");
        }
        
        return Status::OK();
    }
};

} // namespace acl

#endif // CANN_ACL_TENSOR_H

2.2 Python张量操作

python 复制代码

import numpy as np
import acl
from typing import Union, Tuple, Optional

class Tensor:
    """张量类"""
    
    def __init__(self, data: Union[np.ndarray, bytes], name: str = ""):
        self.name = name
        
        if isinstance(data, np.ndarray):
            self._from_numpy(data)
        elif isinstance(data, bytes):
            self._from_bytes(data)
        else:
            raise TypeError("Unsupported data type")
    
    def _from_numpy(self, arr: np.ndarray):
        """从numpy数组创建"""
        self.dtype = self._numpy_to_acl_dtype(arr.dtype)
        self.shape = arr.shape
        self.data = arr.tobytes()
        self.size = arr.nbytes
        self.is_device = False
    
    def _from_bytes(self, data: bytes):
        """从字节数据创建"""
        self.data = data
        self.size = len(data)
        self.shape = None
        self.dtype = None
        self.is_device = False
    
    @staticmethod
    def _numpy_to_acl_dtype(dtype: np.dtype) -> int:
        """转换numpy数据类型到ACL数据类型"""
        dtype_map = {
            np.float32: acl.ACL_FLOAT,
            np.float16: acl.ACL_FLOAT16,
            np.int32: acl.ACL_INT32,
            np.int64: acl.ACL_INT64,
            np.int8: acl.ACL_INT8,
            np.uint8: acl.ACL_UINT8,
            np.bool_: acl.ACL_BOOL,
        }
        return dtype_map.get(dtype, acl.ACL_FLOAT)
    
    @staticmethod
    def _acl_to_numpy_dtype(dtype: int) -> np.dtype:
        """转换ACL数据类型到numpy数据类型"""
        dtype_map = {
            acl.ACL_FLOAT: np.float32,
            acl.ACL_FLOAT16: np.float16,
            acl.ACL_INT32: np.int32,
            acl.ACL_INT64: np.int64,
            acl.ACL_INT8: np.int8,
            acl.ACL_UINT8: np.uint8,
            acl.ACL_BOOL: np.bool_,
        }
        return dtype_map.get(dtype, np.float32)
    
    def to_device(self) -> 'DeviceTensor':
        """传输到设备"""
        ptr = acl.rt.malloc(self.size, acl.rt.ACL_MEM_MALLOC_HUGE_FIRST)
        acl.rt.memcpy(ptr, self.data, self.size, acl.rt.ACL_MEMCPY_HOST_TO_DEVICE)
        
        device_tensor = DeviceTensor(ptr, self.size, self.dtype, self.shape)
        return device_tensor
    
    def to_numpy(self) -> np.ndarray:
        """转换为numpy数组"""
        if self.shape is None:
            raise ValueError("Shape is not set")
        
        dtype = self._acl_to_numpy_dtype(self.dtype)
        return np.frombuffer(self.data, dtype=dtype).reshape(self.shape)
    
    def __repr__(self):
        return f"Tensor(shape={self.shape}, dtype={self.dtype}, size={self.size})"


class DeviceTensor:
    """设备张量"""
    
    def __init__(self, ptr: int, size: int, dtype: int, shape: Optional[Tuple] = None):
        self.ptr = ptr
        self.size = size
        self.dtype = dtype
        self.shape = shape
        self.is_device = True
    
    def __del__(self):
        """析构函数"""
        if self.ptr != 0:
            acl.rt.free(self.ptr)
            self.ptr = 0
    
    def to_host(self) -> Tensor:
        """传输到主机"""
        data = bytes(self.size)
        acl.rt.memcpy(data, self.ptr, self.size, acl.rt.ACL_MEMCPY_DEVICE_TO_HOST)
        
        return Tensor(data)
    
    def copy_from(self, src: Union['DeviceTensor', Tensor]):
        """从源拷贝"""
        if isinstance(src, DeviceTensor):
            acl.rt.memcpy(self.ptr, src.ptr, min(self.size, src.size),
                         acl.rt.ACL_MEMCPY_DEVICE_TO_DEVICE)
        elif isinstance(src, Tensor):
            acl.rt.memcpy(self.ptr, src.data, min(self.size, src.size),
                         acl.rt.ACL_MEMCPY_HOST_TO_DEVICE)
    
    def copy_to(self, dst: Union['DeviceTensor', Tensor]):
        """拷贝到目标"""
        if isinstance(dst, DeviceTensor):
            acl.rt.memcpy(dst.ptr, self.ptr, min(self.size, dst.size),
                         acl.rt.ACL_MEMCPY_DEVICE_TO_DEVICE)
        elif isinstance(dst, Tensor):
            acl.rt.memcpy(dst.data, self.ptr, min(self.size, dst.size),
                         acl.rt.ACL_MEMCPY_DEVICE_TO_HOST)
    
    def to_numpy(self) -> np.ndarray:
        """转换为numpy数组"""
        if self.shape is None:
            raise ValueError("Shape is not set")
        
        dtype = Tensor._acl_to_numpy_dtype(self.dtype)
        arr = np.empty(self.shape, dtype=dtype)
        
        acl.rt.memcpy(arr.ctypes.data, self.ptr, self.size, 
                     acl.rt.ACL_MEMCPY_DEVICE_TO_HOST)
        
        return arr
    
    def __repr__(self):
        return f"DeviceTensor(ptr=0x{self.ptr:x}, size={self.size}, dtype={self.dtype})"


# 工厂函数
def from_numpy(arr: np.ndarray, name: str = "") -> Tensor:
    """从numpy数组创建张量"""
    return Tensor(arr, name)


def zeros(shape: Tuple, dtype: np.dtype = np.float32, name: str = "") -> Tensor:
    """创建零张量"""
    arr = np.zeros(shape, dtype=dtype)
    return Tensor(arr, name)


def ones(shape: Tuple, dtype: np.dtype = np.float32, name: str = "") -> Tensor:
    """创建单位张量"""
    arr = np.ones(shape, dtype=dtype)
    return Tensor(arr, name)


def randn(shape: Tuple, dtype: np.dtype = np.float32, name: str = "") -> Tensor:
    """创建随机张量"""
    arr = np.random.randn(*shape).astype(dtype)
    return Tensor(arr, name)


# 使用示例
if __name__ == "__main__":
    # 从numpy创建张量
    arr = np.random.randn(2, 3, 4).astype(np.float32)
    tensor = from_numpy(arr, "input")
    print(f"Created tensor: {tensor}")
    
    # 传输到设备
    device_tensor = tensor.to_device()
    print(f"Device tensor: {device_tensor}")
    
    # 传输回主机
    host_tensor = device_tensor.to_host()
    print(f"Host tensor: {host_tensor}")
    
    # 转换为numpy
    arr2 = host_tensor.to_numpy()
    print(f"Numpy array shape: {arr2.shape}")
    
    # 验证数据
    assert np.allclose(arr, arr2)
    print("Data verification passed!")

三、算子执行接口

3.1 算子执行器实现

cpp 复制代码

// 算子执行器
#ifndef CANN_ACL_OPERATOR_H
#define CANN_ACL_OPERATOR_H

#include "acl_tensor.h"
#include <map>
#include <string>

namespace acl {

// 算子属性
class OpAttr {
private:
    std::map<std::string, int> int_attrs_;
    std::map<std::string, float> float_attrs_;
    std::map<std::string, std::string> string_attrs_;
    std::map<std::string, std::vector<int>> int_list_attrs_;
    std::map<std::string, std::vector<float>> float_list_attrs_;
    std::map<std::string, bool> bool_attrs_;

public:
    void SetInt(const std::string& name, int value) {
        int_attrs_[name] = value;
    }
    
    void SetFloat(const std::string& name, float value) {
        float_attrs_[name] = value;
    }
    
    void SetString(const std::string& name, const std::string& value) {
        string_attrs_[name] = value;
    }
    
    void SetIntList(const std::string& name, const std::vector<int>& value) {
        int_list_attrs_[name] = value;
    }
    
    void SetFloatList(const std::string& name, const std::vector<float>& value) {
        float_list_attrs_[name] = value;
    }
    
    void SetBool(const std::string& name, bool value) {
        bool_attrs_[name] = value;
    }
    
    int GetInt(const std::string& name, int default_value = 0) const {
        auto it = int_attrs_.find(name);
        return it != int_attrs_.end() ? it->second : default_value;
    }
    
    float GetFloat(const std::string& name, float default_value = 0.0f) const {
        auto it = float_attrs_.find(name);
        return it != float_attrs_.end() ? it->second : default_value;
    }
    
    std::string GetString(const std::string& name, const std::string& default_value = "") const {
        auto it = string_attrs_.find(name);
        return it != string_attrs_.end() ? it->second : default_value;
    }
    
    std::vector<int> GetIntList(const std::string& name) const {
        auto it = int_list_attrs_.find(name);
        return it != int_list_attrs_.end() ? it->second : std::vector<int>();
    }
    
    std::vector<float> GetFloatList(const std::string& name) const {
        auto it = float_list_attrs_.find(name);
        return it != float_list_attrs_.end() ? it->second : std::vector<float>();
    }
    
    bool GetBool(const std::string& name, bool default_value = false) const {
        auto it = bool_attrs_.find(name);
        return it != bool_attrs_.end() ? it->second : default_value;
    }
};

// 算子描述
class OpDesc {
private:
    std::string op_type_;
    std::string op_name_;
    OpAttr attrs_;
    std::vector<TensorDesc> input_descs_;
    std::vector<TensorDesc> output_descs_;

public:
    OpDesc(const std::string& op_type, const std::string& op_name = "")
        : op_type_(op_type), op_name_(op_name.empty() ? op_type : op_name) {}
    
    const std::string& GetType() const { return op_type_; }
    const std::string& GetName() const { return op_name_; }
    OpAttr& GetAttrs() { return attrs_; }
    const OpAttr& GetAttrs() const { return attrs_; }
    
    void AddInputDesc(const TensorDesc& desc) {
        input_descs_.push_back(desc);
    }
    
    void AddOutputDesc(const TensorDesc& desc) {
        output_descs_.push_back(desc);
    }
    
    const std::vector<TensorDesc>& GetInputDescs() const { return input_descs_; }
    const std::vector<TensorDesc>& GetOutputDescs() const { return output_descs_; }
};

// 算子执行器
class OperatorExecutor {
private:
    std::shared_ptr<Context> context_;
    
    Status CreateAclopDesc(const OpDesc& op_desc, aclopDesc*& acl_op_desc) {
        // 创建ACL算子描述
        acl_op_desc = aclopCreateDesc(op_desc.GetType().c_str(),
                                      op_desc.GetName().c_str(),
                                      op_desc.GetInputDescs().size(),
                                      op_desc.GetOutputDescs().size(),
                                      ACL_OP_EXEC_MODE_AICPU);
        
        if (acl_op_desc == nullptr) {
            return Status(Error::INTERNAL_ERROR, "Failed to create op desc");
        }
        
        // 设置输入输出描述
        for (size_t i = 0; i < op_desc.GetInputDescs().size(); ++i) {
            auto& desc = op_desc.GetInputDescs()[i];
            // 设置输入描述...
        }
        
        for (size_t i = 0; i < op_desc.GetOutputDescs().size(); ++i) {
            auto& desc = op_desc.GetOutputDescs()[i];
            // 设置输出描述...
        }
        
        return Status::OK();
    }
    
    Status SetOpAttrs(const OpDesc& op_desc, aclopDesc* acl_op_desc) {
        // 设置算子属性
        auto& attrs = op_desc.GetAttrs();
        
        // 这里需要根据不同的属性类型进行设置
        // 由于ACL API的限制,这里只做示例
        
        return Status::OK();
    }

public:
    OperatorExecutor() : context_(Context::GetInstance()) {}
    
    Status Execute(const OpDesc& op_desc,
                   const std::vector<DataBuffer>& inputs,
                   std::vector<DataBuffer>& outputs,
                   aclrtStream stream = nullptr) {
        
        // 创建ACL算子描述
        aclopDesc* acl_op_desc = nullptr;
        auto status = CreateAclopDesc(op_desc, acl_op_desc);
        if (!status.IsOk()) {
            return status;
        }
        
        // 设置算子属性
        status = SetOpAttrs(op_desc, acl_op_desc);
        if (!status.IsOk()) {
            aclopDestroyDesc(acl_op_desc);
            return status;
        }
        
        // 创建输入输出数据
        std::vector<aclDataBuffer*> acl_inputs;
        std::vector<aclDataBuffer*> acl_outputs;
        
        for (const auto& input : inputs) {
            acl_inputs.push_back(aclCreateDataBuffer(
                input.GetData(), input.GetSize()));
        }
        
        for (const auto& output : outputs) {
            acl_outputs.push_back(aclCreateDataBuffer(
                output.GetData(), output.GetSize()));
        }
        
        // 执行算子
        auto ret = aclopExecuteV2(acl_op_desc,
                                  acl_inputs.size(),
                                  acl_inputs.data(),
                                  acl_outputs.size(),
                                  acl_outputs.data(),
                                  stream);
        
        // 清理资源
        for (auto* buffer : acl_inputs) {
            aclDestroyDataBuffer(buffer);
        }
        
        for (auto* buffer : acl_outputs) {
            aclDestroyDataBuffer(buffer);
        }
        
        aclopDestroyDesc(acl_op_desc);
        
        if (ret != ACL_ERROR_NONE) {
            return Status(Error::INTERNAL_ERROR, "Failed to execute operator");
        }
        
        return Status::OK();
    }
};

} // namespace acl

#endif // CANN_ACL_OPERATOR_H

3.2 Python算子接口

python 复制代码

import acl
from typing import List, Dict, Any, Optional

class Operator:
    """算子类"""
    
    def __init__(self, op_type: str, name: str = ""):
        self.op_type = op_type
        self.name = name or op_type
        self.attrs = {}
        self.input_descs = []
        self.output_descs = []
    
    def set_attr(self, name: str, value: Any):
        """设置算子属性"""
        self.attrs[name] = value
        return self
    
    def add_input(self, tensor: 'Tensor'):
        """添加输入"""
        self.input_descs.append({
            'shape': tensor.shape,
            'dtype': tensor.dtype,
            'format': 'NCHW'
        })
        return self
    
    def add_output(self, shape: tuple, dtype: int):
        """添加输出"""
        self.output_descs.append({
            'shape': shape,
            'dtype': dtype,
            'format': 'NCHW'
        })
        return self
    
    def execute(self, inputs: List['DeviceTensor'], 
                stream: Optional['Stream'] = None) -> List['DeviceTensor']:
        """执行算子"""
        # 创建输出张量
        outputs = []
        for desc in self.output_descs:
            size = 1
            for dim in desc['shape']:
                size *= dim
            size *= 4  # 假设float32
            
            ptr = acl.rt.malloc(size, acl.rt.ACL_MEM_MALLOC_HUGE_FIRST)
            output = DeviceTensor(ptr, size, desc['dtype'], desc['shape'])
            outputs.append(output)
        
        # 准备ACL数据
        acl_inputs = []
        for tensor in inputs:
            acl_inputs.append(acl.create_data_buffer(tensor.ptr, tensor.size))
        
        acl_outputs = []
        for tensor in outputs:
            acl_outputs.append(acl.create_data_buffer(tensor.ptr, tensor.size))
        
        # 执行算子
        stream_handle = stream.handle if stream else None
        
        ret = acl.op.execute(
            self.op_type,
            self.name,
            len(self.input_descs),
            acl_inputs,
            len(self.output_descs),
            acl_outputs,
            self.attrs,
            acl.op.ACL_OP_EXEC_MODE_AICPU,
            stream_handle
        )
        
        # 清理ACL数据
        for buffer in acl_inputs:
            acl.destroy_data_buffer(buffer)
        
        for buffer in acl_outputs:
            acl.destroy_data_buffer(buffer)
        
        if ret != 0:
            raise RuntimeError(f"Failed to execute operator {self.op_type}: {ret}")
        
        return outputs


# 常用算子工厂函数
def conv2d(input_tensor: 'DeviceTensor', 
            weight: 'DeviceTensor',
            bias: Optional['DeviceTensor'] = None,
            kernel_size: int = 3,
            stride: int = 1,
            padding: int = 0,
            name: str = "conv2d") -> 'DeviceTensor':
    """卷积算子"""
    op = Operator("Conv2D", name)
    
    # 设置属性
    op.set_attr("kernel_size", kernel_size)
    op.set_attr("strides", [stride, stride])
    op.set_attr("pads", [padding, padding, padding, padding])
    op.set_attr("dilations", [1, 1])
    op.set_attr("groups", 1)
    op.set_attr("data_format", "NCHW")
    
    # 计算输出形状
    in_shape = input_tensor.shape
    out_shape = (in_shape[0], weight.shape[0],
                 (in_shape[2] + 2 * padding - kernel_size) // stride + 1,
                 (in_shape[3] + 2 * padding - kernel_size) // stride + 1)
    
    # 添加输入输出
    op.add_input(input_tensor)
    if bias:
        op.add_input(bias)
    
    op.add_output(out_shape, input_tensor.dtype)
    
    # 执行
    outputs = op.execute([input_tensor, weight])
    return outputs[0]


def relu(input_tensor: 'DeviceTensor', 
         name: str = "relu") -> 'DeviceTensor':
    """ReLU激活函数"""
    op = Operator("Relu", name)
    op.add_input(input_tensor)
    op.add_output(input_tensor.shape, input_tensor.dtype)
    
    outputs = op.execute([input_tensor])
    return outputs[0]


def matmul(a: 'DeviceTensor', 
           b: 'DeviceTensor',
           transpose_a: bool = False,
           transpose_b: bool = False,
           name: str = "matmul") -> 'DeviceTensor':
    """矩阵乘法"""
    op = Operator("MatMul", name)
    
    op.set_attr("transpose_x1", transpose_a)
    op.set_attr("transpose_x2", transpose_b)
    
    # 计算输出形状
    a_shape = a.shape if not transpose_a else (a.shape[1], a.shape[0])
    b_shape = b.shape if not transpose_b else (b.shape[1], b.shape[0])
    
    out_shape = (a_shape[0], b_shape[1])
    
    op.add_input(a)
    op.add_input(b)
    op.add_output(out_shape, a.dtype)
    
    outputs = op.execute([a, b])
    return outputs[0]


# 使用示例
if __name__ == "__main__":
    # 初始化ACL
    acl_context = ACLContext()
    acl_context.initialize()
    
    # 创建流
    stream_mgr = StreamManager()
    stream = stream_mgr.create_stream()
    
    # 创建输入张量
    input_arr = np.random.randn(1, 3, 224, 224).astype(np.float32)
    weight_arr = np.random.randn(64, 3, 3, 3).astype(np.float32)
    
    input_tensor = from_numpy(input_arr).to_device()
    weight_tensor = from_numpy(weight_arr).to_device()
    
    # 执行卷积
    output_tensor = conv2d(input_tensor, weight_tensor, 
                          kernel_size=3, stride=1, padding=1)
    
    # 执行ReLU
    output_tensor = relu(output_tensor)
    
    # 同步流
    stream.synchronize()
    
    # 获取输出
    output_arr = output_tensor.to_host().to_numpy()
    print(f"Output shape: {output_arr.shape}")

四、模型推理接口

4.1 模型推理器

python 复制代码

import json
from typing import Dict, List, Optional, Any

class ModelInference:
    """模型推理器"""
    
    def __init__(self, model_path: str, device_id: int = 0):
        self.model_path = model_path
        self.device_id = device_id
        self.model_id = 0
        self.model_desc = None
        self.input_info = []
        self.output_info = []
        self._load_model()
    
    def _load_model(self):
        """加载模型"""
        ret = acl.mdl.load_from_file(self.model_path, self.model_id)
        if ret != 0:
            raise RuntimeError(f"Failed to load model: {ret}")
        
        # 获取模型描述
        self.model_desc = acl.mdl.create_desc()
        ret = acl.mdl.get_desc(self.model_desc, self.model_id)
        if ret != 0:
            raise RuntimeError(f"Failed to get model desc: {ret}")
        
        # 获取输入输出信息
        self._parse_model_info()
        
        print(f"Model loaded: {len(self.input_info)} inputs, {len(self.output_info)} outputs")
    
    def _parse_model_info(self):
        """解析模型信息"""
        num_inputs = acl.mdl.get_num_inputs(self.model_desc)
        num_outputs = acl.mdl.get_num_outputs(self.model_desc)
        
        for i in range(num_inputs):
            desc = acl.mdl.get_input_desc_by_index(self.model_desc, i)
            shape = acl.mdl.get_input_shape(desc)
            dtype = acl.mdl.get_input_data_type(desc)
            size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            
            self.input_info.append({
                'index': i,
                'name': acl.mdl.get_input_name_by_index(self.model_desc, i),
                'shape': shape,
                'dtype': dtype,
                'size': size
            })
        
        for i in range(num_outputs):
            desc = acl.mdl.get_output_desc_by_index(self.model_desc, i)
            shape = acl.mdl.get_output_shape(desc)
            dtype = acl.mdl.get_output_data_type(desc)
            size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            
            self.output_info.append({
                'index': i,
                'name': acl.mdl.get_output_name_by_index(self.model_desc, i),
                'shape': shape,
                'dtype': dtype,
                'size': size
            })
    
    def infer(self, inputs: List[np.ndarray], 
              stream: Optional[Stream] = None) -> List[np.ndarray]:
        """执行推理"""
        if len(inputs) != len(self.input_info):
            raise ValueError(f"Expected {len(self.input_info)} inputs, got {len(inputs)}")
        
        # 创建输入数据集
        input_dataset = acl.mdl.create_dataset()
        
        for i, input_data in enumerate(inputs):
            # 验证输入
            expected_shape = self.input_info[i]['shape']
            if input_data.shape != expected_shape:
                raise ValueError(f"Input {i} shape mismatch: expected {expected_shape}, got {input_data.shape}")
            
            # 分配设备内存
            dev_mem = DeviceMemory.allocate(input_data.nbytes)
            dev_mem.copy_from(input_data)
            
            # 创建数据缓冲区
            buffer = acl.create_data_buffer(dev_mem.ptr, input_data.nbytes)
            acl.mdl.add_dataset_buffer(input_dataset, buffer)
        
        # 创建输出数据集
        output_dataset = acl.mdl.create_dataset()
        output_mems = []
        
        for i, output_info in enumerate(self.output_info):
            # 分配设备内存
            dev_mem = DeviceMemory.allocate(output_info['size'])
            output_mems.append(dev_mem)
            
            # 创建数据缓冲区
            buffer = acl.create_data_buffer(dev_mem.ptr, output_info['size'])
            acl.mdl.add_dataset_buffer(output_dataset, buffer)
        
        # 执行推理
        stream_handle = stream.handle if stream else None
        ret = acl.mdl.execute(self.model_id, input_dataset, output_dataset, stream_handle)
        
        if ret != 0:
            # 清理资源
            acl.mdl.destroy_dataset(input_dataset)
            acl.mdl.destroy_dataset(output_dataset)
            raise RuntimeError(f"Failed to execute model: {ret}")
        
        # 获取输出
        outputs = []
        for i, output_info in enumerate(self.output_info):
            # 获取数据缓冲区
            buffer = acl.mdl.get_dataset_buffer(output_dataset, i)
            dev_ptr = acl.get_data_buffer_addr(buffer)
            size = acl.get_data_buffer_size(buffer)
            
            # 拷贝到主机
            output_data = np.empty(output_info['shape'], dtype=np.float32)
            acl.rt.memcpy(output_data.ctypes.data, dev_ptr, size,
                         acl.rt.ACL_MEMCPY_DEVICE_TO_HOST)
            
            outputs.append(output_data)
        
        # 清理资源
        acl.mdl.destroy_dataset(input_dataset)
        acl.mdl.destroy_dataset(output_dataset)
        
        return outputs
    
    def infer_async(self, inputs: List[np.ndarray], 
                    stream: Stream) -> List[np.ndarray]:
        """异步推理"""
        return self.infer(inputs, stream)
    
    def get_input_info(self) -> List[Dict[str, Any]]:
        """获取输入信息"""
        return self.input_info
    
    def get_output_info(self) -> List[Dict[str, Any]]:
        """获取输出信息"""
        return self.output_info
    
    def __del__(self):
        """析构函数"""
        if self.model_id != 0:
            ret = acl.mdl.unload(self.model_id)
            if ret != 0:
                print(f"Warning: Failed to unload model: {ret}")
            self.model_id = 0
        
        if self.model_desc is not None:
            acl.mdl.destroy_desc(self.model_desc)
            self.model_desc = None


# 使用示例
if __name__ == "__main__":
    # 初始化ACL
    acl_context = ACLContext()
    acl_context.initialize()
    
    # 创建流
    stream_mgr = StreamManager()
    stream = stream_mgr.create_stream()
    
    # 加载模型
    model = ModelInference("resnet50.om")
    
    # 打印模型信息
    print("Input info:")
    for info in model.get_input_info():
        print(f"  {info['name']}: {info['shape']}")
    
    print("\nOutput info:")
    for info in model.get_output_info():
        print(f"  {info['name']}: {info['shape']}")
    
    # 准备输入
    input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
    
    # 执行推理
    outputs = model.infer([input_data], stream)
    
    # 同步流
    stream.synchronize()
    
    # 打印输出
    print(f"\nOutput shape: {outputs[0].shape}")
    print(f"Output max: {outputs[0].max():.4f}")
    print(f"Output min: {outputs[0].min():.4f}")

参考资源

官方资源:

CANN组织主页:https://atomgit.com/cann
acl仓库:https://atomgit.com/cann/acl
AI开发者社区:https://www.hiascend.com
CANN开发文档:https://www.hiascend.com/document

学习资源:

ACL API参考手册
应用开发指南
性能优化实践
错误处理与调试