深入解析CANN acl应用层接口:构建高效的AI应用开发框架
引言
CANN平台的应用层接口(Application Computing Language, ACL)为开发者提供了简洁易用的编程接口,是连接上层应用和底层运行时环境的重要桥梁。ACL封装了底层复杂的硬件操作,让开发者能够专注于业务逻辑而非底层细节。本文将深入剖析ACL的架构设计和代码实现。
相关链接:
- CANN组织:https://atomgit.com/cann
- acl仓库:https://atomgit.com/cann/acl
一、ACL核心架构与初始化
1.1 ACL框架设计
cpp
// ACL核心框架头文件
#ifndef CANN_ACL_FRAMEWORK_H
#define CANN_ACL_FRAMEWORK_H
#include <memory>
#include <string>
#include <vector>
#include <map>
#include <mutex>
namespace acl {
// 错误码定义
enum class Error {
NONE = 0,
INVALID_PARAM = 100,
OUT_OF_MEMORY = 101,
INTERNAL_ERROR = 102,
DEVICE_NOT_FOUND = 103,
// ... 更多错误码
};
// 返回状态
class Status {
private:
Error error_;
std::string message_;
public:
Status() : error_(Error::NONE), message_("") {}
Status(Error error, const std::string& message)
: error_(error), message_(message) {}
bool IsOk() const { return error_ == Error::NONE; }
Error GetError() const { return error_; }
const std::string& GetMessage() const { return message_; }
static Status OK() { return Status(); }
static Status InvalidParam(const std::string& msg) {
return Status(Error::INVALID_PARAM, msg);
}
static Status OutOfMemory(const std::string& msg) {
return Status(Error::OUT_OF_MEMORY, msg);
}
};
// ACL上下文
class Context {
private:
static std::shared_ptr<Context> instance_;
static std::mutex mutex_;
bool initialized_;
int device_id_;
std::map<std::string, void*> resources_;
Context() : initialized_(false), device_id_(-1) {}
public:
static std::shared_ptr<Context> GetInstance() {
std::lock_guard<std::mutex> lock(mutex_);
if (!instance_) {
instance_ = std::shared_ptr<Context>(new Context());
}
return instance_;
}
Status Initialize(int device_id = 0) {
if (initialized_) {
return Status::OK();
}
// 调用底层初始化
auto ret = aclInit(nullptr);
if (ret != 0) {
return Status(Error::INTERNAL_ERROR, "Failed to initialize ACL");
}
// 设置设备
ret = aclrtSetDevice(device_id);
if (ret != 0) {
return Status(Error::DEVICE_NOT_FOUND, "Failed to set device");
}
device_id_ = device_id;
initialized_ = true;
return Status::OK();
}
Status Finalize() {
if (!initialized_) {
return Status::OK();
}
// 释放资源
for (auto& pair : resources_) {
// 根据资源类型释放
}
resources_.clear();
// 重置设备
aclrtResetDevice(device_id_);
// 终止ACL
aclFinalize();
initialized_ = false;
device_id_ = -1;
return Status::OK();
}
bool IsInitialized() const { return initialized_; }
int GetDeviceId() const { return device_id_; }
void SetResource(const std::string& key, void* resource) {
resources_[key] = resource;
}
void* GetResource(const std::string& key) {
auto it = resources_.find(key);
if (it != resources_.end()) {
return it->second;
}
return nullptr;
}
};
// 初始化静态成员
std::shared_ptr<Context> Context::instance_ = nullptr;
std::mutex Context::mutex_;
} // namespace acl
#endif // CANN_ACL_FRAMEWORK_H
1.2 Python ACL封装
python
import acl
from typing import Optional, Dict, Any
import threading
import atexit
class ACLContext:
"""ACL上下文管理器"""
_instance = None
_lock = threading.Lock()
def __new__(cls):
if cls._instance is None:
with cls._lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
cls._instance._device_id = -1
cls._instance._resources = {}
return cls._instance
def initialize(self, device_id: int = 0) -> bool:
"""初始化ACL"""
if self._initialized:
return True
# 初始化ACL
ret = acl.init()
if ret != 0:
raise RuntimeError(f"Failed to initialize ACL: {ret}")
# 设置设备
ret = acl.rt.set_device(device_id)
if ret != 0:
raise RuntimeError(f"Failed to set device {device_id}: {ret}")
self._device_id = device_id
self._initialized = True
# 注册清理函数
atexit.register(self.finalize)
return True
def finalize(self):
"""终止ACL"""
if not self._initialized:
return
# 清理资源
for name, resource in self._resources.items():
try:
resource.cleanup()
except Exception as e:
print(f"Error cleaning up {name}: {e}")
self._resources.clear()
# 重置设备
acl.rt.reset_device(self._device_id)
# 终止ACL
acl.finalize()
self._initialized = False
self._device_id = -1
def __enter__(self):
"""上下文管理器入口"""
self.initialize()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""上下文管理器出口"""
self.finalize()
def register_resource(self, name: str, resource):
"""注册资源"""
self._resources[name] = resource
def unregister_resource(self, name: str):
"""注销资源"""
if name in self._resources:
del self._resources[name]
@property
def device_id(self) -> int:
"""获取设备ID"""
return self._device_id
@property
def is_initialized(self) -> bool:
"""检查是否已初始化"""
return self._initialized
# 全局ACL上下文
acl_context = ACLContext()
def initialize_acl(device_id: int = 0) -> ACLContext:
"""初始化ACL"""
acl_context.initialize(device_id)
return acl_context
def finalize_acl():
"""终止ACL"""
acl_context.finalize()
# 使用示例
if __name__ == "__main__":
# 使用上下文管理器
with ACLContext() as ctx:
print(f"ACL initialized on device {ctx.device_id}")
print(f"Is initialized: {ctx.is_initialized}")
二、张量与数据操作
2.1 张量类实现
cpp
// 张量类定义
#ifndef CANN_ACL_TENSOR_H
#define CANN_ACL_TENSOR_H
#include <vector>
#include <memory>
#include "acl_framework.h"
namespace acl {
// 数据类型
enum class DataType {
DT_FLOAT = 0,
DT_FLOAT16 = 1,
DT_INT8 = 2,
DT_INT16 = 3,
DT_INT32 = 4,
DT_INT64 = 5,
DT_UINT8 = 6,
DT_UINT16 = 7,
DT_UINT32 = 8,
DT_UINT64 = 9,
DT_BOOL = 10,
DT_DOUBLE = 11,
};
// 张量形状
class Shape {
private:
std::vector<int64_t> dims_;
public:
Shape() = default;
explicit Shape(const std::vector<int64_t>& dims) : dims_(dims) {}
int64_t operator[](size_t index) const { return dims_[index]; }
size_t Size() const { return dims_.size(); }
int64_t NumElements() const {
if (dims_.empty()) return 0;
int64_t count = 1;
for (auto dim : dims_) {
count *= dim;
}
return count;
}
const std::vector<int64_t>& GetDims() const { return dims_; }
void AddDim(int64_t dim) { dims_.push_back(dim); }
void SetDim(size_t index, int64_t dim) { dims_[index] = dim; }
std::string ToString() const {
std::string str = "[";
for (size_t i = 0; i < dims_.size(); ++i) {
if (i > 0) str += ", ";
str += std::to_string(dims_[i]);
}
str += "]";
return str;
}
};
// 张量描述
class TensorDesc {
private:
DataType data_type_;
Shape shape_;
std::string format_;
size_t element_size_;
public:
TensorDesc() : data_type_(DataType::DT_FLOAT),
format_("NCHW"), element_size_(4) {}
TensorDesc(DataType data_type, const Shape& shape,
const std::string& format = "NCHW")
: data_type_(data_type), shape_(shape), format_(format) {
element_size_ = GetDataTypeSize(data_type);
}
DataType GetDataType() const { return data_type_; }
void SetDataType(DataType dtype) { data_type_ = dtype; element_size_ = GetDataTypeSize(dtype); }
const Shape& GetShape() const { return shape_; }
void SetShape(const Shape& shape) { shape_ = shape; }
const std::string& GetFormat() const { return format_; }
void SetFormat(const std::string& format) { format_ = format; }
size_t GetElementSize() const { return element_size_; }
size_t GetTotalSize() const {
return shape_.NumElements() * element_size_;
}
static size_t GetDataTypeSize(DataType dtype) {
switch (dtype) {
case DataType::DT_FLOAT:
case DataType::DT_INT32:
case DataType::DT_UINT32:
return 4;
case DataType::DT_FLOAT16:
case DataType::DT_INT16:
case DataType::DT_UINT16:
return 2;
case DataType::DT_INT8:
case DataType::DT_UINT8:
case DataType::DT_BOOL:
return 1;
case DataType::DT_INT64:
case DataType::DT_UINT64:
case DataType::DT_DOUBLE:
return 8;
default:
return 4;
}
}
};
// 数据缓冲区
class DataBuffer {
private:
void* data_;
size_t size_;
bool is_device_;
bool owns_data_;
public:
DataBuffer() : data_(nullptr), size_(0), is_device_(false), owns_data_(false) {}
DataBuffer(void* data, size_t size, bool is_device = false, bool owns_data = true)
: data_(data), size_(size), is_device_(is_device), owns_data_(owns_data) {}
~DataBuffer() {
if (owns_data_ && data_ != nullptr) {
if (is_device_) {
aclrtFree(data_);
} else {
free(data_);
}
}
}
void* GetData() const { return data_; }
size_t GetSize() const { return size_; }
bool IsDevice() const { return is_device_; }
void SetData(void* data, size_t size, bool is_device = false, bool owns_data = true) {
if (owns_data_ && data_ != nullptr) {
if (is_device_) {
aclrtFree(data_);
} else {
free(data_);
}
}
data_ = data;
size_ = size;
is_device_ = is_device;
owns_data_ = owns_data;
}
void CopyFrom(const DataBuffer& other) {
if (size_ != other.size_) {
throw std::runtime_error("Buffer size mismatch");
}
if (is_device_ && other.is_device_) {
aclrtMemcpy(data_, size_, other.data_, other.size_,
ACL_MEMCPY_DEVICE_TO_DEVICE);
} else if (is_device_ && !other.is_device_) {
aclrtMemcpy(data_, size_, other.data_, other.size_,
ACL_MEMCPY_HOST_TO_DEVICE);
} else if (!is_device_ && other.is_device_) {
aclrtMemcpy(data_, size_, other.data_, other.size_,
ACL_MEMCPY_DEVICE_TO_HOST);
} else {
memcpy(data_, other.data_, size_);
}
}
};
// 张量类
class Tensor {
private:
TensorDesc desc_;
DataBuffer buffer_;
std::string name_;
public:
Tensor() = default;
Tensor(const std::string& name, const TensorDesc& desc)
: desc_(desc), name_(name) {
// 分配内存
void* data = nullptr;
size_t size = desc_.GetTotalSize();
auto ret = aclrtMalloc(&data, size, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
throw std::runtime_error("Failed to allocate device memory");
}
buffer_.SetData(data, size, true, true);
}
Tensor(const std::string& name, const TensorDesc& desc, void* data, bool is_device = false)
: desc_(desc), name_(name) {
size_t size = desc_.GetTotalSize();
buffer_.SetData(data, size, is_device, false);
}
const TensorDesc& GetDesc() const { return desc_; }
TensorDesc& GetDesc() { return desc_; }
const DataBuffer& GetBuffer() const { return buffer_; }
DataBuffer& GetBuffer() { return buffer_; }
const std::string& GetName() const { return name_; }
void SetName(const std::string& name) { name_ = name; }
size_t GetNumElements() const { return desc_.GetShape().NumElements(); }
size_t GetSizeInBytes() const { return desc_.GetTotalSize(); }
Status CopyFrom(const Tensor& other) {
if (desc_.GetShape() != other.desc_.GetShape() ||
desc_.GetDataType() != other.desc_.GetDataType()) {
return Status::InvalidParam("Tensor shape or dtype mismatch");
}
buffer_.CopyFrom(other.buffer_);
return Status::OK();
}
Status ToHost(Tensor& host_tensor) const {
if (host_tensor.desc_.GetShape() != desc_.GetShape() ||
host_tensor.desc_.GetDataType() != desc_.GetDataType()) {
return Status::InvalidParam("Tensor shape or dtype mismatch");
}
auto ret = aclrtMemcpy(host_tensor.buffer_.GetData(),
host_tensor.buffer_.GetSize(),
buffer_.GetData(), buffer_.GetSize(),
ACL_MEMCPY_DEVICE_TO_HOST);
if (ret != ACL_ERROR_NONE) {
return Status(Error::INTERNAL_ERROR, "Failed to copy to host");
}
return Status::OK();
}
Status ToDevice(Tensor& device_tensor) const {
if (device_tensor.desc_.GetShape() != desc_.GetShape() ||
device_tensor.desc_.GetDataType() != desc_.GetDataType()) {
return Status::InvalidParam("Tensor shape or dtype mismatch");
}
auto ret = aclrtMemcpy(device_tensor.buffer_.GetData(),
device_tensor.buffer_.GetSize(),
buffer_.GetData(), buffer_.GetSize(),
ACL_MEMCPY_HOST_TO_DEVICE);
if (ret != ACL_ERROR_NONE) {
return Status(Error::INTERNAL_ERROR, "Failed to copy to device");
}
return Status::OK();
}
};
} // namespace acl
#endif // CANN_ACL_TENSOR_H
2.2 Python张量操作
python
import numpy as np
import acl
from typing import Union, Tuple, Optional
class Tensor:
"""张量类"""
def __init__(self, data: Union[np.ndarray, bytes], name: str = ""):
self.name = name
if isinstance(data, np.ndarray):
self._from_numpy(data)
elif isinstance(data, bytes):
self._from_bytes(data)
else:
raise TypeError("Unsupported data type")
def _from_numpy(self, arr: np.ndarray):
"""从numpy数组创建"""
self.dtype = self._numpy_to_acl_dtype(arr.dtype)
self.shape = arr.shape
self.data = arr.tobytes()
self.size = arr.nbytes
self.is_device = False
def _from_bytes(self, data: bytes):
"""从字节数据创建"""
self.data = data
self.size = len(data)
self.shape = None
self.dtype = None
self.is_device = False
@staticmethod
def _numpy_to_acl_dtype(dtype: np.dtype) -> int:
"""转换numpy数据类型到ACL数据类型"""
dtype_map = {
np.float32: acl.ACL_FLOAT,
np.float16: acl.ACL_FLOAT16,
np.int32: acl.ACL_INT32,
np.int64: acl.ACL_INT64,
np.int8: acl.ACL_INT8,
np.uint8: acl.ACL_UINT8,
np.bool_: acl.ACL_BOOL,
}
return dtype_map.get(dtype, acl.ACL_FLOAT)
@staticmethod
def _acl_to_numpy_dtype(dtype: int) -> np.dtype:
"""转换ACL数据类型到numpy数据类型"""
dtype_map = {
acl.ACL_FLOAT: np.float32,
acl.ACL_FLOAT16: np.float16,
acl.ACL_INT32: np.int32,
acl.ACL_INT64: np.int64,
acl.ACL_INT8: np.int8,
acl.ACL_UINT8: np.uint8,
acl.ACL_BOOL: np.bool_,
}
return dtype_map.get(dtype, np.float32)
def to_device(self) -> 'DeviceTensor':
"""传输到设备"""
ptr = acl.rt.malloc(self.size, acl.rt.ACL_MEM_MALLOC_HUGE_FIRST)
acl.rt.memcpy(ptr, self.data, self.size, acl.rt.ACL_MEMCPY_HOST_TO_DEVICE)
device_tensor = DeviceTensor(ptr, self.size, self.dtype, self.shape)
return device_tensor
def to_numpy(self) -> np.ndarray:
"""转换为numpy数组"""
if self.shape is None:
raise ValueError("Shape is not set")
dtype = self._acl_to_numpy_dtype(self.dtype)
return np.frombuffer(self.data, dtype=dtype).reshape(self.shape)
def __repr__(self):
return f"Tensor(shape={self.shape}, dtype={self.dtype}, size={self.size})"
class DeviceTensor:
"""设备张量"""
def __init__(self, ptr: int, size: int, dtype: int, shape: Optional[Tuple] = None):
self.ptr = ptr
self.size = size
self.dtype = dtype
self.shape = shape
self.is_device = True
def __del__(self):
"""析构函数"""
if self.ptr != 0:
acl.rt.free(self.ptr)
self.ptr = 0
def to_host(self) -> Tensor:
"""传输到主机"""
data = bytes(self.size)
acl.rt.memcpy(data, self.ptr, self.size, acl.rt.ACL_MEMCPY_DEVICE_TO_HOST)
return Tensor(data)
def copy_from(self, src: Union['DeviceTensor', Tensor]):
"""从源拷贝"""
if isinstance(src, DeviceTensor):
acl.rt.memcpy(self.ptr, src.ptr, min(self.size, src.size),
acl.rt.ACL_MEMCPY_DEVICE_TO_DEVICE)
elif isinstance(src, Tensor):
acl.rt.memcpy(self.ptr, src.data, min(self.size, src.size),
acl.rt.ACL_MEMCPY_HOST_TO_DEVICE)
def copy_to(self, dst: Union['DeviceTensor', Tensor]):
"""拷贝到目标"""
if isinstance(dst, DeviceTensor):
acl.rt.memcpy(dst.ptr, self.ptr, min(self.size, dst.size),
acl.rt.ACL_MEMCPY_DEVICE_TO_DEVICE)
elif isinstance(dst, Tensor):
acl.rt.memcpy(dst.data, self.ptr, min(self.size, dst.size),
acl.rt.ACL_MEMCPY_DEVICE_TO_HOST)
def to_numpy(self) -> np.ndarray:
"""转换为numpy数组"""
if self.shape is None:
raise ValueError("Shape is not set")
dtype = Tensor._acl_to_numpy_dtype(self.dtype)
arr = np.empty(self.shape, dtype=dtype)
acl.rt.memcpy(arr.ctypes.data, self.ptr, self.size,
acl.rt.ACL_MEMCPY_DEVICE_TO_HOST)
return arr
def __repr__(self):
return f"DeviceTensor(ptr=0x{self.ptr:x}, size={self.size}, dtype={self.dtype})"
# 工厂函数
def from_numpy(arr: np.ndarray, name: str = "") -> Tensor:
"""从numpy数组创建张量"""
return Tensor(arr, name)
def zeros(shape: Tuple, dtype: np.dtype = np.float32, name: str = "") -> Tensor:
"""创建零张量"""
arr = np.zeros(shape, dtype=dtype)
return Tensor(arr, name)
def ones(shape: Tuple, dtype: np.dtype = np.float32, name: str = "") -> Tensor:
"""创建单位张量"""
arr = np.ones(shape, dtype=dtype)
return Tensor(arr, name)
def randn(shape: Tuple, dtype: np.dtype = np.float32, name: str = "") -> Tensor:
"""创建随机张量"""
arr = np.random.randn(*shape).astype(dtype)
return Tensor(arr, name)
# 使用示例
if __name__ == "__main__":
# 从numpy创建张量
arr = np.random.randn(2, 3, 4).astype(np.float32)
tensor = from_numpy(arr, "input")
print(f"Created tensor: {tensor}")
# 传输到设备
device_tensor = tensor.to_device()
print(f"Device tensor: {device_tensor}")
# 传输回主机
host_tensor = device_tensor.to_host()
print(f"Host tensor: {host_tensor}")
# 转换为numpy
arr2 = host_tensor.to_numpy()
print(f"Numpy array shape: {arr2.shape}")
# 验证数据
assert np.allclose(arr, arr2)
print("Data verification passed!")
三、算子执行接口
3.1 算子执行器实现
cpp
// 算子执行器
#ifndef CANN_ACL_OPERATOR_H
#define CANN_ACL_OPERATOR_H
#include "acl_tensor.h"
#include <map>
#include <string>
namespace acl {
// 算子属性
class OpAttr {
private:
std::map<std::string, int> int_attrs_;
std::map<std::string, float> float_attrs_;
std::map<std::string, std::string> string_attrs_;
std::map<std::string, std::vector<int>> int_list_attrs_;
std::map<std::string, std::vector<float>> float_list_attrs_;
std::map<std::string, bool> bool_attrs_;
public:
void SetInt(const std::string& name, int value) {
int_attrs_[name] = value;
}
void SetFloat(const std::string& name, float value) {
float_attrs_[name] = value;
}
void SetString(const std::string& name, const std::string& value) {
string_attrs_[name] = value;
}
void SetIntList(const std::string& name, const std::vector<int>& value) {
int_list_attrs_[name] = value;
}
void SetFloatList(const std::string& name, const std::vector<float>& value) {
float_list_attrs_[name] = value;
}
void SetBool(const std::string& name, bool value) {
bool_attrs_[name] = value;
}
int GetInt(const std::string& name, int default_value = 0) const {
auto it = int_attrs_.find(name);
return it != int_attrs_.end() ? it->second : default_value;
}
float GetFloat(const std::string& name, float default_value = 0.0f) const {
auto it = float_attrs_.find(name);
return it != float_attrs_.end() ? it->second : default_value;
}
std::string GetString(const std::string& name, const std::string& default_value = "") const {
auto it = string_attrs_.find(name);
return it != string_attrs_.end() ? it->second : default_value;
}
std::vector<int> GetIntList(const std::string& name) const {
auto it = int_list_attrs_.find(name);
return it != int_list_attrs_.end() ? it->second : std::vector<int>();
}
std::vector<float> GetFloatList(const std::string& name) const {
auto it = float_list_attrs_.find(name);
return it != float_list_attrs_.end() ? it->second : std::vector<float>();
}
bool GetBool(const std::string& name, bool default_value = false) const {
auto it = bool_attrs_.find(name);
return it != bool_attrs_.end() ? it->second : default_value;
}
};
// 算子描述
class OpDesc {
private:
std::string op_type_;
std::string op_name_;
OpAttr attrs_;
std::vector<TensorDesc> input_descs_;
std::vector<TensorDesc> output_descs_;
public:
OpDesc(const std::string& op_type, const std::string& op_name = "")
: op_type_(op_type), op_name_(op_name.empty() ? op_type : op_name) {}
const std::string& GetType() const { return op_type_; }
const std::string& GetName() const { return op_name_; }
OpAttr& GetAttrs() { return attrs_; }
const OpAttr& GetAttrs() const { return attrs_; }
void AddInputDesc(const TensorDesc& desc) {
input_descs_.push_back(desc);
}
void AddOutputDesc(const TensorDesc& desc) {
output_descs_.push_back(desc);
}
const std::vector<TensorDesc>& GetInputDescs() const { return input_descs_; }
const std::vector<TensorDesc>& GetOutputDescs() const { return output_descs_; }
};
// 算子执行器
class OperatorExecutor {
private:
std::shared_ptr<Context> context_;
Status CreateAclopDesc(const OpDesc& op_desc, aclopDesc*& acl_op_desc) {
// 创建ACL算子描述
acl_op_desc = aclopCreateDesc(op_desc.GetType().c_str(),
op_desc.GetName().c_str(),
op_desc.GetInputDescs().size(),
op_desc.GetOutputDescs().size(),
ACL_OP_EXEC_MODE_AICPU);
if (acl_op_desc == nullptr) {
return Status(Error::INTERNAL_ERROR, "Failed to create op desc");
}
// 设置输入输出描述
for (size_t i = 0; i < op_desc.GetInputDescs().size(); ++i) {
auto& desc = op_desc.GetInputDescs()[i];
// 设置输入描述...
}
for (size_t i = 0; i < op_desc.GetOutputDescs().size(); ++i) {
auto& desc = op_desc.GetOutputDescs()[i];
// 设置输出描述...
}
return Status::OK();
}
Status SetOpAttrs(const OpDesc& op_desc, aclopDesc* acl_op_desc) {
// 设置算子属性
auto& attrs = op_desc.GetAttrs();
// 这里需要根据不同的属性类型进行设置
// 由于ACL API的限制,这里只做示例
return Status::OK();
}
public:
OperatorExecutor() : context_(Context::GetInstance()) {}
Status Execute(const OpDesc& op_desc,
const std::vector<DataBuffer>& inputs,
std::vector<DataBuffer>& outputs,
aclrtStream stream = nullptr) {
// 创建ACL算子描述
aclopDesc* acl_op_desc = nullptr;
auto status = CreateAclopDesc(op_desc, acl_op_desc);
if (!status.IsOk()) {
return status;
}
// 设置算子属性
status = SetOpAttrs(op_desc, acl_op_desc);
if (!status.IsOk()) {
aclopDestroyDesc(acl_op_desc);
return status;
}
// 创建输入输出数据
std::vector<aclDataBuffer*> acl_inputs;
std::vector<aclDataBuffer*> acl_outputs;
for (const auto& input : inputs) {
acl_inputs.push_back(aclCreateDataBuffer(
input.GetData(), input.GetSize()));
}
for (const auto& output : outputs) {
acl_outputs.push_back(aclCreateDataBuffer(
output.GetData(), output.GetSize()));
}
// 执行算子
auto ret = aclopExecuteV2(acl_op_desc,
acl_inputs.size(),
acl_inputs.data(),
acl_outputs.size(),
acl_outputs.data(),
stream);
// 清理资源
for (auto* buffer : acl_inputs) {
aclDestroyDataBuffer(buffer);
}
for (auto* buffer : acl_outputs) {
aclDestroyDataBuffer(buffer);
}
aclopDestroyDesc(acl_op_desc);
if (ret != ACL_ERROR_NONE) {
return Status(Error::INTERNAL_ERROR, "Failed to execute operator");
}
return Status::OK();
}
};
} // namespace acl
#endif // CANN_ACL_OPERATOR_H
3.2 Python算子接口
python
import acl
from typing import List, Dict, Any, Optional
class Operator:
"""算子类"""
def __init__(self, op_type: str, name: str = ""):
self.op_type = op_type
self.name = name or op_type
self.attrs = {}
self.input_descs = []
self.output_descs = []
def set_attr(self, name: str, value: Any):
"""设置算子属性"""
self.attrs[name] = value
return self
def add_input(self, tensor: 'Tensor'):
"""添加输入"""
self.input_descs.append({
'shape': tensor.shape,
'dtype': tensor.dtype,
'format': 'NCHW'
})
return self
def add_output(self, shape: tuple, dtype: int):
"""添加输出"""
self.output_descs.append({
'shape': shape,
'dtype': dtype,
'format': 'NCHW'
})
return self
def execute(self, inputs: List['DeviceTensor'],
stream: Optional['Stream'] = None) -> List['DeviceTensor']:
"""执行算子"""
# 创建输出张量
outputs = []
for desc in self.output_descs:
size = 1
for dim in desc['shape']:
size *= dim
size *= 4 # 假设float32
ptr = acl.rt.malloc(size, acl.rt.ACL_MEM_MALLOC_HUGE_FIRST)
output = DeviceTensor(ptr, size, desc['dtype'], desc['shape'])
outputs.append(output)
# 准备ACL数据
acl_inputs = []
for tensor in inputs:
acl_inputs.append(acl.create_data_buffer(tensor.ptr, tensor.size))
acl_outputs = []
for tensor in outputs:
acl_outputs.append(acl.create_data_buffer(tensor.ptr, tensor.size))
# 执行算子
stream_handle = stream.handle if stream else None
ret = acl.op.execute(
self.op_type,
self.name,
len(self.input_descs),
acl_inputs,
len(self.output_descs),
acl_outputs,
self.attrs,
acl.op.ACL_OP_EXEC_MODE_AICPU,
stream_handle
)
# 清理ACL数据
for buffer in acl_inputs:
acl.destroy_data_buffer(buffer)
for buffer in acl_outputs:
acl.destroy_data_buffer(buffer)
if ret != 0:
raise RuntimeError(f"Failed to execute operator {self.op_type}: {ret}")
return outputs
# 常用算子工厂函数
def conv2d(input_tensor: 'DeviceTensor',
weight: 'DeviceTensor',
bias: Optional['DeviceTensor'] = None,
kernel_size: int = 3,
stride: int = 1,
padding: int = 0,
name: str = "conv2d") -> 'DeviceTensor':
"""卷积算子"""
op = Operator("Conv2D", name)
# 设置属性
op.set_attr("kernel_size", kernel_size)
op.set_attr("strides", [stride, stride])
op.set_attr("pads", [padding, padding, padding, padding])
op.set_attr("dilations", [1, 1])
op.set_attr("groups", 1)
op.set_attr("data_format", "NCHW")
# 计算输出形状
in_shape = input_tensor.shape
out_shape = (in_shape[0], weight.shape[0],
(in_shape[2] + 2 * padding - kernel_size) // stride + 1,
(in_shape[3] + 2 * padding - kernel_size) // stride + 1)
# 添加输入输出
op.add_input(input_tensor)
if bias:
op.add_input(bias)
op.add_output(out_shape, input_tensor.dtype)
# 执行
outputs = op.execute([input_tensor, weight])
return outputs[0]
def relu(input_tensor: 'DeviceTensor',
name: str = "relu") -> 'DeviceTensor':
"""ReLU激活函数"""
op = Operator("Relu", name)
op.add_input(input_tensor)
op.add_output(input_tensor.shape, input_tensor.dtype)
outputs = op.execute([input_tensor])
return outputs[0]
def matmul(a: 'DeviceTensor',
b: 'DeviceTensor',
transpose_a: bool = False,
transpose_b: bool = False,
name: str = "matmul") -> 'DeviceTensor':
"""矩阵乘法"""
op = Operator("MatMul", name)
op.set_attr("transpose_x1", transpose_a)
op.set_attr("transpose_x2", transpose_b)
# 计算输出形状
a_shape = a.shape if not transpose_a else (a.shape[1], a.shape[0])
b_shape = b.shape if not transpose_b else (b.shape[1], b.shape[0])
out_shape = (a_shape[0], b_shape[1])
op.add_input(a)
op.add_input(b)
op.add_output(out_shape, a.dtype)
outputs = op.execute([a, b])
return outputs[0]
# 使用示例
if __name__ == "__main__":
# 初始化ACL
acl_context = ACLContext()
acl_context.initialize()
# 创建流
stream_mgr = StreamManager()
stream = stream_mgr.create_stream()
# 创建输入张量
input_arr = np.random.randn(1, 3, 224, 224).astype(np.float32)
weight_arr = np.random.randn(64, 3, 3, 3).astype(np.float32)
input_tensor = from_numpy(input_arr).to_device()
weight_tensor = from_numpy(weight_arr).to_device()
# 执行卷积
output_tensor = conv2d(input_tensor, weight_tensor,
kernel_size=3, stride=1, padding=1)
# 执行ReLU
output_tensor = relu(output_tensor)
# 同步流
stream.synchronize()
# 获取输出
output_arr = output_tensor.to_host().to_numpy()
print(f"Output shape: {output_arr.shape}")
四、模型推理接口
4.1 模型推理器
python
import json
from typing import Dict, List, Optional, Any
class ModelInference:
"""模型推理器"""
def __init__(self, model_path: str, device_id: int = 0):
self.model_path = model_path
self.device_id = device_id
self.model_id = 0
self.model_desc = None
self.input_info = []
self.output_info = []
self._load_model()
def _load_model(self):
"""加载模型"""
ret = acl.mdl.load_from_file(self.model_path, self.model_id)
if ret != 0:
raise RuntimeError(f"Failed to load model: {ret}")
# 获取模型描述
self.model_desc = acl.mdl.create_desc()
ret = acl.mdl.get_desc(self.model_desc, self.model_id)
if ret != 0:
raise RuntimeError(f"Failed to get model desc: {ret}")
# 获取输入输出信息
self._parse_model_info()
print(f"Model loaded: {len(self.input_info)} inputs, {len(self.output_info)} outputs")
def _parse_model_info(self):
"""解析模型信息"""
num_inputs = acl.mdl.get_num_inputs(self.model_desc)
num_outputs = acl.mdl.get_num_outputs(self.model_desc)
for i in range(num_inputs):
desc = acl.mdl.get_input_desc_by_index(self.model_desc, i)
shape = acl.mdl.get_input_shape(desc)
dtype = acl.mdl.get_input_data_type(desc)
size = acl.mdl.get_input_size_by_index(self.model_desc, i)
self.input_info.append({
'index': i,
'name': acl.mdl.get_input_name_by_index(self.model_desc, i),
'shape': shape,
'dtype': dtype,
'size': size
})
for i in range(num_outputs):
desc = acl.mdl.get_output_desc_by_index(self.model_desc, i)
shape = acl.mdl.get_output_shape(desc)
dtype = acl.mdl.get_output_data_type(desc)
size = acl.mdl.get_output_size_by_index(self.model_desc, i)
self.output_info.append({
'index': i,
'name': acl.mdl.get_output_name_by_index(self.model_desc, i),
'shape': shape,
'dtype': dtype,
'size': size
})
def infer(self, inputs: List[np.ndarray],
stream: Optional[Stream] = None) -> List[np.ndarray]:
"""执行推理"""
if len(inputs) != len(self.input_info):
raise ValueError(f"Expected {len(self.input_info)} inputs, got {len(inputs)}")
# 创建输入数据集
input_dataset = acl.mdl.create_dataset()
for i, input_data in enumerate(inputs):
# 验证输入
expected_shape = self.input_info[i]['shape']
if input_data.shape != expected_shape:
raise ValueError(f"Input {i} shape mismatch: expected {expected_shape}, got {input_data.shape}")
# 分配设备内存
dev_mem = DeviceMemory.allocate(input_data.nbytes)
dev_mem.copy_from(input_data)
# 创建数据缓冲区
buffer = acl.create_data_buffer(dev_mem.ptr, input_data.nbytes)
acl.mdl.add_dataset_buffer(input_dataset, buffer)
# 创建输出数据集
output_dataset = acl.mdl.create_dataset()
output_mems = []
for i, output_info in enumerate(self.output_info):
# 分配设备内存
dev_mem = DeviceMemory.allocate(output_info['size'])
output_mems.append(dev_mem)
# 创建数据缓冲区
buffer = acl.create_data_buffer(dev_mem.ptr, output_info['size'])
acl.mdl.add_dataset_buffer(output_dataset, buffer)
# 执行推理
stream_handle = stream.handle if stream else None
ret = acl.mdl.execute(self.model_id, input_dataset, output_dataset, stream_handle)
if ret != 0:
# 清理资源
acl.mdl.destroy_dataset(input_dataset)
acl.mdl.destroy_dataset(output_dataset)
raise RuntimeError(f"Failed to execute model: {ret}")
# 获取输出
outputs = []
for i, output_info in enumerate(self.output_info):
# 获取数据缓冲区
buffer = acl.mdl.get_dataset_buffer(output_dataset, i)
dev_ptr = acl.get_data_buffer_addr(buffer)
size = acl.get_data_buffer_size(buffer)
# 拷贝到主机
output_data = np.empty(output_info['shape'], dtype=np.float32)
acl.rt.memcpy(output_data.ctypes.data, dev_ptr, size,
acl.rt.ACL_MEMCPY_DEVICE_TO_HOST)
outputs.append(output_data)
# 清理资源
acl.mdl.destroy_dataset(input_dataset)
acl.mdl.destroy_dataset(output_dataset)
return outputs
def infer_async(self, inputs: List[np.ndarray],
stream: Stream) -> List[np.ndarray]:
"""异步推理"""
return self.infer(inputs, stream)
def get_input_info(self) -> List[Dict[str, Any]]:
"""获取输入信息"""
return self.input_info
def get_output_info(self) -> List[Dict[str, Any]]:
"""获取输出信息"""
return self.output_info
def __del__(self):
"""析构函数"""
if self.model_id != 0:
ret = acl.mdl.unload(self.model_id)
if ret != 0:
print(f"Warning: Failed to unload model: {ret}")
self.model_id = 0
if self.model_desc is not None:
acl.mdl.destroy_desc(self.model_desc)
self.model_desc = None
# 使用示例
if __name__ == "__main__":
# 初始化ACL
acl_context = ACLContext()
acl_context.initialize()
# 创建流
stream_mgr = StreamManager()
stream = stream_mgr.create_stream()
# 加载模型
model = ModelInference("resnet50.om")
# 打印模型信息
print("Input info:")
for info in model.get_input_info():
print(f" {info['name']}: {info['shape']}")
print("\nOutput info:")
for info in model.get_output_info():
print(f" {info['name']}: {info['shape']}")
# 准备输入
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
# 执行推理
outputs = model.infer([input_data], stream)
# 同步流
stream.synchronize()
# 打印输出
print(f"\nOutput shape: {outputs[0].shape}")
print(f"Output max: {outputs[0].max():.4f}")
print(f"Output min: {outputs[0].min():.4f}")
参考资源
官方资源:
- CANN组织主页:https://atomgit.com/cann
- acl仓库:https://atomgit.com/cann/acl
- AI开发者社区:https://www.hiascend.com
- CANN开发文档:https://www.hiascend.com/document
学习资源:
- ACL API参考手册
- 应用开发指南
- 性能优化实践
- 错误处理与调试