Ref
源码
mirage/src/kernel/customized.cc
实现自定义内核算子(KNCustomizedOp)的核心文件,主要负责将内核级图和线程块级图连接起来
Graph::customized() - 自己定义算子
cpp
std::vector<DTensor> Graph::customized(std::vector<DTensor> const &inputs,
threadblock::Graph const &bgraph) {
KNOperator *op = create_customized_op(inputs, bgraph);
assert(op != nullptr);
operators.push_back(op);
return op->output_tensors;
}
int Graph::customized(std::vector<DTensor const *> _inputs,
DTensor **outputs,
mirage::threadblock::Graph const *bgraph) {
std::vector<DTensor> inputs;
for (auto const &t : _inputs) {
inputs.push_back(t == nullptr ? DTensor::EMPTY_TENSOR : *t);
}
KNOperator *op = create_customized_op(inputs, *bgraph);
assert(op != nullptr);
operators.push_back(op);
for (size_t i = 0; i < op->output_tensors.size(); i++) {
outputs[i] = &op->output_tensors[i];
}
return op->output_tensors.size();
}
mirage/src/transpiler/transpile.cc
mirage/src/kernel/chunk.cc
chunk算子用于对于给定张量在指定维度上进行切分,注意这里貌似只涉及了描述chunk的行为,具体的可能是通过Transpiler代码生成器进行翻译成CUDA代码
算子创建
cpp
KNOperator *
Graph::create_chunk_op(DTensor const &input, int chunk_size, int dim) {
if (dim < 0 || dim >= input.num_dims || chunk_size <= 0) {
return nullptr;
}
if (input.dim[dim] % chunk_size != 0) {
return nullptr;
}
if (!this->can_allocate(input)) {
return nullptr;
}
KNChunkOp *op = new KNChunkOp(this, input, chunk_size, dim);
return op;
}
运行chunk算子
cpp
std::vector<DTensor>
Graph::chunk(DTensor const &input, int chunk_size, int dim) {
KNOperator *op = create_chunk_op(input, chunk_size, dim);
assert(op != nullptr);
operators.push_back(op);
assert(op->output_tensors.size() > 0);
return op->output_tensors;
}
创建chunk算子并加入算子库operators之后,并调用该算子运行得到结果