头文件
cpp
// Standard Library includes
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
// CUTLASS includes needed for half-precision GEMM kernel
#include "cutlass/core_io.h"
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/layout/matrix.h"
//
// CUTLASS utility includes
//
// Defines operator<<() to write TensorView objects to std::ostream
#include "cutlass/util/tensor_view_io.h"
// Defines cutlass::HostTensor<>
#include "cutlass/util/host_tensor.h"
// Defines cutlass::half_t
#include "cutlass/numeric_types.h"
// Defines device_memory::copy_device_to_device()
#include "cutlass/util/device_memory.h"
// Defines cutlass::reference::device::TensorFillRandomGaussian()
#include "cutlass/util/reference/device/tensor_fill.h"
// Defines cutlass::reference::host::TensorEquals()
#include "cutlass/util/reference/host/tensor_compare.h"
// Defines cutlass::reference::host::Gemm()
#include "cutlass/util/reference/host/gemm.h"
#pragma warning(disable : 4503)
kernel定义
cpp
/// Define a CUTLASS GEMM template and launch a GEMM kernel.
cudaError_t cutlass_hgemm_nn(int M, int N, int K, cutlass::half_t alpha,
cutlass::half_t const *A,
cutlass::layout::ColumnMajor::Stride::Index lda,
cutlass::half_t const *B,
cutlass::layout::ColumnMajor::Stride::Index ldb,
cutlass::half_t beta, cutlass::half_t *C,
cutlass::layout::ColumnMajor::Stride::Index ldc) {
// Define the GEMM operation
using Gemm =
cutlass::gemm::device::Gemm<cutlass::half_t, // ElementA
cutlass::layout::ColumnMajor, // LayoutA
cutlass::half_t, // ElementB
cutlass::layout::ColumnMajor, // LayoutB
cutlass::half_t, // ElementOutput
cutlass::layout::ColumnMajor // LayoutOutput
>;
Gemm gemm_op;
cutlass::Status status = gemm_op(
{{M, N, K}, {A, lda}, {B, ldb}, {C, ldc}, {C, ldc}, {alpha, beta}});
if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
return cudaSuccess;
}
这里大部分的流程都和上一篇一模一样,就不再赘述了
cutlass::half_t
这是cutlass封装的half数据类型,对应cuda的__half
cpp
//这是在其他地方定义的
typedef __half half;
/// IEEE half-precision floating-point type
struct alignas(2) half_t {
//
// Data members
//
/// 实际存储为uint16_t
uint16_t storage;
//
// Static conversion operators
// 提供了许多静态方法,将int,float等等转化为half。这里省略
//
// Methods
// 提供了基础的构造拷贝函数,这里省略
// 也提供了很多接口,将half转化为其他类型。这里只列举一个
/// Bitcasts to CUDA's half type
CUTLASS_HOST_DEVICE
half to_half() const {
#if defined(__CUDA_ARCH__)
return reinterpret_cast<half const &>(storage);
#else
__half_raw raw;
std::memcpy(&raw.x, &storage, sizeof(raw.x));
return half(raw);
#endif
}
/// Accesses raw internal state
CUTLASS_HOST_DEVICE
uint16_t& raw() {
return storage;
}
/// Accesses raw internal state
CUTLASS_HOST_DEVICE
uint16_t raw() const {
return storage;
}
};
//数字后面加_hf自动转化为half数据类型,比如
//cutlass::half_t a = 0.0_hf;
CUTLASS_HOST_DEVICE
cutlass::half_t operator""_hf(long double x) {
return cutlass::half_t(float(x));
}
CUTLASS_HOST_DEVICE
cutlass::half_t operator""_hf(unsigned long long int x) {
return cutlass::half_t(int(x));
}
Host端配置
cpp
/// 在 GPU 设备内存上分配多个矩阵,并调用一个半精度 CUTLASS GEMM kernel
cudaError_t TestCutlassGemm(int M, int N, int K, cutlass::half_t alpha,
cutlass::half_t beta) {
cudaError_t result;
//
// 构造 cutlass::HostTensor<>
//
// HostTensor 是 CUTLASS 提供的辅助类:
// - 同时在 Host 和 Device 上分配内存
// - 表示一个二维张量(矩阵)
// - 支持显式的 Host ↔ Device 同步
//
// 默认这里使用 ColumnMajor(列主序)布局
//
// 创建一个 M×K 的矩阵 A(元素类型为 half_t)
cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A(
cutlass::MatrixCoord(M, K));
// 创建一个 K×N 的矩阵 B
cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B(
cutlass::MatrixCoord(K, N));
// 创建一个 M×N 的矩阵,用来存储 CUTLASS 计算结果
cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C_cutlass(
cutlass::MatrixCoord(M, N));
// 创建一个 M×N 的矩阵,用来存储参考结果(CPU 计算)
cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor>
C_reference(cutlass::MatrixCoord(M, N));
//
// 用随机数初始化矩阵
//
// 随机数种子(固定值,保证每次运行结果一致)
uint64_t seed = 2080;
// 高斯分布参数
cutlass::half_t mean = 0.0_hf; // 均值
cutlass::half_t stddev = 5.0_hf; // 标准差
// 控制随机数的小数部分
// 这里设置为0,意味着随机数会被截断为整数
int bits_less_than_one = 0;
// 在 GPU 上用高斯分布填充矩阵 A
cutlass::reference::device::TensorFillRandomGaussian(
A.device_view(), seed, mean, stddev, bits_less_than_one);
// 在 GPU 上填充矩阵 B
cutlass::reference::device::TensorFillRandomGaussian(
B.device_view(), seed * 2019, mean, stddev, bits_less_than_one);
// 在 GPU 上填充 C_cutlass
cutlass::reference::device::TensorFillRandomGaussian(
C_cutlass.device_view(), seed * 1993, mean, stddev, bits_less_than_one);
// 将 C_cutlass 拷贝到 C_reference
// 这样当 beta != 0 时,参考 GEMM 的输入 C 与 CUTLASS 的一致
cutlass::device_memory::copy_device_to_device(
C_reference.device_data(), C_cutlass.device_data(), C_cutlass.capacity());
// 将 device 上的数据同步到 host
C_reference.sync_host();
//
// 启动 CUTLASS GEMM kernel
//
result = cutlass_hgemm_nn(
M, N, K, // GEMM problem size
alpha, // alpha
A.device_data(), A.stride(0), // A 的 device 指针和 leading dimension
B.device_data(), B.stride(0), // B 的 device 指针
beta, // beta
C_cutlass.device_data(), C_cutlass.stride(0) // C 输出
);
if (result != cudaSuccess) {
return result;
}
//
// 使用 CPU 参考实现验证结果
//
// 因为 A 和 B 是在 device 上初始化的
// 所以需要拷贝回 host 以便 CPU 计算
A.sync_host();
B.sync_host();
// 将 CUTLASS 的结果拷贝回 host
C_cutlass.sync_host();
//
// 使用 CUTLASS 提供的 host 参考 GEMM
//
cutlass::reference::host::Gemm<
cutlass::half_t, // A 的元素类型
cutlass::layout::ColumnMajor, // A 的布局
cutlass::half_t, // B 的元素类型
cutlass::layout::ColumnMajor, // B 的布局
cutlass::half_t, // 输出元素类型
cutlass::layout::ColumnMajor, // 输出布局
cutlass::half_t, // alpha 类型
cutlass::half_t // beta 类型
> gemm_ref;
// 执行 CPU GEMM
gemm_ref(
{M, N, K}, // GEMM problem size
alpha, // alpha
A.host_ref(), // A
B.host_ref(), // B
beta, // beta
C_reference.host_ref() // C
);
//
// 对比 CUTLASS 计算结果和 CPU 参考结果
//
if (!cutlass::reference::host::TensorEquals(
C_reference.host_view(),
C_cutlass.host_view())) {
char const *filename = "errors_01_cutlass_utilities.csv";
std::cerr << "错误 - CUTLASS GEMM 结果与参考实现不同。"
<< " 已将结果写入文件: "
<< filename << std::endl;
//
// 如果出现错误,将结果写入文件
//
std::ofstream file(filename);
// CUTLASS 计算结果
file << "\n\nCUTLASS =\n" << C_cutlass.host_view() << std::endl;
// CPU 参考结果
file << "\n\nReference =\n" << C_reference.host_view() << std::endl;
return cudaErrorUnknown;
}
// 验证通过
return cudaSuccess;
}
cutlass::HostTensor
源码设计
cpp
/// 主机端张量(Host tensor)
template <
/// 张量中存储元素的数据类型(概念:NumericType,数值类型)
typename Element_,
/// 定义从逻辑坐标到线性内存地址的映射方式(概念:Layout,布局)
typename Layout_
>
class HostTensor {
public:
/// 单个元素的数据类型
using Element = Element_;
/// 从逻辑坐标映射到线性内存的布局类型
using Layout = Layout_;
/// 张量索引空间的逻辑维度(rank)
static int const kRank = Layout::kRank;
/// 索引类型
using Index = typename Layout::Index;
/// 用于指针偏移的长整型索引
using LongIndex = typename Layout::LongIndex;
/// 张量逻辑空间中的坐标类型
using TensorCoord = typename Layout::TensorCoord;
/// 布局中的 stride(步长)向量
using Stride = typename Layout::Stride;
/// 指向设备(GPU)内存的张量引用
using TensorRef = TensorRef<Element, Layout>;
/// 指向设备常量内存的张量引用
using ConstTensorRef = typename TensorRef::ConstTensorRef;
/// 设备内存张量视图(view)
using TensorView = TensorView<Element, Layout>;
/// 设备常量内存张量视图
using ConstTensorView = typename TensorView::ConstTensorView;
/// 张量中元素的引用类型
using Reference = typename TensorRef::Reference;
/// 张量中元素的常量引用类型
using ConstReference = typename ConstTensorRef::Reference;
private:
// 选择底层存储单元类型
using StorageUnit =
typename platform::conditional_t<
std::is_same_v<Element, bool>,
uint8_t, // 避免使用 std::vector<bool> 的特殊实现
typename platform::conditional_t<
sizeof_bits<Element>::value % 8 == 0,
Element, // 如果元素大小是字节对齐的,直接用 Element
uint8_t // 如果是 sub-byte 类型(比如 fp4 / fp8),用 uint8_t
>
>;
using StorageContainerCalculator =
cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
// 容器类型的位数
static constexpr int kContainerTypeNumBits =
StorageContainerCalculator::kContainerTypeNumBits;
// 一个容器中包含多少个逻辑元素
static constexpr int kContainerTypeNumLogicalElements =
StorageContainerCalculator::kContainerTypeNumLogicalElements;
// 容器占用多少字节
static constexpr int kContainerTypeNumBytes =
StorageContainerCalculator::kContainerTypeNumBytes;
// 一个容器包含多少个 StorageUnit
static constexpr int kContainerTypeNumStorageUnit =
StorageContainerCalculator::kContainerTypeNumStorageUnit;
//
// 数据成员
//
/// 张量在逻辑维度上的大小(extent)
TensorCoord extent_;
/// 布局对象(负责坐标 → 内存地址映射)
Layout layout_;
/// 主机(CPU)端的内存分配
std::vector<StorageUnit> host_;
/// 设备(GPU)端的内存分配
device_memory::allocation<StorageUnit> device_;
/// 将逻辑元素数量转换为容器所需的 StorageUnit 数量
size_t count_to_container_storage_unit_count(size_t count) {
return (count + kContainerTypeNumLogicalElements - 1)
/ kContainerTypeNumLogicalElements
* kContainerTypeNumStorageUnit;
}
};
这里从Element_到ConstTensorRef的内容,在上一篇我都介绍过,这里就不再重复
cutlass::TensorView
cpp
template <
/// 元素数据类型
typename Element_,
/// 逻辑坐标 → 线性内存地址 的映射规则;比如RowMajor等
typename Layout_
>
class TensorView : public TensorRef<Element_, Layout_> {
public:
/// 基类
using Base = cutlass::TensorRef<Element_, Layout_>;
/// Mapping function from logical coordinate to internal n-D array
using Layout = Layout_;
/// 指向常量内存的TensorRef
using ConstTensorRef = typename Base::ConstTensorRef;
/// Underlying TensorRef type
using TensorRef = Base;
/// 元素数据类型
using Element = Element_;
/// 元素数据类型的引用
using Reference = Element &;
/// 来自 Layout的张量维度,比如三维张量,则kRank是3。
static int const kRank = Layout::kRank;
/// Index type
using Index = typename Layout::Index;
/// Long index used for pointer offsets
using LongIndex = typename Layout::LongIndex;
/// template <
/// int Rank_,
/// typename Index_
/// typename LongIndex_
/// >
/// struct Coord
/// 本质是一个Coord
using TensorCoord = typename Layout::TensorCoord;
/// Coordinate in storage n-D array
using Stride = typename Layout::Stride;
/// 指向常量的TensorView
using ConstTensorView = TensorView<
typename platform::remove_const<Element>::type const,
Layout>;
/// 指向非常量的TensorView
using NonConstTensorView = TensorView<
typename platform::remove_const<Element>::type,
Layout>;
/// 至少需要 rank=1。
/// 从数学上讲 rank=0 的 tensor 可以看作一个标量,
/// 但这种退化情况在 C++ 中很难处理,
/// 需要复杂的模板编程或支持零长度数组。
static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
private:
/// 视图的尺寸(extent)
TensorCoord extent_;
public:
//
// Methods
//
/// 构造一个 TensorView 对象
CUTLASS_HOST_DEVICE
TensorView() { }
/// 构造一个 TensorView 对象
CUTLASS_HOST_DEVICE
TensorView(
Element *ptr, ///< 指向 tensor 起始位置的指针
Layout const &layout, ///< layout 对象,包含 stride 和映射函数
TensorCoord const &extent ///< tensor 在逻辑坐标空间中的大小
):
Base(ptr, layout), extent_(extent) {
}
/// 构造一个 TensorView 对象
CUTLASS_HOST_DEVICE
TensorView(
TensorRef const &ref, ///< 指针 + layout,用于引用一个 tensor
TensorCoord const &extent ///< tensor 的逻辑大小
):
Base(ref), extent_(extent) {
}
/// 从非 const TensorView 转换构造
CUTLASS_HOST_DEVICE
TensorView(
NonConstTensorView const &view ///< 指向非 const 数据的 TensorView
):
Base(view), extent_(view.extent_) { }
/// 更新指针和 layout
CUTLASS_HOST_DEVICE
void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) {
Base::reset(ptr, layout);
this->resize(extent);
}
/// 只更新指针
CUTLASS_HOST_DEVICE
void reset(Element* ptr) {
Base::reset(ptr);
}
/// 修改 view 的大小,但不改变指针或 layout
CUTLASS_HOST_DEVICE
void resize(TensorCoord const &extent) {
this->extent_ = extent;
}
/// 返回 view 的 extent(每个维度的大小)
CUTLASS_HOST_DEVICE
TensorCoord const& extent() const { return extent_; }
/// 返回某一维度的大小
CUTLASS_HOST_DEVICE
Index extent(int dim) const { return extent_.at(dim); }
/// 返回 tensor 的逻辑元素个数
CUTLASS_HOST_DEVICE
LongIndex size() const {
return extent_.product();
}
/// 判断某个坐标是否在 tensor 范围内
CUTLASS_HOST_DEVICE
bool contains(TensorCoord const& coord) const {
CUTLASS_PRAGMA_UNROLL
for (int dim = 0; dim < kRank; ++dim) {
if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
return false;
}
}
return true;
}
/// 返回一个指向 tensor 首元素的 TensorRef
CUTLASS_HOST_DEVICE
TensorRef ref() const {
return TensorRef(this->data(), this->layout());
}
/// 返回 const TensorRef
CUTLASS_HOST_DEVICE
ConstTensorRef const_ref() const {
return ConstTensorRef(this->data(), this->layout());
}
/// 返回 const TensorView
CUTLASS_HOST_DEVICE
ConstTensorView const_view() const {
return ConstTensorView(const_ref(), extent_);
}
/// 返回一个子 TensorView
CUTLASS_HOST_DEVICE
TensorView subview(
TensorCoord extent, ///< 子视图的大小
TensorCoord const& location = TensorCoord() ///< 子视图在原 tensor 中的起点
) const {
TensorView result(this->ref(), extent.clamp(extent_ - location));
result.add_coord_offset(location);
return result;
}
/// 返回存储该 tensor 所需的标量元素数量
CUTLASS_HOST_DEVICE
size_t capacity() const {
return Base::layout().capacity(extent_);
}
/// 返回一个偏移后的 TensorView
CUTLASS_HOST_DEVICE
TensorView operator+(
TensorCoord const& b ///< 在逻辑坐标空间中的偏移
) const {
TensorView result(*this);
result.add_pointer_offset(this->offset(b));
return result;
}
/// 就地偏移
CUTLASS_HOST_DEVICE
TensorView& operator+=(
TensorCoord const& b
) {
this->add_pointer_offset(this->offset(b));
return *this;
}
/// 返回一个向负方向偏移的 TensorView
CUTLASS_HOST_DEVICE
TensorView operator-(
TensorCoord const& b
) const {
TensorRef result(*this);
result.add_pointer_offset(-this->offset(b));
return result;
}
/// 就地负偏移
CUTLASS_HOST_DEVICE
TensorView& operator-=(
TensorCoord const& b
) {
this->add_pointer_offset(-this->offset(b));
return *this;
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////
/// 构造 TensorView 的辅助函数(自动推导模板类型)
template <
typename Element,
typename Layout
>
CUTLASS_HOST_DEVICE TensorView<Element, Layout> make_TensorView(
Element *ptr,
Layout const &layout,
typename Layout::TensorCoord const &extent) {
return TensorView<Element, Layout>(ptr, layout, extent);
}
这里补充一下上一篇省略的Coord类的一些成员
cpp
/// Gets the index of a given Coord element
template <int Dim>
CUTLASS_HOST_DEVICE Index const& at() const {
return idx[Dim];
}
/// Access via index; may limit unrolling potential
CUTLASS_HOST_DEVICE
Index const& at(int dim) const { return idx[dim]; }
为什么要设计 TensorView 这种类?
cpp
TensorRef = 指针 + layout
TensorView = TensorRef + shape
HostTensor = TensorView + 内存管理
TensorRef 只做一件事:
描述内存地址 + layout
它只包含:
cpp
ptr
layout
例如:
cpp
A[i,j] → ptr + layout(i,j)
但是它不知道:
tensor 有多大,它的形状是什么
TensorView =TensorRef + extent
多了一个:shape
例如:A : 128 x 256
有了具体形状,就可以做更多事情
判断越界
获取大小
子tensor
cutlass::device_memory::allocation
cpp
template <typename T>
using allocation = cutlass::DeviceAllocation<T>;
所以我们去找一下DeviceAllocation
cpp
template <typename T>
class DeviceAllocation {
public:
/// CUDA 设备内存释放器(用于 unique_ptr 的自定义删除函数)
struct deleter {
void operator()(T* ptr) {
// 释放 CUDA device memory
cudaError_t cuda_error = (cudaFree(ptr));
// 如果释放失败
if (cuda_error != cudaSuccess) {
// noexcept:不能抛异常
// 原本可以抛出异常,但这里被注释掉了
// throw cuda_exception("cudaFree() failed", cuda_error);
return; // 直接返回
}
}
};
public:
//
// Data members
//
/// 当前 CUDA 设备上分配的 T 类型元素数量
size_t capacity;
/// 智能指针,用于管理 device memory
platform::unique_ptr<T, deleter> smart_ptr;
public:
//
// Static methods
//
/// 静态函数:计算存储指定数量元素需要多少字节
static size_t bytes(size_t elements) {
// 如果元素类型小于 1 字节(例如 1bit / 4bit 量化类型)
if (sizeof_bits<T>::value < 8) {
// 每个 byte 可以容纳多少个元素
size_t const kElementsPerByte = 8 / sizeof_bits<T>::value;
// 返回需要多少字节
return elements / kElementsPerByte;
}
else {
// 每个元素占多少字节
size_t const kBytesPerElement = sizeof_bits<T>::value / 8;
// 返回总字节数
return elements * kBytesPerElement;
}
}
public:
//
// Methods
//
/// 构造函数:不分配任何内存
DeviceAllocation() : capacity(0) {}
/// 构造函数:在当前 CUDA 设备上分配 capacity 个元素
DeviceAllocation(size_t _capacity) :
smart_ptr(device_memory::allocate<T>(_capacity)), // 调用 CUDA malloc
capacity(_capacity) {}
/// 构造函数:接管一个已经存在的 device memory
DeviceAllocation(T *ptr, size_t _capacity) :
smart_ptr(ptr), // unique_ptr 接管 ptr
capacity(_capacity) {}
/// 拷贝构造函数
DeviceAllocation(DeviceAllocation const &p):
smart_ptr(device_memory::allocate<T>(p.capacity)), // 重新分配 device memory
capacity(p.capacity) {
// 将原对象的数据复制到新对象
device_memory::copy_device_to_device(
smart_ptr.get(), // destination
p.get(), // source
capacity);
}
/// 移动构造函数
DeviceAllocation(DeviceAllocation &&p): capacity(0) {
// 交换资源(实现 move)
std::swap(smart_ptr, p.smart_ptr);
std::swap(capacity, p.capacity);
}
/// 析构函数
~DeviceAllocation() {
reset(); // 释放 device memory
}
/// 返回 device memory 指针
T* get() const {
return smart_ptr.get();
}
/// 释放所有权(但不释放内存)
T* release() {
capacity = 0;
// unique_ptr 放弃 ownership
return smart_ptr.release();
}
/// 删除 device memory 并重置容量
void reset() {
capacity = 0;
// unique_ptr.reset() 会调用 cudaFree
smart_ptr.reset();
}
/// 删除旧内存并重新分配
void reset(size_t _capacity) {
reset(device_memory::allocate<T>(_capacity), _capacity);
}
/// 删除旧内存并替换为新指针
void reset(T* _ptr, size_t _capacity) {
smart_ptr.reset(_ptr); // 释放旧内存并接管新内存
capacity = _capacity;
}
/// 重新分配更大的 buffer,并拷贝旧数据
void reallocate(size_t new_capacity) {
// 创建新的 device memory
platform::unique_ptr<T, deleter> new_allocation(
device_memory::allocate<T>(new_capacity));
// 将旧数据复制到新内存
device_memory::copy_device_to_device(
new_allocation.get(), // destination
smart_ptr.get(), // source
std::min(new_capacity, capacity)); // 防止越界
// 交换指针
std::swap(smart_ptr, new_allocation);
// 更新容量
std::swap(new_capacity, capacity);
}
/// 返回元素数量
size_t size() const {
return capacity;
}
/// 返回占用的字节数
size_t bytes() const {
return bytes(capacity);
}
/// 返回 device pointer(类似 pointer->member)
T* operator->() const {
return smart_ptr.get();
}
/// 返回删除器对象
deleter& get_deleter() {
return smart_ptr.get_deleter();
}
/// 返回 const 删除器对象
const deleter& get_deleter() const {
return smart_ptr.get_deleter();
}
/// 设备内存拷贝赋值
DeviceAllocation & operator=(DeviceAllocation const &p) {
// 如果容量不同
if (capacity != p.capacity) {
// 重新分配内存
smart_ptr.reset(device_memory::allocate<T>(p.capacity));
capacity = p.capacity;
}
// 复制数据
device_memory::copy_device_to_device(
smart_ptr.get(),
p.get(),
capacity);
return *this;
}
/// 移动赋值
DeviceAllocation & operator=(DeviceAllocation && p) {
std::swap(smart_ptr, p.smart_ptr);
std::swap(capacity, p.capacity);
return *this;
}
/// 从 device memory 复制整个 allocation
void copy_from_device(T const *ptr) const {
copy_from_device(ptr, capacity);
}
/// 从 device memory 复制指定数量元素
void copy_from_device(T const *ptr, size_t elements) const {
device_memory::copy_device_to_device(
get(),
ptr,
elements);
}
/// 复制到 device memory
void copy_to_device(T *ptr) const {
copy_to_device(ptr, capacity);
}
/// 复制指定数量元素到 device memory
void copy_to_device(T *ptr, size_t elements) const {
device_memory::copy_device_to_device(
ptr,
get(),
elements);
}
/// 从 host 复制到 device
void copy_from_host(T const *ptr) const {
copy_from_host(ptr, capacity);
}
/// 从 host 复制指定元素数量
void copy_from_host(T const *ptr, size_t elements) const {
device_memory::copy_to_device(
get(),
ptr,
elements);
}
/// 从 device 复制到 host
void copy_to_host(T *ptr) const {
copy_to_host(ptr, capacity);
}
/// 从 device 复制指定数量到 host
void copy_to_host(T *ptr, size_t elements) const {
device_memory::copy_to_host(
ptr,
get(),
elements);
}
};
可以把它理解为GPU 版 vector
里面的一些功能仔细拆解,就能回到最原始的cuda内存管理里
cutlass::platform::unique_ptr
cpp
/// std::unique_ptr
template <class T, class Deleter = default_delete<T> >
class unique_ptr;
直接认为是标准库的unique_ptr即可
cutlass::device_memory::allocate
cpp
template <typename T>
T* allocate(size_t count = 1) {
T* ptr = 0;
size_t bytes = count * sizeof_bits<T>::value / 8;
cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
if (cuda_error != cudaSuccess) {
#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
std::ostringstream os;
os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
CUTLASS_TRACE_HOST(os.str());
#endif
throw cuda_exception("Failed to allocate memory", cuda_error);
}
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
else {
std::ostringstream os;
os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
CUTLASS_TRACE_HOST(os.str());
}
#endif
return ptr;
}
说白了就是调用cudaMalloc分配内存
cutlass::device_memory::copy_***
cpp
template <typename T>
void copy_to_device(T* dst, T const* src, size_t count = 1) {
copy(dst, src, count, cudaMemcpyHostToDevice);
}
template <typename T>
void copy_to_host(T* dst, T const* src, size_t count = 1) {
copy(dst, src, count, cudaMemcpyDeviceToHost);
}
template <typename T>
void copy_device_to_device(T* dst, T const* src, size_t count = 1) {
copy(dst, src, count, cudaMemcpyDeviceToDevice);
}
template <typename T>
void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
copy(dst, src, count, cudaMemcpyHostToHost);
}
相信你们猜也能猜出来copy具体是干了什么
cpp
template <typename T>
void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
size_t bytes = count * sizeof_bits<T>::value / 8;
if (bytes == 0 && count > 0) {
bytes = 1;
}
cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
if (cuda_error != cudaSuccess) {
std::ostringstream os;
os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
<< "dst=" << dst << ", src=" << src
<< ", bytes=" << bytes << ", count=" << count;
if (kind == cudaMemcpyHostToDevice) {
os << ", kind=cudaMemcpyHostToDevice";
}
else if (kind == cudaMemcpyDeviceToHost) {
os << ", kind=cudaMemcpyDeviceToHost";
}
else if (kind == cudaMemcpyDeviceToDevice) {
os << ", kind=cudaMemcpyDeviceToDevice";
}
else if (kind == cudaMemcpyHostToHost) {
os << ", kind=cudaMemcpyHostToHost";
}
else if (kind == cudaMemcpyDefault) {
os << ", kind=cudaMemcpyDefault";
}
else {
os << ", kind=Unknown";
}
os << ", error: " << cudaGetErrorString(cuda_error);
throw cuda_exception(os.str().c_str(), cuda_error);
}
}
就是调用了cudaMemcpy
构造函数
cpp
/// Default constructor
HostTensor() {}
/// Constructs a tensor given an extent. Assumes a packed layout
HostTensor(
TensorCoord const &extent,
bool device_backed = true
) {
this->reset(extent, Layout::packed(extent), device_backed);
}
/// Constructs a tensor given an extent and layout
HostTensor(
TensorCoord const &extent,
Layout const &layout,
bool device_backed = true
) {
this->reset(extent, layout, device_backed);
}
~HostTensor() { }
/// Clears the HostTensor allocation to size/capacity = 0
void reset() {
extent_ = TensorCoord();
layout_ = Layout::packed(extent_);
host_.clear();
device_.reset();
}
这里的Layout::packed在第一篇文章的cutlass::layout::ColumnMajor里有相关介绍
数据访问接口
cpp
/// 获取指向 host 端数据的指针
Element * host_data() {
return reinterpret_cast<Element *>(host_.data());
}
/// 获取带有元素偏移量的 host 数据指针
Element * host_data_ptr_offset(LongIndex ptr_element_offset) {
return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset);
}
/// 获取 host 内存中某个元素的引用
Reference host_data(LongIndex idx) {
return ReferenceFactory<Element>::get(host_data(), idx);
}
/// 获取指向 host 端数据的常量指针
Element const * host_data() const {
return reinterpret_cast<Element const *>(host_.data());
}
/// 获取带有元素偏移量的 host 数据常量指针
Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const {
return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset);
}
/// 获取 host 内存中某个元素的常量引用
ConstReference host_data(LongIndex idx) const {
return ReferenceFactory<Element const>::get(host_data(), idx);
}
/// 获取指向 device(GPU) 数据的指针
Element * device_data() {
return reinterpret_cast<Element *>(device_.get());
}
/// 获取指向 device(GPU) 数据的常量指针
Element const * device_data() const {
return reinterpret_cast<Element const *>(device_.get());
}
/// 获取带有元素偏移量的 device 数据指针
Element * device_data_ptr_offset(LongIndex ptr_element_offset) {
return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset);
}
/// 获取带有元素偏移量的 device 数据常量指针
Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const {
return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset);
}
/// 获取一个指向 host 数据的 TensorRef(张量引用)
TensorRef host_ref(LongIndex ptr_element_offset=0) {
return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_);
}
/// 获取一个指向 host 数据的常量 TensorRef
ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const {
return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_);
}
/// 获取一个指向 device 数据的 TensorRef(张量引用)
TensorRef device_ref(LongIndex ptr_element_offset=0) {
return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
}
/// 获取一个指向 device 数据的常量 TensorRef
ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
}
/// 获取一个指向 host 数据的 TensorView(带 shape 信息的视图)
TensorView host_view(LongIndex ptr_element_offset=0) {
return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
}
/// 获取一个指向 host 数据的常量 TensorView
ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
}
/// 获取一个指向 device 数据的 TensorView
TensorView device_view(LongIndex ptr_element_offset=0) {
return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
}
/// 获取一个指向 device 数据的常量 TensorView
ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
}
这一段代码其实提供了 三层访问接口:
cpp
HostTensor
│
├── host_data() → Element*
├── host_ref() → TensorRef
└── host_view() → TensorView
device同理
数据在host和device移动
cpp
/// Returns true if device memory is allocated
bool device_backed() const {
return (device_.get() == nullptr) ? false : true;
}
/// Copies data from device to host
void sync_host() {
if (device_backed()) {
device_memory::copy_to_host(
host_.data(), device_.get(), device_.size());
}
}
/// Copies data from host to device
void sync_device() {
if (device_backed()) {
device_memory::copy_to_device(
device_.get(), host_.data(), host_.size());
}
}