[CUTLASS笔记2]host端工具类

头文件

cpp 复制代码

// Standard Library includes
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>

// CUTLASS includes needed for half-precision GEMM kernel
#include "cutlass/core_io.h"
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/layout/matrix.h"

//
// CUTLASS utility includes
//

// Defines operator<<() to write TensorView objects to std::ostream
#include "cutlass/util/tensor_view_io.h"

// Defines cutlass::HostTensor<>
#include "cutlass/util/host_tensor.h"

// Defines cutlass::half_t
#include "cutlass/numeric_types.h"

// Defines device_memory::copy_device_to_device()
#include "cutlass/util/device_memory.h"

// Defines cutlass::reference::device::TensorFillRandomGaussian()
#include "cutlass/util/reference/device/tensor_fill.h"

// Defines cutlass::reference::host::TensorEquals()
#include "cutlass/util/reference/host/tensor_compare.h"

// Defines cutlass::reference::host::Gemm()
#include "cutlass/util/reference/host/gemm.h"

#pragma warning(disable : 4503)

kernel定义

cpp 复制代码

/// Define a CUTLASS GEMM template and launch a GEMM kernel.
cudaError_t cutlass_hgemm_nn(int M, int N, int K, cutlass::half_t alpha,
                             cutlass::half_t const *A,
                             cutlass::layout::ColumnMajor::Stride::Index lda,
                             cutlass::half_t const *B,
                             cutlass::layout::ColumnMajor::Stride::Index ldb,
                             cutlass::half_t beta, cutlass::half_t *C,
                             cutlass::layout::ColumnMajor::Stride::Index ldc) {

  // Define the GEMM operation
  using Gemm =
      cutlass::gemm::device::Gemm<cutlass::half_t,              // ElementA
                                  cutlass::layout::ColumnMajor, // LayoutA
                                  cutlass::half_t,              // ElementB
                                  cutlass::layout::ColumnMajor, // LayoutB
                                  cutlass::half_t,              // ElementOutput
                                  cutlass::layout::ColumnMajor  // LayoutOutput
                                  >;

  Gemm gemm_op;

  cutlass::Status status = gemm_op(
      {{M, N, K}, {A, lda}, {B, ldb}, {C, ldc}, {C, ldc}, {alpha, beta}});

  if (status != cutlass::Status::kSuccess) {
    return cudaErrorUnknown;
  }

  return cudaSuccess;
}

这里大部分的流程都和上一篇一模一样，就不再赘述了

cutlass::half_t

这是cutlass封装的half数据类型，对应cuda的__half

cpp 复制代码

//这是在其他地方定义的
typedef __half half;

/// IEEE half-precision floating-point type
struct alignas(2) half_t {

  //
  // Data members
  //

  /// 实际存储为uint16_t
  uint16_t storage;

  //
  // Static conversion operators
  // 提供了许多静态方法，将int,float等等转化为half。这里省略

  //
  // Methods
  // 提供了基础的构造拷贝函数，这里省略
  // 也提供了很多接口，将half转化为其他类型。这里只列举一个
  /// Bitcasts to CUDA's half type
  CUTLASS_HOST_DEVICE
  half to_half() const {
    #if defined(__CUDA_ARCH__)
    return reinterpret_cast<half const &>(storage);
    #else
    __half_raw raw;
    std::memcpy(&raw.x, &storage, sizeof(raw.x));
    return half(raw);
    #endif
  }
  /// Accesses raw internal state
  CUTLASS_HOST_DEVICE
  uint16_t& raw() {
    return storage;
  }

  /// Accesses raw internal state
  CUTLASS_HOST_DEVICE
  uint16_t raw() const {
    return storage;
  }
};

//数字后面加_hf自动转化为half数据类型，比如
//cutlass::half_t a = 0.0_hf;
CUTLASS_HOST_DEVICE
cutlass::half_t operator""_hf(long double x) {
  return cutlass::half_t(float(x));
}

CUTLASS_HOST_DEVICE
cutlass::half_t operator""_hf(unsigned long long int x) {
  return cutlass::half_t(int(x));
}

Host端配置

cpp 复制代码

/// 在 GPU 设备内存上分配多个矩阵，并调用一个半精度 CUTLASS GEMM kernel
cudaError_t TestCutlassGemm(int M, int N, int K, cutlass::half_t alpha,
                            cutlass::half_t beta) {

  cudaError_t result;

  //
  // 构造 cutlass::HostTensor<>
  //
  // HostTensor 是 CUTLASS 提供的辅助类：
  //  - 同时在 Host 和 Device 上分配内存
  //  - 表示一个二维张量（矩阵）
  //  - 支持显式的 Host ↔ Device 同步
  //
  // 默认这里使用 ColumnMajor（列主序）布局
  //

  // 创建一个 M×K 的矩阵 A（元素类型为 half_t）
  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A(
      cutlass::MatrixCoord(M, K));

  // 创建一个 K×N 的矩阵 B
  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B(
      cutlass::MatrixCoord(K, N));

  // 创建一个 M×N 的矩阵，用来存储 CUTLASS 计算结果
  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C_cutlass(
      cutlass::MatrixCoord(M, N));

  // 创建一个 M×N 的矩阵，用来存储参考结果（CPU 计算）
  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor>
      C_reference(cutlass::MatrixCoord(M, N));


  //
  // 用随机数初始化矩阵
  //

  // 随机数种子（固定值，保证每次运行结果一致）
  uint64_t seed = 2080;

  // 高斯分布参数
  cutlass::half_t mean = 0.0_hf;   // 均值
  cutlass::half_t stddev = 5.0_hf; // 标准差

  // 控制随机数的小数部分
  // 这里设置为0，意味着随机数会被截断为整数
  int bits_less_than_one = 0;

  // 在 GPU 上用高斯分布填充矩阵 A
  cutlass::reference::device::TensorFillRandomGaussian(
      A.device_view(), seed, mean, stddev, bits_less_than_one);

  // 在 GPU 上填充矩阵 B
  cutlass::reference::device::TensorFillRandomGaussian(
      B.device_view(), seed * 2019, mean, stddev, bits_less_than_one);

  // 在 GPU 上填充 C_cutlass
  cutlass::reference::device::TensorFillRandomGaussian(
      C_cutlass.device_view(), seed * 1993, mean, stddev, bits_less_than_one);


  // 将 C_cutlass 拷贝到 C_reference
  // 这样当 beta != 0 时，参考 GEMM 的输入 C 与 CUTLASS 的一致
  cutlass::device_memory::copy_device_to_device(
      C_reference.device_data(), C_cutlass.device_data(), C_cutlass.capacity());

  // 将 device 上的数据同步到 host
  C_reference.sync_host();


  //
  // 启动 CUTLASS GEMM kernel
  //

  result = cutlass_hgemm_nn(
      M, N, K,                  // GEMM problem size
      alpha,                    // alpha
      A.device_data(), A.stride(0),  // A 的 device 指针和 leading dimension
      B.device_data(), B.stride(0),  // B 的 device 指针
      beta,                     // beta
      C_cutlass.device_data(), C_cutlass.stride(0) // C 输出
  );

  if (result != cudaSuccess) {
    return result;
  }


  //
  // 使用 CPU 参考实现验证结果
  //

  // 因为 A 和 B 是在 device 上初始化的
  // 所以需要拷贝回 host 以便 CPU 计算
  A.sync_host();
  B.sync_host();

  // 将 CUTLASS 的结果拷贝回 host
  C_cutlass.sync_host();


  //
  // 使用 CUTLASS 提供的 host 参考 GEMM
  //

  cutlass::reference::host::Gemm<
      cutlass::half_t,              // A 的元素类型
      cutlass::layout::ColumnMajor, // A 的布局
      cutlass::half_t,              // B 的元素类型
      cutlass::layout::ColumnMajor, // B 的布局
      cutlass::half_t,              // 输出元素类型
      cutlass::layout::ColumnMajor, // 输出布局
      cutlass::half_t,              // alpha 类型
      cutlass::half_t               // beta 类型
      > gemm_ref;

  // 执行 CPU GEMM
  gemm_ref(
      {M, N, K},       // GEMM problem size
      alpha,           // alpha
      A.host_ref(),    // A
      B.host_ref(),    // B
      beta,            // beta
      C_reference.host_ref() // C
  );


  //
  // 对比 CUTLASS 计算结果和 CPU 参考结果
  //

  if (!cutlass::reference::host::TensorEquals(
          C_reference.host_view(),
          C_cutlass.host_view())) {

    char const *filename = "errors_01_cutlass_utilities.csv";

    std::cerr << "错误 - CUTLASS GEMM 结果与参考实现不同。"
              << " 已将结果写入文件: "
              << filename << std::endl;

    //
    // 如果出现错误，将结果写入文件
    //

    std::ofstream file(filename);

    // CUTLASS 计算结果
    file << "\n\nCUTLASS =\n" << C_cutlass.host_view() << std::endl;

    // CPU 参考结果
    file << "\n\nReference =\n" << C_reference.host_view() << std::endl;

    return cudaErrorUnknown;
  }

  // 验证通过
  return cudaSuccess;
}

cutlass::HostTensor

源码设计

cpp 复制代码

/// 主机端张量（Host tensor）
template <
  /// 张量中存储元素的数据类型（概念：NumericType，数值类型）
  typename Element_,
  /// 定义从逻辑坐标到线性内存地址的映射方式（概念：Layout，布局）
  typename Layout_
>
class HostTensor {
public:

  /// 单个元素的数据类型
  using Element = Element_;

  /// 从逻辑坐标映射到线性内存的布局类型
  using Layout = Layout_;

  /// 张量索引空间的逻辑维度（rank）
  static int const kRank = Layout::kRank;

  /// 索引类型
  using Index = typename Layout::Index;

  /// 用于指针偏移的长整型索引
  using LongIndex = typename Layout::LongIndex;

  /// 张量逻辑空间中的坐标类型
  using TensorCoord = typename Layout::TensorCoord;

  /// 布局中的 stride（步长）向量
  using Stride = typename Layout::Stride;

  /// 指向设备（GPU）内存的张量引用
  using TensorRef = TensorRef<Element, Layout>;

  /// 指向设备常量内存的张量引用
  using ConstTensorRef = typename TensorRef::ConstTensorRef;

  /// 设备内存张量视图（view）
  using TensorView = TensorView<Element, Layout>;

  /// 设备常量内存张量视图
  using ConstTensorView = typename TensorView::ConstTensorView;

  /// 张量中元素的引用类型
  using Reference = typename TensorRef::Reference;

  /// 张量中元素的常量引用类型
  using ConstReference = typename ConstTensorRef::Reference;

private:

  // 选择底层存储单元类型
  using StorageUnit =
      typename platform::conditional_t<
          std::is_same_v<Element, bool>,
          uint8_t,  // 避免使用 std::vector<bool> 的特殊实现
          typename platform::conditional_t<
              sizeof_bits<Element>::value % 8 == 0,
              Element, // 如果元素大小是字节对齐的，直接用 Element
              uint8_t  // 如果是 sub-byte 类型（比如 fp4 / fp8），用 uint8_t
          >
      >;

  using StorageContainerCalculator =
      cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;

  // 容器类型的位数
  static constexpr int kContainerTypeNumBits =
      StorageContainerCalculator::kContainerTypeNumBits;

  // 一个容器中包含多少个逻辑元素
  static constexpr int kContainerTypeNumLogicalElements =
      StorageContainerCalculator::kContainerTypeNumLogicalElements;

  // 容器占用多少字节
  static constexpr int kContainerTypeNumBytes =
      StorageContainerCalculator::kContainerTypeNumBytes;

  // 一个容器包含多少个 StorageUnit
  static constexpr int kContainerTypeNumStorageUnit =
      StorageContainerCalculator::kContainerTypeNumStorageUnit;


  //
  // 数据成员
  //

  /// 张量在逻辑维度上的大小（extent）
  TensorCoord extent_;

  /// 布局对象（负责坐标 → 内存地址映射）
  Layout layout_;

  /// 主机（CPU）端的内存分配
  std::vector<StorageUnit> host_;

  /// 设备（GPU）端的内存分配
  device_memory::allocation<StorageUnit> device_;

  /// 将逻辑元素数量转换为容器所需的 StorageUnit 数量
  size_t count_to_container_storage_unit_count(size_t count) {
    return (count + kContainerTypeNumLogicalElements - 1)
        / kContainerTypeNumLogicalElements
        * kContainerTypeNumStorageUnit;
  }
};

这里从Element_到ConstTensorRef的内容，在上一篇我都介绍过，这里就不再重复

cutlass::TensorView

cpp 复制代码

template <
  /// 元素数据类型
  typename Element_,
  /// 逻辑坐标 → 线性内存地址 的映射规则；比如RowMajor等
  typename Layout_
>
class TensorView : public TensorRef<Element_, Layout_> {
 public:

  /// 基类
  using Base = cutlass::TensorRef<Element_, Layout_>;

  /// Mapping function from logical coordinate to internal n-D array
  using Layout = Layout_;

  /// 指向常量内存的TensorRef
  using ConstTensorRef = typename Base::ConstTensorRef;

  /// Underlying TensorRef type
  using TensorRef = Base;

  /// 元素数据类型
  using Element = Element_;

  /// 元素数据类型的引用
  using Reference = Element &;

  /// 来自 Layout的张量维度，比如三维张量，则kRank是3。
  static int const kRank = Layout::kRank;

  /// Index type
  using Index = typename Layout::Index;

  /// Long index used for pointer offsets
  using LongIndex = typename Layout::LongIndex;

  /// template <
  ///    int Rank_,
  ///    typename Index_
  ///    typename LongIndex_
  ///  >
  ///  struct Coord
  ///  本质是一个Coord
  using TensorCoord = typename Layout::TensorCoord;

  /// Coordinate in storage n-D array
  using Stride = typename Layout::Stride;

  /// 指向常量的TensorView
  using ConstTensorView = TensorView<
    typename platform::remove_const<Element>::type const,
    Layout>;

  /// 指向非常量的TensorView
  using NonConstTensorView = TensorView<
    typename platform::remove_const<Element>::type,
    Layout>;

  /// 至少需要 rank=1。
  /// 从数学上讲 rank=0 的 tensor 可以看作一个标量，
  /// 但这种退化情况在 C++ 中很难处理，
  /// 需要复杂的模板编程或支持零长度数组。
  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");

 private:

  /// 视图的尺寸（extent）
  TensorCoord extent_;

 public:

  //
  // Methods
  //

  /// 构造一个 TensorView 对象
  CUTLASS_HOST_DEVICE
  TensorView() { }

  /// 构造一个 TensorView 对象
  CUTLASS_HOST_DEVICE
  TensorView(
    Element *ptr,                         ///< 指向 tensor 起始位置的指针
    Layout const &layout,                 ///< layout 对象，包含 stride 和映射函数
    TensorCoord const &extent             ///< tensor 在逻辑坐标空间中的大小
  ):
    Base(ptr, layout), extent_(extent) {
  
  }

  /// 构造一个 TensorView 对象
  CUTLASS_HOST_DEVICE
  TensorView(
    TensorRef const &ref,                 ///< 指针 + layout，用于引用一个 tensor
    TensorCoord const &extent             ///< tensor 的逻辑大小
  ):
    Base(ref), extent_(extent) {
  
  }

  /// 从非 const TensorView 转换构造
  CUTLASS_HOST_DEVICE
  TensorView(
    NonConstTensorView const &view        ///< 指向非 const 数据的 TensorView
  ):
    Base(view), extent_(view.extent_) { }

  /// 更新指针和 layout
  CUTLASS_HOST_DEVICE
  void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) {
    Base::reset(ptr, layout);
    this->resize(extent);
  }

  /// 只更新指针
  CUTLASS_HOST_DEVICE
  void reset(Element* ptr) {
    Base::reset(ptr);
  }

  /// 修改 view 的大小，但不改变指针或 layout
  CUTLASS_HOST_DEVICE
  void resize(TensorCoord const &extent) {
    this->extent_ = extent;
  }

  /// 返回 view 的 extent（每个维度的大小）
  CUTLASS_HOST_DEVICE
  TensorCoord const& extent() const { return extent_; }

  /// 返回某一维度的大小
  CUTLASS_HOST_DEVICE
  Index extent(int dim) const { return extent_.at(dim); }

  /// 返回 tensor 的逻辑元素个数
  CUTLASS_HOST_DEVICE
  LongIndex size() const {
    return extent_.product();
  }

  /// 判断某个坐标是否在 tensor 范围内
  CUTLASS_HOST_DEVICE
  bool contains(TensorCoord const& coord) const {
    CUTLASS_PRAGMA_UNROLL
    for (int dim = 0; dim < kRank; ++dim) {
      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
        return false;
      }
    }
    return true;
  }

  /// 返回一个指向 tensor 首元素的 TensorRef
  CUTLASS_HOST_DEVICE
  TensorRef ref() const {
    return TensorRef(this->data(), this->layout());
  }

  /// 返回 const TensorRef
  CUTLASS_HOST_DEVICE
  ConstTensorRef const_ref() const {
    return ConstTensorRef(this->data(), this->layout());
  }

  /// 返回 const TensorView
  CUTLASS_HOST_DEVICE
  ConstTensorView const_view() const {
    return ConstTensorView(const_ref(), extent_);
  }

  /// 返回一个子 TensorView
  CUTLASS_HOST_DEVICE
  TensorView subview(
    TensorCoord extent,                               ///< 子视图的大小
    TensorCoord const& location = TensorCoord()       ///< 子视图在原 tensor 中的起点
  ) const {

    TensorView result(this->ref(), extent.clamp(extent_ - location));
    result.add_coord_offset(location);
    return result;
  }

  /// 返回存储该 tensor 所需的标量元素数量
  CUTLASS_HOST_DEVICE
  size_t capacity() const {
    return Base::layout().capacity(extent_);
  }

  /// 返回一个偏移后的 TensorView
  CUTLASS_HOST_DEVICE
  TensorView operator+(
    TensorCoord const& b            ///< 在逻辑坐标空间中的偏移
  ) const {

    TensorView result(*this);
    result.add_pointer_offset(this->offset(b));
    return result;
  }

  /// 就地偏移
  CUTLASS_HOST_DEVICE
  TensorView& operator+=(
    TensorCoord const& b
  ) {

    this->add_pointer_offset(this->offset(b));
    return *this;
  }

  /// 返回一个向负方向偏移的 TensorView
  CUTLASS_HOST_DEVICE
  TensorView operator-(
    TensorCoord const& b
  ) const {

    TensorRef result(*this);
    result.add_pointer_offset(-this->offset(b));
    return result;
  }

  /// 就地负偏移
  CUTLASS_HOST_DEVICE
  TensorView& operator-=(
    TensorCoord const& b
  ) {

    this->add_pointer_offset(-this->offset(b));
    return *this;
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/// 构造 TensorView 的辅助函数（自动推导模板类型）
template <
  typename Element,
  typename Layout
>
CUTLASS_HOST_DEVICE TensorView<Element, Layout> make_TensorView(
  Element *ptr, 
  Layout const &layout,
  typename Layout::TensorCoord const &extent) {

  return TensorView<Element, Layout>(ptr, layout, extent);
}

这里补充一下上一篇省略的Coord类的一些成员

cpp 复制代码

  /// Gets the index of a given Coord element
  template <int Dim>
  CUTLASS_HOST_DEVICE Index const& at() const {
    return idx[Dim];
  }

  /// Access via index; may limit unrolling potential
  CUTLASS_HOST_DEVICE
  Index const& at(int dim) const { return idx[dim]; }

为什么要设计 TensorView 这种类？

cpp 复制代码

TensorRef  = 指针 + layout
TensorView = TensorRef + shape
HostTensor = TensorView + 内存管理

TensorRef 只做一件事：

描述内存地址 + layout

它只包含：

cpp 复制代码

ptr
layout

例如：

cpp 复制代码

A[i,j] → ptr + layout(i,j)

但是它不知道：

tensor 有多大，它的形状是什么

TensorView =TensorRef + extent

多了一个：shape

例如：A : 128 x 256

有了具体形状，就可以做更多事情

判断越界

获取大小

子tensor

cutlass::device_memory::allocation

cpp 复制代码

template <typename T>
using allocation = cutlass::DeviceAllocation<T>;

所以我们去找一下DeviceAllocation

cpp 复制代码

template <typename T>
class DeviceAllocation {
public:

  /// CUDA 设备内存释放器（用于 unique_ptr 的自定义删除函数）
  struct deleter {
    void operator()(T* ptr) {
      // 释放 CUDA device memory
      cudaError_t cuda_error = (cudaFree(ptr));

      // 如果释放失败
      if (cuda_error != cudaSuccess) {

        // noexcept：不能抛异常
        // 原本可以抛出异常，但这里被注释掉了
        // throw cuda_exception("cudaFree() failed", cuda_error);

        return;  // 直接返回
      }
    }
  };

public:
  //
  // Data members
  //

  /// 当前 CUDA 设备上分配的 T 类型元素数量
  size_t capacity;

  /// 智能指针，用于管理 device memory
  platform::unique_ptr<T, deleter> smart_ptr;

public:

  //
  // Static methods
  //

  /// 静态函数：计算存储指定数量元素需要多少字节
  static size_t bytes(size_t elements) {

    // 如果元素类型小于 1 字节（例如 1bit / 4bit 量化类型）
    if (sizeof_bits<T>::value < 8) {

      // 每个 byte 可以容纳多少个元素
      size_t const kElementsPerByte = 8 / sizeof_bits<T>::value;

      // 返回需要多少字节
      return elements / kElementsPerByte;
    }
    else {

      // 每个元素占多少字节
      size_t const kBytesPerElement = sizeof_bits<T>::value / 8;

      // 返回总字节数
      return elements * kBytesPerElement;
    }
  }

public:

  //
  // Methods
  //

  /// 构造函数：不分配任何内存
  DeviceAllocation() : capacity(0) {}

  /// 构造函数：在当前 CUDA 设备上分配 capacity 个元素
  DeviceAllocation(size_t _capacity) : 
    smart_ptr(device_memory::allocate<T>(_capacity)), // 调用 CUDA malloc
    capacity(_capacity) {}

  /// 构造函数：接管一个已经存在的 device memory
  DeviceAllocation(T *ptr, size_t _capacity) : 
    smart_ptr(ptr),        // unique_ptr 接管 ptr
    capacity(_capacity) {}

  /// 拷贝构造函数
  DeviceAllocation(DeviceAllocation const &p): 
    smart_ptr(device_memory::allocate<T>(p.capacity)), // 重新分配 device memory
    capacity(p.capacity) {

    // 将原对象的数据复制到新对象
    device_memory::copy_device_to_device(
        smart_ptr.get(),  // destination
        p.get(),          // source
        capacity);
  }

  /// 移动构造函数
  DeviceAllocation(DeviceAllocation &&p): capacity(0) {

    // 交换资源（实现 move）
    std::swap(smart_ptr, p.smart_ptr);
    std::swap(capacity, p.capacity);
  }

  /// 析构函数
  ~DeviceAllocation() { 
    reset();   // 释放 device memory
  }

  /// 返回 device memory 指针
  T* get() const { 
    return smart_ptr.get(); 
  }

  /// 释放所有权（但不释放内存）
  T* release() {

    capacity = 0;

    // unique_ptr 放弃 ownership
    return smart_ptr.release();
  }

  /// 删除 device memory 并重置容量
  void reset() {

    capacity = 0;

    // unique_ptr.reset() 会调用 cudaFree
    smart_ptr.reset();
  }

  /// 删除旧内存并重新分配
  void reset(size_t _capacity) {

    reset(device_memory::allocate<T>(_capacity), _capacity);
  }

  /// 删除旧内存并替换为新指针
  void reset(T* _ptr, size_t _capacity) {

    smart_ptr.reset(_ptr);  // 释放旧内存并接管新内存

    capacity = _capacity;
  }

  /// 重新分配更大的 buffer，并拷贝旧数据
  void reallocate(size_t new_capacity) {
    
    // 创建新的 device memory
    platform::unique_ptr<T, deleter> new_allocation(
        device_memory::allocate<T>(new_capacity));

    // 将旧数据复制到新内存
    device_memory::copy_device_to_device(
      new_allocation.get(),     // destination
      smart_ptr.get(),          // source
      std::min(new_capacity, capacity)); // 防止越界

    // 交换指针
    std::swap(smart_ptr, new_allocation);

    // 更新容量
    std::swap(new_capacity, capacity);
  }

  /// 返回元素数量
  size_t size() const {
    return capacity;
  }

  /// 返回占用的字节数
  size_t bytes() const {

    return bytes(capacity);
  }

  /// 返回 device pointer（类似 pointer->member）
  T* operator->() const { 
    return smart_ptr.get(); 
  }

  /// 返回删除器对象
  deleter& get_deleter() { 
    return smart_ptr.get_deleter(); 
  }

  /// 返回 const 删除器对象
  const deleter& get_deleter() const { 
    return smart_ptr.get_deleter(); 
  }

  /// 设备内存拷贝赋值
  DeviceAllocation & operator=(DeviceAllocation const &p) {

    // 如果容量不同
    if (capacity != p.capacity) {

      // 重新分配内存
      smart_ptr.reset(device_memory::allocate<T>(p.capacity));

      capacity = p.capacity;
    }

    // 复制数据
    device_memory::copy_device_to_device(
        smart_ptr.get(),
        p.get(),
        capacity);

    return *this;
  }

  /// 移动赋值
  DeviceAllocation & operator=(DeviceAllocation && p) {

    std::swap(smart_ptr, p.smart_ptr);
    std::swap(capacity, p.capacity);

    return *this;
  }

  /// 从 device memory 复制整个 allocation
  void copy_from_device(T const *ptr) const {

    copy_from_device(ptr, capacity);
  }

  /// 从 device memory 复制指定数量元素
  void copy_from_device(T const *ptr, size_t elements) const {

    device_memory::copy_device_to_device(
        get(),
        ptr,
        elements);
  }

  /// 复制到 device memory
  void copy_to_device(T *ptr) const {

    copy_to_device(ptr, capacity);
  }

  /// 复制指定数量元素到 device memory
  void copy_to_device(T *ptr, size_t elements) const {

    device_memory::copy_device_to_device(
        ptr,
        get(),
        elements);
  }

  /// 从 host 复制到 device
  void copy_from_host(T const *ptr) const {

    copy_from_host(ptr, capacity);
  }

  /// 从 host 复制指定元素数量
  void copy_from_host(T const *ptr, size_t elements) const {

    device_memory::copy_to_device(
        get(),
        ptr,
        elements);
  }

  /// 从 device 复制到 host
  void copy_to_host(T *ptr) const {

    copy_to_host(ptr, capacity);
  }

  /// 从 device 复制指定数量到 host
  void copy_to_host(T *ptr, size_t elements) const {

    device_memory::copy_to_host(
        ptr,
        get(),
        elements); 
  }
};

可以把它理解为GPU 版 vector

里面的一些功能仔细拆解，就能回到最原始的cuda内存管理里

cutlass::platform::unique_ptr

cpp 复制代码

/// std::unique_ptr
template <class T, class Deleter = default_delete<T> >
class unique_ptr;

直接认为是标准库的unique_ptr即可

cutlass::device_memory::allocate

cpp 复制代码

template <typename T>
T* allocate(size_t count = 1) {

  T* ptr = 0;
  size_t bytes = count * sizeof_bits<T>::value / 8;

  cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);

  if (cuda_error != cudaSuccess) {
#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
    std::ostringstream os;
    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
    CUTLASS_TRACE_HOST(os.str());
#endif
    throw cuda_exception("Failed to allocate memory", cuda_error);
  }
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
  else {
    std::ostringstream os;
    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
    CUTLASS_TRACE_HOST(os.str());
  }
#endif

  return ptr;
}

说白了就是调用cudaMalloc分配内存

cutlass::device_memory::copy_***

cpp 复制代码

template <typename T>
void copy_to_device(T* dst, T const* src, size_t count = 1) {
  copy(dst, src, count, cudaMemcpyHostToDevice);
}

template <typename T>
void copy_to_host(T* dst, T const* src, size_t count = 1) {
  copy(dst, src, count, cudaMemcpyDeviceToHost);
}

template <typename T>
void copy_device_to_device(T* dst, T const* src, size_t count = 1) {
  copy(dst, src, count, cudaMemcpyDeviceToDevice);
}

template <typename T>
void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
  copy(dst, src, count, cudaMemcpyHostToHost);
}

相信你们猜也能猜出来copy具体是干了什么

cpp 复制代码

template <typename T>
void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
  size_t bytes = count * sizeof_bits<T>::value / 8;
  if (bytes == 0 && count > 0) {
    bytes = 1;
  }
  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
  if (cuda_error != cudaSuccess) {
    std::ostringstream os;
    os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
       << "dst=" << dst << ", src=" << src
       << ", bytes=" << bytes << ", count=" << count;
    if (kind == cudaMemcpyHostToDevice) {
      os << ", kind=cudaMemcpyHostToDevice";
    }
    else if (kind == cudaMemcpyDeviceToHost) {
      os << ", kind=cudaMemcpyDeviceToHost";
    }
    else if (kind == cudaMemcpyDeviceToDevice) {
      os << ", kind=cudaMemcpyDeviceToDevice";
    }
    else if (kind == cudaMemcpyHostToHost) {
      os << ", kind=cudaMemcpyHostToHost";
    }
    else if (kind == cudaMemcpyDefault) {
      os << ", kind=cudaMemcpyDefault";
    }
    else {
      os << ", kind=Unknown";
    }
    os << ", error: " << cudaGetErrorString(cuda_error);

    throw cuda_exception(os.str().c_str(), cuda_error);
  }
}

就是调用了cudaMemcpy

构造函数

cpp 复制代码

  /// Default constructor
  HostTensor() {}

  /// Constructs a tensor given an extent. Assumes a packed layout
  HostTensor(
    TensorCoord const &extent,
    bool device_backed = true
  ) {

    this->reset(extent, Layout::packed(extent), device_backed);
  }

  /// Constructs a tensor given an extent and layout
  HostTensor(
    TensorCoord const &extent,
    Layout const &layout,
    bool device_backed = true
  ) {

    this->reset(extent, layout, device_backed);
  }

  ~HostTensor() { }

  /// Clears the HostTensor allocation to size/capacity = 0
  void reset() {
    extent_ = TensorCoord();
    layout_ = Layout::packed(extent_);

    host_.clear();
    device_.reset();
  }

这里的Layout::packed在第一篇文章的cutlass::layout::ColumnMajor里有相关介绍

数据访问接口

cpp 复制代码

  /// 获取指向 host 端数据的指针
  Element * host_data() { 
    return reinterpret_cast<Element *>(host_.data()); 
  }

  /// 获取带有元素偏移量的 host 数据指针
  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { 
    return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); 
  }

  /// 获取 host 内存中某个元素的引用
  Reference host_data(LongIndex idx) {
    return ReferenceFactory<Element>::get(host_data(), idx);
  }

  /// 获取指向 host 端数据的常量指针
  Element const * host_data() const { 
    return reinterpret_cast<Element const *>(host_.data()); 
  }

  /// 获取带有元素偏移量的 host 数据常量指针
  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { 
    return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); 
  }

  /// 获取 host 内存中某个元素的常量引用
  ConstReference host_data(LongIndex idx) const {
    return ReferenceFactory<Element const>::get(host_data(), idx);
  }

  /// 获取指向 device(GPU) 数据的指针
  Element * device_data() { 
    return reinterpret_cast<Element *>(device_.get()); 
  }

  /// 获取指向 device(GPU) 数据的常量指针
  Element const * device_data() const { 
    return reinterpret_cast<Element const *>(device_.get()); 
  }

  /// 获取带有元素偏移量的 device 数据指针
  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { 
    return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); 
  }

  /// 获取带有元素偏移量的 device 数据常量指针
  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { 
    return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); 
  }

  /// 获取一个指向 host 数据的 TensorRef（张量引用）
  TensorRef host_ref(LongIndex ptr_element_offset=0) { 
    return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); 
  }

  /// 获取一个指向 host 数据的常量 TensorRef
  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { 
    return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); 
  }

  /// 获取一个指向 device 数据的 TensorRef（张量引用）
  TensorRef device_ref(LongIndex ptr_element_offset=0) {
    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
  }

  /// 获取一个指向 device 数据的常量 TensorRef
  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
  }

  /// 获取一个指向 host 数据的 TensorView（带 shape 信息的视图）
  TensorView host_view(LongIndex ptr_element_offset=0) {
    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
  }

  /// 获取一个指向 host 数据的常量 TensorView
  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
  }

  /// 获取一个指向 device 数据的 TensorView
  TensorView device_view(LongIndex ptr_element_offset=0) {
    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
  }

  /// 获取一个指向 device 数据的常量 TensorView
  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
  }

这一段代码其实提供了 三层访问接口：

cpp 复制代码

HostTensor
   │
   ├── host_data()     → Element*
   ├── host_ref()      → TensorRef
   └── host_view()     → TensorView

device同理

数据在host和device移动

cpp 复制代码

  /// Returns true if device memory is allocated
  bool device_backed() const {
    return (device_.get() == nullptr) ? false : true;
  }
  /// Copies data from device to host
  void sync_host() {
    if (device_backed()) {
      device_memory::copy_to_host(
          host_.data(), device_.get(), device_.size());
    }
  }

  /// Copies data from host to device
  void sync_device() {
    if (device_backed()) {
      device_memory::copy_to_device(
          device_.get(), host_.data(), host_.size());
    }
  }