【Cute学学习笔记】print_tensor打印error

经过实践,cute中使用print_tensor打印一个tensor,如果类型是half是会直接编译不过的:

cpp 复制代码
#include <cuda.h>
#include <stdlib.h>
#include <cute/tensor.hpp>

/*
    cute中的Tensor更多的是对Tensor进行分解和组合等操作,而这些操作多是对Layout的变换(只是逻辑层面的数据组织形式),底层的数据实体一般不变更。
    Tensor = Layout + storage
*/

// nvcc tensor.cu -arch=sm_89 -std=c++17 -I ../cutlass/include -I ../cutlass/tools/util/include --expt-relaxed-constexpr -cudart shared --cudadevrt none  -DDEBUG

using namespace cute;
using namespace std;

#define PRINT(name, content) \
    print(name);             \
    print(" : ");            \
    print(content);          \
    print("\n");

#define PRINTTENSOR(name, content) \
    print(name);                   \
    print(" : ");                  \
    print_tensor(content);         \
    print("\n");

template<typename T>
__global__ void handle_global_tensor(T *pointer)
{
    auto gshape = make_shape(Int<4>{}, Int<6>{});
    auto gstride = make_stride(Int<6>{}, Int<1>{});
    auto gtensor = make_tensor(make_gmem_ptr(pointer), make_layout(gshape, gstride));
    PRINTTENSOR("global tensor", gtensor);
}

int main()
{
    // register tensor
    // handle_regiser_tensor<<<1, 1>>>();

    // global memory tensor

    using T = half;

    T *pointer;
    int size = 4 * 6;
    cudaMalloc(&pointer, size * sizeof(T));
    T *cpointer = (T *)malloc(size * sizeof(T));
    for (int i = 0; i < size; i++)
    {
        cpointer[i] = (T)i;
    }
    cudaMemcpy(pointer, cpointer, size * sizeof(int), cudaMemcpyHostToDevice);
    handle_global_tensor<T><<<1, 1>>>(pointer);
    cudaDeviceSynchronize();
    return 0;
}

如果类型换成int或者float,是可以成功打印的:

cpp 复制代码
#include <cuda.h>
#include <stdlib.h>
#include <cute/tensor.hpp>

/*
    cute中的Tensor更多的是对Tensor进行分解和组合等操作,而这些操作多是对Layout的变换(只是逻辑层面的数据组织形式),底层的数据实体一般不变更。
    Tensor = Layout + storage
*/

// nvcc tensor.cu -arch=sm_89 -std=c++17 -I ../cutlass/include -I ../cutlass/tools/util/include --expt-relaxed-constexpr -cudart shared --cudadevrt none  -DDEBUG

using namespace cute;
using namespace std;

#define PRINT(name, content) \
    print(name);             \
    print(" : ");            \
    print(content);          \
    print("\n");

#define PRINTTENSOR(name, content) \
    print(name);                   \
    print(" : ");                  \
    print_tensor(content);         \
    print("\n");

template<typename T>
__global__ void handle_global_tensor(T *pointer)
{
    auto gshape = make_shape(Int<4>{}, Int<6>{});
    auto gstride = make_stride(Int<6>{}, Int<1>{});
    auto gtensor = make_tensor(make_gmem_ptr(pointer), make_layout(gshape, gstride));
    PRINTTENSOR("global tensor", gtensor);
}

int main()
{
    using T = float;

    T *pointer;
    int size = 4 * 6;
    cudaMalloc(&pointer, size * sizeof(T));
    T *cpointer = (T *)malloc(size * sizeof(T));
    for (int i = 0; i < size; i++)
    {
        cpointer[i] = (T)i;
    }
    cudaMemcpy(pointer, cpointer, size * sizeof(int), cudaMemcpyHostToDevice);
    handle_global_tensor<T><<<1, 1>>>(pointer);
    cudaDeviceSynchronize();
    return 0;
}

惊喜发现,不能直接用half,得用cute::half_t,这样的是可以打印的

cpp 复制代码
#include <cuda.h>
#include <stdlib.h>
#include <cute/tensor.hpp>

/*
    cute中的Tensor更多的是对Tensor进行分解和组合等操作,而这些操作多是对Layout的变换(只是逻辑层面的数据组织形式),底层的数据实体一般不变更。
    Tensor = Layout + storage
*/

// nvcc tensor.cu -arch=sm_89 -std=c++17 -I ../cutlass/include -I ../cutlass/tools/util/include --expt-relaxed-constexpr -cudart shared --cudadevrt none  -DDEBUG

using namespace cute;
using namespace std;

#define PRINT(name, content) \
    print(name);             \
    print(" : ");            \
    print(content);          \
    print("\n");

#define PRINTTENSOR(name, content) \
    print(name);                   \
    print(" : ");                  \
    print_tensor(content);         \
    print("\n");

template<typename T>
__global__ void handle_global_tensor(T *pointer)
{
    auto gshape = make_shape(Int<4>{}, Int<6>{});
    auto gstride = make_stride(Int<6>{}, Int<1>{});
    auto gtensor = make_tensor(make_gmem_ptr(pointer), make_layout(gshape, gstride));
    PRINTTENSOR("global tensor", gtensor);
}

int main()
{
    using T = cute::half_t;
    // using T = half;

    T *pointer;
    int size = 4 * 6;
    cudaMalloc(&pointer, size * sizeof(T));
    T *cpointer = (T *)malloc(size * sizeof(T));
    for (int i = 0; i < size; i++)
    {
        cpointer[i] = 1;
    }
    cudaMemcpy(pointer, cpointer, size * sizeof(T), cudaMemcpyHostToDevice);
    handle_global_tensor<T><<<1, 1>>>(pointer);
    cudaDeviceSynchronize();
    return 0;
}
相关推荐
冷雨夜中漫步7 小时前
Python快速入门(6)——for/if/while语句
开发语言·经验分享·笔记·python
Gain_chance9 小时前
34-学习笔记尚硅谷数仓搭建-DWS层最近一日汇总表建表语句汇总
数据仓库·hive·笔记·学习·datagrip
Gain_chance10 小时前
36-学习笔记尚硅谷数仓搭建-DWS层数据装载脚本
大数据·数据仓库·笔记·学习
肖永威10 小时前
macOS环境安装/卸载python实践笔记
笔记·python·macos
暗光之痕11 小时前
Unreal5研究笔记 Actor的生命周期函数
笔记·unreal engine
Gain_chance11 小时前
35-学习笔记尚硅谷数仓搭建-DWS层最近n日汇总表及历史至今汇总表建表语句
数据库·数据仓库·hive·笔记·学习
宵时待雨12 小时前
STM32笔记归纳9:定时器
笔记·stm32·单片机·嵌入式硬件
m0_7190841112 小时前
React笔记张天禹
前端·笔记·react.js
r i c k15 小时前
数据库系统学习笔记
数据库·笔记·学习
shandianchengzi16 小时前
【小白向】错位排列|图文解释公考常见题目错位排列的递推式Dn=(n-1)(Dn-2+Dn-1)推导方式
笔记·算法·公考·递推·排列·考公