ubuntu 22.04 cuda12.x 上 cutensor 1.6.2 版本环境搭建

ubuntu 22.04 cuda12.x 运行 cutensor 1.6.2 sample

1.6.2 是比较久的cutensor 版本,但是nv对新的cuda 平台做了继续支持,故可以在cuda sdk 12上使用cutensor 1.6.2

1,下载libcutensor 1.6.2

下载 cutensor 1.6.2 for all Linux and all cuda:

https://developer.nvidia.com/cutensor/1.6.2/downloads

wget https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-1.6.2.3-archive.tar.xz

tar xf libcutensor-linux-x86_64-1.6.2.3-archive.tar.xz

ls lib..../lib/

10.2/ 11/ 11.0/ 12/

2,运行示例

由于cutensor 2.x中的api有改写,例如 cutensorInit(&handle) 已经改名字;

故需要使用旧的 CUDALibrarySamples中的代码运行示例,例如:

Makefile:

bash 复制代码
CUTENSOR_ROOT := /home/hipper/cutensor_ex/libcutensor-linux-x86_64-1.6.2.3
CXX_FLAGS=-std=c++11 -I${CUTENSOR_ROOT}/include -L${CUTENSOR_ROOT}/lib/12 -lcutensor -lcudart

all:
	nvcc einsum.cu -o  einsum ${CXX_FLAGS}
	nvcc contraction.cu -o  contraction ${CXX_FLAGS}
	nvcc contraction_simple.cu -o  contraction_simple ${CXX_FLAGS}
	nvcc contraction_autotuning.cu -o  contraction_autotuning ${CXX_FLAGS}
	nvcc elementwise_binary.cu -o  elementwise_binary ${CXX_FLAGS}
	nvcc elementwise_permute.cu -o  elementwise_permute ${CXX_FLAGS}
	nvcc elementwise_trinary.cu -o  elementwise_trinary ${CXX_FLAGS}
	nvcc reduction.cu -o  reduction ${CXX_FLAGS}

clean:
	rm -f contraction contraction_simple contraction_autotuning elementwise_binary elementwise_permute elementwise_trinary reduction

contraction_simple.cu

cpp 复制代码
#include <stdlib.h>
#include <stdio.h>

#include <unordered_map>
#include <vector>

#include <cuda_runtime.h>
#include <cutensor.h>

#define HANDLE_ERROR(x)                                               \
{ const auto err = x;                                                 \
  if( err != CUTENSOR_STATUS_SUCCESS )                                \
  { printf("Error: %s\n", cutensorGetErrorString(err)); return err; } \
};

#define HANDLE_CUDA_ERROR(x)                                      \
{ const auto err = x;                                             \
  if( err != cudaSuccess )                                        \
  { printf("Error: %s\n", cudaGetErrorString(err)); return err; } \
};

/* This routine computes the tensor contraction \f[ D = alpha * A * B + beta * C \f] using the staged-API */
cutensorStatus_t cutensorContractionSimple(const cutensorHandle_t* handle,
                                           const void* alpha, const void *A, const cutensorTensorDescriptor_t* descA, const int32_t modeA[],
                                                              const void *B, const cutensorTensorDescriptor_t* descB, const int32_t modeB[],
                                           const void* beta,  const void *C, const cutensorTensorDescriptor_t* descC, const int32_t modeC[],
                                                                    void *D, const cutensorTensorDescriptor_t* descD, const int32_t modeD[],
                                           cutensorComputeType_t typeCompute, cutensorAlgo_t algo, cutensorWorksizePreference_t workPref,
                                           cudaStream_t stream)
{
    /**********************************************
     * Retrieve the memory alignment for each tensor
     **********************************************/ 

     uint32_t alignmentRequirementA;
     HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,
                  A, descA, &alignmentRequirementA));

     uint32_t alignmentRequirementB;
     HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,
                  B, descB, &alignmentRequirementB));

     uint32_t alignmentRequirementC;
     HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,
                  C, descC, &alignmentRequirementC));

     uint32_t alignmentRequirementD;
     HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,
                  D, descD, &alignmentRequirementD));

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    cutensorContractionDescriptor_t desc;
    HANDLE_ERROR(cutensorInitContractionDescriptor(handle, 
                 &desc,
                 descA, modeA, alignmentRequirementA,
                 descB, modeB, alignmentRequirementB,
                 descC, modeC, alignmentRequirementC,
                 descD, modeD, alignmentRequirementD,
                 typeCompute));

    /**************************
    * Set the algorithm to use
    ***************************/

    cutensorContractionFind_t find;
    HANDLE_ERROR(cutensorInitContractionFind( 
                 handle, &find, 
                 algo));

    /**********************
     * Query workspace
     **********************/

    size_t worksize = 0;
    HANDLE_ERROR(cutensorContractionGetWorkspaceSize(handle,
                 &desc,
                 &find,
                 workPref, &worksize));

    void *work = nullptr;
    if (worksize > 0)
    {
        if(cudaSuccess != cudaMalloc(&work, worksize))
        {
            work = nullptr;
            worksize = 0;
        }
    } 

    /**************************
     * Create Contraction Plan
     **************************/

    cutensorContractionPlan_t plan;
    HANDLE_ERROR(cutensorInitContractionPlan(handle,
                 &plan,
                 &desc,
                 &find,
                 worksize));

    /**********************
     * Run
     **********************/

    HANDLE_ERROR(cutensorContraction(handle,
                 &plan,
                 (void*) &alpha, A, B,
                 (void*) &beta,  C, D, 
                 work, worksize, stream));

    return CUTENSOR_STATUS_SUCCESS;
}


int main()
{
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    cudaDataType_t typeA = CUDA_R_32F;
    cudaDataType_t typeB = CUDA_R_32F;
    cudaDataType_t typeC = CUDA_R_32F;
    cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_32F;

    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta  = (floatTypeCompute) 0.f;

    /**********************
     * Computing: C_{m,u,n,v} = alpha * A_{m,h,k,n} B_{u,k,v,h} + beta * C_{m,u,n,v}
     **********************/

    std::vector<int> modeC{'m','u','n','v'};
    std::vector<int> modeA{'m','h','k','n'};
    std::vector<int> modeB{'u','k','v','h'};
    int nmodeA = modeA.size();
    int nmodeB = modeB.size();
    int nmodeC = modeC.size();

    std::unordered_map<int, int64_t> extent;
    extent['m'] = 96;
    extent['n'] = 96;
    extent['u'] = 96;
    extent['v'] = 64;
    extent['h'] = 64;
    extent['k'] = 64;

    double gflops = (2.0 * extent['m'] * extent['n'] * extent['u'] * extent['v'] * extent['k'] * extent['h']) /1e9;
    std::vector<int64_t> extentC;
    for (auto mode : modeC)
        extentC.push_back(extent[mode]);
    std::vector<int64_t> extentA;
    for (auto mode : modeA)
        extentA.push_back(extent[mode]);
    std::vector<int64_t> extentB;
    for (auto mode : modeB)
        extentB.push_back(extent[mode]);

    /**********************
     * Allocating data
     **********************/

    size_t elementsA = 1;
    for (auto mode : modeA)
        elementsA *= extent[mode];
    size_t elementsB = 1;
    for (auto mode : modeB)
        elementsB *= extent[mode];
    size_t elementsC = 1;
    for (auto mode : modeC)
        elementsC *= extent[mode];

    size_t sizeA = sizeof(floatTypeA) * elementsA;
    size_t sizeB = sizeof(floatTypeB) * elementsB;
    size_t sizeC = sizeof(floatTypeC) * elementsC;
    printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC)/1024./1024./1024);

    void *A_d, *B_d, *C_d;
    HANDLE_CUDA_ERROR(cudaMalloc((void**) &A_d, sizeA));
    HANDLE_CUDA_ERROR(cudaMalloc((void**) &B_d, sizeB));
    HANDLE_CUDA_ERROR(cudaMalloc((void**) &C_d, sizeC));

    floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
    floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
    floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: Host allocation of A, B, or C.\n");
        return -1;
    }

    /*******************
     * Initialize data
     *******************/

    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand())/RAND_MAX - 0.5)*100;

    HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
    HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
    HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));

    /*************************
     * cuTENSOR
     *************************/ 

    cutensorHandle_t handle;
    HANDLE_ERROR(cutensorInit(&handle));

    /**********************
     * Create Tensor Descriptors
     **********************/

    cutensorTensorDescriptor_t descA;
    HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
                 &descA,
                 nmodeA,
                 extentA.data(),
                 NULL /* stride */,
                 typeA, CUTENSOR_OP_IDENTITY));

    cutensorTensorDescriptor_t descB;
    HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
                 &descB,
                 nmodeB,
                 extentB.data(),
                 NULL /* stride */,
                 typeB, CUTENSOR_OP_IDENTITY));

    cutensorTensorDescriptor_t descC;
    HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
                 &descC,
                 nmodeC,
                 extentC.data(),
                 NULL /* stride */,
                 typeC, CUTENSOR_OP_IDENTITY));

    HANDLE_ERROR(cutensorContractionSimple(&handle,
                 (void*)&alpha, A_d, &descA, modeA.data(),
                                B_d, &descB, modeB.data(),
                 (void*)&beta,  C_d, &descC, modeC.data(),
                                C_d, &descC, modeC.data(),
                 typeCompute, CUTENSOR_ALGO_DEFAULT,
                 CUTENSOR_WORKSPACE_RECOMMENDED, 0 /* stream */));

    return 0;
}

运行:

export LD_LIBRARY_PATH=/home/hipper/cutensor_ex/libcutensor-linux-x86_64-1.6.2.3/lib/12

相关推荐
MicroTech20252 小时前
突破量子数据加载瓶颈,MLGO微算法科技推出面向大规模量子计算的分治态制备技术
科技·算法·量子计算
MicroTech20254 小时前
突破虚时演化非酉限制:MLGO微算法科技发布可在现有量子计算机运行的变分量子模拟技术
科技·算法·量子计算
Allen_LVyingbo5 小时前
量子计算Dirac Notation基本教学—从零基础到读懂量子信息论文(下)
开发语言·人工智能·python·数学建模·量子计算
AEIC学术交流中心7 小时前
【快速EI检索 | SPIE出版】2026年量子计算与人工智能国际学术会议(ICQCAI 2026)
人工智能·量子计算
Allen_LVyingbo8 小时前
量子计算Dirac Notation基本教学—从零基础到读懂量子信息论文(上)
开发语言·数据结构·架构·健康医疗·量子计算
Liudef062 天前
后量子密码学(PQC)深度解析:算法原理、标准进展与软件开发行业的影响
算法·密码学·量子计算
阿钱真强道4 天前
01 飞腾 S5000C 服务器环境搭建实战:PyTorch + CUDA + RTX 4090D 安装与验证
pytorch·cuda·aarch64·深度学习环境搭建·飞腾服务器·s5000c·rtx4090d
陈天伟教授4 天前
小白快速进阶- AI辅助编码
人工智能·神经网络·机器学习·量子计算
李乾文5 天前
量子力学 20 海森堡绘景运动方程
量子力学·海森堡绘景
Eloudy6 天前
偏迹(Partial Trace)的定义和数学物理意义
量子计算·量子力学