CUDA(C)磁态蒙特卡洛和传输矩阵多GPU并行计算分析

🎯要点

  1. 使用英伟达GPU、大都会和并行回火算法模拟蒙特卡洛。
  2. 使用兰佐斯算法计算传输矩阵特征值。
  3. 使用 Suzuki-Trotter 公式归一化量子无序系统。
  4. 算法模型特征:多CUDA线程,多GPU和多任务式并行计算。

🍁磁态分析角度

  • Python和MATLAB自旋玻璃投资组合神经网络广义方程
  • Python和C++及MATLAB低温磁态机器学习模型

🍪语言内容分比


🍇CUDA矩阵乘法二维块平铺实现

从数学上讲,给定一个广义矩阵乘法运算 D = A B + C D=A B+C D=AB+C,其中 D ∈ R m × n , A ∈ R m × k , B ∈ R k × n , C ∈ R m × n D \in R ^{m \times n}, A \in R ^{m \times k}, B \in R ^{k \times n}, C \in R ^{m \times n} D∈Rm×n,A∈Rm×k,B∈Rk×n,C∈Rm×n,矩阵可以分成更小的矩阵。
A = A 1 , 1 d b m × d b k A 1 , 2 d b m × d b k ... A 1 , k d m m × d b k A 2 , 1 d m m × d b k A 2 , 2 d b m × d b k ⋯ A 2 , k / d b k d b m × d b k ⋮ ⋮ ⋱ ⋮ A m / d m m , 1 d m b × d b k A m / d m m , 2 d b m × d b k ⋯ A m / d b m , k / d b k d b m × d b k A=\left\\begin{array}{cccc} A_{1,1}\^{d_{b m} \\times d_{b k}} \& A_{1,2}\^{d_{b m} \\times d_{b k}} \& \\ldots \& A_{1, k}\^{d_{m m} \\times d_{b k}} \\\\ A_{2,1}\^{d_{m m} \\times d_{b k}} \& A_{2,2}\^{d_{b m} \\times d_{b k}} \& \\cdots \& A_{2, k / d_{b k}}\^{d_{b m} \\times d_{b k}} \\\\ \\vdots \& \\vdots \& \\ddots \& \\vdots \\\\ A_{m / d_{m m}, 1}\^{d_{m b} \\times d_{b k}} \& A_{m / d_{m m}, 2}\^{d_{b m} \\times d_{b k}} \& \\cdots \& A_{m / d_{b m}, k / d_{b k}}\^{d_{b m} \\times d_{b k}} \\end{array}\\right A= A1,1dbm×dbkA2,1dmm×dbk⋮Am/dmm,1dmb×dbkA1,2dbm×dbkA2,2dbm×dbk⋮Am/dmm,2dbm×dbk...⋯⋱⋯A1,kdmm×dbkA2,k/dbkdbm×dbk⋮Am/dbm,k/dbkdbm×dbk

B = B 1 , 1 d b k × d b n B 1 , 2 d b k × d b n ... B 1 , n / d b n d b k × d b n B 2 , 1 d b k × d b n B 2 , 2 d b k × d b n ... B 2 , n / d b n d b k × d b n ⋮ ⋮ ⋱ ⋮ B k / d b k , 1 d b k d b n B k / d b k , 2 d b k × d b n ⋯ B k / d b k , n / d b n d b k × d b n B=\left\\begin{array}{cccc} B_{1,1}\^{d_{b k} \\times d_{b n}} \& B_{1,2}\^{d_{b k} \\times d_{b n}} \& \\ldots \& B_{1, n / d_{b n}}\^{d_{b k} \\times d_{b n}} \\\\ B_{2,1}\^{d_{b k} \\times d_{b n}} \& B_{2,2}\^{d_{b k} \\times d_{b n}} \& \\ldots \& B_{2, n / d_{b n}}\^{d_{b k} \\times d_{b n}} \\\\ \\vdots \& \\vdots \& \\ddots \& \\vdots \\\\ B_{k / d_{b k}, 1}\^{d_{b k} d_{b n}} \& B_{k / d_{b k}, 2}\^{d_{b k} \\times d_{b n}} \& \\cdots \& B_{k / d_{b k}, n / d_{b n}}\^{d_{b k} \\times d_{b n}} \\end{array}\\right B= B1,1dbk×dbnB2,1dbk×dbn⋮Bk/dbk,1dbkdbnB1,2dbk×dbnB2,2dbk×dbn⋮Bk/dbk,2dbk×dbn......⋱⋯B1,n/dbndbk×dbnB2,n/dbndbk×dbn⋮Bk/dbk,n/dbndbk×dbn

C = C 1 , 1 d b m × d b n C 1 , 2 d b m × d b n ... C 1 , n / d b n d b m × d b n C 2 , 1 d b m × d b n C 2 , 2 d b m × d b n ... C 2 , n / d b n d b m × d m n ⋮ ⋮ ⋱ ⋮ C m / d b m , 1 d b m × d l n C m / d b m , 2 d b n × d b n ⋯ C m / d b m , n / d b n d b m × d b n C=\left\\begin{array}{cccc} C_{1,1}\^{d_{b m} \\times d_{b n}} \& C_{1,2}\^{d_{b m} \\times d_{b n}} \& \\ldots \& C_{1, n / d_{b n}}\^{d_{b m} \\times d_{b n}} \\\\ C_{2,1}\^{d_{b m} \\times d_{b n}} \& C_{2,2}\^{d_{b m} \\times d_{b n}} \& \\ldots \& C_{2, n / d_{b n}}\^{d_{b m} \\times d_{m n}} \\\\ \\vdots \& \\vdots \& \\ddots \& \\vdots \\\\ C_{m / d_{b m}, 1}\^{d_{b m} \\times d_{l n}} \& C_{m / d_{b m}, 2}\^{d_{b n} \\times d_{b n}} \& \\cdots \& C_{m / d_{b m}, n / d_{b n}}\^{d_{b m} \\times d_{b n}} \\end{array}\\right C= C1,1dbm×dbnC2,1dbm×dbn⋮Cm/dbm,1dbm×dlnC1,2dbm×dbnC2,2dbm×dbn⋮Cm/dbm,2dbn×dbn......⋱⋯C1,n/dbndbm×dbnC2,n/dbndbm×dmn⋮Cm/dbm,n/dbndbm×dbn

D = D 1 , 1 d b m × d b n D 1 , 2 d b m × d b n ... D 1 , n / d b n d b m × d b n D 2 , 1 d b m × d b n D 2 , 2 d b m × d b n ... D 2 , n / d b n d b m × d m n ⋮ ⋮ ⋱ ⋮ D m / d b m , 1 d b m × d b n D m / d b m , 2 d b m × d b n ⋯ D m / d b m , n / d b n d b n × d l m D=\left\\begin{array}{cccc} D_{1,1}\^{d_{b m} \\times d_{b n}} \& D_{1,2}\^{d_{b m} \\times d_{b n}} \& \\ldots \& D_{1, n / d_{b n}}\^{d_{b m} \\times d_{b n}} \\\\ D_{2,1}\^{d_{b m} \\times d_{b n}} \& D_{2,2}\^{d_{b m} \\times d_{b n}} \& \\ldots \& D_{2, n / d_{b n}}\^{d_{b m} \\times d_{m n}} \\\\ \\vdots \& \\vdots \& \\ddots \& \\vdots \\\\ D_{m / d_{b m}, 1}\^{d_{b m} \\times d_{b n}} \& D_{m / d_{b m}, 2}\^{d_{b m} \\times d_{b n}} \& \\cdots \& D_{m / d_{b m}, n / d_{b n}}\^{d_{b n} \\times d_{l_m}} \\end{array}\\right D= D1,1dbm×dbnD2,1dbm×dbn⋮Dm/dbm,1dbm×dbnD1,2dbm×dbnD2,2dbm×dbn⋮Dm/dbm,2dbm×dbn......⋱⋯D1,n/dbndbm×dbnD2,n/dbndbm×dmn⋮Dm/dbm,n/dbndbn×dlm

代码实现如下:

C 复制代码
template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
          size_t BLOCK_TILE_SIZE_K, size_t NUM_THREADS, size_t BLOCK_TILE_SKEW_SIZE_X = 0U, size_t BLOCK_TILE_SKEW_SIZE_K = 0U>
__device__ void load_data_to_shared_memory(T const* A, size_t lda,
                                           T const* B, size_t ldb,
                                           T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K + BLOCK_TILE_SKEW_SIZE_K],
                                           T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X + BLOCK_TILE_SKEW_SIZE_X],
                                           size_t thread_block_tile_idx,
                                           size_t thread_linear_idx,
                                           size_t m, size_t n,
                                           size_t k)
{

#pragma unroll
    for (size_t load_idx{0U};
         load_idx <
         (BLOCK_TILE_SIZE_Y * BLOCK_TILE_SIZE_K + NUM_THREADS - 1U) /
             NUM_THREADS;
         ++load_idx)
    {
        size_t const A_thread_block_tile_row_idx{
            (thread_linear_idx + load_idx * NUM_THREADS) /
            BLOCK_TILE_SIZE_K};
        size_t const A_thread_block_tile_col_idx{
            (thread_linear_idx + load_idx * NUM_THREADS) %
            BLOCK_TILE_SIZE_K};
        size_t const A_row_idx{blockIdx.y * BLOCK_TILE_SIZE_Y +
                               A_thread_block_tile_row_idx};
        size_t const A_col_idx{thread_block_tile_idx * BLOCK_TILE_SIZE_K +
                               A_thread_block_tile_col_idx};

        T val{static_cast<T>(0)};
        if (A_row_idx < m && A_col_idx < k)
        {
            val = A[A_row_idx * lda + A_col_idx];
        }

        static_assert(BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_Y % NUM_THREADS ==
                      0U);

        A_thread_block_tile[A_thread_block_tile_row_idx]
                           [A_thread_block_tile_col_idx] = val;
    }

#pragma unroll
    for (size_t load_idx{0U};
         load_idx <
         (BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_X + NUM_THREADS - 1U) /
             NUM_THREADS;
         ++load_idx)
    {
        size_t const B_thread_block_tile_row_idx{
            (thread_linear_idx + load_idx * NUM_THREADS) /
            BLOCK_TILE_SIZE_X};
        size_t const B_thread_block_tile_col_idx{
            (thread_linear_idx + load_idx * NUM_THREADS) %
            BLOCK_TILE_SIZE_X};
        size_t const B_row_idx{thread_block_tile_idx * BLOCK_TILE_SIZE_K +
                               B_thread_block_tile_row_idx};
        size_t const B_col_idx{blockIdx.x * BLOCK_TILE_SIZE_X +
                               B_thread_block_tile_col_idx};

        T val{static_cast<T>(0)};
        if (B_row_idx < k && B_col_idx < n)
        {
            val = B[B_row_idx * ldb + B_col_idx];
        }

        static_assert(BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_K % NUM_THREADS ==
                      0U);

        B_thread_block_tile[B_thread_block_tile_row_idx]
                           [B_thread_block_tile_col_idx] = val;
    }
}

template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
          size_t BLOCK_TILE_SIZE_K>
__global__ void gemm_v02(size_t m, size_t n, size_t k, T alpha, T const* A,
                         size_t lda, T const* B, size_t ldb, T beta, T* C,
                         size_t ldc)
{

    constexpr size_t NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y};
    size_t const thread_linear_idx{threadIdx.y * blockDim.x + threadIdx.x};

    size_t const C_col_idx{blockIdx.x * blockDim.x + threadIdx.x};
    size_t const C_row_idx{blockIdx.y * blockDim.y + threadIdx.y};

    __shared__ T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K];
    __shared__ T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X];

    size_t const num_thread_block_tiles{(k + BLOCK_TILE_SIZE_K - 1) /
                                        BLOCK_TILE_SIZE_K};

    T sum{static_cast<T>(0)};
    for (size_t thread_block_tile_idx{0U};
         thread_block_tile_idx < num_thread_block_tiles;
         ++thread_block_tile_idx)
    {
        load_data_to_shared_memory<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y,
                                   BLOCK_TILE_SIZE_K, NUM_THREADS>(
            A, lda, B, ldb, A_thread_block_tile, B_thread_block_tile,
            thread_block_tile_idx, thread_linear_idx, m, n, k);
        __syncthreads();

#pragma unroll
        for (size_t k_i{0U}; k_i < BLOCK_TILE_SIZE_K; ++k_i)
        {

            sum += A_thread_block_tile[threadIdx.y][k_i] *
                   B_thread_block_tile[k_i][threadIdx.x];
        }
        __syncthreads();
    }
    if (C_row_idx < m && C_col_idx < n)
    {
        C[C_row_idx * ldc + C_col_idx] =
            alpha * sum + beta * C[C_row_idx * ldc + C_col_idx];
    }
}

template <typename T>
void launch_gemm_kernel_v02(size_t m, size_t n, size_t k, T const* alpha,
                            T const* A, size_t lda, T const* B, size_t ldb,
                            T const* beta, T* C, size_t ldc,
                            cudaStream_t stream)
{

    constexpr unsigned int BLOCK_TILE_SIZE_X{32U};
    constexpr unsigned int BLOCK_TILE_SIZE_Y{32U};
    constexpr unsigned int BLOCK_TILE_SIZE_K{32U};
    constexpr unsigned int NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y};
    static_assert(BLOCK_TILE_SIZE_K * BLOCK_TILE_SIZE_Y % NUM_THREADS == 0U);
    static_assert(BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_K % NUM_THREADS == 0U);
    dim3 const block_dim{BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, 1U};
    dim3 const grid_dim{
        (static_cast<unsigned int>(n) + block_dim.x - 1U) / block_dim.x,
        (static_cast<unsigned int>(m) + block_dim.y - 1U) / block_dim.y, 1U};
    gemm_v02<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, BLOCK_TILE_SIZE_K>
        <<<grid_dim, block_dim, 0U, stream>>>(m, n, k, *alpha, A, lda, B, ldb,
                                              *beta, C, ldc);
    CHECK_LAST_CUDA_ERROR();
}

此 FP32 广义矩阵乘法实现的性能在 NVIDIA GeForce RTX 3090 GPU 上达到 2.66 TFLOPS,比之前的实现好很多。不过,它距离 GPU 的理论峰值性能还相差甚远。

使用二维块平铺和一维线程平铺实现

我们首先将矩阵 B B B 的数据从共享内存缓存到寄存器中。每个具有块线程索引 ( t m , t n ) \left(t_m, t_n\right) (tm,tn) 的线程,其中 t m ∈ 1 , d t m / d t m t_m \in\left1, d_{t m} / d_{t m}\\right tm∈1,dtm/dtm 和 t n ∈ 1 , d m m t_n \in\left1, d_{m m}\\right tn∈1,dmm,现在负责计算小矩阵的 d t m d_{t m} dtm 个元素,其中 d t m d_{t m} dtm 是线程块大小。
( D b m , b m d m × d m ) t m t m + d m , t n = ( ∑ b k = 1 k / d k A b m , b k d m × d ∗ B b k , b n d k × d m + C b m , b n d m × d m ) t m t m + d m , t n = ∑ b k = 1 k / d m k ( ∑ t k = 1 d m ( A b m , b k d m × d k k ) t m t m + d m , t k ( B b k , b n d m × d m ) t k , t n ) + ( C b m , b n d m × d m ) t m t m + d m , t n \begin{aligned} &\begin{aligned} & \left(D_{b_m, b_m}^{d_m \times d_m}\right){t_m t_m+d_m, t_n}=\left(\sum{b_k=1}^{k / d_k} A_{b_m, b_k}^{d_m \times d *} B_{b_k, b_n}^{d_k \times d_m}+C_{b_m, b_n}^{d_m \times d_m}\right){t_m t_m+d_m, t_n} \end{aligned}\\ &\begin{aligned} & =\sum{b_k=1}^{k / d_{m k}}\left(\sum_{t_k=1}^{d_m}\left(A_{b_m, b_k}^{d_m \times d_{k_k}}\right){t_m t_m+d_m, t_k}\left(B{b_k, b_n}^{d_m \times d_m}\right){t_k, t_n}\right)+\left(C{b_m, b_n}^{d_m \times d_m}\right)_{t_m t_m+d_m, t_n} \end{aligned} \end{aligned} (Dbm,bmdm×dm)tmtm+dm,tn= bk=1∑k/dkAbm,bkdm×d∗Bbk,bndk×dm+Cbm,bndm×dm tmtm+dm,tn=bk=1∑k/dmk(tk=1∑dm(Abm,bkdm×dkk)tmtm+dm,tk(Bbk,bndm×dm)tk,tn)+(Cbm,bndm×dm)tmtm+dm,tn

C 复制代码
template <typename T, size_t BLOCK_TILE_SIZE_X, size_t BLOCK_TILE_SIZE_Y,
          size_t BLOCK_TILE_SIZE_K, size_t THREAD_TILE_SIZE_Y>
__global__ void gemm_v03(size_t m, size_t n, size_t k, T alpha, T const* A,
                         size_t lda, T const* B, size_t ldb, T beta, T* C,
                         size_t ldc)
{

    constexpr size_t NUM_THREADS{BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y /
                                 THREAD_TILE_SIZE_Y};
    size_t const thread_linear_idx{threadIdx.y * blockDim.x + threadIdx.x};

    __shared__ T A_thread_block_tile[BLOCK_TILE_SIZE_Y][BLOCK_TILE_SIZE_K];
    __shared__ T B_thread_block_tile[BLOCK_TILE_SIZE_K][BLOCK_TILE_SIZE_X];

    size_t const num_thread_block_tiles{(k + BLOCK_TILE_SIZE_K - 1) /
                                        BLOCK_TILE_SIZE_K};

    T C_thread_results[THREAD_TILE_SIZE_Y] = {static_cast<T>(0)};

    for (size_t thread_block_tile_idx{0U};
         thread_block_tile_idx < num_thread_block_tiles;
         ++thread_block_tile_idx)
    {
        load_data_to_shared_memory<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y,
                                   BLOCK_TILE_SIZE_K, NUM_THREADS>(
            A, lda, B, ldb, A_thread_block_tile, B_thread_block_tile,
            thread_block_tile_idx, thread_linear_idx, m, n, k);
        __syncthreads();

#pragma unroll
        for (size_t k_i{0U}; k_i < BLOCK_TILE_SIZE_K; ++k_i)
        {
            size_t const B_thread_block_tile_row_idx{k_i};

            T const B_val{
                B_thread_block_tile[B_thread_block_tile_row_idx]
                                   [thread_linear_idx % BLOCK_TILE_SIZE_X]};
#pragma unroll
            for (size_t thread_tile_row_idx{0U};
                 thread_tile_row_idx < THREAD_TILE_SIZE_Y;
                 ++thread_tile_row_idx)
            {
                size_t const A_thread_block_tile_row_idx{
                    thread_linear_idx / BLOCK_TILE_SIZE_X * THREAD_TILE_SIZE_Y +
                    thread_tile_row_idx};
                size_t const A_thread_block_tile_col_idx{k_i};
                T const A_val{A_thread_block_tile[A_thread_block_tile_row_idx]
                                                 [A_thread_block_tile_col_idx]};
                C_thread_results[thread_tile_row_idx] += A_val * B_val;
            }
        }
        __syncthreads();
    }

#pragma unroll
    for (size_t thread_tile_row_idx{0U};
         thread_tile_row_idx < THREAD_TILE_SIZE_Y; ++thread_tile_row_idx)
    {
        size_t const C_row_idx{blockIdx.y * BLOCK_TILE_SIZE_Y +
                               thread_linear_idx / BLOCK_TILE_SIZE_X *
                                   THREAD_TILE_SIZE_Y +
                               thread_tile_row_idx};
        size_t const C_col_idx{blockIdx.x * BLOCK_TILE_SIZE_X +
                               thread_linear_idx % BLOCK_TILE_SIZE_X};
        if (C_row_idx < m && C_col_idx < n)
        {
            C[C_row_idx * ldc + C_col_idx] =
                alpha * C_thread_results[thread_tile_row_idx] +
                beta * C[C_row_idx * ldc + C_col_idx];
        }
    }
}

template <typename T>
void launch_gemm_kernel_v03(size_t m, size_t n, size_t k, T const* alpha,
                            T const* A, size_t lda, T const* B, size_t ldb,
                            T const* beta, T* C, size_t ldc,
                            cudaStream_t stream)
{

    constexpr unsigned int BLOCK_TILE_SIZE_X{64U};
    constexpr unsigned int BLOCK_TILE_SIZE_Y{64U};
    constexpr unsigned int BLOCK_TILE_SIZE_K{8U};

    constexpr unsigned int THREAD_TILE_SIZE_Y{8U};
    constexpr unsigned int NUM_THREADS_PER_BLOCK{
        BLOCK_TILE_SIZE_X * BLOCK_TILE_SIZE_Y / THREAD_TILE_SIZE_Y};
    static_assert(BLOCK_TILE_SIZE_Y % THREAD_TILE_SIZE_Y == 0U);
    static_assert(NUM_THREADS_PER_BLOCK % BLOCK_TILE_SIZE_K == 0U);
    static_assert(NUM_THREADS_PER_BLOCK % BLOCK_TILE_SIZE_X == 0U);
    dim3 const block_dim{NUM_THREADS_PER_BLOCK, 1U, 1U};
    dim3 const grid_dim{
        (static_cast<unsigned int>(n) + BLOCK_TILE_SIZE_X - 1U) /
            BLOCK_TILE_SIZE_X,
        (static_cast<unsigned int>(m) + BLOCK_TILE_SIZE_Y - 1U) /
            BLOCK_TILE_SIZE_Y,
        1U};
    gemm_v03<T, BLOCK_TILE_SIZE_X, BLOCK_TILE_SIZE_Y, BLOCK_TILE_SIZE_K,
             THREAD_TILE_SIZE_Y><<<grid_dim, block_dim, 0U, stream>>>(
        m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
    CHECK_LAST_CUDA_ERROR();
}

👉参阅、更新:计算思维 | 亚图跨际

相关推荐
云道轩5 天前
腾讯云上有性能比较强的英伟达GPU
人工智能·云计算·腾讯云·英伟达gpu
2401_841495644 个月前
【强化学习】REINFORCE 算法
人工智能·python·算法·强化学习·reinforce·策略梯度·蒙特卡洛
大千AI助手8 个月前
Gibbs采样:全面解析马尔可夫链蒙特卡洛的核心算法
贝叶斯·mcmc·蒙特卡洛·条件概率·马尔科夫链·gibbs采样·gibbs sampling
大千AI助手1 年前
蒙特卡洛方法:随机抽样的艺术与科学
人工智能·机器学习·贝叶斯·概率·蒙特卡洛·随机
蟹堡王首席大厨2 年前
数模方法论-蒙特卡洛法
python·算法·数学建模·matlab·蒙特卡洛
亚图跨际2 年前
Python和MATLAB及R平均意见得分导图
文本语音·语音视频得分·后验分布·英伟达gpu·秩统计·置信区间·相似性评估
随风飘摇的土木狗2 年前
【MATLAB第98期】基于MATLAB的MonteCarlo蒙特卡罗结合kriging克里金代理模型的全局敏感性分析模型(有目标函数)
matlab·代理模型·全局敏感性分析·全局·蒙特卡洛·kriging·克里金
可编程芯片开发2 年前
基于蒙特卡洛的电力系统可靠性分析matlab仿真,对比EDNS和LOLP
matlab·电力系统·蒙特卡洛·可靠性分析·edns·lolp
亚图跨际2 年前
CUDA Cpp正电子发射断层扫描仪校准和图像重建—蒙特卡洛3D伊辛模型
c++·cuda·并行计算·蒙特卡洛