leetGPU

  1. 加粗样式:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void vector_add(const float* A, const float* B, float* C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}
  1. 2:
cuda 复制代码
#include <cuda_runtime.h>

/// 矩阵乘法 kernel(naive 实现)
/// A: M × N,  B: N × K,  C: M × K,  全部行主序存储
/// 每个线程计算 C 的一个元素
__global__ void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N,
                                             int K) {
    // 当前线程负责 C[row][col]
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // 边界检查
    if (row < M && col < K) {
        float sum = 0.0f;

        // 沿共享维度 N 做内积: A 的第 row 行 × B 的第 col 列
        for (int n = 0; n < N; n++) {
            sum += A[row * N + n] * B[n * K + col];
        }

        C[row * K + col] = sum;
    }
}

/// solve 函数
/// A, B, C 已经是 GPU 上的指针,无需 cudaMalloc/cudaMemcpy
extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K) {
    // 2D block: 16×16 = 256 线程(32 的倍数,occupancy 友好)
    dim3 threadsPerBlock(16, 16);

    // 2D grid: 按 K(宽)和 M(高)覆盖整个输出矩阵
    dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernel
    matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);

    // 等待 GPU 执行完毕
    cudaDeviceSynchronize();
}
  1. 3:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if(row < rows && col < cols) {
        output[col * rows + row] = input[row *cols + col];
    }
}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int rows, int cols) {
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);

    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
    cudaDeviceSynchronize();
}
  1. 4:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void invert_kernel(unsigned char* image, int width, int height) {
    int idx = blockIdx.x * blockDim.x +threadIdx.x;
    if(idx < width * height) {
        uchar4 * pixel = (uchar4 *)(image + idx * 4);
        uchar4 p = *pixel;
        p.x = 255 - p.x;
        p.y = 255 - p.y;
        p.z = 255 - p.z;
        *pixel = p;
        
    }
}
// image_input, image_output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(unsigned char* image, int width, int height) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (width * height + threadsPerBlock - 1) / threadsPerBlock;

    invert_kernel<<<blocksPerGrid, threadsPerBlock>>>(image, width, height);
    cudaDeviceSynchronize();
}
  1. 5:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void matrix_add(const float* A, const float* B, float* C, int N) {
    // int row = blockIdx.y * blockDim.y + threadIdx.y;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N * N) {
        C[idx] = (A[idx] + B[idx]);
    }
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N * N + threadsPerBlock - 1) / threadsPerBlock;

    matrix_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}
  1. 6:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void convolution_1d_kernel(const float* input, const float* kernel, float* output,
                                      int input_size, int kernel_size) {
                                        int idx1 = blockIdx.x * blockDim.x + threadIdx.x;
                                        if(idx1 <  input_size - kernel_size + 1 ) {
                                            float t = 0.0f;
                                            for(int idx2 = 0; idx2 <= kernel_size - 1; idx2 ++){
                                                t += input[idx1 + idx2] * kernel[idx2];
                                            }
                                            output[idx1] = t;

                                        } 
                                      }

// input, kernel, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, const float* kernel, float* output, int input_size,
                      int kernel_size) {
    int output_size = input_size - kernel_size + 1;
    int threadsPerBlock = 256;
    int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;

    convolution_1d_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, kernel, output, input_size,
                                                              kernel_size);
    cudaDeviceSynchronize();
}
  1. 7:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void reverse_array(float* input, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N / 2) {
        int t = input[idx];
        input[idx] = input[N - idx - 1];
        input[N - idx - 1] = t;
    }
}

// input is device pointer
extern "C" void solve(float* input, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    reverse_array<<<blocksPerGrid, threadsPerBlock>>>(input, N);
    cudaDeviceSynchronize();
}
  1. 8
相关推荐
我星期八休息1 小时前
Linux系统编程—基础IO
linux·运维·服务器·c语言·c++·人工智能·算法
池塘的蜗牛2 小时前
A Low-Complexity Method for FFT-based OFDM Sensing
算法
故事和你912 小时前
洛谷-【图论2-1】树5
开发语言·数据结构·c++·算法·动态规划·图论
咖啡里的茶i3 小时前
视觉显著目标的自适应分割与动态网格生成算法研究
人工智能·算法·目标跟踪
paeamecium3 小时前
【PAT甲级真题】- String Subtraction (20)
数据结构·c++·算法·pat考试·pat
YL200404263 小时前
047从前序与中序遍历序列构造二叉树
算法·leetcode
极梦网络无忧3 小时前
password_hash
算法·哈希算法
计算机安禾3 小时前
【c++面向对象编程】第25篇:仿函数(函数对象):重载operator()
开发语言·c++·算法
周末也要写八哥4 小时前
在C++中使用预定义宏
开发语言·c++·算法