leetGPU

  1. 加粗样式:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void vector_add(const float* A, const float* B, float* C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}
  1. 2:
cuda 复制代码
#include <cuda_runtime.h>

/// 矩阵乘法 kernel(naive 实现)
/// A: M × N,  B: N × K,  C: M × K,  全部行主序存储
/// 每个线程计算 C 的一个元素
__global__ void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N,
                                             int K) {
    // 当前线程负责 C[row][col]
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // 边界检查
    if (row < M && col < K) {
        float sum = 0.0f;

        // 沿共享维度 N 做内积: A 的第 row 行 × B 的第 col 列
        for (int n = 0; n < N; n++) {
            sum += A[row * N + n] * B[n * K + col];
        }

        C[row * K + col] = sum;
    }
}

/// solve 函数
/// A, B, C 已经是 GPU 上的指针,无需 cudaMalloc/cudaMemcpy
extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K) {
    // 2D block: 16×16 = 256 线程(32 的倍数,occupancy 友好)
    dim3 threadsPerBlock(16, 16);

    // 2D grid: 按 K(宽)和 M(高)覆盖整个输出矩阵
    dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernel
    matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);

    // 等待 GPU 执行完毕
    cudaDeviceSynchronize();
}
  1. 3:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if(row < rows && col < cols) {
        output[col * rows + row] = input[row *cols + col];
    }
}

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, float* output, int rows, int cols) {
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);

    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);
    cudaDeviceSynchronize();
}
  1. 4:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void invert_kernel(unsigned char* image, int width, int height) {
    int idx = blockIdx.x * blockDim.x +threadIdx.x;
    if(idx < width * height) {
        uchar4 * pixel = (uchar4 *)(image + idx * 4);
        uchar4 p = *pixel;
        p.x = 255 - p.x;
        p.y = 255 - p.y;
        p.z = 255 - p.z;
        *pixel = p;
        
    }
}
// image_input, image_output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(unsigned char* image, int width, int height) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (width * height + threadsPerBlock - 1) / threadsPerBlock;

    invert_kernel<<<blocksPerGrid, threadsPerBlock>>>(image, width, height);
    cudaDeviceSynchronize();
}
  1. 5:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void matrix_add(const float* A, const float* B, float* C, int N) {
    // int row = blockIdx.y * blockDim.y + threadIdx.y;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N * N) {
        C[idx] = (A[idx] + B[idx]);
    }
}

// A, B, C are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N * N + threadsPerBlock - 1) / threadsPerBlock;

    matrix_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}
  1. 6:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void convolution_1d_kernel(const float* input, const float* kernel, float* output,
                                      int input_size, int kernel_size) {
                                        int idx1 = blockIdx.x * blockDim.x + threadIdx.x;
                                        if(idx1 <  input_size - kernel_size + 1 ) {
                                            float t = 0.0f;
                                            for(int idx2 = 0; idx2 <= kernel_size - 1; idx2 ++){
                                                t += input[idx1 + idx2] * kernel[idx2];
                                            }
                                            output[idx1] = t;

                                        } 
                                      }

// input, kernel, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const float* input, const float* kernel, float* output, int input_size,
                      int kernel_size) {
    int output_size = input_size - kernel_size + 1;
    int threadsPerBlock = 256;
    int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;

    convolution_1d_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, kernel, output, input_size,
                                                              kernel_size);
    cudaDeviceSynchronize();
}
  1. 7:
cuda 复制代码
#include <cuda_runtime.h>

__global__ void reverse_array(float* input, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < N / 2) {
        int t = input[idx];
        input[idx] = input[N - idx - 1];
        input[N - idx - 1] = t;
    }
}

// input is device pointer
extern "C" void solve(float* input, int N) {
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    reverse_array<<<blocksPerGrid, threadsPerBlock>>>(input, N);
    cudaDeviceSynchronize();
}
  1. 8
相关推荐
To_OC4 小时前
LC 994 腐烂的橘子:人人都说是 BFS 入门题,我却写了三遍才过
javascript·算法·leetcode
金銀銅鐵8 小时前
[Python] 扩展欧几里得算法
python·数学·算法
To_OC10 小时前
LC 200 岛屿数量:经典 DFS 入门题,我第一次写居然连方向都搞错了
javascript·算法·leetcode
To_OC1 天前
LC 128 最长连续序列:别上来就排序,O (n) 解法才是这题的灵魂
javascript·算法·leetcode
05Kevin2 天前
lk每日冒险题--数据结构6.27
算法
To_OC2 天前
从一次栈溢出报错说起,我把递归彻底扒明白了
javascript·算法·程序员
千纸鹤安安2 天前
千问Qwen-AgentWorld来了:一个语言模型搞定七大Agent场景,GPT-5.4都输了
算法
七牛开发者2 天前
MCP 到底是什么?为什么 Agent 都想接上它
算法·aigc·agent