我的第一个cudnn（cuda）人工智能程序（lenet）

再回不去vs2010，只能用vs2015！

感谢csdn博主ouliten cudnn的普及！我的第一个cudnn就是抄他的！

cuda-NCCL笔记（3）-- 分布式训练LeNet_分布式训练中,nccl梯度反传更新的原理-CSDN博客

人生得意先show一下：

使用cudnn搞定lenet训练手写数字！真是快的不得了！gpu真是干活好手！

训练两次，test10000张图片成绩98.1！不错！平时我train成绩达到这个成绩，我就满足了！

作者ouliten原版是，linux版本，是为2线程服务的，即2个gpu

我改成windows10专业版，vs2015 c++，英伟达显卡1060（6g），cuda9.0，cudnn7.1.4！

这个老师ouliten真不错！

cudnn一次成功！花一周时间就差不多了！人有时就是懒，一拖一年，我就是这样！python拖了十年！今年是自己的人工智能元年，定下目标，一切事情都要为这个目标让路！

python+pytorch（也是第一次学）版本的cifar10，残差训练也达成了新高度84分！

真是走过千难万险！人工智能，不好玩！人生还是要发大愿望！才足够耐风寒！

第一回使用.cu文件，真的不适应！

还是lenet写（抄）过十几遍了，熟得很！所以才能走得这么快！

用星爷的话说，几百遍了，熟得很！

好，先看mnist数据加载：使用下面这个类：

//load minist

#include <iostream>

#include <fstream>

#include <stdexcept>

#include <random>

#include <algorithm>

#include <cstdint>

struct MNISTSamlpe

{

MNISTSamlpe(const std::vector<float>& image_, const unsigned char &label_) :image(image_), label(label_) {}

std::vector<float> image;//28*28

unsigned char label;

};

class MNISTDataset {

public:

MNISTDataset(const std::string &image_file, const std::string &label_file) {

load_images(image_file);

load_labels(label_file);

if (images.size() != labels.size()) {

throw std::runtime_error("Number of images and labels mismatch");

}

indices.resize(images.size());

for (size_t i = 0; i<indices.size(); i++)

indices[i] = i;

}

////随机打乱

//void shuffle(unsigned seed = 42) {

// std::shuffle(indices.begin(), indices.end(), std::default_random_engine(seed));

// current_idx = 0;

//}

//Get next batch（获取下一批数据）

std::vector<MNISTSamlpe> next_batch(size_t batch_size) {

std::vector<MNISTSamlpe> batch;

batch.reserve(batch_size);

for (size_t i = 0; i<batch_size; i++) {

if (current_idx >= indices.size()) current_idx = 0;

size_t idx = indices[current_idx++];

batch.emplace_back(MNISTSamlpe{ images[idx], labels[idx] });

}

return batch;

}

size_t size() const { return images.size(); }

private:

std::vector<std::vector<float>> images;

std::vector<unsigned char> labels;

std::vector<size_t> indices;

size_t current_idx = 0;

void load_images(const std::string &path) {

FILE* mnist_file = NULL;

int err = fopen_s(&mnist_file, "c:\\train-images.idx3-ubyte", "rb");

unsigned char image_buffer[784]; //保存图片信息

int head_info[1000]; //读取出文件的头信息，该信息不参与计算

fread(head_info, 1, 16, mnist_file); //读取16字节头部信息

if (mnist_file == NULL)

{

// cout << "load data from your file err..." << endl;

return;

}

else

{

//cout << "loading data...[in func -->> load_mnist_data]" << endl;

}

int num = 60000; int rows = 28; int cols = 28;

images.resize(num, std::vector<float>(rows*cols));

for (int i = 0; i < 60000; i++)

{

/*sample[i].data = (double*)malloc(padded_matrix_size * sizeof(double));

memset(sample[i].data, 0, padded_matrix_size * sizeof(double));*/

fread(image_buffer, 1, 784, mnist_file);

unsigned int value;

//int index = 0;

for (int j = 0; j < 28; j++)

{

for (int k = 0; k < 28; k++)

{

int shuffle_index = (j + 0) * 28 + k + 0;

//value = (unsigned int)image_buffer[index++];

///*images[j][k] = value / 255.0f;*/

images[i][shuffle_index] = image_buffer[shuffle_index] / 255.0f;

/*images[i][j] = value / 255.0f;*/

/*if (value < 128)

{

sample[i].data[shuffle_index] = 0;

}

else

{

sample[i].data[shuffle_index] = 1;

}*/

}

/*fclose(mnist_file);

mnist_file = NULL;*/

//std::ifstream file(path, std::ios::binary);

//if (!file) throw std::runtime_error("Cannot open image file");

/* uint32_t magic, num, rows, cols;

file.read((char *)&magic, 4);

file.read((char*)&num, 4);

file.read((char*)&rows, 4);

file.read((char*)&cols, 4);

magic = __builtin_bswap32(magic);

num = __builtin_bswap32(num);

rows = __builtin_bswap32(rows);

cols = __builtin_bswap32(cols);

if (magic != 2051) throw std::runtime_error("Invalid MNIST image file");*/

//int num = 60000; int rows = 28; int cols = 28;

//images.resize(num, std::vector<float>(rows*cols));

///*images.resize(num, std::vector<float>(rows*cols));*/

//for (uint32_t i = 0; i<num; i++) {

// for (uint32_t j = 0; j<rows*cols; j++) {

// unsigned char pixel;

// file.read((char*)&pixel, 1);

// images[i][j] = pixel / 255.0f;

// }

//}

fclose(mnist_file);

mnist_file = NULL;

}

void load_labels(const std::string &path) {

FILE* mnist_file = NULL;

int err = fopen_s(&mnist_file, "c:\\train-labels.idx1-ubyte", "rb");

unsigned char label;

if (mnist_file == NULL)

{

//cout << "load label from your file err..." << endl;

return;

}

else

{

// cout << "loading label...[in func -->> load_mnist_label]" << endl;

}

int head_info[1000]; //读取出文件的头信息，该信息不参与计算

fread(head_info, 1, 8, mnist_file); //读取8字节头部信息

int num = 60000;

labels.resize(num);

for (int i = 0; i < num; i++)

{

// sample[i].label = (double*)malloc(class_count * sizeof(double));

/* for (int k = 0; k < 10; k++)

{

sample[i].label[k] = 0;

}*/

fread((char*)&label, sizeof(label), 1, mnist_file);

//int value_index = (unsigned int)label;

//sample[i].label[value_index] = 1;

labels[i] = label;

//show_pic(sample[i],value_index); //test

}

fclose(mnist_file);

mnist_file = NULL;

/* std::ifstream file(path, std::ios::binary);

if (!file) throw std::runtime_error("Cannot open label file");

uint32_t magic, num;

file.read((char*)&magic, 4);

file.read((char*)&num, 4);

magic = __builtin_bswap32(magic);

num = __builtin_bswap32(num);

if (magic != 2049) throw std::runtime_error("Invalid MNIST label file");*/

/* labels.resize(num);

for (uint32_t i = 0; i<num; i++) {

unsigned char lbl;

file.read((char*)&lbl, 1);

labels[i] = lbl;

}*/

}

};

因为我对这个数据集很熟悉，所以也就改成自己的加载方式！否则，自己哪里挂都不知道！

其次，cudnn上场了：为lenet所作的准备工作：

#include <vector>

#include<memory>

#include<iostream>

#include<stdio.h>

#include<cudnn.h>

#include <cublas_v2.h>

#include "cuda_runtime.h"

#include"device_launch_parameters.h"

bool InitCUDA() {

int count;

cudaGetDeviceCount(&count);

if (count == 0)

{

fprintf(stderr, "There is no device.\n");

return false;

}

int i;

for (i = 0; i < count; i++)

{

cudaDeviceProp prop;

if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

if (prop.major >= 1)

{

break;

}

if (i == count)

{

fprintf(stderr, "There is no device supporting CUDA 1.x.\n");

return false;

}

cudaSetDevice(i);

return true;

}

global void add(int *a, int *b, int *c, int N) {

int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < N) {

c[index] = a[index] + b[index];

}

//以上是cuda测试

//以下测试lenet

void error_handling(cudaError_t res) {

if (res != cudaSuccess) {

std::cout << "error! " << cudaGetErrorString(res) << std::endl;

exit(EXIT_FAILURE);

}

void error_handling(cudnnStatus_t status) {

if (status != CUDNN_STATUS_SUCCESS) {

std::cout << "error! " << cudnnGetErrorString(status) << std::endl;

exit(EXIT_FAILURE);

}

void error_handling(cublasStatus_t status) {

if (status != CUBLAS_STATUS_SUCCESS) {

std::cout << "error! " << (status) << std::endl;

exit(EXIT_FAILURE);

}

//功能：将数组 p 初始化为 [a,b] 区间的均匀分布随机数。

global void init_uniform(float* p, int n, unsigned seed, float a = -0.05f, float b = 0.05f) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < n) {

// 线性同余伪随机（教学用）

unsigned s = seed ^ (i * 747796405u + 2891336453u);

s ^= s >> 17; s *= 0xed5ad4bbU; s ^= s >> 11; s *= 0xac4c1b51U; s ^= s >> 15; s *= 0x31848babU; s ^= s >> 14;

float r = (s & 0x00FFFFFF) / float(0x01000000); // [0,1)

p[i] = a + (b - a) * r;

}

//张量清零功能：把数组 p 清零。常用于梯度初始化。

global void zero_kernel(float* p, int n) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < n) {

p[i] = 0.0f;

}

//功能：在卷积 / 全连接的输出上加上对应的 bias 偏置。

//原理：根据 idx 反推当前像素属于哪个通道 c，然后加上对应的 b[c]。（因为bias是根据通道划分的，一个通道共用一个bias）

global void add_bias_nchw(float* y, const float* b, int N, int C, int H, int W) {//b数组的大小是C

int i = blockIdx.x * blockDim.x + threadIdx.x;

int total = N * C * H * W;

if (i < total) {

int c = (i / (H * W)) % C;//计算当前像素属于哪个同搭配channel

y[i] += b[c];

}

// Softmax 前向 + 交叉熵损失（单样本），logits->prob, 返回loss

//功能：计算 softmax 概率，同时得到交叉熵损失。

//原理：标准 softmax + CE，做了 max 平移避免指数溢出。

global void softmax_forward_loss(const float* logits, const int* label, float* prob, float* loss, int num_classes) {

// 单样本简化

// 1) 减去最大值防溢出

float mx = logits[0];

for (int i = 1; i < num_classes; i++)

mx = fmaxf(mx, logits[i]);

float sum = 0.f;

for (int i = 0; i < num_classes; i++) {

float e = expf(logits[i] - mx);

prob[i] = e;

sum += e;

}

for (int i = 0; i < num_classes; i++)

prob[i] /= sum;

int y = *label;

float l = -logf(fmaxf(prob[y], 1e-12f));//这是损失

*loss = l;

}

// softmax + CE 的反向：dlogits(损失函数对logits的导数) = prob - onehot（推导过程自行学习）

//功能：计算 dL / dlogits。

global void softmax_ce_backward(const float* prob, const int* label, float* dlogits, int num_classes) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < num_classes) {

int y = *label;

dlogits[i] = prob[i] - (i == y ? 1.f : 0.f);//详见softmax的梯度公式

}

// softmax + cross entropy loss (batch)

// logits: [batch_size, num_classes]

// labels: [batch_size]

// prob: [batch_size, num_classes]

// loss: [batch_size] (每个样本的loss，外面可以再取均值)

global void softmax_forward_loss_batch(

const float* logits, const unsigned* labels,

float* prob, float* loss,

int batch_size, int num_classes)

{

int b = blockIdx.x; // 每个block处理一个样本

if (b >= batch_size) return;

const float* logit_row = logits + b * num_classes;

float* prob_row = prob + b * num_classes;

// 1) 找最大值防止溢出

float mx = logit_row[0];

for (int i = 1; i < num_classes; i++)

mx = fmaxf(mx, logit_row[i]);

// 2) exp & sum

float sum = 0.f;

for (int i = 0; i < num_classes; i++) {

float e = expf(logit_row[i] - mx);

prob_row[i] = e;

sum += e;

}

// 3) 归一化 softmax

for (int i = 0; i < num_classes; i++)

prob_row[i] /= sum;

// 4) cross entropy loss

unsigned y = labels[b];

float l = -logf(fmaxf(prob_row[y], 1e-12f));

loss[b] = l;

}

// backward: dlogits = prob - onehot

// prob: [batch_size, num_classes]

// labels: [batch_size]

// dlogits: [batch_size, num_classes]

global void softmax_ce_backward_batch(

const float* prob, const unsigned* labels,

float* dlogits,

int batch_size, int num_classes)

{

int b = blockIdx.x; // 样本编号

int i = threadIdx.x; // 类别编号

if (b >= batch_size || i >= num_classes) return;

int y = labels[b];

const float* prob_row = prob + b * num_classes;

float* dlogit_row = dlogits + b * num_classes;

dlogit_row[i] = prob_row[i] - (i == y ? 1.f : 0.f);

}

// SGD 参数更新：W -= lr * dW

//功能：梯度下降更新参数。

global void sgd_update(float* W, const float* dW, float lr, int n) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < n) {

W[i] -= lr * dW[i];

}

// 功能是计算卷积层的 bias 梯度

// 逻辑：把输出梯度 dy 在 N、H、W 上做 sum，得到每个通道的偏置梯度。**每个通道一个线程**

//dy：上层传下来的梯度，形状是(N, C, H, W)

//数学上，db[c]=dy在N*H*W维度上的总和

global void reduce_bias_grad(const float* dy, float* db, int N, int C, int H, int W) {

int c = blockIdx.x * blockDim.x + threadIdx.x;

if (c >= C) return;

float s = 0.f;

for (int n = 0; n < N; n++) {//遍历 batch 内的每个样本 n

const float* p = dy + (n * C + c) * H * W;//p 指向第 n 个样本、第 c 个通道的起始地址

for (int i = 0; i < H * W; i++)//内层循环：遍历该通道的所有空间位置(H * W)

s += p[i];//累加梯度到 s

}

db[c] = s;

}

//张量展平 (Flatten)：将特征图 (N,C,H,W) flatten 成 (N, C*H*W) 行优先缓冲（简单拷贝）

//这里简化为 N=1，只是拷贝。

global void nchw_to_nxk(const float* x, float* y, int N, int C, int H, int W) {

// N=1 简化

int k = C * H * W;

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < k) y[i] = x[i];

}

//功能：反展平，主要用在调试或反向传播时。

global void nxk_to_nchw(const float* x, float* y, int N, int C, int H, int W) {

int k = C * H * W;

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < k) y[i] = x[i];

}

// 初始化 GPU 上的 ones 向量

global void init_ones_kernel(float* data, int n) {

int idx = blockIdx.x*blockDim.x + threadIdx.x;

if (idx < n) data[idx] = 1.0f;

}

//缩放

global void scale_kernel(float* data, int n, float factor) {

int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < n) {

data[idx] *= factor;

}

//作者最后加了这个，有没有用？

/* Tensor Bias addition : C = alpha * A + beta * C */

cudnnStatus_t

cudnnAddTensor(cudnnHandle_t handle,

const void *alpha,

const cudnnTensorDescriptor_t aDesc,

const void *A,

const void *beta,

const cudnnTensorDescriptor_t cDesc,

void *C);

////类方式实现

//// 初始化 GPU 上的 ones 向量

//global void init_ones_kernel(float* data, int n) {

// int idx = blockIdx.x*blockDim.x + threadIdx.x;

// if (idx < n) data[idx] = 1.0f;

//}

class Layer {//抽象的层

public:

//输入的逻辑：调用者传入分配内存空间的输入，输出存在类里

virtual void forward(float* input) = 0;

//反向传播的逻辑亦是如此，只是名字反了一下，参数名字是output

virtual void backward(float* grad_output) = 0;

virtual float* get_output() = 0;

virtual float* get_grad_input() = 0;

virtual void update(float lr) = 0;

virtual~Layer() {}

};

class Conv2D :public Layer {

public:

Conv2D(cudnnHandle_t &handle, int batch, int in_channels, int out_channels, int in_h, int in_w, int kernel_size, int stride = 1, int padding = 0) :

_handle(handle), _batch(batch), _in_channels(in_channels), _out_channels(out_channels), _in_h(in_h), _in_w(in_w), _kernel_size(kernel_size), _stride(stride), _padding(padding)

{

//初始化描述符

error_handling(cudnnCreateTensorDescriptor(&_input_desc));//I

error_handling(cudnnCreateTensorDescriptor(&_output_desc));//O

error_handling(cudnnCreateTensorDescriptor(&_bias_desc));//bias

error_handling(cudnnCreateFilterDescriptor(&_filter_desc));//filter

error_handling(cudnnCreateConvolutionDescriptor(&_conv_desc));//con ker

//设置描述符

error_handling(cudnnSetTensor4dDescriptor(_input_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, _batch, _in_channels, _in_h, _in_w));

error_handling(cudnnSetTensor4dDescriptor(_bias_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, out_channels, 1, 1));

error_handling(cudnnSetFilter4dDescriptor(_filter_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, _out_channels, _in_channels, _kernel_size, _kernel_size));

error_handling(cudnnSetConvolution2dDescriptor(_conv_desc, padding, padding, stride, stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));

error_handling(cudnnGetConvolution2dForwardOutputDim(_conv_desc, _input_desc, _filter_desc, &_batch, &_out_channels, &_out_h, &_out_w));//获得输出维度

error_handling(cudnnSetTensor4dDescriptor(_output_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, _batch, _out_channels, _out_h, _out_w));

//分配GPU内存

www = (float*)malloc(sizeof(float)*_out_channels* _in_channels*_kernel_size*_kernel_size);

//还应该有个置零的动作

error_handling(cudaMalloc(&_weight, sizeof(float)*_out_channels* _in_channels*_kernel_size*_kernel_size));

error_handling(cudaMalloc(&_bias, sizeof(float) * _out_channels));

error_handling(cudaMalloc(&_grad_weight, sizeof(float) * _out_channels * _in_channels * _kernel_size * _kernel_size));//参数的梯度维度和参数是一样的

error_handling(cudaMalloc(&_grad_bias, sizeof(float) * _out_channels));

error_handling(cudaMalloc(&_output, sizeof(float) * _batch * _out_channels * _out_h * _out_w));

error_handling(cudaMalloc(&_grad_input, sizeof(float) * _batch * _in_channels * _in_h * _in_w));

//初始化参数

int w_size = _out_channels * _in_channels * _kernel_size * _kernel_size;

int b_size = _out_channels;

init_uniform << <(w_size + 255) / 256, 256 >> > (_weight, w_size, time(NULL), -0.05f, 0.05f);

init_uniform << <(b_size + 255) / 256, 256 >> > (_bias, b_size, time(NULL) + 1, -0.05f, 0.05f);

zero_kernel << <(w_size + 255) / 256, 256 >> > (_grad_weight, w_size);

zero_kernel << <(_batch * _out_channels * _out_h * _out_w + 255) / 256, 256 >> > (_output, _batch * _out_channels * _out_h * _out_w);

zero_kernel << <(_batch * _in_channels * _in_h * _in_w + 255) / 256, 256 >> > (_grad_input, _batch * _in_channels * _in_h * _in_w);

cudaDeviceSynchronize();

//TODO:检测结果的事情后面再说吧

//获取算法

cudnnConvolutionFwdAlgoPerf_t fwdPerf[8]; int retFwd = 0;

cudnnGetConvolutionForwardAlgorithm_v7(_handle, _input_desc, _filter_desc, _conv_desc, _output_desc, 8, &retFwd, fwdPerf);

_fwd_algo = fwdPerf[0].algo;

cudnnConvolutionBwdFilterAlgoPerf_t bwdFiltPerf[8]; int retBF = 0;

cudnnConvolutionBwdDataAlgoPerf_t bwdDataPerf[8]; int retBD = 0;

cudnnGetConvolutionBackwardFilterAlgorithm_v7(_handle, _input_desc, _output_desc, _conv_desc, _filter_desc, 8, &retBF, bwdFiltPerf);

_bwd_filter_algo = bwdFiltPerf[0].algo;

cudnnGetConvolutionBackwardDataAlgorithm_v7(_handle, _filter_desc, _output_desc, _conv_desc, _input_desc, 8, &retBD, bwdDataPerf);

_bwd_data_algo = bwdDataPerf[0].algo;

//工作空间的内存分配

cudnnGetConvolutionForwardWorkspaceSize(_handle, _input_desc, _filter_desc, _conv_desc, _output_desc, _fwd_algo, &_fwd_ws_size);

cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle, _input_desc, _output_desc, _conv_desc, _filter_desc, _bwd_filter_algo, &_bwd_filter_ws_size);

cudnnGetConvolutionBackwardDataWorkspaceSize(_handle, _filter_desc, _output_desc, _conv_desc, _input_desc, _bwd_data_algo, &_bwd_data_ws_size);

if (_fwd_ws_size > 0)

cudaMalloc(&_fwd_ws, _fwd_ws_size);

else _fwd_ws = nullptr;

if (_bwd_filter_ws_size > 0)

cudaMalloc(&_bwd_filter_ws, _bwd_filter_ws_size);

else _bwd_filter_ws = nullptr;

if (_bwd_data_ws_size > 0)

cudaMalloc(&_bwd_data_ws, _bwd_data_ws_size);

else _bwd_data_ws = nullptr;

}

void forward(float* input)override {

_input = input;

const float alpha = 1.0f, beta = 0.0f;

//向前

cudnnConvolutionForward(_handle, &alpha,

_input_desc, _input,

_filter_desc, _weight,

_conv_desc, _fwd_algo, _fwd_ws, _fwd_ws_size, &beta,

_output_desc, _output);

//加偏置

cudnnAddTensor(_handle, &alpha, _bias_desc, _bias, &alpha, _output_desc, _output);//注意这里两个都是alpha,详情查看函数接口

}

void backward(float* grad_output)override {

const float alpha = 1.0f, beta = 0.0f;

// grad_bias

cudnnConvolutionBackwardBias(_handle, &alpha, _output_desc, grad_output, &beta, _bias_desc, _grad_bias);

// grad_weight

cudnnConvolutionBackwardFilter(_handle, &alpha, _input_desc, _input, _output_desc, grad_output,

_conv_desc, _bwd_filter_algo, _bwd_filter_ws, _bwd_filter_ws_size, &beta, _filter_desc, _grad_weight);

// grad_input 这个输出是要传到外面去给下一层用的

cudnnConvolutionBackwardData(_handle, &alpha, _filter_desc, _weight, _output_desc, grad_output,

_conv_desc, _bwd_data_algo, _bwd_data_ws, _bwd_data_ws_size, &beta, _input_desc, _grad_input);

}

float* get_output() override { return _output; }

float* get_grad_input() override { return _grad_input; }

float* getwgh() { return www; }//很久没用类了，还搞个重载，有点懵,更新后取www才有意义

void update(float lr) override {

int w_size = _out_channels * _in_channels * _kernel_size * _kernel_size;

int b_size = _out_channels;

// 简单 SGD

sgd_update << <(w_size + 255) / 256, 256 >> > (_weight, _grad_weight, lr, w_size);//权重应该从这里取出来20251006

sgd_update << <(b_size + 255) / 256, 256 >> > (_bias, _grad_bias, lr, b_size);

cudaDeviceSynchronize();

// ---- 打印中间输出 ----

cudaMemcpy(www, _weight, _out_channels* _in_channels*_kernel_size*_kernel_size * sizeof(float), cudaMemcpyDeviceToHost);//取出权重，可以一试,保存加载不必再训练喽

}

/*getwgh()

{

}*/

~Conv2D() {

cudaFree(_weight);

cudaFree(_bias);

cudaFree(_grad_weight);

cudaFree(_grad_bias);

cudaFree(_output);

cudaFree(_grad_input);

if (_fwd_ws) cudaFree(_fwd_ws);

if (_bwd_filter_ws) cudaFree(_bwd_filter_ws);

if (_bwd_data_ws) cudaFree(_bwd_data_ws);

cudnnDestroyTensorDescriptor(_input_desc);

cudnnDestroyTensorDescriptor(_output_desc);

cudnnDestroyTensorDescriptor(_bias_desc);

cudnnDestroyFilterDescriptor(_filter_desc);

cudnnDestroyConvolutionDescriptor(_conv_desc);

}

private:

int _in_channels, _out_channels, _kernel_size, _stride, _padding, _batch;//卷积核的参数（kernel暂时是正方形的）

int _in_h, _in_w, _out_h, _out_w;//输入输出的形状

float* _weight, *_bias;//卷积核的参数和偏置

float* _input, *_output, *_grad_input;//x的输入、输出，反向传播时向外输出的梯度

float* _grad_weight, *_grad_bias;//参数的梯度，偏置的梯度

cudnnHandle_t& _handle;//引用外部的handle，这样一个程序只用创建一个handle

cudnnTensorDescriptor_t _input_desc, _output_desc, _bias_desc;//三个张量描述子：输入张量，输出张量，偏置张量

cudnnFilterDescriptor_t _filter_desc;//卷积核描述子

cudnnConvolutionDescriptor_t _conv_desc;//卷积描述子

cudnnConvolutionFwdAlgo_t _fwd_algo;//前向操作的算法

cudnnConvolutionBwdFilterAlgo_t _bwd_filter_algo;//反向求导中，对参数求导的算法

cudnnConvolutionBwdDataAlgo_t _bwd_data_algo;//反向求导中，对输入求导的算法

size_t _fwd_ws_size, _bwd_filter_ws_size, _bwd_data_ws_size;//各个工作空间的大小

void* _fwd_ws = nullptr, *_bwd_filter_ws = nullptr, *_bwd_data_ws = nullptr;//各个工作空间的指针

float * www;

};

class ReLU :public Layer {

public:

ReLU(cudnnHandle_t& handle, int n, int c, int h, int w) :_handle(handle), _n(n), _c(c), _h(h), _w(w) {

//描述子初始化

cudnnCreateTensorDescriptor(&_input_desc);

cudnnCreateActivationDescriptor(&_act_desc);

//描述子设置

cudnnSetTensor4dDescriptor(_input_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, _n, _c, _h, _w);

cudnnSetActivationDescriptor(_act_desc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0f);

//分配GPU内存

cudaMalloc(&_output, sizeof(float) * _n * _c * _h * _w);

cudaMalloc(&_grad_input, sizeof(float) * _n * _c * _h * _w);

}

void forward(float* input)override {

_input = input;

const float alpha = 1.0f, beta = 0.0f;

//执行激活操作，结果保存在_output

cudnnActivationForward(_handle, _act_desc, &alpha,

_input_desc, _input, &beta,

_input_desc, _output);

}

void backward(float* grad_output)override {

//执行反向梯度计算，结果存在_grad_input

const float alpha = 1.0f, beta = 0.0f;

cudnnActivationBackward(_handle, _act_desc, &alpha,

_input_desc, _output, // yDesc, y (forward 输出)

_input_desc, grad_output, // dyDesc, dy (来自上层的梯度)

_input_desc, _input, // xDesc, x (forward 输入)

&beta,

_input_desc, _grad_input); // dxDesc, dx

}

float* get_output()override { return _output; }

float* get_grad_input()override { return _grad_input; }

void update(float lr) override {}

~ReLU() {

cudaFree(_output);

cudaFree(_grad_input);

cudnnDestroyTensorDescriptor(_input_desc);

cudnnDestroyActivationDescriptor(_act_desc);

}

private:

float* _input;

float* _output;

float* _grad_input;

int _n, _c, _h, _w;

cudnnHandle_t& _handle;

cudnnTensorDescriptor_t _input_desc;//输入（同时也是输出）张量的描述子

cudnnActivationDescriptor_t _act_desc;//激活描述子

};

class MaxPool2D :public Layer {

public:

MaxPool2D(cudnnHandle_t &handle, int n, int c, int h, int w, int ph, int pw, int padding, int stride) :

_handle(handle), _n(n), _c(c), _h(h), _w(w), _ph(ph), _pw(pw), _padding(padding), _stride(stride) {

//描述子初始化

cudnnCreateTensorDescriptor(&_input_desc);

cudnnCreateTensorDescriptor(&_output_desc);

cudnnCreatePoolingDescriptor(&_pool_desc);

//描述子设置

cudnnSetPooling2dDescriptor(_pool_desc, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, _ph, _pw, _padding, _padding, _stride, _stride);

cudnnSetTensor4dDescriptor(_input_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, _n, _c, _h, _w);

cudnnGetPooling2dForwardOutputDim(_pool_desc, _input_desc, &_n, &_c, &_out_h, &_out_w);

cudnnSetTensor4dDescriptor(_output_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, _n, _c, _out_h, _out_w);

//分配内存

cudaMalloc(&_output, sizeof(float)*_n*_c*_out_h*_out_w);

cudaMalloc(&_grad_input, sizeof(float)*_n*_c*_h*_w);

}

void forward(float *input) override {

_input = input;

const float alpha = 1.0f, beta = 0.0f;

cudnnPoolingForward(_handle, _pool_desc, &alpha,

_input_desc, _input, &beta,

_output_desc, _output);

}

void backward(float *grad_output) override {

const float alpha = 1.0f, beta = 0.0f;

cudnnPoolingBackward(_handle, _pool_desc, &alpha, _output_desc, _output, _output_desc, grad_output, _input_desc, _input, &beta, _input_desc, _grad_input);

}

float *get_output()override { return _output; }

float *get_grad_input()override { return _grad_input; }

void update(float lr)override {}

~MaxPool2D() {

cudaFree(_output);

cudaFree(_grad_input);

cudnnDestroyTensorDescriptor(_input_desc);

cudnnDestroyTensorDescriptor(_output_desc);

cudnnDestroyPoolingDescriptor(_pool_desc);

}

private:

int _n, _c, _h, _w, _ph, _pw, _padding, _stride;//输入维度，池化的padding和stride

int _out_h, _out_w;//输出维度

float *_input, *_output, *_grad_input;

cudnnHandle_t &_handle;

cudnnTensorDescriptor_t _input_desc, _output_desc;//输入输出张量描述子

cudnnPoolingDescriptor_t _pool_desc;//池化描述子

};

class Linear : public Layer {

public:

Linear(cublasHandle_t &handle_, int batch_, int in_f, int out_f)

: handle(handle_), batch(batch_), in_features(in_f), out_features(out_f)

{

www = (float*)malloc(sizeof(float) * in_features * out_features);

//还应该有个置零的动作

// 参数和输出内存

cudaMalloc(&weight, sizeof(float) * in_features * out_features);

cudaMalloc(&bias, sizeof(float) * out_features);

cudaMalloc(&grad_weight, sizeof(float) * in_features * out_features);

cudaMalloc(&grad_bias, sizeof(float) * out_features);

cudaMalloc(&output, sizeof(float) * batch * out_features);

cudaMalloc(&grad_input, sizeof(float) * batch * in_features);

cudaMalloc(&ones, sizeof(float) * batch);

//初始化参数,为什么前面类中没有？可以参考没写类之前

init_uniform << <(in_features*out_features + 255) / 256, 256 >> > (weight, in_features*out_features, 1, -0.05f, 0.05f);//先同意设置为1

init_uniform << <(out_features + 255) / 256, 256 >> > (bias, out_features, 1, -0.05f, 0.05f);

// 初始化 ones 向量

init_ones_kernel << <(batch + 255) / 256, 256 >> >(ones, batch);

cudaDeviceSynchronize();

}

void forward(float* input_) override {

//input: [batch, in_features]

// weight: [in_features, out_features]

// bias: [out_features]

// output: [batch, out_features]

input = input_;

const float alpha = 1.0f, beta = 0.0f;

// output = input * weight

cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,

out_features, batch, in_features,

&alpha,

weight, out_features,

input, in_features,

&beta,

output, out_features);

// add bias:

const float beta2 = 1.0f;

for (int i = 0; i < batch; i++)

cublasSaxpy(handle, out_features, &alpha, bias, 1, output + i*out_features, 1);

}

void backward(float* grad_output) override {

//input [batch, in_features]

//grad_output [batch, out_features]

//grad_weight [in_features, out_features]

const float alpha = 1.0f, beta = 0.0f;

// grad_weight = input^T * grad_output

cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,

out_features, in_features, batch,

&alpha,

grad_output, out_features,

input, in_features,

&beta,

grad_weight, out_features);

// weight: [in_features, out_features]

//grad_output [batch, out_features]

//grad_input [batch, in_features]

// grad_input = grad_output * weight^T

cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N,

in_features, batch, out_features,

&alpha,

weight, out_features,

grad_output, out_features,

&beta,

grad_input, in_features);

// grad_bias = grad_output^T * ones

cublasSgemv(handle, CUBLAS_OP_N,

batch, out_features,

&alpha,

grad_output, out_features,

ones, 1,

&beta,

grad_bias, 1);

}

float* get_output() override { return output; }

float* get_grad_input() override { return grad_input; }

void update(float lr) override {

const float alpha = -lr;

// weight -= lr * grad_weight

cublasSaxpy(handle, in_features*out_features, &alpha, grad_weight, 1, weight, 1);

// bias -= lr * grad_bias

cublasSaxpy(handle, out_features, &alpha, grad_bias, 1, bias, 1);

cudaDeviceSynchronize();//需要这句话吗？先试一试

cudaMemcpy(www, weight, sizeof(float) * in_features * out_features, cudaMemcpyDeviceToHost);//暂时未获取bias

}

float* getwgh() { return www; }//很久没用类了，还搞个重载，有点懵,更新后取www才有意义

~Linear() {

cudaFree(weight);

cudaFree(bias);

cudaFree(grad_weight);

cudaFree(grad_bias);

cudaFree(output);

cudaFree(grad_input);

cudaFree(ones);

// delete www;

}

private:

cublasHandle_t &handle;

int in_features, out_features, batch;

float *weight, *bias;

float *grad_weight, *grad_bias;

float *input, *output, *grad_input;

float *ones;

float * www;

};

cublasStatus_t cublasSaxpy(

cublasHandle_t handle,

int n,

const float *alpha,

const float *x, int incx,

float *y, int incy);

void checkCuda(cudaError_t res) {

if (res != cudaSuccess) {

std::cerr << "CUDA Error: " << cudaGetErrorString(res) << std::endl;

exit(EXIT_FAILURE);

}

第三步，就是在上面cudnn api（应用程序接口）的准备工作上，构建lenet网络：

class LeNet :public Layer {

public:

LeNet(cublasHandle_t &cublas_, cudnnHandle_t &cudnn_, int batch_) :cublas(cublas_), cudnn(cudnn_), batch(batch_) {

layers.emplace_back(std::make_shared<Conv2D>(cudnn, batch, 1, 6, 28, 28, 5));//0,有权重

layers.emplace_back(std::make_shared<ReLU>(cudnn, batch, 6, 24, 24));

layers.emplace_back(std::make_shared<MaxPool2D>(cudnn, batch, 6, 24, 24, 2, 2, 0, 2));

layers.emplace_back(std::make_shared<Conv2D>(cudnn, batch, 6, 16, 12, 12, 5));//3,有权重

layers.emplace_back(std::make_shared<ReLU>(cudnn, batch, 16, 8, 8));

layers.emplace_back(std::make_shared<MaxPool2D>(cudnn, batch, 16, 8, 8, 2, 2, 0, 2));

layers.emplace_back(std::make_shared<Linear>(cublas, batch, 16 * 4 * 4, 120));//6，,有权重

layers.emplace_back(std::make_shared<ReLU>(cudnn, batch, 120, 1, 1));

layers.emplace_back(std::make_shared<Linear>(cublas, batch, 120, 84));//8，,有权重

layers.emplace_back(std::make_shared<ReLU>(cudnn, batch, 84, 1, 1));

layers.emplace_back(std::make_shared<Linear>(cublas, batch, 84, 10));//10，,有权重

cudaMalloc(&output, batch * 10 * sizeof(float));

cudaMalloc(&grad_input, batch * 1 * 28 * 28 * sizeof(float));

}

void forward(float *input_)override {

input = input_;

for (const auto &l : layers) {

l->forward(input);

input = l->get_output();

}

cudaMemcpy(output, input, sizeof(float)*batch * 10, cudaMemcpyDeviceToDevice);

}

void backward(float *grad_output)override {

float* grad = grad_output;

for (int i = layers.size() - 1; i >= 0; i--) {

layers[i]->backward(grad);

grad = layers[i]->get_grad_input();

}

cudaMemcpy(grad_input, grad, sizeof(float)*batch * 1 * 28 * 28, cudaMemcpyDeviceToDevice);

}

float* get_output() override { return output; }

float* get_grad_input() override { return grad_input; }

void update(float lr) {

for (const auto &l : layers) {

l->update(lr);

}

/*void getWghL1() {

layersWgh1= layers[0]->getwgh();

}

void getWghL2() {

layersWgh2 = layers[3]->getwgh();

}

void getWghL3() {

layersWgh3 = layers[6]->getwgh();

}

void getWghL4() {

layersWgh4 = layers[8]->getwgh();

}

void getWghL5() {

layersWgh5 = layers[10]->getwgh();

}*/

~LeNet() {

cudaFree(output);

cudaFree(grad_input);

}

//float* get_w_grad() {

// return nullptr;

//}

//float* get_b_grad() {

// return nullptr;

//}

//int get_w_grad_size() {

// return 0;

//}

//int get_b_grad_size() {

// return 0;

//}

//void allreduce_grads(ncclComm_t comm, int num_gpus, cudaStream_t stream) {//专门用来同步分布训练梯度的函数

// for (const auto &layer : layers) {

// int w_size = layer->get_w_grad_size();

// if (w_size) {//有参数的梯度

// float *w_grad = layer->get_w_grad();//这是获取已经算好的梯度

// ncclAllReduce(w_grad, w_grad, w_size, ncclFloat, ncclSum, comm, stream);

// //归一化

// scale_kernel << <(w_size + 255) / 256, 256 >> >(w_grad, w_size, 1.0 / num_gpus);

// }

// int b_size = layer->get_b_grad_size();

// if (b_size) {

// float *b_grad = layer->get_b_grad();//这是获取已经算好的梯度

// ncclAllReduce(b_grad, b_grad, b_size, ncclFloat, ncclSum, comm, stream);

// //归一化

// scale_kernel << <(b_size + 255) / 256, 256 >> >(b_grad, b_size, 1.0 / num_gpus);

// }

// cudaStreamSynchronize(stream);

//}

private:

cublasHandle_t &cublas;

cudnnHandle_t &cudnn;

int batch;

float *input, *output, *grad_input;

std::vector<std::shared_ptr<Layer>> layers;

float * layersWgh1; float * layersWgh2; float * layersWgh3; float * layersWgh4; float * layersWgh5;

};

厉害，这把c++的重载搞得是炉火纯青！我想改成笨办法，试了三次没成功！又简洁又高效，好，随他去！

第四步，就是训练了：

//#include<thread>

void train(int epochs, int batch_size, float lr, LeNet &net, int rank, cudaStream_t stream, MNISTDataset &Dataset) {

float *d_inputs;

unsigned *d_labels;

//输出的预测概率，softmax后的结果，损失，预测概率的梯度（作为LeNet反向传播的输入）

float *d_logits, *d_prob, *loss, *dlogits;

int num_classes = 10;

cudaMalloc(&d_inputs, batch_size * 28 * 28 * sizeof(float));

cudaMalloc(&d_labels, batch_size*sizeof(unsigned));

cudaMalloc(&d_logits, batch_size*num_classes*sizeof(float));

cudaMalloc(&d_prob, batch_size*num_classes*sizeof(float));

cudaMalloc(&loss, batch_size*sizeof(float));

cudaMalloc(&dlogits, batch_size*num_classes*sizeof(float));

std::vector<float> h_loss(batch_size);

for (int epoch = 0; epoch<epochs; epoch++) {

auto batch = Dataset.next_batch(batch_size);

std::vector<unsigned> h_labels(batch_size);

for (int batch_idx = 0; batch_idx<batch_size; batch_idx++) {

cudaMemcpy(d_inputs + batch_idx * 28 * 28, batch[batch_idx].image.data(), 28 * 28 * sizeof(float), cudaMemcpyHostToDevice);

h_labels[batch_idx] = batch[batch_idx].label;

}

cudaMemcpy(d_labels, h_labels.data(),

batch_size*sizeof(unsigned), cudaMemcpyHostToDevice);

net.forward(d_inputs);

cudaMemcpy(d_logits, net.get_output(), batch_size*num_classes*sizeof(float), cudaMemcpyDeviceToDevice);

softmax_forward_loss_batch << <batch_size, 1 >> >(d_logits, d_labels, d_prob, loss, batch_size, num_classes);

softmax_ce_backward_batch << <batch_size, num_classes >> >(d_prob, d_labels, dlogits, batch_size, num_classes);

net.backward(dlogits);

// net.allreduce_grads(comm, nDev, stream);

cudaStreamSynchronize(stream);

net.update(lr);

// ---- 打印中间输出 ----

cudaMemcpy(h_loss.data(), loss, batch_size*sizeof(float), cudaMemcpyDeviceToHost);

float avg_loss = 0.f;

for (float l : h_loss) avg_loss += l;

avg_loss /= batch_size;

if (epoch % 1 == 0) { // 每个 epoch 打印一次

std::cout << "[Rank " << rank << "] Epoch " << epoch

<< " Avg Loss = " << avg_loss

<< " (first label=" << h_labels[0] << ")"

<< std::endl;

}

cudaFree(d_inputs);

cudaFree(d_labels);

cudaFree(d_logits);

cudaFree(d_prob);

cudaFree(loss);

cudaFree(dlogits);

}

第五步,测试10000张：

int last[10000];

void test( LeNet &net, cudaStream_t stream, MNISTDataset &Dataset) {//使用10000

float *d_inputs;

unsigned *d_labels;

for (int x = 0; x<10000; x++)

{

last[x] = 0;

}

//输出的预测概率，softmax后的结果，损失，预测概率的梯度（作为LeNet反向传播的输入）

float *d_logits, *d_prob, *loss, *dlogits;

int num_classes = 10;

cudaMalloc(&d_inputs, 1 * 28 * 28 * sizeof(float));

cudaMalloc(&d_labels, 1*sizeof(unsigned));

cudaMalloc(&d_logits, 1*10*sizeof(float));

cudaMalloc(&d_prob, 1*10*sizeof(float));

cudaMalloc(&loss, 1*sizeof(float));

cudaMalloc(&dlogits, 1*10*sizeof(float));

std::vector<float> h_loss(1); std::vector<float> h_prob(10);

for (int epoch = 0; epoch<10000; epoch++) {

auto batch = Dataset.next_batch(1);

std::vector<unsigned> h_labels(1);//取了一个图像一个标签，也可以是train数据取10000

for (int batch_idx = 0; batch_idx<1; batch_idx++) {

cudaMemcpy(d_inputs + batch_idx * 28 * 28, batch[batch_idx].image.data(), 28 * 28 * sizeof(float), cudaMemcpyHostToDevice);

h_labels[batch_idx] = batch[batch_idx].label;

}

cudaMemcpy(d_labels, h_labels.data(),

1*sizeof(unsigned), cudaMemcpyHostToDevice);

net.forward(d_inputs);

cudaMemcpy(d_logits, net.get_output(), 1*10*sizeof(float), cudaMemcpyDeviceToDevice);

softmax_forward_loss_batch << <1, 1 >> >(d_logits, d_labels, d_prob, loss, 1, num_classes);

cudaStreamSynchronize(stream);

// ---- 打印中间输出 ----

cudaMemcpy(h_loss.data(), loss, 1*sizeof(float), cudaMemcpyDeviceToHost);

cudaMemcpy(h_prob.data(), d_prob, 1 * 10 * sizeof(float), cudaMemcpyDeviceToHost);

int ans = 0;

for (int i = 1; i < 10; i++)

if (h_prob[i] > h_prob[ans]) ans = i;//找到最大概率标号

if (h_labels[0] == ans)

last[epoch] = 1;

//找出最大概率是不是与标签同？同，则ok

//否则失败

/* float avg_loss = 0.f;

for (float l : h_loss) avg_loss += l;

avg_loss /= 1;*/

if (epoch % 1 == 0) { // 每个 epoch 打印一次

std::cout << "Count " << epoch

<< " 损失 = " << h_loss[0]

<< " max概率= " << h_prob[ans]

<< "max响应标签 " << ans

<< " (实际标签=" << h_labels[0] << ")"

<< std::endl;

}

cudaFree(d_inputs);

cudaFree(d_labels);

cudaFree(d_logits);

cudaFree(d_prob);

cudaFree(loss);

cudaFree(dlogits);

int P = 100;

int 得分数组[9];

int he = 9100;//统计最后1000，十次

float junzhi = 0;

for (int n = 0; n < 9; n++)

{

int hh = 0; //double wucha = 0;

for (int i = 0; i < P; i++)

{

hh += last[he + i];

// wucha += erT[he + i];

}

// textBox47.Text += he.ToString() + "," + hh.ToString() +":"+ ((float)wucha / 100).ToString()+ "\r\n";

he += 100;

junzhi += hh;

得分数组[n] = hh;

}

junzhi = junzhi / (float)9;

}

最后，mian函数上场：

int main() {

MNISTDataset Datasets = MNISTDataset{ "c:\\train-images.idx3-ubyte", "c:\\train-labels.idx1-ubyte" };

//所以我就不要线程了，也不要他的minist了

int dev = 0;

cudaSetDevice(dev);

// 创建流

cudaStream_t stream;

checkCuda(cudaStreamCreate(&stream));

cublasHandle_t cublas;

cudnnHandle_t cudnn;

cublasCreate(&cublas);//句柄是与当前上下文绑定的，不能一个句柄执行在不同GPU

cudnnCreate(&cudnn);

/*int batch_size = 8;*/

int batch_size = 10;

float lr = 0.018f;

LeNet LeNet_net(cublas, cudnn, batch_size);

train(6000, batch_size, lr, LeNet_net, 0, stream, Datasets);

test(LeNet_net,stream, Datasets);

//第二次训练和测试，看成绩会不会提升

train(6000, batch_size, lr, LeNet_net, 0, stream, Datasets);

test(LeNet_net, stream, Datasets);

cublasDestroy(cublas);

cudnnDestroy(cudnn);

cudaStreamDestroy(stream);

}

上面要解释的是：我训练了两次，叫epoch，而程序中的epoch循环了6000次，并不是真正的轮次，批处理是10，so，6000（epochs）*10（batch_size）=60000，这正好是60000万张训练图像！