卷积神经网络实现手写数字识别

一、实验介绍

手写数字识别是一个典型的图像分类问题,在日常生活中已经被广泛地应用。本实验基于mnist数据集,通过搭建卷积神经网络完成手写数字识别模型的训练,并通过训练集与自制手写数字图片对模型性能进行评估。

二、设计过程

1. 环境搭建

  • 安装Python、Pytorch等必要的开发环境。

  • 创建一个新的Python项目,用于存放实验代码和数据。

2. 数据加载与处理

  • 从mnist数据集中下载训练和测试数据。

    Mnist是一个手写字体的数字图像数据集,包含数字0-9共计70000张图像,其中训练集60000张测试集10000张,每张图像的大小为28*28。数据集地址:http://yann.lecun.com/exdb/mnist/。下载来的数据是784维度的数组,并不是图像数据,所以首先我们要把下载来的数据转化成图像。本文采用torchvision内部封装好的数据集部分torchvision.datasets来下载mnist

    部分代码如下,代码文件:mnist_save.py

python 复制代码
import torch
import torchvision
import os
from torchvision import transforms
from torchvision.datasets import mnist
import torch.utils.data as data
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import itertools

# 将mnist保存为图像
def save():
    os.makedirs('mnist/train', exist_ok=True)
    os.makedirs('mnist/test', exist_ok=True)
    for i in range(10):
        os.makedirs('mnist/train/' + str(i), exist_ok=True)
        os.makedirs('mnist/test/' + str(i), exist_ok=True)
    # 保存训练集
    for i, item in enumerate(train_loader):
        img, label = item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        img_path = 'mnist/train/' + str(label) + '/' + str(i) + '.jpg'
        print(img_path)
        img.save(img_path)

    # 保存测试集
    for i, item in enumerate(test_loader):
        img, label = item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        img_path = 'mnist/test/' + str(label) + '/' + str(i) + '.jpg'
        print(img_path)
        img.save(img_path)


# 查看部分mnist图像
def show():
    plt.figure(figsize=(16, 9))
    for i, item in enumerate(itertools.islice(train_loader,2,12)):
        plt.subplot(2, 5, i+1)
        img,label= item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        plt.imshow(img, cmap=plt.get_cmap('gray'))
    plt.show()

if __name__ == '__main__':
    train_data = mnist.MNIST('mnist', train=True, transform=transforms.ToTensor(), download=True)
    test_data = mnist.MNIST('mnist', train=False, transform=transforms.ToTensor(), download=True)
    train_loader = data.DataLoader(dataset=train_data, batch_size=1, shuffle=True)
    test_loader = data.DataLoader(dataset=test_data, batch_size=1, shuffle=True)
    train_total = train_loader.__len__()
    test_total = test_loader.__len__()
    labels = train_data.targets
    print(train_data.targets)
    print(train_total, test_total)
    dataiter = iter(train_data)
    print(train_data)
    images, labs = dataiter.__next__()
    print(type(images), type(labs))
    print(images.shape, labs)
    save()
    show() 
  • 使用pytorch自带的dataset为基类,进行重构数据加载部分。
python 复制代码
class MnistData(Dataset):
    def __init__(self, data, transform=None, target_transform=None):
        super(Dataset, self).__init__()
        self.transform = transform
        self.target_transform = target_transform
        self.samples = data

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, target = self.samples[index]
        img = Image.open(img_path).convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

3. 构建网络模型

数据流向和代码实现如下:CNN.py

python 复制代码
class CnnNet(nn.Module):
    def __init__(self, classes=10):
        super(CnnNet, self).__init__()
        # 分类数
        self.classes = classes
        # 第一层卷积,输入:bs*3*28*28 输出:bs*16*14*14
        self.conv1 = nn.Sequential(
            # 卷积 输入:bs*3*28*28  输出:bs*16*28*28
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(16),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*16*28*28  输出:bs*16*14*14
            nn.MaxPool2d(2)
        )
        # 第二层卷积,输入:bs*16*14*14 输出:bs*64*7*7
        self.conv2 = nn.Sequential(
            # 卷积 输入:bs*16*14*14  输出:bs*32*14*14
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(32),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*16*14*14  输出:bs*32*7*7
            nn.MaxPool2d(2)
        )
        # 第三层卷积,输入:bs*32*7*7 输出:bs*64*3*3
        self.conv3 = nn.Sequential(
            # 卷积 输入:bs*32*7*7  输出:bs*64*3*3
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(64),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*32*7*7 输出:bs*64*3*3
            nn.MaxPool2d(2)
        )
        # 自适应池化,将bs*64*3*3映射为bs*64*1*1
        self.advpool = nn.AdaptiveAvgPool2d((1, 1))
        # 全连接层
        self.fc = nn.Linear(64, self.classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.advpool(x)
        # 需要将多维度的值展平为一维,送入linear中,但是需要保持batchsize的维度
        # 例如2*64*1*1 变成2*64
        out = x.view(x.size(0), -1)
        out = self.fc(out)
        return out


if __name__ == '__main__':
    # 测试数据
    x = torch.rand((2, 3, 28, 28))
    cnn = CnnNet(classes=10)
    print(cnn)
    out = cnn(x)
    print(out)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    cnn = cnn.to(device)
    # 网络模型的数据流程及参数信息
    summary(cnn, (3, 28, 28))

4. 模型训练(可视化)

为了使模型达到较好效果,我们需要对传入的参数进行调整。

对epoch进行调参

EPOCH 8 18 36 50 81
loss 0.0019 0.0010 0.0009 0.0008 0.0007
train_acc 0.9861 0.9938 0.9951 0.9958 0.9965
val_acc 0.9855 0.9468 0.9897 0.9903 0.9915

选择epoch=50

对bs进行调参

BS 8 16 32 64 128
lose 0.0458 0.0325 0.0008 0.0007 0.0114
train_acc 0.9153 0.8879 0.9958 0.9969 0.7039
val_acc 0.9747 0.9595 0.9903 0.9880 0.8622

选择bs=64

对lr进行调参

LR 0.0001 0.0003 0.0005 0.001 0.01
loss 0.1531 0.0569 0.0311 0.0308 0.0004
train_acc 0.8887 0.9423 0.9517 0.9644 0.9969
val_acc 0.893 0.9454 0.9597 0.9648 0.9880

选择lr=0.01

python 复制代码
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
import random
import os
from mnist_dataset import MnistData
from CNN import CnnNet
import time
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import matplotlib.pyplot as plt


# 将数据集划分训练集和验证集
def split_data(files):
    """
    :param files:
    :return:
    """
    random.shuffle(files)
    # 计算比例系数,分割数据训练集和验证集
    ratio = 0.9
    offset = int(len(files) * ratio)
    train_data = files[:offset]
    val_data = files[offset:]
    return train_data, val_data


# 训练
def train(model, loss_func, optimizer, checkpoints, epoch):
    print('Train......................')
    best_acc = 0
    best_epoch = 0
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    for epoch in range(1, epoch + 1):
        # 设置计时器,计算每个epoch的用时
        start_time = time.time()
        model.train()  # 保证每一个batch都能进入model.train()的模式
        # 记录每个epoch的loss和acc
        train_loss, train_acc, val_loss, val_acc = 0, 0, 0, 0
        for i, (inputs, labels) in enumerate(train_data):
            # print(batch_size)
            # print(i, inputs, labels)
            inputs = inputs.to(device)
            labels = labels.to(device)
            # 预测输出
            outputs = model(inputs)
            # 计算损失
            loss = loss_func(outputs, labels)
            # print(outputs)
            # 因为梯度是累加的,需要清零
            optimizer.zero_grad()
            # 反向传播
            loss.backward()
            # 优化器
            optimizer.step()
            # 计算准确率
            output = nn.functional.softmax(outputs, dim=1)
            pred = torch.argmax(output, dim=1)
            acc = torch.sum(pred == labels)
            train_loss += loss.item()
            train_acc += acc.item()
        # 验证集进行验证
        model.eval()
        with torch.no_grad():
            for i, (inputs, labels) in enumerate(val_data):
                inputs = inputs.to(device)
                labels = labels.to(device)
                # 预测输出
                outputs = model(inputs)
                # 计算损失
                loss = loss_func(outputs, labels)
                # 计算准确率
                output = nn.functional.softmax(outputs, dim=1)
                pred = torch.argmax(output, dim=1)
                # print(pred,'================')
                # print(pred==labels,'=====----------======')
                acc = torch.sum(pred == labels)
                # acc = calculat_acc(outputs, labels)
                val_loss += loss.item()
                val_acc += acc.item()

                # 计算每个epoch的训练损失和精度
            train_loss_epoch = train_loss / train_data_size
            train_acc_epoch = train_acc / train_data_size
            # 计算每个epoch的验证集损失和精度
            val_loss_epoch = val_loss / val_data_size
            val_acc_epoch = val_acc / val_data_size
            end_time = time.time()
            print(
                'epoch:{} | time:{:.4f} | train_loss:{:.4f} | train_acc:{:.4f} | eval_loss:{:.4f} | val_acc:{:.4f}'.format(
                    epoch, end_time - start_time, train_loss_epoch, train_acc_epoch, val_loss_epoch, val_acc_epoch))

            # 记录损失和准确率
            train_losses.append(train_loss_epoch)
            train_accs.append(train_acc_epoch)
            val_losses.append(val_loss_epoch)
            val_accs.append(val_acc_epoch)
        print(
            'epoch:{} | time:{:.4f} | train_loss:{:.4f} | train_acc:{:.4f} | eval_loss:{:.4f} | val_acc:{:.4f}'.format(
                epoch,
                end_time - start_time,
                train_loss_epoch,
                train_acc_epoch,
                val_loss_epoch,
                val_acc_epoch))

        # 记录验证集上准确率最高的模型
        best_model_path = checkpoints + "/" + 'best_model' + '.pth'
        if val_acc_epoch >= best_acc:
            best_acc = val_acc_epoch
            best_epoch = epoch
            torch.save(model, best_model_path)
        print('Best Accuracy for Validation :{:.4f} at epoch {:d}'.format(best_acc, best_epoch))
        # 每迭代50次保存一次模型
        # if epoch % 50 == 0:
        #     model_name = '/epoch_' + str(epoch) + '.pt'
        #     torch.save(model, checkpoints + model_name)
    # 保存最后的模型
    torch.save(model, checkpoints + '/last.pt')

 # 绘制损失和准确率曲线
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Loss per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.title('Accuracy per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    # batchsize
    bs = 32
    # learning rate
    lr = 0.01
    # epoch
    epoch = 50
    # checkpoints,模型保存路径
    checkpoints = 'checkpoints'
    os.makedirs(checkpoints, exist_ok=True)
    transform = transforms.Compose([
        transforms.ToTensor()
    ])

    # 训练0-9的数据
    labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    base_dir = 'mnist/train'
    imgs = []
    # 获取数据
    for label in labels:
        label_dir = os.path.join(base_dir, str(label))
        images = os.listdir(label_dir)
        for img in images:
            img_path = os.path.join(label_dir, img)
            imgs.append((img_path, label))
    print(len(imgs))
    # 将训练数据拆分成训练集和验证集
    trains, vals = split_data(imgs)
    # 加载训练集数据
    train_dataset = MnistData(trains, transform=transform)
    train_data = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=0)
    # 加载验证集数据
    val_dataset = MnistData(vals, transform=transform)
    val_data = DataLoader(val_dataset, batch_size=bs, shuffle=True, num_workers=0)
    train_data_size = train_dataset.__len__()
    val_data_size = val_dataset.__len__()
    print(train_dataset.__len__())
    print(val_dataset.__len__())

    # 加载模型
    model = CnnNet(classes=10)
    # GPU是否可用,如果可用,则使用GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    # 损失函数
    loss_func = nn.CrossEntropyLoss()
    # 优化器,使用SGD,可换Adam
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=1e-3)
    # 训练
    train(model, loss_func, optimizer, checkpoints, epoch)

5. 识别数字的准确率

python 复制代码
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image
import os
import json

# 手写数字预测+特征提取

# 预测类
class Pred:
    def __init__(self):
        self.labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        model_path = 'checkpoints/best_model.pth'
        # 加载模型
        self.model = torch.load(model_path)
        self.device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(self.device)

    # 预测
    def predict(self, img_path):
        transform = transforms.Compose([
            transforms.ToTensor()
        ])
        img = Image.open(img_path).convert('RGB')
        img = transform(img)
        img = img.view(1, 3, 28, 28).to(self.device)
        output = self.model(img)
        output = torch.softmax(output, dim=1)
        # 每个预测值的概率
        probability = output.cpu().detach().numpy()[0]
        # 找出最大概率值的索引
        output = torch.argmax(output, dim=1)
        index = output.cpu().numpy()[0]
        # 预测结果
        pred = self.labels[index]
        return pred


if __name__ == '__main__':
    test_dir = 'mnist/test'
    pred = Pred()

    for label in range(10):
        test_dir_path = os.path.join(test_dir,str(label))
        total_img = 0
        correct_prediction = 0
        for file in os.listdir(test_dir_path):
            file_path = os.path.join(test_dir_path,file)

            correct_number = label
            pred_num = pred.predict(file_path)
            if pred_num == correct_number:
                correct_prediction += 1
            total_img += 1
        accuracy = (correct_prediction/total_img)*100
        print(f"Number {label} 's accuracy is {accuracy:3f}%")
    print("done")
相关推荐
MarkHD17 分钟前
第十一天 线性代数基础
线性代数·决策树·机器学习
打羽毛球吗️20 分钟前
机器学习中的两种主要思路:数据驱动与模型驱动
人工智能·机器学习
好喜欢吃红柚子37 分钟前
万字长文解读空间、通道注意力机制机制和超详细代码逐行分析(SE,CBAM,SGE,CA,ECA,TA)
人工智能·pytorch·python·计算机视觉·cnn
小馒头学python41 分钟前
机器学习是什么?AIGC又是什么?机器学习与AIGC未来科技的双引擎
人工智能·python·机器学习
正义的彬彬侠1 小时前
《XGBoost算法的原理推导》12-14决策树复杂度的正则化项 公式解析
人工智能·决策树·机器学习·集成学习·boosting·xgboost
羊小猪~~1 小时前
神经网络基础--什么是正向传播??什么是方向传播??
人工智能·pytorch·python·深度学习·神经网络·算法·机器学习
软工菜鸡2 小时前
预训练语言模型BERT——PaddleNLP中的预训练模型
大数据·人工智能·深度学习·算法·语言模型·自然语言处理·bert
正义的彬彬侠2 小时前
【scikit-learn 1.2版本后】sklearn.datasets中load_boston报错 使用 fetch_openml 函数来加载波士顿房价
python·机器学习·sklearn
资源补给站2 小时前
论文2—《基于柔顺控制的智能神经导航手术机器人系统设计》文献阅读分析报告
机器学习·机器人·手术机器人
哔哩哔哩技术3 小时前
B站S赛直播中的关键事件识别与应用
深度学习