卷积神经网络实现手写数字识别

一、实验介绍

手写数字识别是一个典型的图像分类问题,在日常生活中已经被广泛地应用。本实验基于mnist数据集,通过搭建卷积神经网络完成手写数字识别模型的训练,并通过训练集与自制手写数字图片对模型性能进行评估。

二、设计过程

1. 环境搭建

  • 安装Python、Pytorch等必要的开发环境。

  • 创建一个新的Python项目,用于存放实验代码和数据。

2. 数据加载与处理

  • 从mnist数据集中下载训练和测试数据。

    Mnist是一个手写字体的数字图像数据集,包含数字0-9共计70000张图像,其中训练集60000张测试集10000张,每张图像的大小为28*28。数据集地址:http://yann.lecun.com/exdb/mnist/。下载来的数据是784维度的数组,并不是图像数据,所以首先我们要把下载来的数据转化成图像。本文采用torchvision内部封装好的数据集部分torchvision.datasets来下载mnist

    部分代码如下,代码文件:mnist_save.py

python 复制代码
import torch
import torchvision
import os
from torchvision import transforms
from torchvision.datasets import mnist
import torch.utils.data as data
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import itertools

# 将mnist保存为图像
def save():
    os.makedirs('mnist/train', exist_ok=True)
    os.makedirs('mnist/test', exist_ok=True)
    for i in range(10):
        os.makedirs('mnist/train/' + str(i), exist_ok=True)
        os.makedirs('mnist/test/' + str(i), exist_ok=True)
    # 保存训练集
    for i, item in enumerate(train_loader):
        img, label = item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        img_path = 'mnist/train/' + str(label) + '/' + str(i) + '.jpg'
        print(img_path)
        img.save(img_path)

    # 保存测试集
    for i, item in enumerate(test_loader):
        img, label = item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        img_path = 'mnist/test/' + str(label) + '/' + str(i) + '.jpg'
        print(img_path)
        img.save(img_path)


# 查看部分mnist图像
def show():
    plt.figure(figsize=(16, 9))
    for i, item in enumerate(itertools.islice(train_loader,2,12)):
        plt.subplot(2, 5, i+1)
        img,label= item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        plt.imshow(img, cmap=plt.get_cmap('gray'))
    plt.show()

if __name__ == '__main__':
    train_data = mnist.MNIST('mnist', train=True, transform=transforms.ToTensor(), download=True)
    test_data = mnist.MNIST('mnist', train=False, transform=transforms.ToTensor(), download=True)
    train_loader = data.DataLoader(dataset=train_data, batch_size=1, shuffle=True)
    test_loader = data.DataLoader(dataset=test_data, batch_size=1, shuffle=True)
    train_total = train_loader.__len__()
    test_total = test_loader.__len__()
    labels = train_data.targets
    print(train_data.targets)
    print(train_total, test_total)
    dataiter = iter(train_data)
    print(train_data)
    images, labs = dataiter.__next__()
    print(type(images), type(labs))
    print(images.shape, labs)
    save()
    show() 
  • 使用pytorch自带的dataset为基类,进行重构数据加载部分。
python 复制代码
class MnistData(Dataset):
    def __init__(self, data, transform=None, target_transform=None):
        super(Dataset, self).__init__()
        self.transform = transform
        self.target_transform = target_transform
        self.samples = data

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, target = self.samples[index]
        img = Image.open(img_path).convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

3. 构建网络模型

数据流向和代码实现如下:CNN.py

python 复制代码
class CnnNet(nn.Module):
    def __init__(self, classes=10):
        super(CnnNet, self).__init__()
        # 分类数
        self.classes = classes
        # 第一层卷积,输入:bs*3*28*28 输出:bs*16*14*14
        self.conv1 = nn.Sequential(
            # 卷积 输入:bs*3*28*28  输出:bs*16*28*28
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(16),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*16*28*28  输出:bs*16*14*14
            nn.MaxPool2d(2)
        )
        # 第二层卷积,输入:bs*16*14*14 输出:bs*64*7*7
        self.conv2 = nn.Sequential(
            # 卷积 输入:bs*16*14*14  输出:bs*32*14*14
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(32),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*16*14*14  输出:bs*32*7*7
            nn.MaxPool2d(2)
        )
        # 第三层卷积,输入:bs*32*7*7 输出:bs*64*3*3
        self.conv3 = nn.Sequential(
            # 卷积 输入:bs*32*7*7  输出:bs*64*3*3
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(64),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*32*7*7 输出:bs*64*3*3
            nn.MaxPool2d(2)
        )
        # 自适应池化,将bs*64*3*3映射为bs*64*1*1
        self.advpool = nn.AdaptiveAvgPool2d((1, 1))
        # 全连接层
        self.fc = nn.Linear(64, self.classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.advpool(x)
        # 需要将多维度的值展平为一维,送入linear中,但是需要保持batchsize的维度
        # 例如2*64*1*1 变成2*64
        out = x.view(x.size(0), -1)
        out = self.fc(out)
        return out


if __name__ == '__main__':
    # 测试数据
    x = torch.rand((2, 3, 28, 28))
    cnn = CnnNet(classes=10)
    print(cnn)
    out = cnn(x)
    print(out)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    cnn = cnn.to(device)
    # 网络模型的数据流程及参数信息
    summary(cnn, (3, 28, 28))

4. 模型训练(可视化)

为了使模型达到较好效果,我们需要对传入的参数进行调整。

对epoch进行调参

EPOCH 8 18 36 50 81
loss 0.0019 0.0010 0.0009 0.0008 0.0007
train_acc 0.9861 0.9938 0.9951 0.9958 0.9965
val_acc 0.9855 0.9468 0.9897 0.9903 0.9915

选择epoch=50

对bs进行调参

BS 8 16 32 64 128
lose 0.0458 0.0325 0.0008 0.0007 0.0114
train_acc 0.9153 0.8879 0.9958 0.9969 0.7039
val_acc 0.9747 0.9595 0.9903 0.9880 0.8622

选择bs=64

对lr进行调参

LR 0.0001 0.0003 0.0005 0.001 0.01
loss 0.1531 0.0569 0.0311 0.0308 0.0004
train_acc 0.8887 0.9423 0.9517 0.9644 0.9969
val_acc 0.893 0.9454 0.9597 0.9648 0.9880

选择lr=0.01

python 复制代码
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
import random
import os
from mnist_dataset import MnistData
from CNN import CnnNet
import time
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import matplotlib.pyplot as plt


# 将数据集划分训练集和验证集
def split_data(files):
    """
    :param files:
    :return:
    """
    random.shuffle(files)
    # 计算比例系数,分割数据训练集和验证集
    ratio = 0.9
    offset = int(len(files) * ratio)
    train_data = files[:offset]
    val_data = files[offset:]
    return train_data, val_data


# 训练
def train(model, loss_func, optimizer, checkpoints, epoch):
    print('Train......................')
    best_acc = 0
    best_epoch = 0
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    for epoch in range(1, epoch + 1):
        # 设置计时器,计算每个epoch的用时
        start_time = time.time()
        model.train()  # 保证每一个batch都能进入model.train()的模式
        # 记录每个epoch的loss和acc
        train_loss, train_acc, val_loss, val_acc = 0, 0, 0, 0
        for i, (inputs, labels) in enumerate(train_data):
            # print(batch_size)
            # print(i, inputs, labels)
            inputs = inputs.to(device)
            labels = labels.to(device)
            # 预测输出
            outputs = model(inputs)
            # 计算损失
            loss = loss_func(outputs, labels)
            # print(outputs)
            # 因为梯度是累加的,需要清零
            optimizer.zero_grad()
            # 反向传播
            loss.backward()
            # 优化器
            optimizer.step()
            # 计算准确率
            output = nn.functional.softmax(outputs, dim=1)
            pred = torch.argmax(output, dim=1)
            acc = torch.sum(pred == labels)
            train_loss += loss.item()
            train_acc += acc.item()
        # 验证集进行验证
        model.eval()
        with torch.no_grad():
            for i, (inputs, labels) in enumerate(val_data):
                inputs = inputs.to(device)
                labels = labels.to(device)
                # 预测输出
                outputs = model(inputs)
                # 计算损失
                loss = loss_func(outputs, labels)
                # 计算准确率
                output = nn.functional.softmax(outputs, dim=1)
                pred = torch.argmax(output, dim=1)
                # print(pred,'================')
                # print(pred==labels,'=====----------======')
                acc = torch.sum(pred == labels)
                # acc = calculat_acc(outputs, labels)
                val_loss += loss.item()
                val_acc += acc.item()

                # 计算每个epoch的训练损失和精度
            train_loss_epoch = train_loss / train_data_size
            train_acc_epoch = train_acc / train_data_size
            # 计算每个epoch的验证集损失和精度
            val_loss_epoch = val_loss / val_data_size
            val_acc_epoch = val_acc / val_data_size
            end_time = time.time()
            print(
                'epoch:{} | time:{:.4f} | train_loss:{:.4f} | train_acc:{:.4f} | eval_loss:{:.4f} | val_acc:{:.4f}'.format(
                    epoch, end_time - start_time, train_loss_epoch, train_acc_epoch, val_loss_epoch, val_acc_epoch))

            # 记录损失和准确率
            train_losses.append(train_loss_epoch)
            train_accs.append(train_acc_epoch)
            val_losses.append(val_loss_epoch)
            val_accs.append(val_acc_epoch)
        print(
            'epoch:{} | time:{:.4f} | train_loss:{:.4f} | train_acc:{:.4f} | eval_loss:{:.4f} | val_acc:{:.4f}'.format(
                epoch,
                end_time - start_time,
                train_loss_epoch,
                train_acc_epoch,
                val_loss_epoch,
                val_acc_epoch))

        # 记录验证集上准确率最高的模型
        best_model_path = checkpoints + "/" + 'best_model' + '.pth'
        if val_acc_epoch >= best_acc:
            best_acc = val_acc_epoch
            best_epoch = epoch
            torch.save(model, best_model_path)
        print('Best Accuracy for Validation :{:.4f} at epoch {:d}'.format(best_acc, best_epoch))
        # 每迭代50次保存一次模型
        # if epoch % 50 == 0:
        #     model_name = '/epoch_' + str(epoch) + '.pt'
        #     torch.save(model, checkpoints + model_name)
    # 保存最后的模型
    torch.save(model, checkpoints + '/last.pt')

 # 绘制损失和准确率曲线
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Loss per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.title('Accuracy per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    # batchsize
    bs = 32
    # learning rate
    lr = 0.01
    # epoch
    epoch = 50
    # checkpoints,模型保存路径
    checkpoints = 'checkpoints'
    os.makedirs(checkpoints, exist_ok=True)
    transform = transforms.Compose([
        transforms.ToTensor()
    ])

    # 训练0-9的数据
    labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    base_dir = 'mnist/train'
    imgs = []
    # 获取数据
    for label in labels:
        label_dir = os.path.join(base_dir, str(label))
        images = os.listdir(label_dir)
        for img in images:
            img_path = os.path.join(label_dir, img)
            imgs.append((img_path, label))
    print(len(imgs))
    # 将训练数据拆分成训练集和验证集
    trains, vals = split_data(imgs)
    # 加载训练集数据
    train_dataset = MnistData(trains, transform=transform)
    train_data = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=0)
    # 加载验证集数据
    val_dataset = MnistData(vals, transform=transform)
    val_data = DataLoader(val_dataset, batch_size=bs, shuffle=True, num_workers=0)
    train_data_size = train_dataset.__len__()
    val_data_size = val_dataset.__len__()
    print(train_dataset.__len__())
    print(val_dataset.__len__())

    # 加载模型
    model = CnnNet(classes=10)
    # GPU是否可用,如果可用,则使用GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    # 损失函数
    loss_func = nn.CrossEntropyLoss()
    # 优化器,使用SGD,可换Adam
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=1e-3)
    # 训练
    train(model, loss_func, optimizer, checkpoints, epoch)

5. 识别数字的准确率

python 复制代码
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image
import os
import json

# 手写数字预测+特征提取

# 预测类
class Pred:
    def __init__(self):
        self.labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        model_path = 'checkpoints/best_model.pth'
        # 加载模型
        self.model = torch.load(model_path)
        self.device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(self.device)

    # 预测
    def predict(self, img_path):
        transform = transforms.Compose([
            transforms.ToTensor()
        ])
        img = Image.open(img_path).convert('RGB')
        img = transform(img)
        img = img.view(1, 3, 28, 28).to(self.device)
        output = self.model(img)
        output = torch.softmax(output, dim=1)
        # 每个预测值的概率
        probability = output.cpu().detach().numpy()[0]
        # 找出最大概率值的索引
        output = torch.argmax(output, dim=1)
        index = output.cpu().numpy()[0]
        # 预测结果
        pred = self.labels[index]
        return pred


if __name__ == '__main__':
    test_dir = 'mnist/test'
    pred = Pred()

    for label in range(10):
        test_dir_path = os.path.join(test_dir,str(label))
        total_img = 0
        correct_prediction = 0
        for file in os.listdir(test_dir_path):
            file_path = os.path.join(test_dir_path,file)

            correct_number = label
            pred_num = pred.predict(file_path)
            if pred_num == correct_number:
                correct_prediction += 1
            total_img += 1
        accuracy = (correct_prediction/total_img)*100
        print(f"Number {label} 's accuracy is {accuracy:3f}%")
    print("done")
相关推荐
蓝婷儿31 分钟前
Python 机器学习核心入门与实战进阶 Day 3 - 决策树 & 随机森林模型实战
人工智能·python·机器学习
大千AI助手34 分钟前
PageRank:互联网的马尔可夫链平衡态
人工智能·机器学习·贝叶斯·mc·pagerank·条件概率·马尔科夫链
我就是全世界1 小时前
TensorRT-LLM:大模型推理加速的核心技术与实践优势
人工智能·机器学习·性能优化·大模型·tensorrt-llm
.30-06Springfield1 小时前
决策树(Decision tree)算法详解(ID3、C4.5、CART)
人工智能·python·算法·决策树·机器学习
强哥之神3 小时前
英伟达发布 Llama Nemotron Nano 4B:专为边缘 AI 和科研任务优化的高效开源推理模型
人工智能·深度学习·语言模型·架构·llm·transformer·边缘计算
陈敬雷-充电了么-CEO兼CTO4 小时前
大模型技术原理 - 基于Transformer的预训练语言模型
人工智能·深度学习·语言模型·自然语言处理·chatgpt·aigc·transformer
旷世奇才李先生5 小时前
Pillow 安装使用教程
深度学习·microsoft·pillow
acstdm8 小时前
DAY 48 CBAM注意力
人工智能·深度学习·机器学习
澪-sl8 小时前
基于CNN的人脸关键点检测
人工智能·深度学习·神经网络·计算机视觉·cnn·视觉检测·卷积神经网络
羊小猪~~9 小时前
数据库学习笔记(十七)--触发器的使用
数据库·人工智能·后端·sql·深度学习·mysql·考研