卷积神经网络实现手写数字识别

一、实验介绍

手写数字识别是一个典型的图像分类问题,在日常生活中已经被广泛地应用。本实验基于mnist数据集,通过搭建卷积神经网络完成手写数字识别模型的训练,并通过训练集与自制手写数字图片对模型性能进行评估。

二、设计过程

1. 环境搭建

  • 安装Python、Pytorch等必要的开发环境。

  • 创建一个新的Python项目,用于存放实验代码和数据。

2. 数据加载与处理

  • 从mnist数据集中下载训练和测试数据。

    Mnist是一个手写字体的数字图像数据集,包含数字0-9共计70000张图像,其中训练集60000张测试集10000张,每张图像的大小为28*28。数据集地址:http://yann.lecun.com/exdb/mnist/。下载来的数据是784维度的数组,并不是图像数据,所以首先我们要把下载来的数据转化成图像。本文采用torchvision内部封装好的数据集部分torchvision.datasets来下载mnist

    部分代码如下,代码文件:mnist_save.py

python 复制代码
import torch
import torchvision
import os
from torchvision import transforms
from torchvision.datasets import mnist
import torch.utils.data as data
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import itertools

# 将mnist保存为图像
def save():
    os.makedirs('mnist/train', exist_ok=True)
    os.makedirs('mnist/test', exist_ok=True)
    for i in range(10):
        os.makedirs('mnist/train/' + str(i), exist_ok=True)
        os.makedirs('mnist/test/' + str(i), exist_ok=True)
    # 保存训练集
    for i, item in enumerate(train_loader):
        img, label = item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        img_path = 'mnist/train/' + str(label) + '/' + str(i) + '.jpg'
        print(img_path)
        img.save(img_path)

    # 保存测试集
    for i, item in enumerate(test_loader):
        img, label = item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        img_path = 'mnist/test/' + str(label) + '/' + str(i) + '.jpg'
        print(img_path)
        img.save(img_path)


# 查看部分mnist图像
def show():
    plt.figure(figsize=(16, 9))
    for i, item in enumerate(itertools.islice(train_loader,2,12)):
        plt.subplot(2, 5, i+1)
        img,label= item
        img = img[0].cpu().numpy()
        array = (img.reshape((28, 28)) * 255).astype(np.uint8)
        img = Image.fromarray(array, 'L')
        label = label.cpu().numpy()[0]
        plt.imshow(img, cmap=plt.get_cmap('gray'))
    plt.show()

if __name__ == '__main__':
    train_data = mnist.MNIST('mnist', train=True, transform=transforms.ToTensor(), download=True)
    test_data = mnist.MNIST('mnist', train=False, transform=transforms.ToTensor(), download=True)
    train_loader = data.DataLoader(dataset=train_data, batch_size=1, shuffle=True)
    test_loader = data.DataLoader(dataset=test_data, batch_size=1, shuffle=True)
    train_total = train_loader.__len__()
    test_total = test_loader.__len__()
    labels = train_data.targets
    print(train_data.targets)
    print(train_total, test_total)
    dataiter = iter(train_data)
    print(train_data)
    images, labs = dataiter.__next__()
    print(type(images), type(labs))
    print(images.shape, labs)
    save()
    show() 
  • 使用pytorch自带的dataset为基类,进行重构数据加载部分。
python 复制代码
class MnistData(Dataset):
    def __init__(self, data, transform=None, target_transform=None):
        super(Dataset, self).__init__()
        self.transform = transform
        self.target_transform = target_transform
        self.samples = data

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, target = self.samples[index]
        img = Image.open(img_path).convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

3. 构建网络模型

数据流向和代码实现如下:CNN.py

python 复制代码
class CnnNet(nn.Module):
    def __init__(self, classes=10):
        super(CnnNet, self).__init__()
        # 分类数
        self.classes = classes
        # 第一层卷积,输入:bs*3*28*28 输出:bs*16*14*14
        self.conv1 = nn.Sequential(
            # 卷积 输入:bs*3*28*28  输出:bs*16*28*28
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(16),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*16*28*28  输出:bs*16*14*14
            nn.MaxPool2d(2)
        )
        # 第二层卷积,输入:bs*16*14*14 输出:bs*64*7*7
        self.conv2 = nn.Sequential(
            # 卷积 输入:bs*16*14*14  输出:bs*32*14*14
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(32),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*16*14*14  输出:bs*32*7*7
            nn.MaxPool2d(2)
        )
        # 第三层卷积,输入:bs*32*7*7 输出:bs*64*3*3
        self.conv3 = nn.Sequential(
            # 卷积 输入:bs*32*7*7  输出:bs*64*3*3
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            # 归一化
            nn.BatchNorm2d(64),
            # 激活函数
            nn.ReLU(),
            # 最大池化:输入:bs*32*7*7 输出:bs*64*3*3
            nn.MaxPool2d(2)
        )
        # 自适应池化,将bs*64*3*3映射为bs*64*1*1
        self.advpool = nn.AdaptiveAvgPool2d((1, 1))
        # 全连接层
        self.fc = nn.Linear(64, self.classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.advpool(x)
        # 需要将多维度的值展平为一维,送入linear中,但是需要保持batchsize的维度
        # 例如2*64*1*1 变成2*64
        out = x.view(x.size(0), -1)
        out = self.fc(out)
        return out


if __name__ == '__main__':
    # 测试数据
    x = torch.rand((2, 3, 28, 28))
    cnn = CnnNet(classes=10)
    print(cnn)
    out = cnn(x)
    print(out)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    cnn = cnn.to(device)
    # 网络模型的数据流程及参数信息
    summary(cnn, (3, 28, 28))

4. 模型训练(可视化)

为了使模型达到较好效果,我们需要对传入的参数进行调整。

对epoch进行调参

EPOCH 8 18 36 50 81
loss 0.0019 0.0010 0.0009 0.0008 0.0007
train_acc 0.9861 0.9938 0.9951 0.9958 0.9965
val_acc 0.9855 0.9468 0.9897 0.9903 0.9915

选择epoch=50

对bs进行调参

BS 8 16 32 64 128
lose 0.0458 0.0325 0.0008 0.0007 0.0114
train_acc 0.9153 0.8879 0.9958 0.9969 0.7039
val_acc 0.9747 0.9595 0.9903 0.9880 0.8622

选择bs=64

对lr进行调参

LR 0.0001 0.0003 0.0005 0.001 0.01
loss 0.1531 0.0569 0.0311 0.0308 0.0004
train_acc 0.8887 0.9423 0.9517 0.9644 0.9969
val_acc 0.893 0.9454 0.9597 0.9648 0.9880

选择lr=0.01

python 复制代码
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
import random
import os
from mnist_dataset import MnistData
from CNN import CnnNet
import time
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import matplotlib.pyplot as plt


# 将数据集划分训练集和验证集
def split_data(files):
    """
    :param files:
    :return:
    """
    random.shuffle(files)
    # 计算比例系数,分割数据训练集和验证集
    ratio = 0.9
    offset = int(len(files) * ratio)
    train_data = files[:offset]
    val_data = files[offset:]
    return train_data, val_data


# 训练
def train(model, loss_func, optimizer, checkpoints, epoch):
    print('Train......................')
    best_acc = 0
    best_epoch = 0
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    for epoch in range(1, epoch + 1):
        # 设置计时器,计算每个epoch的用时
        start_time = time.time()
        model.train()  # 保证每一个batch都能进入model.train()的模式
        # 记录每个epoch的loss和acc
        train_loss, train_acc, val_loss, val_acc = 0, 0, 0, 0
        for i, (inputs, labels) in enumerate(train_data):
            # print(batch_size)
            # print(i, inputs, labels)
            inputs = inputs.to(device)
            labels = labels.to(device)
            # 预测输出
            outputs = model(inputs)
            # 计算损失
            loss = loss_func(outputs, labels)
            # print(outputs)
            # 因为梯度是累加的,需要清零
            optimizer.zero_grad()
            # 反向传播
            loss.backward()
            # 优化器
            optimizer.step()
            # 计算准确率
            output = nn.functional.softmax(outputs, dim=1)
            pred = torch.argmax(output, dim=1)
            acc = torch.sum(pred == labels)
            train_loss += loss.item()
            train_acc += acc.item()
        # 验证集进行验证
        model.eval()
        with torch.no_grad():
            for i, (inputs, labels) in enumerate(val_data):
                inputs = inputs.to(device)
                labels = labels.to(device)
                # 预测输出
                outputs = model(inputs)
                # 计算损失
                loss = loss_func(outputs, labels)
                # 计算准确率
                output = nn.functional.softmax(outputs, dim=1)
                pred = torch.argmax(output, dim=1)
                # print(pred,'================')
                # print(pred==labels,'=====----------======')
                acc = torch.sum(pred == labels)
                # acc = calculat_acc(outputs, labels)
                val_loss += loss.item()
                val_acc += acc.item()

                # 计算每个epoch的训练损失和精度
            train_loss_epoch = train_loss / train_data_size
            train_acc_epoch = train_acc / train_data_size
            # 计算每个epoch的验证集损失和精度
            val_loss_epoch = val_loss / val_data_size
            val_acc_epoch = val_acc / val_data_size
            end_time = time.time()
            print(
                'epoch:{} | time:{:.4f} | train_loss:{:.4f} | train_acc:{:.4f} | eval_loss:{:.4f} | val_acc:{:.4f}'.format(
                    epoch, end_time - start_time, train_loss_epoch, train_acc_epoch, val_loss_epoch, val_acc_epoch))

            # 记录损失和准确率
            train_losses.append(train_loss_epoch)
            train_accs.append(train_acc_epoch)
            val_losses.append(val_loss_epoch)
            val_accs.append(val_acc_epoch)
        print(
            'epoch:{} | time:{:.4f} | train_loss:{:.4f} | train_acc:{:.4f} | eval_loss:{:.4f} | val_acc:{:.4f}'.format(
                epoch,
                end_time - start_time,
                train_loss_epoch,
                train_acc_epoch,
                val_loss_epoch,
                val_acc_epoch))

        # 记录验证集上准确率最高的模型
        best_model_path = checkpoints + "/" + 'best_model' + '.pth'
        if val_acc_epoch >= best_acc:
            best_acc = val_acc_epoch
            best_epoch = epoch
            torch.save(model, best_model_path)
        print('Best Accuracy for Validation :{:.4f} at epoch {:d}'.format(best_acc, best_epoch))
        # 每迭代50次保存一次模型
        # if epoch % 50 == 0:
        #     model_name = '/epoch_' + str(epoch) + '.pt'
        #     torch.save(model, checkpoints + model_name)
    # 保存最后的模型
    torch.save(model, checkpoints + '/last.pt')

 # 绘制损失和准确率曲线
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Loss per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.title('Accuracy per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    # batchsize
    bs = 32
    # learning rate
    lr = 0.01
    # epoch
    epoch = 50
    # checkpoints,模型保存路径
    checkpoints = 'checkpoints'
    os.makedirs(checkpoints, exist_ok=True)
    transform = transforms.Compose([
        transforms.ToTensor()
    ])

    # 训练0-9的数据
    labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    base_dir = 'mnist/train'
    imgs = []
    # 获取数据
    for label in labels:
        label_dir = os.path.join(base_dir, str(label))
        images = os.listdir(label_dir)
        for img in images:
            img_path = os.path.join(label_dir, img)
            imgs.append((img_path, label))
    print(len(imgs))
    # 将训练数据拆分成训练集和验证集
    trains, vals = split_data(imgs)
    # 加载训练集数据
    train_dataset = MnistData(trains, transform=transform)
    train_data = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=0)
    # 加载验证集数据
    val_dataset = MnistData(vals, transform=transform)
    val_data = DataLoader(val_dataset, batch_size=bs, shuffle=True, num_workers=0)
    train_data_size = train_dataset.__len__()
    val_data_size = val_dataset.__len__()
    print(train_dataset.__len__())
    print(val_dataset.__len__())

    # 加载模型
    model = CnnNet(classes=10)
    # GPU是否可用,如果可用,则使用GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    # 损失函数
    loss_func = nn.CrossEntropyLoss()
    # 优化器,使用SGD,可换Adam
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=1e-3)
    # 训练
    train(model, loss_func, optimizer, checkpoints, epoch)

5. 识别数字的准确率

python 复制代码
import torch
import torch.nn as nn
from torchvision import transforms
from PIL import Image
import os
import json

# 手写数字预测+特征提取

# 预测类
class Pred:
    def __init__(self):
        self.labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        model_path = 'checkpoints/best_model.pth'
        # 加载模型
        self.model = torch.load(model_path)
        self.device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(self.device)

    # 预测
    def predict(self, img_path):
        transform = transforms.Compose([
            transforms.ToTensor()
        ])
        img = Image.open(img_path).convert('RGB')
        img = transform(img)
        img = img.view(1, 3, 28, 28).to(self.device)
        output = self.model(img)
        output = torch.softmax(output, dim=1)
        # 每个预测值的概率
        probability = output.cpu().detach().numpy()[0]
        # 找出最大概率值的索引
        output = torch.argmax(output, dim=1)
        index = output.cpu().numpy()[0]
        # 预测结果
        pred = self.labels[index]
        return pred


if __name__ == '__main__':
    test_dir = 'mnist/test'
    pred = Pred()

    for label in range(10):
        test_dir_path = os.path.join(test_dir,str(label))
        total_img = 0
        correct_prediction = 0
        for file in os.listdir(test_dir_path):
            file_path = os.path.join(test_dir_path,file)

            correct_number = label
            pred_num = pred.predict(file_path)
            if pred_num == correct_number:
                correct_prediction += 1
            total_img += 1
        accuracy = (correct_prediction/total_img)*100
        print(f"Number {label} 's accuracy is {accuracy:3f}%")
    print("done")
相关推荐
NLP小讲堂38 分钟前
LLaMA Factory 深度调参
人工智能·机器学习
不懂嵌入式44 分钟前
基于深度学习的水果识别系统设计
人工智能·深度学习
Alessio Micheli1 小时前
基于几何布朗运动的股价预测模型构建与分析
线性代数·机器学习·概率论
vlln1 小时前
适应性神经树:当深度学习遇上决策树的“生长法则”
人工智能·深度学习·算法·决策树·机器学习
奋斗者1号1 小时前
机器学习之决策树与决策森林:机器学习中的强大工具
人工智能·决策树·机器学习
xiangzhihong82 小时前
LegoGPT,卡内基梅隆大学推出的乐高积木设计模型
机器学习
小L爱科研2 小时前
4.7/Q1,GBD数据库最新文章解读
数据库·机器学习·数据分析·回归·健康医疗
zkmall2 小时前
推荐算法工程化:ZKmall模板商城的B2C 商城的用户分层推荐策略
算法·机器学习·推荐算法
龙湾开发3 小时前
轻量级高性能推理引擎MNN 学习笔记 02.MNN主要API
人工智能·笔记·学习·机器学习·mnn
CopyLower3 小时前
Java与AI技术结合:从机器学习到生成式AI的实践
java·人工智能·机器学习