李沐《动手学深度学习》kaggle树叶分类(ResNet18无预训练)python代码实现

前言

在尝试这个树叶分类之前,作者仅仅看完了ResNet残差网络一章,并没有看后面关于数据增强的部分,这导致在第一次使用最原始的ResNet18直接跑完训练数据之后的效果十分的差,提交kaggle后的准确仅有20%左右。本文最后依然使用未经预训练的手写ResNet18网络,但做了一定的数据增强,最终在较少的迭代次数下在kaggle精度能达到80%。

数据集来自李沐老师在kaggle官网所用的Classify Leaves数据集。

地址:https://www.kaggle.com/competitions/classify-leaves

一、数据处理

首先读入train.csv训练数据,使用sklearn中的labelencoder和train_test_split把训练数据的标签映射成int值,再随机取10%作为验证数据。

python 复制代码
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('./kaggle_classify_leaves_dataset/train.csv')

label_encoder = LabelEncoder() # 创建encoder类,准备把文本标签映射成int数
df['label'] = label_encoder.fit_transform(df['label'])
train_df, valid_df = train_test_split(df, test_size=0.1, stratify=df['label'])

二、DataSet

重点在于DataSet中的数据增强部分,使不使用数据增强对结果的影响非常大。本文对训练所用的图片进行了一些随机的裁剪、随机的对比度曝光调整以及对图片的标准化。

在初次接触数据增强的写法时我好奇这种写法并没有真正意义上的扩充训练数据的规模,后来发现在ImageDataset中对图片的一系列增强操作,只有在用train_iter真正的把数据一批一批的取出来的时候才会生效。也就是说两次epoch迭代之间虽然都是用了train_iter来取同样的所有的图片,但是由于图片增强的操作是随机的,所以不同的迭代之间用的图片实际上是不一样的。由此一来我们只要增加迭代的次数,就增加了图片的多样性。

python 复制代码
class ImageDataset(Dataset): # 用来获取图片和标签的dataset
    def __init__(self, dataframe, img_dir, mode=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        if mode=='train':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.RandomCrop(224),
                        transforms.RandomHorizontalFlip(),
                        transforms.RandomVerticalFlip(),
                        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # 随机改变颜色
                        transforms.ToTensor(),                
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # 标准化
                        ])
        elif mode=='val':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),  # 中心裁剪
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                        ])
        elif mode=='test':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),  # 中心裁剪
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                        ])
        self.transform = preprocess
        self.mode = mode
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self,idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0]) # 得到图片的路径
        img = Image.open(img_name) # 打开图像
        if self.transform:
            img = self.transform(img)
        if self.mode == 'test':
            return img
        
        label = self.dataframe.iloc[idx, 1]
        label = torch.tensor(label, dtype=torch.long)
                  
        return img, label


train_data = ImageDataset(train_df, img_dir, 'train')
valid_data = ImageDataset(valid_df, img_dir, 'val')



# 参数
batch_size, lr, num_epochs = 128, 0.001, 30
train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True) 
valid_iter = DataLoader(valid_data, batch_size=batch_size, shuffle=True)

三、完整代码

因为后面的模型和训练部分使用的都是李沐所教的,这里之间给出完整代码。

python 复制代码
import pandas as pd
import torch
from torch import nn
import os
from PIL import Image
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from d2l import torch as d2l
from torch.nn import functional as F

class Residual(nn.Module):
    def __init__(self, in_channels, num_channels,
                 use_1x1conv=False, strides=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, num_channels, 
                               kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels, 
                               kernel_size=3, padding=1)
        
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channels, num_channels, kernel_size=1, stride=strides)
        else:
            self.conv3 = None
            
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)
        
    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        Y += X
        return F.relu(Y)
    

def resnet_block(in_channels, num_channels, num_residuals, first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block: # 
            blk.append(Residual(in_channels, num_channels, True, strides=2))
        else:
            blk.append(Residual(num_channels, num_channels))
            
    return blk

class ImageDataset(Dataset): # 用来获取图片和标签的dataset
    def __init__(self, dataframe, img_dir, mode=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        if mode=='train':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.RandomCrop(224),
                        transforms.RandomHorizontalFlip(),
                        transforms.RandomVerticalFlip(),
                        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # 随机改变颜色
                        transforms.ToTensor(),                
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # 标准化
                        ])
        elif mode=='val':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),  # 中心裁剪
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                        ])
        elif mode=='test':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),  # 中心裁剪
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                        ])
        self.transform = preprocess
        self.mode = mode
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self,idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0]) # 得到图片的路径
        img = Image.open(img_name) # 打开图像
        if self.transform:
            img = self.transform(img)
        if self.mode == 'test':
            return img
        
        label = self.dataframe.iloc[idx, 1]
        label = torch.tensor(label, dtype=torch.long)
                  
        return img, label
    

img_dir = './kaggle_classify_leaves_dataset/'
df = pd.read_csv('./kaggle_classify_leaves_dataset/train.csv')
label_encoder = LabelEncoder() # 创建encoder类,准备把文本标签映射成int数
df['label'] = label_encoder.fit_transform(df['label'])


train_df, valid_df = train_test_split(df, test_size=0.1, stratify=df['label'])

train_data = ImageDataset(train_df, img_dir, 'train')
valid_data = ImageDataset(valid_df, img_dir, 'val')



# 参数
batch_size, lr, num_epochs = 128, 0.001, 30
train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True) 
valid_iter = DataLoader(valid_data, batch_size=batch_size, shuffle=True) 


b1 = nn.Sequential( # 初始的层,输出后通道数变为64,图片大小折半两次
    nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
    nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b2 = nn.Sequential(*resnet_block(64, 64, 2, True)) # 因为经过初始层图片的wh已经变为了1/4,这里就不做进一步的减小了
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))

net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), # (1,512,1,1) flatten后变为(1,512)
                    nn.Flatten(), nn.Linear(512, 176))

def evaluate_accuracy_gpu(net, data_iter, device=None):
    # 利用gpu评估精度
    if isinstance(net, nn.Module):
        net.eval() # 设置为评估模式
        if not device: # 如果没有gpu
            device = next(iter(net.parameters())).device # 如果没设定device,就是用net里第一层参数所用的device
    metric = d2l.Accumulator(2)
    
    with torch.no_grad():
        for X, y in data_iter:
            # 把需要评估的数据都放到device上
            if isinstance(X, list):
                X = [x.to(device) for x in X]
            else :
                X = X.to(device)
            y = y.to(device)
            metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]


def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
     
    # 利用Xavier初始化卷积层和全连接层的权重
    def init_weights(m): 
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weights)
    net.to(device) # 把net也放到device上
    print('training on ',device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss() # reduction=none使得返回一个10维的张量
        
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            optimizer.zero_grad()
            X ,y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l*X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
        train_l = metric[0] / metric[2]
        train_acc = metric[1] / metric[2]
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        print(f'第{epoch}次迭代结束, loss {train_l:.3f}, train acc {train_acc:.3f}, test acc {test_acc:.3f}')
        
    print(f'最终 loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')


# 训练
train_ch6(net, train_iter, valid_iter, num_epochs, lr, torch.device('cuda'))

四、提交代码

python 复制代码
df_test = pd.read_csv('./kaggle_classify_leaves_dataset/test.csv')


test_data = ImageDataset(df_test, img_dir, 'test')
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)
net.eval()
all_preds = []

with torch.no_grad():
    for imgs in test_loader:
        imgs = imgs.to(torch.device('cuda'))  # 如果使用GPU,确保数据在GPU上
        outputs = net(imgs)
        _, preds = torch.max(outputs, 1)  # 获取概率最大的类
        all_preds.extend(preds.cpu().numpy())  # 将预测结果收集到列表中

df_test['label'] = pd.Series(all_preds)
df_test['label'] = label_encoder.inverse_transform(df_test['label'])
submission = pd.concat([df_test['image'], df_test['label']], axis = 1)
submission.to_csv("./kaggle_classify_leaves_dataset/submission.csv", index=False)

五、最终效果

由于时间问题本文使用30次迭代,验证精度可以达到接近80%,建议设置100次迭代来达到较好的效果。

30次迭代在kaggle上的效果

相关推荐
luod2 分钟前
Python使用pymysql执行DML语句
python
坐吃山猪7 分钟前
BrowserUse11-源码-LLM模块
python·llm·playwright·browser-use
lang201509288 分钟前
深入解析Java资源加载机制
java·开发语言·python
Sui_Network24 分钟前
备受期待的 POP 射击游戏 XOCIETY 正式在 Epic Games Store 开启体验
人工智能·游戏·rpc·区块链·量子计算·graphql
漫长的~以后37 分钟前
GPT-5.2深度拆解:多档位自适应架构如何重塑AI推理效率
人工智能·gpt·架构
爱笑的眼睛1142 分钟前
自动机器学习组件的深度解析:超越AutoML框架的底层架构
java·人工智能·python·ai
LCG米1 小时前
嵌入式Python工业环境监测实战:MicroPython读取多传感器数据
开发语言·人工智能·python
自学小白菜1 小时前
每周刷题 - 第三周 - 双指针专题 - 02
python·算法·leetcode
努力的BigJiang1 小时前
Cube-slam复现及报错解决
人工智能
ComputerInBook1 小时前
代数基本概念理解——特征向量和特征值
人工智能·算法·机器学习·线性变换·特征值·特征向量