李沐《动手学深度学习》kaggle树叶分类(ResNet18无预训练)python代码实现

前言

在尝试这个树叶分类之前,作者仅仅看完了ResNet残差网络一章,并没有看后面关于数据增强的部分,这导致在第一次使用最原始的ResNet18直接跑完训练数据之后的效果十分的差,提交kaggle后的准确仅有20%左右。本文最后依然使用未经预训练的手写ResNet18网络,但做了一定的数据增强,最终在较少的迭代次数下在kaggle精度能达到80%。

数据集来自李沐老师在kaggle官网所用的Classify Leaves数据集。

地址:https://www.kaggle.com/competitions/classify-leaves

一、数据处理

首先读入train.csv训练数据,使用sklearn中的labelencoder和train_test_split把训练数据的标签映射成int值,再随机取10%作为验证数据。

python 复制代码
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('./kaggle_classify_leaves_dataset/train.csv')

label_encoder = LabelEncoder() # 创建encoder类,准备把文本标签映射成int数
df['label'] = label_encoder.fit_transform(df['label'])
train_df, valid_df = train_test_split(df, test_size=0.1, stratify=df['label'])

二、DataSet

重点在于DataSet中的数据增强部分,使不使用数据增强对结果的影响非常大。本文对训练所用的图片进行了一些随机的裁剪、随机的对比度曝光调整以及对图片的标准化。

在初次接触数据增强的写法时我好奇这种写法并没有真正意义上的扩充训练数据的规模,后来发现在ImageDataset中对图片的一系列增强操作,只有在用train_iter真正的把数据一批一批的取出来的时候才会生效。也就是说两次epoch迭代之间虽然都是用了train_iter来取同样的所有的图片,但是由于图片增强的操作是随机的,所以不同的迭代之间用的图片实际上是不一样的。由此一来我们只要增加迭代的次数,就增加了图片的多样性。

python 复制代码
class ImageDataset(Dataset): # 用来获取图片和标签的dataset
    def __init__(self, dataframe, img_dir, mode=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        if mode=='train':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.RandomCrop(224),
                        transforms.RandomHorizontalFlip(),
                        transforms.RandomVerticalFlip(),
                        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # 随机改变颜色
                        transforms.ToTensor(),                
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # 标准化
                        ])
        elif mode=='val':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),  # 中心裁剪
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                        ])
        elif mode=='test':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),  # 中心裁剪
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                        ])
        self.transform = preprocess
        self.mode = mode
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self,idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0]) # 得到图片的路径
        img = Image.open(img_name) # 打开图像
        if self.transform:
            img = self.transform(img)
        if self.mode == 'test':
            return img
        
        label = self.dataframe.iloc[idx, 1]
        label = torch.tensor(label, dtype=torch.long)
                  
        return img, label


train_data = ImageDataset(train_df, img_dir, 'train')
valid_data = ImageDataset(valid_df, img_dir, 'val')



# 参数
batch_size, lr, num_epochs = 128, 0.001, 30
train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True) 
valid_iter = DataLoader(valid_data, batch_size=batch_size, shuffle=True)

三、完整代码

因为后面的模型和训练部分使用的都是李沐所教的,这里之间给出完整代码。

python 复制代码
import pandas as pd
import torch
from torch import nn
import os
from PIL import Image
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from d2l import torch as d2l
from torch.nn import functional as F

class Residual(nn.Module):
    def __init__(self, in_channels, num_channels,
                 use_1x1conv=False, strides=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, num_channels, 
                               kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels, 
                               kernel_size=3, padding=1)
        
        if use_1x1conv:
            self.conv3 = nn.Conv2d(in_channels, num_channels, kernel_size=1, stride=strides)
        else:
            self.conv3 = None
            
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)
        
    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        Y += X
        return F.relu(Y)
    

def resnet_block(in_channels, num_channels, num_residuals, first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block: # 
            blk.append(Residual(in_channels, num_channels, True, strides=2))
        else:
            blk.append(Residual(num_channels, num_channels))
            
    return blk

class ImageDataset(Dataset): # 用来获取图片和标签的dataset
    def __init__(self, dataframe, img_dir, mode=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        if mode=='train':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.RandomCrop(224),
                        transforms.RandomHorizontalFlip(),
                        transforms.RandomVerticalFlip(),
                        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # 随机改变颜色
                        transforms.ToTensor(),                
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # 标准化
                        ])
        elif mode=='val':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),  # 中心裁剪
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                        ])
        elif mode=='test':
            preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),  # 中心裁剪
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                        ])
        self.transform = preprocess
        self.mode = mode
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self,idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0]) # 得到图片的路径
        img = Image.open(img_name) # 打开图像
        if self.transform:
            img = self.transform(img)
        if self.mode == 'test':
            return img
        
        label = self.dataframe.iloc[idx, 1]
        label = torch.tensor(label, dtype=torch.long)
                  
        return img, label
    

img_dir = './kaggle_classify_leaves_dataset/'
df = pd.read_csv('./kaggle_classify_leaves_dataset/train.csv')
label_encoder = LabelEncoder() # 创建encoder类,准备把文本标签映射成int数
df['label'] = label_encoder.fit_transform(df['label'])


train_df, valid_df = train_test_split(df, test_size=0.1, stratify=df['label'])

train_data = ImageDataset(train_df, img_dir, 'train')
valid_data = ImageDataset(valid_df, img_dir, 'val')



# 参数
batch_size, lr, num_epochs = 128, 0.001, 30
train_iter = DataLoader(train_data, batch_size=batch_size, shuffle=True) 
valid_iter = DataLoader(valid_data, batch_size=batch_size, shuffle=True) 


b1 = nn.Sequential( # 初始的层,输出后通道数变为64,图片大小折半两次
    nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
    nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b2 = nn.Sequential(*resnet_block(64, 64, 2, True)) # 因为经过初始层图片的wh已经变为了1/4,这里就不做进一步的减小了
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))

net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), # (1,512,1,1) flatten后变为(1,512)
                    nn.Flatten(), nn.Linear(512, 176))

def evaluate_accuracy_gpu(net, data_iter, device=None):
    # 利用gpu评估精度
    if isinstance(net, nn.Module):
        net.eval() # 设置为评估模式
        if not device: # 如果没有gpu
            device = next(iter(net.parameters())).device # 如果没设定device,就是用net里第一层参数所用的device
    metric = d2l.Accumulator(2)
    
    with torch.no_grad():
        for X, y in data_iter:
            # 把需要评估的数据都放到device上
            if isinstance(X, list):
                X = [x.to(device) for x in X]
            else :
                X = X.to(device)
            y = y.to(device)
            metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]


def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
     
    # 利用Xavier初始化卷积层和全连接层的权重
    def init_weights(m): 
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weights)
    net.to(device) # 把net也放到device上
    print('training on ',device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss() # reduction=none使得返回一个10维的张量
        
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            optimizer.zero_grad()
            X ,y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l*X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
        train_l = metric[0] / metric[2]
        train_acc = metric[1] / metric[2]
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        print(f'第{epoch}次迭代结束, loss {train_l:.3f}, train acc {train_acc:.3f}, test acc {test_acc:.3f}')
        
    print(f'最终 loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')


# 训练
train_ch6(net, train_iter, valid_iter, num_epochs, lr, torch.device('cuda'))

四、提交代码

python 复制代码
df_test = pd.read_csv('./kaggle_classify_leaves_dataset/test.csv')


test_data = ImageDataset(df_test, img_dir, 'test')
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)
net.eval()
all_preds = []

with torch.no_grad():
    for imgs in test_loader:
        imgs = imgs.to(torch.device('cuda'))  # 如果使用GPU,确保数据在GPU上
        outputs = net(imgs)
        _, preds = torch.max(outputs, 1)  # 获取概率最大的类
        all_preds.extend(preds.cpu().numpy())  # 将预测结果收集到列表中

df_test['label'] = pd.Series(all_preds)
df_test['label'] = label_encoder.inverse_transform(df_test['label'])
submission = pd.concat([df_test['image'], df_test['label']], axis = 1)
submission.to_csv("./kaggle_classify_leaves_dataset/submission.csv", index=False)

五、最终效果

由于时间问题本文使用30次迭代,验证精度可以达到接近80%,建议设置100次迭代来达到较好的效果。

30次迭代在kaggle上的效果

相关推荐
Kai HVZ12 分钟前
python爬虫----爬取视频实战
爬虫·python·音视频
古希腊掌管学习的神14 分钟前
[LeetCode-Python版]相向双指针——611. 有效三角形的个数
开发语言·python·leetcode
浊酒南街15 分钟前
决策树(理论知识1)
算法·决策树·机器学习
m0_7482448317 分钟前
StarRocks 排查单副本表
大数据·数据库·python
B站计算机毕业设计超人23 分钟前
计算机毕业设计PySpark+Hadoop中国城市交通分析与预测 Python交通预测 Python交通可视化 客流量预测 交通大数据 机器学习 深度学习
大数据·人工智能·爬虫·python·机器学习·课程设计·数据可视化
路人甲ing..27 分钟前
jupyter切换内核方法配置问题总结
chrome·python·jupyter
学术头条28 分钟前
清华、智谱团队:探索 RLHF 的 scaling laws
人工智能·深度学习·算法·机器学习·语言模型·计算语言学
18号房客32 分钟前
一个简单的机器学习实战例程,使用Scikit-Learn库来完成一个常见的分类任务——**鸢尾花数据集(Iris Dataset)**的分类
人工智能·深度学习·神经网络·机器学习·语言模型·自然语言处理·sklearn
feifeikon35 分钟前
机器学习DAY3 : 线性回归与最小二乘法与sklearn实现 (线性回归完)
人工智能·机器学习·线性回归
游客52038 分钟前
opencv中的常用的100个API
图像处理·人工智能·python·opencv·计算机视觉