使用paddlepaddle框架构建ViT用于CIFAR10图像分类
硬件环境:GPU (1 * NVIDIA T4)
运行时间:一个epoch大概一分钟
python
import paddle
import time
import paddle.nn as nn
import paddle.nn.functional as F
import paddle.vision.transforms as transforms
from paddle.io import DataLoader
import numpy as np
import paddle.optimizer.lr as lrScheduler
from paddle.vision.transforms import BaseTransform
import math
from tqdm import tqdm
paddle.seed(1024)
np.random.seed(1234)
# 设置使用的设备为GPU
paddle.set_device('gpu')
# 通过AutoTransforms实现随机数据增强
class AutoTransforms(BaseTransform):
def __init__(self, transforms=None, keys=None):
super(AutoTransforms, self).__init__(keys)
self.transforms = transforms
def _apply_image(self, image):
if self.transforms is None:
return image
choose=np.random.randint(0, len(self.transforms))
return self.transforms[choose](image)
# 训练集数据增强
mean = [0.5071, 0.4867, 0.4408]
std = [0.2675, 0.2565, 0.2761]
transforms_list= [
transforms.BrightnessTransform(0.5), # 亮度变换
transforms.SaturationTransform(0.5), # 饱和度变换
transforms.ContrastTransform(0.5), # 对比度变换
transforms.HueTransform(0.5), # 色调变换
transforms.RandomRotation(15,
expand=True,
fill=128), # 随机旋转
transforms.ColorJitter(0.5,0.5,0.5,0.5),
transforms.Grayscale(3) # 转换为灰度图
]
train_tx = transforms.Compose([
transforms.RandomHorizontalFlip(),
AutoTransforms(transforms_list),
transforms.RandomCrop(32),
transforms.RandomVerticalFlip(),
transforms.Transpose(),
transforms.Normalize(0.0, 255.0),
transforms.Normalize(mean, std)
])
val_tx = transforms.Compose([
transforms.Transpose(),
transforms.Normalize(0.0, 255.0),
transforms.Normalize(mean, std)
])
cifar10_train = paddle.vision.datasets.Cifar10(mode='train', transform=train_tx, download=True)
cifar10_test = paddle.vision.datasets.Cifar10(mode='test', transform=val_tx, download=True)
# 训练集数量50000,测试集数量10000
print('训练集数量:', len(cifar10_train), '训练集图像尺寸', cifar10_train[0][0].shape)
print('测试集数量:', len(cifar10_test), '测试集图像尺寸', cifar10_test[0][0].shape)
def anti_normalize(image):
# 将图像转换为张量
image = paddle.to_tensor(image)
# 处理均值和标准差
t_mean = paddle.to_tensor(mean).reshape([3, 1, 1]).expand([3, 32, 32])
t_std = paddle.to_tensor(std).reshape([3, 1, 1]).expand([3, 32, 32])
# 反归一化
return (image * t_std + t_mean).transpose([1, 2, 0])
# ViT模型组网部分包含图像切片(Patches),多层感知机(MLP),多头自注意力机制(MultiHeadSelfAttention)以及Transformer编码器(Transformer Encoder)。
# Patches的目的是实现图像切块,将整张图像分割成一个个小块(patch),以方便后续将图像编码成一个个tokens。
class Patches(paddle.nn.Layer):
def __init__(self, patch_size):
super(Patches, self).__init__()
self.patch_size = patch_size
def forward(self, images):
patches = F.unfold(images, self.patch_size, self.patch_size)
return patches.transpose([0,2,1])
# 多层感知机包含线性层,激活层(GELU),DropOut层。线性层将输入扩增指定维度,再缩减回去,MLP不改变输入输出维度。
class Mlp(nn.Layer):
def __init__(self, feats, mlp_hidden, dropout=0.1):
super().__init__()
self.fc1 = nn.Linear(feats, mlp_hidden)
self.fc2 = nn.Linear(mlp_hidden, feats)
self.act = nn.GELU()
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.dropout(x)
return x
# 多头自注意力机制。
class MultiHeadSelfAttention(nn.Layer):
def __init__(self, feats, head=8, dropout=0., attn_dropout=0.0):
super(MultiHeadSelfAttention, self).__init__()
self.head = head
self.feats = feats
self.sqrt_d = self.feats ** 0.5
self.qkv = nn.Linear(feats,
feats * 3)
self.out = nn.Linear(feats, feats)
self.dropout = nn.Dropout(dropout)
self.attn_dropout = nn.Dropout(attn_dropout)
def transpose_multi_head(self, x):
new_shape = x.shape[:-1] + [self.head, self.feats//self.head]
x = x.reshape(new_shape)
x = x.transpose([0, 2, 1, 3])
return x
def forward(self, x):
b, n, f = x.shape
qkv = self.qkv(x).chunk(3, -1)
q, k, v = map(self.transpose_multi_head, qkv)
attn = F.softmax(paddle.einsum("bhif, bhjf->bhij", q, k) / self.sqrt_d, axis=-1)
attn = self.attn_dropout(attn)
attn = paddle.einsum("bhij, bhjf->bihf", attn, v)
out = self.dropout(self.out(attn.flatten(2))) # 使用flatten函数将多头输出恢复为原始的特征维度,并通过out线性层进行映射
return out
# 一个Transformer Encoder包括LayerNorm层,MultiHeadSelfAttention以及MLP,将输入进来的token编码输出。
class TransformerEncoder(nn.Layer):
def __init__(self, feats, mlp_hidden, head=8, dropout=0., attn_dropout=0.):
super(TransformerEncoder, self).__init__()
self.layer1 = nn.LayerNorm(feats)
self.msa = MultiHeadSelfAttention(feats, head=head, dropout=dropout, attn_dropout=attn_dropout)
self.layer2 = nn.LayerNorm(feats)
self.mlp = Mlp(feats, mlp_hidden)
def forward(self, x):
out = self.msa(self.layer1(x)) + x
out = self.mlp(self.layer2(out)) + out
return out
# 将Patches,MLP,MultiHeadSelfAttention以及TransformerEncoder组合,实现ViT。
class ViT(nn.Layer):
# in_c:输入图像的通道数;num_classes:分类任务的类别数;img_size:输入图像的尺寸;patch:将图像分割为的块的大小
# dropout 和 attn_dropout:分别用于MLP和自注意力机制的Dropout比例;num_layers:Transformer编码器层的数量
# hidden:Transformer的隐藏层维度;mlp_hidden:MLP的隐藏层维度
# head:多头自注意力模块的头数;is_cls_token:是否添加分类令牌
def __init__(self, in_c=3, num_classes=10, img_size=32, patch=8, dropout=0., attn_dropout=0.0, num_layers=7, hidden=384, mlp_hidden=384*4, head=8, is_cls_token=True):
super(ViT, self).__init__()
self.patch = patch
self.is_cls_token = is_cls_token
self.patch_size = img_size // self.patch
self.patches = Patches(self.patch_size)
f = (img_size // self.patch) ** 2 * 3
num_tokens = (self.patch ** 2) + 1 if self.is_cls_token else (self.patch ** 2)
# emb:线性层,用于将块的特征映射到隐藏层维度
self.emb = nn.Linear(f, hidden)
self.cls_token = paddle.create_parameter(
shape = [1, 1, hidden],
dtype = 'float32',
default_initializer=nn.initializer.Assign(paddle.randn([1, 1, hidden]))
) if is_cls_token else None
# pos_embedding:位置嵌入,用于为每个块(包括分类令牌)提供位置信息
self.pos_embedding = paddle.create_parameter(
shape = [1,num_tokens, hidden],
dtype = 'float32',
default_initializer=nn.initializer.Assign(paddle.randn([1,num_tokens, hidden]))
)
encoder_list = [TransformerEncoder(hidden, mlp_hidden=mlp_hidden, dropout=dropout, attn_dropout=attn_dropout, head=head) for _ in range(num_layers)]
self.encoder = nn.Sequential(*encoder_list)
self.fc = nn.Sequential(
nn.LayerNorm(hidden),
nn.Linear(hidden, num_classes) # for cls_token
)
# 使用patches将输入图像x分割为块,并展平这些块。
# 将展平的块通过emb线性层映射到隐藏层维度。
# 如果is_cls_token为True,则在输入序列的开始处添加一个分类令牌。
# 将位置嵌入pos_embedding添加到输入序列中。
# 将输入序列传递给encoder进行Transformer编码。
# 如果is_cls_token为True,则只取分类令牌的输出;否则,取所有块的输出的平均值。
# 将最终输出传递给fc全连接层以进行分类。
# 返回分类结果。
def forward(self, x):
out = self.patches(x)
out = self.emb(out)
if self.is_cls_token:
out = paddle.concat([self.cls_token.tile([out.shape[0],1,1]), out], axis=1)
out = out + self.pos_embedding
out = self.encoder(out)
if self.is_cls_token:
out = out[:,0]
else:
out = out.mean(1)
out = self.fc(out)
return out
# 构建LabelSmoothingCrossEntropyLoss作为损失函数,并采用LinearWarmup和CosineAnnealingDecay构建带有Warmup的Cosine学习率衰减方式。
# 标签平滑的交叉熵损失函数,正则化方法,提高模型的泛化能力
class LabelSmoothingCrossEntropyLoss(nn.Layer):
def __init__(self, classes, smoothing=0.0, dim=-1):
super(LabelSmoothingCrossEntropyLoss, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
self.cls = classes
self.dim = dim
def forward(self, pred, target):
pred = F.log_softmax(pred, axis=self.dim)
with paddle.no_grad():
true_dist = paddle.ones_like(pred)
true_dist.fill_(self.smoothing / (self.cls - 1))
true_dist.put_along_axis_(target.unsqueeze(1), self.confidence, 1)
return paddle.mean(paddle.sum(-true_dist * pred, axis=self.dim))
def get_scheduler(epochs, warmup_epochs, learning_rate):
base_scheduler = lrScheduler.CosineAnnealingDecay(learning_rate=learning_rate, T_max=epochs, eta_min=1e-5, verbose=False)
scheduler = lrScheduler.LinearWarmup(base_scheduler, warmup_epochs, 1e-5, learning_rate, last_epoch=-1, verbose=False)
return scheduler
# 模型构建
Model = ViT(in_c=3, num_classes=10, img_size=32, patch=8, dropout=0.5, attn_dropout=0.1, num_layers=7, hidden=384, head=12, mlp_hidden=384, is_cls_token=True)
# 输出模型结构
paddle.summary(Model, (1, 3, 32, 32))
# 定义训练的超参数、优化器、损失函数和学习率衰减方式,构建数据迭代器。
EPOCHS = 100 # 训练的总轮数
BATCH_SIZE = 128 # 批处理大小
NUM_CLASSES = 10 # 类别总数
WARMUP_EPOCHS = 5 # 学习率预热阶段的轮数
LR = 1e-3 # 初始学习率
# 学习率调度器
scheduler = get_scheduler(epochs=EPOCHS, warmup_epochs=WARMUP_EPOCHS, learning_rate=LR)
# Adam优化器
optim = paddle.optimizer.Adam(learning_rate=scheduler, parameters=Model.parameters(), weight_decay=5e-5)
# 损失函数
criterion = LabelSmoothingCrossEntropyLoss(NUM_CLASSES, smoothing=0.1)
# 加载训练集,打乱顺序
train_loader = DataLoader(cifar10_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=False)
# 加载测试集,不打乱顺序
test_loader = DataLoader(cifar10_test, batch_size=BATCH_SIZE * 16, shuffle=False, num_workers=0, drop_last=False)
# 定义模型训练函数train_epoch,在模型训练过程中,打印训练过程中的学习率,损失值以及模型在训练集上的精度。
def train_epoch(model, epoch, interval=20):
acc_num = 0 # acc_num用于记录正确预测的数量
total_samples = 0 # total_samples用于记录已经处理过的样本总数
nb = len(train_loader)
pbar = enumerate(train_loader)
# 用tqdm库来创建一个进度条
pbar = tqdm(pbar, total=nb, colour='red', disable=((epoch + 1) % interval != 0))
pbar.set_description(f'EPOCH: {epoch:3d}')
for _, (_, data) in enumerate(pbar):
x_data = data[0] # 从数据批次中提取特征和标签。
y_data = data[1]
predicts = model(x_data) # 使用模型对特征进行预测。
loss = criterion(predicts, y_data) # 计算预测损失。
loss_item = loss.item()
acc_num += paddle.sum(predicts.argmax(1) == y_data).item()
total_samples += y_data.shape[0] # 更新正确预测的数量和总样本数,从而计算总的准确率。
total_acc = acc_num / total_samples
current_lr = optim.get_lr() # 获取当前的学习率。
loss.backward() # 反向传播损失以更新模型的权重。
pbar.set_postfix(train_loss=f'{loss_item:5f}', train_acc=f'{total_acc:5f}', train_lr=f'{current_lr:5f}')
optim.step() # 使用优化器进行一步优化。
optim.clear_grad() # 清除已计算的梯度,为下一个批次的优化做准备。
scheduler.step() # 更新进度条的信息,显示当前的损失、准确率和学习率。
# 定义模型评估函数validation,在模型验证过程中,输出模型在验证集上的精度。
@paddle.no_grad()
def validation(model, epoch, interval=20):
model.eval()
acc_num = 0
total_samples = 0
nb = len(test_loader)
pbar = enumerate(test_loader)
pbar = tqdm(pbar, total=nb, colour='green', disable=((epoch + 1) % interval != 0))
pbar.set_description(f'EVAL')
for _, (_, data) in enumerate(pbar):
x_data = data[0]
y_data = data[1]
predicts = model(x_data)
acc_num += paddle.sum(predicts.argmax(1) == y_data).item()
total_samples += y_data.shape[0]
batch_acc = paddle.metric.accuracy(predicts, y_data.unsqueeze(1)).item()
total_acc = acc_num / total_samples
pbar.set_postfix(eval_batch_acc=f'{batch_acc:4f}', total_acc=f'{total_acc:4f}')
# 每20轮打印一次模型训练和评估信息,每50轮保存一次模型参数。
start = time.time()
print(start)
for epoch in range(EPOCHS):
train_epoch(Model, epoch)
validation(Model, epoch)
if (epoch + 1) % 50 == 0:
paddle.save(Model.state_dict(), str(epoch + 1) + '.pdparams')
paddle.save(Model.state_dict(), 'finished.pdparams')
end = time.time()
print('Training Cost ', (end-start) / 60, 'minutes')
state_dict = paddle.load('finished.pdparams') # 加载模型的权重
Model.set_state_dict(state_dict)
Model.eval()
top1_num = 0 # 记录Top1预测正确的样本数
top5_num = 0 # 记录Top5预测正确的样本数
total_samples = 0
nb = len(test_loader)
pbar = enumerate(test_loader)
pbar = tqdm(pbar, total=nb, colour='green')
pbar.set_description(f'EVAL')
with paddle.no_grad():
for _, (_, data) in enumerate(pbar):
x_data = data[0]
y_data = data[1]
predicts = Model(x_data)
total_samples += y_data.shape[0]
# paddle.metric.accuracy计算Top1和Top5的准确率,并更新相应的计数器。
top1_num += paddle.metric.accuracy(predicts, y_data.unsqueeze(1), k=1).item() * y_data.shape[0]
top5_num += paddle.metric.accuracy(predicts, y_data.unsqueeze(1), k=5).item() * y_data.shape[0]
TOP1 = top1_num / total_samples
TOP5 = top5_num / total_samples
pbar.set_postfix(TOP1=f'{TOP1:4f}', TOP5=f'{TOP5:4f}')
预测结果:TOP1=0.800800, TOP5=0.963500