今天的任务是使用torch搭积木,手动实现VGG-16模型。 重点包括
- 如何保存模型参数到本地并加载模型
nn.Sequential()
,nn.BatchNorm2d()
,nn.MaxPool2d()
的使用- 模型调优
一. Dataset和DataLoader构建
同之前数据集的格式,图像数据集以文件夹名称作为标签,调用torchvision.datasets
中的ImageFolder
可以自动将数据集封装成Dataset格式。
python
total_data = datasets.ImageFolder("./48-data/",transform=train_transforms)
total_data.class_to_idx
# {'Angelina Jolie': 0, 'Brad Pitt': 1, 'Denzel Washington': 2, 'Hugh Jackman': 3}
使用Path.glob('*')
遍历目录中的所有子目录
python
# 打印标签名称
data_dir = './48-data/'
data_dir = pathlib.Path(data_dir)
data_paths = list(data_dir.glob('*'))
classNames = [str(path).split("\\")[1] for path in data_paths]
使用torch.utils.data.random_split()
划分训练集、测试集和验证集
python
# 根据 7:2:1 的比例计算各个数据集的大小
train_size = int(0.7 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size # 剩余的数据作为测试集
# 使用 random_split 划分数据集
train_dataset, val_dataset, test_dataset = random_split(total_data, [train_size, val_size, test_size])
# 定义 batch size
batch_size = 32
# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=1)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=1)
测试一下tensor的shape
python
for item in train_loader:
x, y = item
print(x.shape, y.shape)
break
# torch.Size([32, 64, 224, 224])
二. 模型搭建
Batch Normalization(批量归一化)
功能:
- 归一化:使数据的均值为 0,方差为 1,从而标准化激活值,这有助于加快训练速度和提高网络的稳定性。
- 学习偏移和缩放参数:在归一化后加入可学习的参数
γ
和β
,使网络仍能恢复数据的表达能力。 - 抑制梯度消失或梯度爆炸:在深层网络中,批量归一化能有效减少梯度消失或爆炸问题。
常见位置:
批量归一化通常放在 卷积层之后 和 激活函数之前,这样可以在激活前归一化卷积的输出,从而加快激活的收敛。
Max Pooling(最大池化)
功能:
- 下采样:对特征图进行降采样(通常是 2x2 的区域),取区域内的最大值,保留主要特征,同时减少特征图的大小。
- 减少计算量:降低了后续层的计算需求和参数量。
- 提取关键信息:有助于保留最显著的特征,从而增强网络对平移不变性的鲁棒性。
常见位置:
最大池化一般放在 激活函数之后,通常在批量归一化后。即,经过卷积、批量归一化、激活函数处理后,再进行池化操作,以提取更高层次的特征。
初步搭的积木如下 , 有很多冗余代码,可以用nn.Sequential()
优化。
python
class VGG16(nn.Module):
def __init__(self, num_class):
super().__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=1)
self.bn1 = nn.BatchNorm2d(num_features=64)
self.pool1 = nn.MaxPool2d(kernel_size=2)
self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=1)
self.bn2 = nn.BatchNorm2d(num_features=128)
self.pool2 = nn.MaxPool2d(kernel_size=2)
self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=1)
self.bn3 = nn.BatchNorm2d(num_features=256)
self.pool3 = nn.MaxPool2d(kernel_size=2)
self.conv4 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1)
self.bn4 = nn.BatchNorm2d(num_features=512)
self.pool4 = nn.MaxPool2d(kernel_size=2)
self.conv5 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1)
self.bn5 = nn.BatchNorm2d(num_features=1024)
self.pool5 = nn.MaxPool2d(kernel_size=2)
self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)
self.fc1 = nn.Linear(in_features=1024, out_features=4096)
self.fc2 = nn.Linear(in_features=4096, out_features=4096)
self.fc3 = nn.Linear(in_features=4096, out_features=1000)
self.fc4 = nn.Linear(in_features=1000, out_features=10)
self.dp1 = nn.Dropout(0.3)
def forward(self, x):
x = self.pool1(F.relu(self.bn1(self.conv1(x))))
x = self.pool2(F.relu(self.bn2(self.conv2(x))))
x = self.pool3(F.relu(self.bn3(self.conv3(x))))
x = self.pool4(F.relu(self.bn4(self.conv4(x))))
x = self.pool5(F.relu(self.bn5(self.conv5(x))))
x = self.avgpool(x)
x = x.permute(0, 2, 3, 1)
x = F.relu(self.fc1(x))
x = self.dp1(F.relu(self.fc2(x)))
x = self.dp1(F.relu(self.fc3(x)))
x = F.relu(self.fc4(x))
x = x.flatten(start_dim=1)
return x
使用nn.Sequential()
优化后的代码:
ini
class VGG16(nn.Module):
def __init__(self, num_class=10):
super().__init__()
# 卷积层部分
self.features = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=64, kernel_size=1),
nn.BatchNorm2d(num_features=64),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=1),
nn.BatchNorm2d(num_features=128),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=1),
nn.BatchNorm2d(num_features=256),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1),
nn.BatchNorm2d(num_features=512),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1),
nn.BatchNorm2d(num_features=1024),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
nn.AdaptiveAvgPool2d(output_size=1)
)
# 全连接层部分
self.classifier = nn.Sequential(
nn.Flatten(), # 扁平化处理,准备输入全连接层
nn.Linear(in_features=1024, out_features=4096),
nn.ReLU(inplace=True),
nn.Dropout(0.3),
nn.Linear(in_features=4096, out_features=4096),
nn.ReLU(inplace=True),
nn.Dropout(0.3),
nn.Linear(in_features=4096, out_features=1000),
nn.ReLU(inplace=True),
nn.Linear(in_features=1000, out_features=num_class) # 输出类别数
)
def forward(self, x):
x = self.features(x) # 卷积层部分前向传播
x = self.classifier(x) # 全连接层部分前向传播
return x
三. 模型调优
训练的主函数如下:
python
epochs = 20
train_loss = []
train_acc = []
test_loss = []
test_acc = []
model = VGG16(num_class=17).to(device)
loss_fn = nn.CrossEntropyLoss() # 创建损失函数
learn_rate = 1e-2 # 学习率
opt = torch.optim.SGD(model.parameters(),lr=learn_rate)
scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=5, gamma=0.5)
for epoch in range(epochs):
model.train()
epoch_train_acc, epoch_train_loss = train(train_loader, model, loss_fn, opt, scheduler)
model.eval()
epoch_test_acc, epoch_test_loss = test(test_loader, model, loss_fn)
train_acc.append(epoch_train_acc)
train_loss.append(epoch_train_loss)
test_acc.append(epoch_test_acc)
test_loss.append(epoch_test_loss)
template = ('Epoch:{:2d}, Train_acc:{:.1f}%, Train_loss:{:.3f}, Test_acc:{:.1f}%,Test_loss:{:.3f}')
print(template.format(epoch+1, epoch_train_acc*100, epoch_train_loss, epoch_test_acc*100, epoch_test_loss))
print('Done')
初步训练得到的结果如下:
增加验证集和早停机制, 在训练的最后进行测试集上的评估。早停机制保证模型在验证集上的效果至少不比原来差。保存中间的模型参数,并在一开始训练之前判断是否有过训练好的模型参数可以使用。
python
epochs = 50
train_loss, train_acc = [], []
val_loss, val_acc = [], []
model = VGG16(num_class=17).to(device)
# 配置损失函数、优化器和学习率调度器
loss_fn = nn.CrossEntropyLoss()
learn_rate = 1e-2
opt = torch.optim.SGD(model.parameters(), lr=learn_rate)
scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=5, gamma=0.5)
# 早停机制和模型保存配置
early_stop_patience = 5
early_stop_counter = 0
min_val_loss = float('inf')
best_model_path = 'best_model.pth'
# 检查是否有已保存的模型权重文件
if os.path.exists(best_model_path):
model.load_state_dict(torch.load(best_model_path))
print("Loaded pre-trained model weights.")
else:
print("No pre-trained model found. Training from scratch.")
for epoch in range(epochs):
# 训练模式
model.train()
epoch_train_acc, epoch_train_loss = train(train_loader, model, loss_fn, opt, scheduler)
# 验证模式
model.eval()
with torch.no_grad():
epoch_val_acc, epoch_val_loss = test(val_loader, model, loss_fn)
# 记录每个 epoch 的训练和验证结果
train_acc.append(epoch_train_acc)
train_loss.append(epoch_train_loss)
val_acc.append(epoch_val_acc)
val_loss.append(epoch_val_loss)
# 打印训练和验证结果
print(f'Epoch:{epoch+1:2d}, Train_acc:{epoch_train_acc*100:.1f}%, Train_loss:{epoch_train_loss:.3f}, '
f'Val_acc:{epoch_val_acc*100:.1f}%, Val_loss:{epoch_val_loss:.3f}')
# 早停机制和模型保存
if epoch_val_loss < min_val_loss:
min_val_loss = epoch_val_loss
early_stop_counter = 0
torch.save(model.state_dict(), best_model_path)
else:
early_stop_counter += 1
if early_stop_counter >= early_stop_patience:
print(f"Early stopping triggered at epoch {epoch+1}.")
break
# 加载最佳模型并在测试集上评估
model.load_state_dict(torch.load(best_model_path))
model.eval()
with torch.no_grad():
epoch_test_acc, epoch_test_loss = test(test_loader, model, loss_fn)
print("Test_acc: {:.1f}%, Test_loss: {:.3f}".format(epoch_test_acc*100, epoch_test_loss))
print('Training and evaluation complete.')
优化后训练60个epoch后,在测试集上的准确率达到21.4%
四. 测试demo
使用模型加载本地的一张图片
python
from PIL import Image
import matplotlib.pyplot as plt
classes = list(total_data.class_to_idx)
model.load_state_dict(torch.load(best_model_path))
def predict_one_image(image_path, model, transform, classes):
test_img = Image.open(image_path).convert('RGB')
plt.imshow(test_img) # 展示预测的图片
test_img = transform(test_img)
img = test_img.to(device).unsqueeze(0)
model.eval()
output = model(img)
_,pred = torch.max(output,1)
pred_class = classes[pred]
print(f'预测结果是:{pred_class}')
# 预测训练集中的某张照片
predict_one_image(image_path='./48-data/Brad Pitt/002_cc1b9701.jpg',
model=model,
transform=train_transforms,
classes=classes)