深度学习笔记 - Pytorch自搭建VGG-16模型实现人脸识别

今天的任务是使用torch搭积木，手动实现VGG-16模型。 重点包括

如何保存模型参数到本地并加载模型
nn.Sequential(), nn.BatchNorm2d(), nn.MaxPool2d()的使用
模型调优

一. Dataset和DataLoader构建

同之前数据集的格式，图像数据集以文件夹名称作为标签，调用torchvision.datasets中的ImageFolder可以自动将数据集封装成Dataset格式。

python 复制代码

total_data = datasets.ImageFolder("./48-data/",transform=train_transforms)
total_data.class_to_idx
# {'Angelina Jolie': 0, 'Brad Pitt': 1, 'Denzel Washington': 2, 'Hugh Jackman': 3}

使用Path.glob('*')遍历目录中的所有子目录

python 复制代码

# 打印标签名称
data_dir = './48-data/'
data_dir = pathlib.Path(data_dir)

data_paths  = list(data_dir.glob('*'))
classNames = [str(path).split("\\")[1] for path in data_paths]

使用torch.utils.data.random_split()划分训练集、测试集和验证集

python 复制代码

# 根据 7:2:1 的比例计算各个数据集的大小
train_size = int(0.7 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size  # 剩余的数据作为测试集

# 使用 random_split 划分数据集
train_dataset, val_dataset, test_dataset = random_split(total_data, [train_size, val_size, test_size])

# 定义 batch size
batch_size = 32

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=1)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

测试一下tensor的shape

python 复制代码

for item in train_loader:
    x, y = item
    print(x.shape, y.shape)
    break
# torch.Size([32, 64, 224, 224])

二. 模型搭建

Batch Normalization（批量归一化）

功能：

归一化：使数据的均值为 0，方差为 1，从而标准化激活值，这有助于加快训练速度和提高网络的稳定性。
学习偏移和缩放参数：在归一化后加入可学习的参数 γ 和 β，使网络仍能恢复数据的表达能力。
抑制梯度消失或梯度爆炸：在深层网络中，批量归一化能有效减少梯度消失或爆炸问题。

常见位置：

批量归一化通常放在 卷积层之后 和 激活函数之前，这样可以在激活前归一化卷积的输出，从而加快激活的收敛。

Max Pooling（最大池化）

功能：

下采样：对特征图进行降采样（通常是 2x2 的区域），取区域内的最大值，保留主要特征，同时减少特征图的大小。
减少计算量：降低了后续层的计算需求和参数量。
提取关键信息：有助于保留最显著的特征，从而增强网络对平移不变性的鲁棒性。

常见位置：

最大池化一般放在 激活函数之后，通常在批量归一化后。即，经过卷积、批量归一化、激活函数处理后，再进行池化操作，以提取更高层次的特征。

初步搭的积木如下 , 有很多冗余代码，可以用nn.Sequential()优化。

python 复制代码

class VGG16(nn.Module):
    def __init__(self, num_class):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=1)
        self.bn1 = nn.BatchNorm2d(num_features=64)
        self.pool1 = nn.MaxPool2d(kernel_size=2)

        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=1)
        self.bn2 = nn.BatchNorm2d(num_features=128)
        self.pool2 = nn.MaxPool2d(kernel_size=2)

        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=1)
        self.bn3 = nn.BatchNorm2d(num_features=256)
        self.pool3 = nn.MaxPool2d(kernel_size=2)

        self.conv4 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1)
        self.bn4 = nn.BatchNorm2d(num_features=512)
        self.pool4 = nn.MaxPool2d(kernel_size=2)

        self.conv5 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1)
        self.bn5 = nn.BatchNorm2d(num_features=1024)
        self.pool5 = nn.MaxPool2d(kernel_size=2)

        self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)

        self.fc1 = nn.Linear(in_features=1024, out_features=4096)
        self.fc2 = nn.Linear(in_features=4096, out_features=4096)
        self.fc3 = nn.Linear(in_features=4096, out_features=1000)
        self.fc4 = nn.Linear(in_features=1000, out_features=10)

        self.dp1 = nn.Dropout(0.3)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = self.pool4(F.relu(self.bn4(self.conv4(x))))
        x = self.pool5(F.relu(self.bn5(self.conv5(x))))
        x = self.avgpool(x)

        x =  x.permute(0, 2, 3, 1)

        x = F.relu(self.fc1(x))
        x = self.dp1(F.relu(self.fc2(x)))
        x = self.dp1(F.relu(self.fc3(x)))
        x = F.relu(self.fc4(x))
        x = x.flatten(start_dim=1)

        return x

使用nn.Sequential()优化后的代码：

ini 复制代码

class VGG16(nn.Module):
    def __init__(self, num_class=10):
        super().__init__()
        
        # 卷积层部分
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=1),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=1),
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=1),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1),
            nn.BatchNorm2d(num_features=512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1),
            nn.BatchNorm2d(num_features=1024),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
            
            nn.AdaptiveAvgPool2d(output_size=1)
        )
        
        # 全连接层部分
        self.classifier = nn.Sequential(
            nn.Flatten(),  # 扁平化处理，准备输入全连接层
            nn.Linear(in_features=1024, out_features=4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),

            nn.Linear(in_features=4096, out_features=4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),

            nn.Linear(in_features=4096, out_features=1000),
            nn.ReLU(inplace=True),

            nn.Linear(in_features=1000, out_features=num_class)  # 输出类别数
        )
        
    def forward(self, x):
        x = self.features(x)      # 卷积层部分前向传播
        x = self.classifier(x)    # 全连接层部分前向传播
        return x

三. 模型调优

训练的主函数如下：

python 复制代码

epochs     = 20
train_loss = []
train_acc  = []
test_loss  = []
test_acc   = []

model = VGG16(num_class=17).to(device)


loss_fn    = nn.CrossEntropyLoss() # 创建损失函数
learn_rate = 1e-2 # 学习率
opt        = torch.optim.SGD(model.parameters(),lr=learn_rate)
scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=5, gamma=0.5)

for epoch in range(epochs):
    model.train()
    epoch_train_acc, epoch_train_loss = train(train_loader, model, loss_fn, opt, scheduler)
    
    model.eval()
    epoch_test_acc, epoch_test_loss = test(test_loader, model, loss_fn)
    
    train_acc.append(epoch_train_acc)
    train_loss.append(epoch_train_loss)
    test_acc.append(epoch_test_acc)
    test_loss.append(epoch_test_loss)
    
    template = ('Epoch:{:2d}, Train_acc:{:.1f}%, Train_loss:{:.3f}, Test_acc:{:.1f}%，Test_loss:{:.3f}')
    print(template.format(epoch+1, epoch_train_acc*100, epoch_train_loss, epoch_test_acc*100, epoch_test_loss))
print('Done')

初步训练得到的结果如下：

增加验证集和早停机制, 在训练的最后进行测试集上的评估。早停机制保证模型在验证集上的效果至少不比原来差。保存中间的模型参数，并在一开始训练之前判断是否有过训练好的模型参数可以使用。

python 复制代码

epochs = 50
train_loss, train_acc = [], []
val_loss, val_acc = [], []

model = VGG16(num_class=17).to(device)

# 配置损失函数、优化器和学习率调度器
loss_fn = nn.CrossEntropyLoss()
learn_rate = 1e-2
opt = torch.optim.SGD(model.parameters(), lr=learn_rate)
scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=5, gamma=0.5)

# 早停机制和模型保存配置
early_stop_patience = 5
early_stop_counter = 0
min_val_loss = float('inf')
best_model_path = 'best_model.pth'

# 检查是否有已保存的模型权重文件
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path))
    print("Loaded pre-trained model weights.")
else:
    print("No pre-trained model found. Training from scratch.")

for epoch in range(epochs):
    # 训练模式
    model.train()
    epoch_train_acc, epoch_train_loss = train(train_loader, model, loss_fn, opt, scheduler)
    
    # 验证模式
    model.eval()
    with torch.no_grad():
        epoch_val_acc, epoch_val_loss = test(val_loader, model, loss_fn)
    
    # 记录每个 epoch 的训练和验证结果
    train_acc.append(epoch_train_acc)
    train_loss.append(epoch_train_loss)
    val_acc.append(epoch_val_acc)
    val_loss.append(epoch_val_loss)
    
    # 打印训练和验证结果
    print(f'Epoch:{epoch+1:2d}, Train_acc:{epoch_train_acc*100:.1f}%, Train_loss:{epoch_train_loss:.3f}, '
          f'Val_acc:{epoch_val_acc*100:.1f}%, Val_loss:{epoch_val_loss:.3f}')
    
    # 早停机制和模型保存
    if epoch_val_loss < min_val_loss:
        min_val_loss = epoch_val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), best_model_path)
    else:
        early_stop_counter += 1
        if early_stop_counter >= early_stop_patience:
            print(f"Early stopping triggered at epoch {epoch+1}.")
            break

# 加载最佳模型并在测试集上评估
model.load_state_dict(torch.load(best_model_path))
model.eval()
with torch.no_grad():
    epoch_test_acc, epoch_test_loss = test(test_loader, model, loss_fn)
    print("Test_acc: {:.1f}%, Test_loss: {:.3f}".format(epoch_test_acc*100, epoch_test_loss))

print('Training and evaluation complete.')

优化后训练60个epoch后，在测试集上的准确率达到21.4%

四. 测试demo

使用模型加载本地的一张图片

python 复制代码

from PIL import Image 
import matplotlib.pyplot as plt

classes = list(total_data.class_to_idx)
model.load_state_dict(torch.load(best_model_path))

def predict_one_image(image_path, model, transform, classes):

    test_img = Image.open(image_path).convert('RGB')
    plt.imshow(test_img)  # 展示预测的图片

    test_img = transform(test_img)
    img = test_img.to(device).unsqueeze(0)
  

    model.eval()
    output = model(img)

    _,pred = torch.max(output,1)
    pred_class = classes[pred]
    print(f'预测结果是：{pred_class}')


# 预测训练集中的某张照片
predict_one_image(image_path='./48-data/Brad Pitt/002_cc1b9701.jpg', 
                  model=model, 
                  transform=train_transforms, 
                  classes=classes)