python
复制代码
import torch
import torch.nn as nn
class YOLOv1(nn.Module):
def __init__(self, num_classes=1, S=7, B=2):
super(YOLOv1, self).__init__()
self.S = S
self.B = B
self.num_classes = num_classes
# 特征提取网络
# 注意:这里去掉了 AdaptiveAvgPool2d,因为我们要在最后时刻才进行压缩
self.features = nn.Sequential(
nn.Conv2d(3, 64, 7, 2, 3), nn.LeakyReLU(0.1), nn.MaxPool2d(2),
nn.Conv2d(64, 192, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2),
nn.Conv2d(192, 128, 1), nn.LeakyReLU(0.1),
nn.Conv2d(128, 256, 3, 1, 1), nn.LeakyReLU(0.1),
nn.Conv2d(256, 256, 3, 1, 1), nn.LeakyReLU(0.1),
nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2),
nn.Conv2d(512, 256, 1), nn.LeakyReLU(0.1),
nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1),
nn.Conv2d(512, 256, 1), nn.LeakyReLU(0.1),
nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1),
nn.Conv2d(512, 256, 1), nn.LeakyReLU(0.1),
nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1),
nn.Conv2d(512, 256, 1), nn.LeakyReLU(0.1),
nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1),
nn.MaxPool2d(2),
nn.Conv2d(512, 512, 1), nn.LeakyReLU(0.1),
nn.Conv2d(512, 1024, 3, 1, 1), nn.LeakyReLU(0.1),
nn.Conv2d(1024, 512, 1), nn.LeakyReLU(0.1),
nn.Conv2d(512, 1024, 3, 1, 1), nn.LeakyReLU(0.1),
nn.Conv2d(1024, 1024, 3, 1, 1), nn.LeakyReLU(0.1),
nn.Conv2d(1024, 1024, 3, 2, 1), nn.LeakyReLU(0.1),
)
# --- 修改重点 1:全连接层输入维度 ---
# 原代码是 nn.Linear(1024 * 7 * 7, 4096) -> 导致 900MB+
# 修改后是 nn.Linear(1024, 4096) -> 配合下面的池化,体积将降至约 20MB
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(1024, 4096), # 输入变成了 1024 (1x1x1024)
nn.LeakyReLU(0.1),
nn.Linear(4096, S * S * (B * 5 + num_classes))
)
def forward(self, x):
x = self.features(x)
# --- 修改重点 2:在 forward 中加入池化 ---
# 将特征图从 [Batch, 1024, 7, 7] 压缩为 [Batch, 1024, 1, 1]
x = nn.functional.adaptive_avg_pool2d(x, (1, 1))
x = self.fc(x)
# 重塑形状为 [Batch, 7, 7, 30]
out = x.view(x.size(0), self.S, self.S, self.B * 5 + self.num_classes)
# --- 激活函数处理 ---
# 1. 坐标 (x, y) -> Sigmoid (0~1)
out[..., :self.B * 4] = torch.sigmoid(out[..., :self.B * 4])
# 2. 置信度 -> Sigmoid (0~1)
out[..., self.B * 4: self.B * 5] = torch.sigmoid(out[..., self.B * 4: self.B * 5])
# 3. 类别 -> Sigmoid (0~1)
out[..., self.B * 5:] = torch.sigmoid(out[..., self.B * 5:])
return out
python
复制代码
import torch
from torch.utils.data import Dataset
import cv2
import os
import numpy as np
class YOLODataset(Dataset):
def __init__(self, img_dir, label_dir, S=7, img_size=448, augment=True):
self.img_dir = img_dir
self.label_dir = label_dir
self.S = S
self.img_size = img_size
self.augment = augment
# 获取文件列表
self.img_files = [f for f in os.listdir(img_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))]
def __len__(self):
return len(self.img_files)
def __getitem__(self, idx):
# 1. 读取图片
img_path = os.path.join(self.img_dir, self.img_files[idx])
img = cv2.imread(img_path)
h, w = img.shape[:2] # 获取原始宽高
# 2. 读取标签 (先读取为原始归一化坐标)
label_path = os.path.join(self.label_dir, self.img_files[idx].replace('.jpg', '.txt').replace('.png', '.txt'))
boxes = []
if os.path.exists(label_path):
with open(label_path, 'r') as f:
for line in f.readlines():
parts = line.strip().split()
if len(parts) >= 5:
cls = int(parts[0])
x_center, y_center, w_box, h_box = map(float, parts[1:5])
boxes.append([cls, x_center, y_center, w_box, h_box])
# ==========================================
# 3. Letterbox 处理 (核心修改部分)
# ==========================================
# 计算缩放比例 (保持长宽比)
scale = min(self.img_size / w, self.img_size / h)
# 计算缩放后的新宽高
new_w, new_h = int(w * scale), int(h * scale)
# 执行缩放
img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
# 创建 448x448 的灰色背景 (114 是 YOLO 默认填充色,也可以设为 0 黑色)
padded_img = np.full((self.img_size, self.img_size, 3), 114, dtype=np.uint8)
# --- 修复点:使用 // 进行整除,确保结果是整数 ---
pad_x = (self.img_size - new_w) // 2
pad_y = (self.img_size - new_h) // 2
# 将缩放后的图片粘贴到背景上
padded_img[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = img
# ==========================================
# 4. 同步修正标签坐标
# ==========================================
for box in boxes:
# box: [cls, x_center, y_center, w, h]
# 1. 缩放坐标:先乘以缩放比例,映射到新尺寸
box[1] = box[1] * w * scale
box[2] = box[2] * h * scale
# 2. 偏移坐标:加上黑边的偏移量
box[1] += pad_x
box[2] += pad_y
# 3. 重新归一化:除以最终画布尺寸 (448)
box[1] /= self.img_size
box[2] /= self.img_size
# 4. 宽高也需要重新归一化 (宽高不受偏移影响,只受缩放影响)
box[3] = box[3] * w * scale / self.img_size
box[4] = box[4] * h * scale / self.img_size
# 5. 数据增强 (水平翻转)
if self.augment and np.random.rand() > 0.5:
padded_img = cv2.flip(padded_img, 1)
# 翻转时同步修正坐标
for box in boxes:
box[1] = 1.0 - box[1]
# 6. 转换为 Tensor
img_tensor = torch.from_numpy(padded_img).permute(2, 0, 1).float() / 255.0
# 7. 构建 YOLO Target
num_classes = 1
target = torch.zeros((self.S, self.S, 5 + num_classes))
for box in boxes:
cls, x_center, y_center, w_box, h_box = box
cls = int(cls)
grid_x = int(x_center * self.S)
grid_y = int(y_center * self.S)
if grid_x < self.S and grid_y < self.S:
if target[grid_y, grid_x, 0] == 0:
target[grid_y, grid_x, 0] = 1
# 计算相对于格子的偏移量
x_offset = x_center * self.S - grid_x
y_offset = y_center * self.S - grid_y
target[grid_y, grid_x, 1:5] = torch.tensor([x_offset, y_offset, w_box, h_box])
target[grid_y, grid_x, 5 + cls] = 1
return img_tensor, target
python
复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from yolo import YOLOv1
from dataset import YOLODataset
# ==========================================
# 1. 定义 YOLO Loss 类 (保持不变)
# ==========================================
class YOLOLoss(nn.Module):
def __init__(self, S=7, B=2, num_classes=1, lambda_coord=5, lambda_noobj=0.5):
super(YOLOLoss, self).__init__()
self.S = S
self.B = B
self.num_classes = num_classes
self.lambda_coord = lambda_coord
self.lambda_noobj = lambda_noobj
def forward(self, preds, targets):
batch_size = preds.size(0)
total_loss = 0
for b in range(batch_size):
pred = preds[b]
target = targets[b]
for i in range(self.S):
for j in range(self.S):
cell_target = target[i, j]
has_obj = cell_target[0] > 0.5
pred_class = pred[i, j, self.B * 5:]
target_class = int(cell_target[5])
if has_obj:
bbox_idx = 0
pred_x = pred[i, j, bbox_idx * 5]
pred_y = pred[i, j, bbox_idx * 5 + 1]
pred_w = pred[i, j, bbox_idx * 5 + 2]
pred_h = pred[i, j, bbox_idx * 5 + 3]
pred_conf = pred[i, j, bbox_idx * 5 + 4]
target_x = cell_target[1]
target_y = cell_target[2]
target_w = cell_target[3]
target_h = cell_target[4]
coord_loss = (pred_x - target_x) ** 2 + (pred_y - target_y) ** 2
coord_loss += (pred_w - target_w) ** 2 + (pred_h - target_h) ** 2
coord_loss *= self.lambda_coord
conf_loss = (pred_conf - 1.0) ** 2
class_loss = 0
for c in range(self.num_classes):
label = 1.0 if c == target_class else 0.0
class_loss += (pred_class[c] - label) ** 2
total_loss += coord_loss + conf_loss + class_loss
else:
for b_idx in range(self.B):
pred_conf = pred[i, j, b_idx * 5 + 4]
total_loss += (pred_conf ** 2) * self.lambda_noobj
return total_loss / batch_size
# ==========================================
# 2. 训练主函数 (重点修改了保存逻辑)
# ==========================================
def train():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 使用设备: {device}")
model = YOLOv1(num_classes=1, S=7, B=2).to(device)
criterion = YOLOLoss(S=7, B=2, num_classes=1)
# 学习率调小,防止 Loss 变成 0
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
train_dataset = YOLODataset('./dataset/images/train', './dataset/labels/train', S=7, augment=True)
val_dataset = YOLODataset('./dataset/images/val', './dataset/labels/val', S=7, augment=False)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0)
# --- 新增:用于记录最优模型的变量 ---
best_val_loss = float('inf') # 初始化为无穷大
best_epoch = 0
print("🔥 开始训练...")
# 增加轮数,因为学习率变小了
for epoch in range(100):
# --- 训练阶段 ---
model.train()
total_train_loss = 0
for batch_idx, (imgs, targets) in enumerate(train_loader):
imgs = imgs.to(device)
targets = targets.to(device)
optimizer.zero_grad()
preds = model(imgs)
loss = criterion(preds, targets)
loss.backward()
optimizer.step()
total_train_loss += loss.item()
# --- 验证阶段 ---
model.eval()
total_val_loss = 0
with torch.no_grad():
for imgs, targets in val_loader:
imgs = imgs.to(device)
targets = targets.to(device)
preds = model(imgs)
loss = criterion(preds, targets)
total_val_loss += loss.item()
avg_train_loss = total_train_loss / len(train_loader)
avg_val_loss = total_val_loss / len(val_loader)
print(f"Epoch [{epoch + 1}/100] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
# --- 核心修改:保存最优模型逻辑 ---
# 如果当前验证集 Loss 比历史最优还低,就保存
if avg_val_loss < best_val_loss:
best_val_loss = avg_val_loss
best_epoch = epoch + 1
# 使用 state_dict() 保存,文件体积小 (约 200MB)
# 只保存权重参数,不保存模型结构和 Python 环境
torch.save(model.state_dict(), "yolov1_best.pth")
print(f"✨ 发现更好的模型!验证集 Loss 降至 {avg_val_loss:.4f},已保存为 yolov1_best.pth")
print(f"🏁 训练结束!最优模型出现在第 {best_epoch} 轮,最低验证集 Loss 为 {best_val_loss:.4f}")
if __name__ == "__main__":
train()