图像分类:CIFAR10/ImageNet实战
1. CIFAR10分类实战
1.1 数据集准备与预处理
python
import torch
from torchvision import datasets, transforms
# 数据增强策略
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# 加载数据集
train_dataset = datasets.CIFAR10(
root='./data',
train=True,
download=True,
transform=train_transform
)
test_dataset = datasets.CIFAR10(
root='./data',
train=False,
transform=test_transform
)
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)
1.1.1 数据分布可视化
pie
title CIFAR10类别分布
"Airplane" : 10
"Automobile" : 10
"Bird" : 10
"Cat" : 10
"Deer" : 10
"Dog" : 10
"Frog" : 10
"Horse" : 10
"Ship" : 10
"Truck" : 10
1.2 模型架构(以ResNet-18为例)
python
import torch.nn as nn
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(
in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion*planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion*planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super(ResNet, self).__init__()
self.in_planes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512*block.expansion, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.adaptive_avg_pool2d(out, (1, 1))
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
model = ResNet(BasicBlock, [2, 2, 2, 2])
1.3 训练配置与结果
python
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
criterion = nn.CrossEntropyLoss()
# 训练循环
for epoch in range(200):
model.train()
for inputs, targets in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
scheduler.step()
# 验证集评估
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, targets in test_loader:
outputs = model(inputs)
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
print(f'Epoch {epoch+1} | Test Acc: {100.*correct/total:.2f}%')
1.3.1 性能对比
模型 | 参数量 | 准确率 | 训练时间(单卡V100) |
---|---|---|---|
ResNet-18 | 11M | 94.5% | 25分钟 |
ResNet-50 | 25M | 95.2% | 45分钟 |
EfficientNet-B0 | 5M | 95.8% | 35分钟 |
2. ImageNet大规模分类实战
2.1 数据准备与分布式加载
python
from torchvision.datasets import ImageNet
from torch.utils.data.distributed import DistributedSampler
# 数据增强(更复杂)
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# 分布式数据加载
train_dataset = ImageNet(root='/path/to/imagenet', split='train', transform=train_transform)
train_sampler = DistributedSampler(train_dataset)
train_loader = DataLoader(
train_dataset,
batch_size=256,
sampler=train_sampler,
num_workers=8,
pin_memory=True
)
2.2 高效训练策略
2.2.1 混合精度训练
python
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()
for inputs, targets in train_loader:
inputs = inputs.cuda(non_blocking=True)
targets = targets.cuda(non_blocking=True)
optimizer.zero_grad()
with autocast():
outputs = model(inputs)
loss = criterion(outputs, targets)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
2.2.2 学习率策略
python
# 线性预热+余弦退火
warmup_epochs = 5
scheduler = torch.optim.lr_scheduler.SequentialLR(
optimizer,
[
torch.optim.lr_scheduler.LinearLR(
optimizer, start_factor=0.01, total_iters=warmup_epochs),
torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=100 - warmup_epochs)
],
milestones=[warmup_epochs]
)
2.3 模型验证与指标
python
def validate(model, val_loader):
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
model.eval()
with torch.no_grad():
for inputs, targets in val_loader:
inputs = inputs.cuda()
targets = targets.cuda()
outputs = model(inputs)
acc1, acc5 = accuracy(outputs, targets, topk=(1, 5))
top1.update(acc1[0], inputs.size(0))
top5.update(acc5[0], inputs.size(0))
print(f' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}')
return top1.avg
def accuracy(output, target, topk=(1,)):
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
2.3.1 主流模型性能
模型 | Top-1 Acc | 参数量 | 训练周期 | 硬件需求 |
---|---|---|---|---|
ResNet-50 | 76.5% | 25M | 90 | 4×V100 |
EfficientNet-B4 | 82.9% | 19M | 350 | 8×TPUv3 |
ViT-Base | 85.2% | 86M | 300 | 16×TPUv3 |
3. 实际部署与优化
3.1 TorchScript导出
python
# 导出为可部署格式
model = model.eval()
example_input = torch.rand(1, 3, 224, 224)
traced_model = torch.jit.trace(model, example_input)
traced_model.save("imagenet_model.pt")
3.2 量化加速
python
# 动态量化
quantized_model = torch.quantization.quantize_dynamic(
model,
{nn.Linear, nn.Conv2d},
dtype=torch.qint8
)
# 测试量化效果
validate(quantized_model, val_loader)
3.3 服务化部署架构
graph TD
A[客户端请求] --> B[负载均衡]
B --> C[模型服务1]
B --> D[模型服务2]
B --> E[模型服务3]
C --> F[结果聚合]
D --> F
E --> F
F --> G[返回预测结果]
style A fill:#9f9,stroke:#333
style G fill:#f99,stroke:#333
附录:关键数学公式
交叉熵损失
<math xmlns="http://www.w3.org/1998/Math/MathML"> L = − ∑ i = 1 N y i log ( p i ) L = -\sum_{i=1}^N y_i \log(p_i) </math>L=−∑i=1Nyilog(pi)
余弦学习率调度
<math xmlns="http://www.w3.org/1998/Math/MathML"> η t = η m i n + 1 2 ( η m a x − η m i n ) ( 1 + cos ( T c u r T m a x π ) ) \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 + \cos(\frac{T_{cur}}{T_{max}}\pi)) </math>ηt=ηmin+21(ηmax−ηmin)(1+cos(TmaxTcurπ))
Top-k准确率
<math xmlns="http://www.w3.org/1998/Math/MathML"> Top-k = 1 N ∑ i = 1 N I ( 真实标签 ∈ 预测前k个结果 ) \text{Top-k} = \frac{1}{N} \sum_{i=1}^N \mathbb{I}(\text{真实标签} \in \text{预测前k个结果}) </math>Top-k=N1∑i=1NI(真实标签∈预测前k个结果)
常见问题解答
Q: 如何处理类别不平衡问题?
- 使用加权采样器
- 调整类别权重
- 采用Focal Loss
Q: 训练时出现NaN损失怎么办?
- 检查数据归一化
- 降低学习率
- 添加梯度裁剪
- 检查模型初始化
Q: 如何选择合适的数据增强策略?
- 小数据集使用更强增强(CutMix, AutoAugment)
- 大数据集使用适度增强(随机裁剪、翻转)
- 领域相关增强(医疗图像使用弹性变形)
最佳实践总结:
- CIFAR10适合快速原型验证
- ImageNet需要分布式训练和混合精度
- 模型压缩技术对部署至关重要
- 使用预训练模型加速收敛
python
# 使用预训练模型示例
from torchvision.models import resnet50
model = resnet50(weights='IMAGENET1K_V2')
# 修改最后一层
model.fc = nn.Linear(model.fc.in_features, num_classes)
本教程完整代码库及预训练模型已在GitHub开源,欢迎Star和贡献! ⭐️ [项目链接]