PyTorch 详细知识点总结
1. PyTorch 基础概念
1.1 张量(Tensor)
-
张量是PyTorch中的基本数据结构
-
类似于多维数组,支持GPU加速
-
主要操作:
pythonimport torch # 创建张量 x = torch.tensor([1, 2, 3]) y = torch.zeros(2, 3) z = torch.randn(3, 4) # 张量运算 a = x + y b = torch.matmul(y, z) c = x.mean()
1.2 自动求导(Autograd)
-
PyTorch的核心功能,用于自动计算梯度
-
通过计算图实现反向传播
-
示例:
pythonx = torch.ones(2, 2, requires_grad=True) y = x + 2 z = y * y z.backward(torch.ones_like(z)) print(x.grad) # 打印梯度
2. 神经网络构建
2.1 nn.Module
-
PyTorch中构建神经网络的基类
-
包含网络层定义和前向传播
-
示例:
pythonimport torch.nn as nn class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.fc1 = nn.Linear(784, 128) self.fc2 = nn.Linear(128, 10) self.relu = nn.ReLU() def forward(self, x): x = self.relu(self.fc1(x)) x = self.fc2(x) return x
2.2 常用层
- Linear:全连接层
- Conv2d:2D卷积层
- MaxPool2d:最大池化层
- BatchNorm2d:批归一化
- Dropout:防止过拟合
3. 数据处理
3.1 Dataset和DataLoader
-
Dataset:定义数据集
-
DataLoader:批量加载数据
-
示例:
pythonfrom torch.utils.data import Dataset, DataLoader class CustomDataset(Dataset): def __init__(self, data, labels): self.data = data self.labels = labels def __len__(self): return len(self.data) def __getitem__(self, idx): return self.data[idx], self.labels[idx] # 创建数据加载器 dataset = CustomDataset(data, labels) dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
4. 模型训练
4.1 优化器和损失函数
-
常用优化器:SGD、Adam
-
常用损失函数:CrossEntropyLoss、MSELoss
-
示例:
pythoncriterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # 训练循环 for epoch in range(num_epochs): for data, labels in dataloader: optimizer.zero_grad() outputs = model(data) loss = criterion(outputs, labels) loss.backward() optimizer.step()
4.2 模型保存与加载
python
# 保存模型
torch.save(model.state_dict(), 'model.pth')
# 加载模型
model.load_state_dict(torch.load('model.pth'))
5. GPU加速
5.1 设备管理
python
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
data = data.to(device)
6. 高级特性
6.1 分布式训练
- DistributedDataParallel:多GPU训练
- DataParallel:简单的数据并行
6.2 TorchScript
- 将PyTorch模型转换为可优化的格式
- 支持在生产环境中部署
6.3 模型量化
- 减少模型大小和推理时间
- 支持动态量化和静态量化
7. 调试与优化
7.1 内存优化
- 使用del释放不需要的张量
- 使用torch.no_grad()减少内存使用
- 梯度累积处理大批量数据
7.2 性能分析
- torch.autograd.profiler
- nvprof性能分析
- 内存泄漏检测
8. 最佳实践
8.1 代码规范
- 使用nn.Sequential组织网络层
- 适当使用nn.ModuleList和nn.ParameterList
- 正确处理batch维度
8.2 训练技巧
- 学习率调度
- 早停策略
- 模型集成
- 交叉验证
8.3 部署考虑
- 模型导出(ONNX)
- 服务化部署
- 移动端部署
- 边缘设备部署
9. 常见问题解决方案
9.1 数据预处理
python
# 图像数据预处理
from torchvision import transforms
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# 文本数据预处理
from torch.nn.utils.rnn import pad_sequence
def text_preprocess(text_list, vocab):
# 文本转换为索引
indices = [[vocab[word] for word in text.split()] for text in text_list]
# 填充序列
padded = pad_sequence([torch.tensor(x) for x in indices], batch_first=True)
return padded
9.2 模型评估
python
def evaluate_model(model, test_loader, criterion, device):
model.eval()
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
total_loss += criterion(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
total += target.size(0)
avg_loss = total_loss / len(test_loader)
accuracy = 100. * correct / total
return avg_loss, accuracy
9.3 早停策略实现
python
class EarlyStopping:
def __init__(self, patience=7, min_delta=0):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = None
self.early_stop = False
def __call__(self, val_loss):
if self.best_loss is None:
self.best_loss = val_loss
elif val_loss > self.best_loss - self.min_delta:
self.counter += 1
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_loss = val_loss
self.counter = 0
9.4 模型训练监控
python
class TrainingMonitor:
def __init__(self):
self.history = {
'train_loss': [],
'val_loss': [],
'accuracy': []
}
def update(self, metrics):
for k, v in metrics.items():
self.history[k].append(v)
def plot_metrics(self):
epochs = range(1, len(self.history['train_loss']) + 1)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs, self.history['train_loss'], 'b-', label='Training Loss')
plt.plot(epochs, self.history['val_loss'], 'r-', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(epochs, self.history['accuracy'], 'g-', label='Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
9.5 实际应用场景示例
python
# 1. 迁移学习示例
from torchvision.models import resnet50
def create_transfer_model(num_classes):
model = resnet50(pretrained=True)
# 冻结预训练参数
for param in model.parameters():
param.requires_grad = False
# 修改最后一层
model.fc = nn.Linear(model.fc.in_features, num_classes)
return model
# 2. 模型集成示例
class EnsembleModel(nn.Module):
def __init__(self, models):
super().__init__()
self.models = nn.ModuleList(models)
def forward(self, x):
outputs = [model(x) for model in self.models]
return torch.stack(outputs).mean(0)
# 3. 自定义损失函数示例
class FocalLoss(nn.Module):
def __init__(self, alpha=1, gamma=2):
super().__init__()
self.alpha = alpha
self.gamma = gamma
def forward(self, inputs, targets):
ce_loss = F.cross_entropy(inputs, targets, reduction='none')
pt = torch.exp(-ce_loss)
focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
return focal_loss.mean()
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for data, targets in test_loader:
data, targets = data.to(device), targets.to(device)
outputs = model(data)
loss = criterion(outputs, targets)
total_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
accuracy = 100. * correct / total
avg_loss = total_loss / len(test_loader)
return avg_loss, accuracy
### 9.3 常见错误处理
```python
# 1. CUDA内存不足
try:
# 较大的批量处理
output = model(large_input)
except RuntimeError as e:
if "out of memory" in str(e):
# 清理缓存
torch.cuda.empty_cache()
# 减小批量大小重试
output = model(large_input.split(2))
# 2. 梯度爆炸处理
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# 3. 模型并行化错误处理
if torch.cuda.device_count() > 1:
try:
model = nn.DataParallel(model)
except RuntimeError as e:
print(f"并行化失败: {e}")
# 回退到单GPU
model = model.to(device)
9.4 性能优化技巧
python
# 1. 使用混合精度训练
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for data, targets in train_loader:
with autocast():
output = model(data)
loss = criterion(output, targets)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# 2. 数据加载优化
train_loader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=4, # 多进程加载
pin_memory=True # GPU训练时使用锁页内存
)
# 3. 模型推理优化
with torch.no_grad():
model.eval()
# 使用torch.jit.trace编译模型
traced_model = torch.jit.trace(model, torch.randn(1, 3, 224, 224))
output = traced_model(input_data)
9.5 实际应用场景示例
python
# 1. 迁移学习示例
from torchvision.models import resnet50
def create_transfer_model(num_classes):
model = resnet50(pretrained=True)
# 冻结预训练参数
for param in model.parameters():
param.requires_grad = False
# 修改最后一层
model.fc = nn.Linear(model.fc.in_features, num_classes)
return model
# 2. 模型集成示例
class EnsembleModel(nn.Module):
def __init__(self, models):
super().__init__()
self.models = nn.ModuleList(models)
def forward(self, x):
outputs = [model(x) for model in self.models]
return torch.stack(outputs).mean(0)
# 3. 自定义损失函数示例
class FocalLoss(nn.Module):
def __init__(self, alpha=1, gamma=2):
super().__init__()
self.alpha = alpha
self.gamma = gamma
def forward(self, inputs, targets):
ce_loss = F.cross_entropy(inputs, targets, reduction='none')
pt = torch.exp(-ce_loss)
focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
return focal_loss.mean()
# 4. 注意力机制实现
class SelfAttention(nn.Module):
def __init__(self, dim):
super().__init__()
self.query = nn.Linear(dim, dim)
self.key = nn.Linear(dim, dim)
self.value = nn.Linear(dim, dim)
def forward(self, x):
q = self.query(x)
k = self.key(x)
v = self.value(x)
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(q.size(-1))
attention = F.softmax(scores, dim=-1)
return torch.matmul(attention, v)
# 5. GAN实现示例
class Generator(nn.Module):
def __init__(self, latent_dim, img_shape):
super().__init__()
self.model = nn.Sequential(
nn.Linear(latent_dim, 128),
nn.LeakyReLU(0.2),
nn.Linear(128, 256),
nn.BatchNorm1d(256),
nn.LeakyReLU(0.2),
nn.Linear(256, np.prod(img_shape)),
nn.Tanh()
)
self.img_shape = img_shape
def forward(self, z):
img = self.model(z)
return img.view(img.size(0), *self.img_shape)
class Discriminator(nn.Module):
def __init__(self, img_shape):
super().__init__()
self.model = nn.Sequential(
nn.Linear(np.prod(img_shape), 256),
nn.LeakyReLU(0.2),
nn.Linear(256, 128),
nn.LeakyReLU(0.2),
nn.Linear(128, 1),
nn.Sigmoid()
)
def forward(self, img):
img_flat = img.view(img.size(0), -1)
return self.model(img_flat)
10. 高级训练技巧
10.1 梯度累积
python
accumulation_steps = 4 # 累积4个批次
optimizer.zero_grad()
for i, (data, target) in enumerate(train_loader):
output = model(data)
loss = criterion(output, target) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
10.2 学习率调度
python
# 1. 余弦退火调度
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=200)
# 2. 带热重启的余弦退火
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
optimizer, T_0=50, T_mult=2)
# 3. One-Cycle学习率
scheduler = torch.optim.lr_scheduler.OneCycleLR(
optimizer,
max_lr=0.1,
steps_per_epoch=len(train_loader),
epochs=num_epochs
)
10.3 模型蒸馏
python
class DistillationLoss(nn.Module):
def __init__(self, alpha=0.5, temperature=2.0):
super().__init__()
self.alpha = alpha
self.T = temperature
def forward(self, student_outputs, teacher_outputs, targets):
# 硬目标损失
hard_loss = F.cross_entropy(student_outputs, targets)
# 软目标损失
soft_loss = nn.KLDivLoss(reduction='batchmean')(
F.log_softmax(student_outputs/self.T, dim=1),
F.softmax(teacher_outputs/self.T, dim=1)
) * (self.T * self.T)
return self.alpha * hard_loss + (1-self.alpha) * soft_loss
10.4 对抗训练
python
def fgsm_attack(model, loss, data, epsilon, data_grad):
# 收集数据梯度的符号
sign_data_grad = data_grad.sign()
# 创建扰动样本
perturbed_data = data + epsilon * sign_data_grad
# 添加剪裁以维持[0,1]范围
perturbed_data = torch.clamp(perturbed_data, 0, 1)
return perturbed_data
def train_with_adversarial(model, train_loader, optimizer, epsilon):
for data, target in train_loader:
# 正常训练
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
# 生成对抗样本
data_grad = data.grad.data
perturbed_data = fgsm_attack(model, loss, data, epsilon, data_grad)
# 对抗训练
output = model(perturbed_data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
10.5 半精度训练
python
# 1. 自动混合精度训练
scaler = torch.cuda.amp.GradScaler()
optimizer = torch.optim.Adam(model.parameters())
for data, target in train_loader:
optimizer.zero_grad()
# 自动转换为半精度
with torch.cuda.amp.autocast():
output = model(data)
loss = criterion(output, target)
# 缩放损失以防止梯度下溢
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# 2. 手动转换为半精度
model.half() # 转换模型参数为FP16
for data, target in train_loader:
data = data.half() # 输入数据转换为FP16
output = model(data)
loss = criterion(output, target)