Accelerate_deepspeed使用

执行完accelerate config会有一个默认的yaml配置文件

python 复制代码
from accelerate import Accelerator
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

# 初始化Accelerator
accelerator = Accelerator()
device = accelerator.device

# 数据预处理
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 加载数据集
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False, num_workers=4)

# 定义模型
model = models.resnet18(num_classes=10).to(device)
print(f'initial model device: {next(model.parameters()).device}')

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

# 使用accelerate包装模型、优化器和数据加载器
model, optimizer, train_loader, test_loader = accelerator.prepare(model, optimizer, train_loader, test_loader)

# 检查分配情况
print(f"Model is on device: {next(model.parameters()).device}")
for batch_idx, (inputs, targets) in enumerate(train_loader):
    print(f"batch_{batch_idx} inputs are on device: {inputs.device}")
    print(f"batch_{batch_idx} targets are on device: {targets.device}")
    

# 训练函数
def train(epoch):
    model.train()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        print(f'{epoch}:{batch_idx}', accelerator.is_main_process, batch_idx, inputs.device, targets.device, model.device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        accelerator.backward(loss)
        optimizer.step()
        if batch_idx % 200 == 0:
            print(f'{accelerator.is_main_process}, Train Epoch: {epoch} [{batch_idx * len(inputs)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

# 测试函数
def test():
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += accelerator.gather(loss).sum().item()
            pred = outputs.argmax(dim=1, keepdim=True)
            correct += accelerator.gather(pred.eq(targets.view_as(pred)).sum()).sum().item()
            total += targets.size(0) * accelerator.num_processes  # 这里的总数可以直接累加
    test_loss /= total
    accuracy = 100. * correct / total
    if accelerator.is_main_process:
        print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{total} ({accuracy:.0f}%)')

# 主训练循环
for epoch in range(3):
    train(epoch)
    test()

print("Training completed.")
相关推荐
数据牧羊人的成长笔记1 天前
认识深度学习_PyTorch入门+神经网络基础+卷积神经网络+迁移学习+生成对抗网络_GAN+CNN目标检测+循环神经网络与NLP
pytorch·深度学习·神经网络
kishu_iOS&AI1 天前
NLP —— 文本预处理
人工智能·pytorch·python·自然语言处理
AI技术增长1 天前
Pytorch图像去噪实战(一):从0复现DnCNN并解决训练不收敛问题(附完整工程+踩坑总结)
人工智能·pytorch·python
bst@微胖子1 天前
PyTorch深度学习框架之基于CNN的手机价格分类任务
pytorch·深度学习·cnn
山顶夕景1 天前
【Agent】Openclaw架构(Gateway|subagent|工具过滤|Sandbox)
大模型·llm·agent·智能体·openclaw
Westward-sun.1 天前
YOLOv5 最新版从零配置环境到训练自己的数据集
人工智能·pytorch·深度学习·yolo
xiezhr1 天前
别被AI吓到了,一文看懂AI到底是啥?
人工智能·llm·openai
Irissgwe1 天前
LangChain之聊天模型核心能力(二)
人工智能·langchain·llm·langgraph
隔壁大炮1 天前
CNN图像分类案例
人工智能·pytorch·python·深度学习·算法·分类·cnn
山顶夕景2 天前
【Agent】Claude code架构和源码粗读分析
大模型·llm·agent·线程·通信协议