sagemaker中使用pytorch框架的DLC训练和部署cifar图像分类任务

参考资料

获取训练数据

py 复制代码
# s3://zhaojiew-sagemaker/data/cifar10/cifar-10-python.tar.gz
import torch
import torchvision
import torchvision.transforms as transforms

def _get_transform():
    return transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# 这里加载数据用的路径是/tmp/pytorch-example/cifar-10-data实际下载了tar.gz文件到本地/tmp目录,后续training也要放入tar.gz文件路径
def get_train_data_loader(data_dir='/tmp/pytorch/cifar-10-data'):
    transform=_get_transform()
    trainset=torchvision.datasets.CIFAR10(root=data_dir, train=True,
                                            download=True, transform=transform)
    return torch.utils.data.DataLoader(trainset, batch_size=4,
                                       shuffle=True, num_workers=2)


def get_test_data_loader(data_dir='/tmp/pytorch/cifar-10-data'):
    transform=_get_transform()
    testset=torchvision.datasets.CIFAR10(root=data_dir, train=False,
                                           download=True, transform=transform)
    return torch.utils.data.DataLoader(testset, batch_size=4,
                                       shuffle=False, num_workers=2)

trainloader=get_train_data_loader('/tmp/pytorch-example/cifar-10-data')
testloader=get_test_data_loader('/tmp/pytorch-example/cifar-10-data')

显示加载的数据

py 复制代码
import numpy as np
import torchvision, torch
import matplotlib.pyplot as plt

def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))

# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# show images
imshow(torchvision.utils.make_grid(images))

# print labels
classes = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")
print(" ".join("%9s" % classes[labels[j]] for j in range(4)))

训练和推理脚本

脚本同时用来进行训练和推理任务,推理部分的实现为model_fn,没有实现input_fn等函数

py 复制代码
import ast
import argparse
import logging

import os

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision
import torchvision.models
import torchvision.transforms as transforms
import torch.nn.functional as F

logger=logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

classes=('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


# https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py#L118
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1=nn.Conv2d(3, 6, 5)
        self.pool=nn.MaxPool2d(2, 2)
        self.conv2=nn.Conv2d(6, 16, 5)
        self.fc1=nn.Linear(16 * 5 * 5, 120)
        self.fc2=nn.Linear(120, 84)
        self.fc3=nn.Linear(84, 10)

    def forward(self, x):
        x=self.pool(F.relu(self.conv1(x)))
        x=self.pool(F.relu(self.conv2(x)))
        x=x.view(-1, 16 * 5 * 5)
        x=F.relu(self.fc1(x))
        x=F.relu(self.fc2(x))
        x=self.fc3(x)
        return x


def _train(args):
    is_distributed=len(args.hosts) > 1 and args.dist_backend is not None
    logger.debug("Distributed training - {}".format(is_distributed))

    if is_distributed:
        # Initialize the distributed environment.
        world_size=len(args.hosts)
        os.environ['WORLD_SIZE']=str(world_size)
        host_rank=args.hosts.index(args.current_host)
        dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size)
        logger.info(
            'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
                args.dist_backend,
                dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format(
                dist.get_rank(), torch.cuda.is_available(), args.num_gpus))

    device='cuda' if torch.cuda.is_available() else 'cpu'
    logger.info("Device Type: {}".format(device))

    logger.info("Loading Cifar10 dataset")
    transform=transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    trainset=torchvision.datasets.CIFAR10(root=args.data_dir, train=True,
                                            download=False, transform=transform)
    train_loader=torch.utils.data.DataLoader(trainset, batch_size=args.batch_size,
                                               shuffle=True, num_workers=args.workers)

    testset=torchvision.datasets.CIFAR10(root=args.data_dir, train=False,
                                           download=False, transform=transform)
    test_loader=torch.utils.data.DataLoader(testset, batch_size=args.batch_size,
                                              shuffle=False, num_workers=args.workers)

    logger.info("Model loaded")
    model=Net()

    if torch.cuda.device_count() > 1:
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model=nn.DataParallel(model)

    model=model.to(device)

    criterion=nn.CrossEntropyLoss().to(device)
    optimizer=torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(0, args.epochs):
        running_loss=0.0
        for i, data in enumerate(train_loader):
            # get the inputs
            inputs, labels=data
            inputs, labels=inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs=model(inputs)
            loss=criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss=0.0
    print('Finished Training')
    return _save_model(model, args.model_dir)


def _save_model(model, model_dir):
    logger.info("Saving the model.")
    path=os.path.join(model_dir, 'model.pth')
    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
    torch.save(model.cpu().state_dict(), path)


def model_fn(model_dir):
    logger.info('model_fn triggered, starting to load model...')
    device="cuda" if torch.cuda.is_available() else "cpu"
    model=Net()
    if torch.cuda.device_count() > 1:
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model=nn.DataParallel(model)

    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f))
    return model.to(device)


if __name__ == '__main__':
    parser=argparse.ArgumentParser()

    parser.add_argument('--workers', type=int, default=2, metavar='W',
                        help='number of data loading workers (default: 2)')
    parser.add_argument('--epochs', type=int, default=2, metavar='E',
                        help='number of total epochs to run (default: 2)')
    parser.add_argument('--batch-size', type=int, default=4, metavar='BS',
                        help='batch size (default: 4)')
    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                        help='initial learning rate (default: 0.001)')
    parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='momentum (default: 0.9)')
    parser.add_argument('--dist-backend', type=str, default='gloo', help='distributed backend (default: gloo)')

    # The parameters below retrieve their default values from SageMaker environment variables, which are
    # instantiated by the SageMaker containers framework.
    # https://github.com/aws/sagemaker-containers#how-a-script-is-executed-inside-the-container
    parser.add_argument('--hosts', type=str, default=ast.literal_eval(os.environ['SM_HOSTS']))
    parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
    parser.add_argument('--num-gpus', type=int, default=os.environ['SM_NUM_GPUS'])

    _train(parser.parse_args())

模型训练

提前获取pytorch镜像

  • 托管的DLC中内置了training toolkit和inference toolkit,因此只需要按照规范提供训练和推理脚本即可
py 复制代码
from sagemaker import get_execution_role

role=get_execution_role()

from sagemaker import image_uris
image_uri_inference = image_uris.retrieve(framework='pytorch',region='cn-north-1',version='1.8.0',py_version='py3',image_scope='inference', instance_type='ml.c5.4xlarge')
image_uri_train = image_uris.retrieve(framework='pytorch',region='cn-north-1',version='1.8.0',py_version='py3',image_scope='training', instance_type='ml.c5.4xlarge')
print(image_uri_inference)
print(image_uri_train)

创建Estimator

py 复制代码
from sagemaker.estimator import Estimator

# 超参数实际上会作为训练脚本的参数传入,可以通过argparse进行解析
hyperparameters = {
    'epochs': 1,
}

# 使用通用的Estimator,
estimator=Estimator(
    image_uri=image_uri_train, # 这里可以使用托管镜像或基于托管的扩展镜像
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    hyperparameters=hyperparameters,
    source_dir="src",
    entry_point="cifar10.py"
    # model_uri="s3://zhaojiew-sagemaker/model/cifar10-pytorch/" # 如果有pre-trained的模型可以使用此参数导入

)
# 在本地测试训练任务,实际上是通过docker-compose运行
#estimator.fit('file:///tmp/pytorch-example/cifar-10-data')
# 提交train任务
estimator.fit('s3://zhaojiew-tmp/cifar-10-data/',)

也可以使用PyTorch的Estimator

py 复制代码
from sagemaker.pytorch.estimator import PyTorch
# 也可以使用PyTorch
pytorch_estimator = PyTorch(
    entry_point='cifar10.py',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8.0',
    py_version='py3',
    hyperparameters=hyperparameters
)
pytorch_estimator.fit('s3://zhaojiew-tmp/cifar-10-data/')

最终存储的模型位置为

复制代码
model_location = 's3://sagemaker-cn-north-1-xxxxxxx/pytorch-training-2024-11-19-09-56-55-508/output/model.tar.gz'

模型部署

实际上可以直接基于estimator进行部署,但是这里导入模型将两个阶段分开

python 复制代码
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(
    # 指定模型所在位置
    model_data=model_location,
    role=role,
    image_uri=image_uri_inference,
    entry_point='cifar10.py', # 如果指定了推理脚本会打包为source.tar.gz并和model.tar.gz合并成一个tar文件
    source_dir="src" # 指定代码所在目录
)
pytorch_predictor = pytorch_model.deploy(instance_type='ml.m5.xlarge', initial_instance_count=1)

也可以使用更通用的Model

py 复制代码
from sagemaker.model import Model

model = Model(
    # # 指定模型所在位置
    model_data=model_location,
    image_uri=image_uri_inference,
    role=role,
    entry_point="cifar10.py",
    source_dir="src"
)

model_predictor=model.deploy(1, "ml.m5.xlarge")

模型调用

如果predictor丢失,可以通过如下方法重建

py 复制代码
from sagemaker.predictor import Predictor
from sagemaker.serializers import NumpySerializer
from sagemaker.deserializers import NumpyDeserializer

model_predictor = Predictor(
    endpoint_name="pytorch-inference-2024-11-19-14-19-49-678"
)
model_predictor.serializer = NumpySerializer()
model_predictor.deserializer = NumpyDeserializer()

使用测试集测试

py 复制代码
# get some test images
dataiter = iter(testloader)
images, labels = next(dataiter)

# print images
imshow(torchvision.utils.make_grid(images))
print("GroundTruth: ", " ".join("%4s" % classes[labels[j]] for j in range(4)))

outputs = model_predictor.predict(images.numpy())
_, predicted = torch.max(torch.from_numpy(np.array(outputs)), 1)

print("Predicted: ", " ".join("%4s" % classes[predicted[j]] for j in range(4)))

由于模型部署后仅仅是在机器学习实例上启动容器,因此也可以在本地测试,例如以下docker-compose文件

yaml 复制代码
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  localendpoint:
    command: serve # 也可以忽略,默认为serve
    container_name: localendpoint
    environment:
    - AWS_REGION=cn-north-1
    - SAGEMAKER_PROGRAM=cifar10.py
    - S3_ENDPOINT_URL=https://s3.cn-north-1.amazonaws.com.cn
    - SAGEMAKER_SUBMIT_DIRECTORY=/opt/ml/model/code
    image: 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/pytorch-inference:1.8.0-cpu-py3
    ports:
    - 8080:8080
    networks:
      sagemaker-local:
    volumes:
    - ./src/cifar10.py:/opt/ml/model/code/cifar10.py
    - ./model/model.pth:/opt/ml/model/model.pth
version: '2.3'

但是这只能测试推理服务器能够正常启动,实际调用由于无法使用boto3和sagemaker sdk,可能需要手动封装http请求

python 复制代码
import numpy as np
import torch
import requests
from io import BytesIO

buffer = BytesIO()
np.save(buffer, images.numpy(), allow_pickle=False)
payload = buffer.getvalue()

local_url = "http://localhost:8080/invocations"
try:
    response = requests.post(
        local_url,
        data=payload,
        headers={
            'Content-Type': 'application/x-npy'
        }
    )
    response.raise_for_status()
    result = np.frombuffer(response.content, dtype=np.float32)
    print(result)
except Exception as e:
    print(f"发生错误: {e}")
相关推荐
逛逛GitHub28 分钟前
飞书多维表“独立”了!功能强大的超出想象。
人工智能·github·产品
机器之心41 分钟前
刚刚,DeepSeek-R1论文登上Nature封面,通讯作者梁文锋
人工智能·openai
aneasystone本尊3 小时前
学习 Chat2Graph 的知识库服务
人工智能
IT_陈寒3 小时前
Redis 性能翻倍的 7 个冷门技巧,第 5 个大多数人都不知道!
前端·人工智能·后端
飞哥数智坊14 小时前
GPT-5-Codex 发布,Codex 正在取代 Claude
人工智能·ai编程
倔强青铜三14 小时前
苦练Python第46天:文件写入与上下文管理器
人工智能·python·面试
虫无涯15 小时前
Dify Agent + AntV 实战:从 0 到 1 打造数据可视化解决方案
人工智能
Dm_dotnet17 小时前
公益站Agent Router注册送200刀额度竟然是真的
人工智能
算家计算17 小时前
7B参数拿下30个世界第一!Hunyuan-MT-7B本地部署教程:腾讯混元开源业界首个翻译集成模型
人工智能·开源
机器之心17 小时前
LLM开源2.0大洗牌:60个出局,39个上桌,AI Coding疯魔,TensorFlow已死
人工智能·openai