sagemaker中使用pytorch框架的DLC训练和部署cifar图像分类任务

参考资料

获取训练数据

py 复制代码
# s3://zhaojiew-sagemaker/data/cifar10/cifar-10-python.tar.gz
import torch
import torchvision
import torchvision.transforms as transforms

def _get_transform():
    return transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# 这里加载数据用的路径是/tmp/pytorch-example/cifar-10-data实际下载了tar.gz文件到本地/tmp目录,后续training也要放入tar.gz文件路径
def get_train_data_loader(data_dir='/tmp/pytorch/cifar-10-data'):
    transform=_get_transform()
    trainset=torchvision.datasets.CIFAR10(root=data_dir, train=True,
                                            download=True, transform=transform)
    return torch.utils.data.DataLoader(trainset, batch_size=4,
                                       shuffle=True, num_workers=2)


def get_test_data_loader(data_dir='/tmp/pytorch/cifar-10-data'):
    transform=_get_transform()
    testset=torchvision.datasets.CIFAR10(root=data_dir, train=False,
                                           download=True, transform=transform)
    return torch.utils.data.DataLoader(testset, batch_size=4,
                                       shuffle=False, num_workers=2)

trainloader=get_train_data_loader('/tmp/pytorch-example/cifar-10-data')
testloader=get_test_data_loader('/tmp/pytorch-example/cifar-10-data')

显示加载的数据

py 复制代码
import numpy as np
import torchvision, torch
import matplotlib.pyplot as plt

def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))

# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# show images
imshow(torchvision.utils.make_grid(images))

# print labels
classes = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")
print(" ".join("%9s" % classes[labels[j]] for j in range(4)))

训练和推理脚本

脚本同时用来进行训练和推理任务,推理部分的实现为model_fn,没有实现input_fn等函数

py 复制代码
import ast
import argparse
import logging

import os

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision
import torchvision.models
import torchvision.transforms as transforms
import torch.nn.functional as F

logger=logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

classes=('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


# https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py#L118
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1=nn.Conv2d(3, 6, 5)
        self.pool=nn.MaxPool2d(2, 2)
        self.conv2=nn.Conv2d(6, 16, 5)
        self.fc1=nn.Linear(16 * 5 * 5, 120)
        self.fc2=nn.Linear(120, 84)
        self.fc3=nn.Linear(84, 10)

    def forward(self, x):
        x=self.pool(F.relu(self.conv1(x)))
        x=self.pool(F.relu(self.conv2(x)))
        x=x.view(-1, 16 * 5 * 5)
        x=F.relu(self.fc1(x))
        x=F.relu(self.fc2(x))
        x=self.fc3(x)
        return x


def _train(args):
    is_distributed=len(args.hosts) > 1 and args.dist_backend is not None
    logger.debug("Distributed training - {}".format(is_distributed))

    if is_distributed:
        # Initialize the distributed environment.
        world_size=len(args.hosts)
        os.environ['WORLD_SIZE']=str(world_size)
        host_rank=args.hosts.index(args.current_host)
        dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size)
        logger.info(
            'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
                args.dist_backend,
                dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format(
                dist.get_rank(), torch.cuda.is_available(), args.num_gpus))

    device='cuda' if torch.cuda.is_available() else 'cpu'
    logger.info("Device Type: {}".format(device))

    logger.info("Loading Cifar10 dataset")
    transform=transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    trainset=torchvision.datasets.CIFAR10(root=args.data_dir, train=True,
                                            download=False, transform=transform)
    train_loader=torch.utils.data.DataLoader(trainset, batch_size=args.batch_size,
                                               shuffle=True, num_workers=args.workers)

    testset=torchvision.datasets.CIFAR10(root=args.data_dir, train=False,
                                           download=False, transform=transform)
    test_loader=torch.utils.data.DataLoader(testset, batch_size=args.batch_size,
                                              shuffle=False, num_workers=args.workers)

    logger.info("Model loaded")
    model=Net()

    if torch.cuda.device_count() > 1:
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model=nn.DataParallel(model)

    model=model.to(device)

    criterion=nn.CrossEntropyLoss().to(device)
    optimizer=torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(0, args.epochs):
        running_loss=0.0
        for i, data in enumerate(train_loader):
            # get the inputs
            inputs, labels=data
            inputs, labels=inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs=model(inputs)
            loss=criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss=0.0
    print('Finished Training')
    return _save_model(model, args.model_dir)


def _save_model(model, model_dir):
    logger.info("Saving the model.")
    path=os.path.join(model_dir, 'model.pth')
    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
    torch.save(model.cpu().state_dict(), path)


def model_fn(model_dir):
    logger.info('model_fn triggered, starting to load model...')
    device="cuda" if torch.cuda.is_available() else "cpu"
    model=Net()
    if torch.cuda.device_count() > 1:
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model=nn.DataParallel(model)

    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f))
    return model.to(device)


if __name__ == '__main__':
    parser=argparse.ArgumentParser()

    parser.add_argument('--workers', type=int, default=2, metavar='W',
                        help='number of data loading workers (default: 2)')
    parser.add_argument('--epochs', type=int, default=2, metavar='E',
                        help='number of total epochs to run (default: 2)')
    parser.add_argument('--batch-size', type=int, default=4, metavar='BS',
                        help='batch size (default: 4)')
    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                        help='initial learning rate (default: 0.001)')
    parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='momentum (default: 0.9)')
    parser.add_argument('--dist-backend', type=str, default='gloo', help='distributed backend (default: gloo)')

    # The parameters below retrieve their default values from SageMaker environment variables, which are
    # instantiated by the SageMaker containers framework.
    # https://github.com/aws/sagemaker-containers#how-a-script-is-executed-inside-the-container
    parser.add_argument('--hosts', type=str, default=ast.literal_eval(os.environ['SM_HOSTS']))
    parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
    parser.add_argument('--num-gpus', type=int, default=os.environ['SM_NUM_GPUS'])

    _train(parser.parse_args())

模型训练

提前获取pytorch镜像

  • 托管的DLC中内置了training toolkit和inference toolkit,因此只需要按照规范提供训练和推理脚本即可
py 复制代码
from sagemaker import get_execution_role

role=get_execution_role()

from sagemaker import image_uris
image_uri_inference = image_uris.retrieve(framework='pytorch',region='cn-north-1',version='1.8.0',py_version='py3',image_scope='inference', instance_type='ml.c5.4xlarge')
image_uri_train = image_uris.retrieve(framework='pytorch',region='cn-north-1',version='1.8.0',py_version='py3',image_scope='training', instance_type='ml.c5.4xlarge')
print(image_uri_inference)
print(image_uri_train)

创建Estimator

py 复制代码
from sagemaker.estimator import Estimator

# 超参数实际上会作为训练脚本的参数传入,可以通过argparse进行解析
hyperparameters = {
    'epochs': 1,
}

# 使用通用的Estimator,
estimator=Estimator(
    image_uri=image_uri_train, # 这里可以使用托管镜像或基于托管的扩展镜像
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    hyperparameters=hyperparameters,
    source_dir="src",
    entry_point="cifar10.py"
    # model_uri="s3://zhaojiew-sagemaker/model/cifar10-pytorch/" # 如果有pre-trained的模型可以使用此参数导入

)
# 在本地测试训练任务,实际上是通过docker-compose运行
#estimator.fit('file:///tmp/pytorch-example/cifar-10-data')
# 提交train任务
estimator.fit('s3://zhaojiew-tmp/cifar-10-data/',)

也可以使用PyTorch的Estimator

py 复制代码
from sagemaker.pytorch.estimator import PyTorch
# 也可以使用PyTorch
pytorch_estimator = PyTorch(
    entry_point='cifar10.py',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8.0',
    py_version='py3',
    hyperparameters=hyperparameters
)
pytorch_estimator.fit('s3://zhaojiew-tmp/cifar-10-data/')

最终存储的模型位置为

复制代码
model_location = 's3://sagemaker-cn-north-1-xxxxxxx/pytorch-training-2024-11-19-09-56-55-508/output/model.tar.gz'

模型部署

实际上可以直接基于estimator进行部署,但是这里导入模型将两个阶段分开

python 复制代码
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(
    # 指定模型所在位置
    model_data=model_location,
    role=role,
    image_uri=image_uri_inference,
    entry_point='cifar10.py', # 如果指定了推理脚本会打包为source.tar.gz并和model.tar.gz合并成一个tar文件
    source_dir="src" # 指定代码所在目录
)
pytorch_predictor = pytorch_model.deploy(instance_type='ml.m5.xlarge', initial_instance_count=1)

也可以使用更通用的Model

py 复制代码
from sagemaker.model import Model

model = Model(
    # # 指定模型所在位置
    model_data=model_location,
    image_uri=image_uri_inference,
    role=role,
    entry_point="cifar10.py",
    source_dir="src"
)

model_predictor=model.deploy(1, "ml.m5.xlarge")

模型调用

如果predictor丢失,可以通过如下方法重建

py 复制代码
from sagemaker.predictor import Predictor
from sagemaker.serializers import NumpySerializer
from sagemaker.deserializers import NumpyDeserializer

model_predictor = Predictor(
    endpoint_name="pytorch-inference-2024-11-19-14-19-49-678"
)
model_predictor.serializer = NumpySerializer()
model_predictor.deserializer = NumpyDeserializer()

使用测试集测试

py 复制代码
# get some test images
dataiter = iter(testloader)
images, labels = next(dataiter)

# print images
imshow(torchvision.utils.make_grid(images))
print("GroundTruth: ", " ".join("%4s" % classes[labels[j]] for j in range(4)))

outputs = model_predictor.predict(images.numpy())
_, predicted = torch.max(torch.from_numpy(np.array(outputs)), 1)

print("Predicted: ", " ".join("%4s" % classes[predicted[j]] for j in range(4)))

由于模型部署后仅仅是在机器学习实例上启动容器,因此也可以在本地测试,例如以下docker-compose文件

yaml 复制代码
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  localendpoint:
    command: serve # 也可以忽略,默认为serve
    container_name: localendpoint
    environment:
    - AWS_REGION=cn-north-1
    - SAGEMAKER_PROGRAM=cifar10.py
    - S3_ENDPOINT_URL=https://s3.cn-north-1.amazonaws.com.cn
    - SAGEMAKER_SUBMIT_DIRECTORY=/opt/ml/model/code
    image: 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/pytorch-inference:1.8.0-cpu-py3
    ports:
    - 8080:8080
    networks:
      sagemaker-local:
    volumes:
    - ./src/cifar10.py:/opt/ml/model/code/cifar10.py
    - ./model/model.pth:/opt/ml/model/model.pth
version: '2.3'

但是这只能测试推理服务器能够正常启动,实际调用由于无法使用boto3和sagemaker sdk,可能需要手动封装http请求

python 复制代码
import numpy as np
import torch
import requests
from io import BytesIO

buffer = BytesIO()
np.save(buffer, images.numpy(), allow_pickle=False)
payload = buffer.getvalue()

local_url = "http://localhost:8080/invocations"
try:
    response = requests.post(
        local_url,
        data=payload,
        headers={
            'Content-Type': 'application/x-npy'
        }
    )
    response.raise_for_status()
    result = np.frombuffer(response.content, dtype=np.float32)
    print(result)
except Exception as e:
    print(f"发生错误: {e}")
相关推荐
我是发哥哈5 分钟前
主流AI框架生产环境性能对比:5大关键维度深度评测
大数据·人工智能·学习·机器学习·ai·chatgpt·ai-native
隔壁大炮6 分钟前
Day07-RNN介绍
人工智能·pytorch·rnn·深度学习·神经网络·算法·numpy
羑悻的小杀马特12 分钟前
零成本搞定!异地访问 OpenClaw 最简方案:SSH 端口映射组网!
运维·服务器·人工智能·docker·自动化·ssh·openclaw
雷帝木木13 分钟前
Python 并发编程的高级技巧与性能优化
人工智能·python·深度学习·机器学习
ai大模型中转api测评16 分钟前
OpenAI再次定义生产力!GPT-image-2发布:当AI绘图学会思考
人工智能·gpt·自动化·api
jinanwuhuaguo16 分钟前
OpenClaw协议霸权——从 MCP 标准到意图封建化的政治经济学(第十八篇)
android·人工智能·kotlin·拓扑学·openclaw
belldeep20 分钟前
基于深度学习的中医药系统 与《本草纲目》结合应用
人工智能·深度学习·ai·中医药
GAMC20 分钟前
从 “凭感觉写代码” 到 “按规范做开发”:OpenSpec 让 AI 编程回归工程化
前端·人工智能
薛定猫AI23 分钟前
【技术干货】Claude Code 终端编程实战:从零搭建 Windows 高效 AI 开发环境
人工智能·windows
AI医影跨模态组学28 分钟前
Radiology子刊 暨南大学附属第一医院等团队:基于肿瘤和内脏脂肪组织CT特征的深度学习模型用于预测浆膜浸润性胃癌根治术后腹膜转移风险
人工智能·深度学习·论文·医学·医学影像