sagemaker中使用pytorch框架的DLC训练和部署cifar图像分类任务

参考资料

获取训练数据

py 复制代码
# s3://zhaojiew-sagemaker/data/cifar10/cifar-10-python.tar.gz
import torch
import torchvision
import torchvision.transforms as transforms

def _get_transform():
    return transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# 这里加载数据用的路径是/tmp/pytorch-example/cifar-10-data实际下载了tar.gz文件到本地/tmp目录,后续training也要放入tar.gz文件路径
def get_train_data_loader(data_dir='/tmp/pytorch/cifar-10-data'):
    transform=_get_transform()
    trainset=torchvision.datasets.CIFAR10(root=data_dir, train=True,
                                            download=True, transform=transform)
    return torch.utils.data.DataLoader(trainset, batch_size=4,
                                       shuffle=True, num_workers=2)


def get_test_data_loader(data_dir='/tmp/pytorch/cifar-10-data'):
    transform=_get_transform()
    testset=torchvision.datasets.CIFAR10(root=data_dir, train=False,
                                           download=True, transform=transform)
    return torch.utils.data.DataLoader(testset, batch_size=4,
                                       shuffle=False, num_workers=2)

trainloader=get_train_data_loader('/tmp/pytorch-example/cifar-10-data')
testloader=get_test_data_loader('/tmp/pytorch-example/cifar-10-data')

显示加载的数据

py 复制代码
import numpy as np
import torchvision, torch
import matplotlib.pyplot as plt

def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))

# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# show images
imshow(torchvision.utils.make_grid(images))

# print labels
classes = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")
print(" ".join("%9s" % classes[labels[j]] for j in range(4)))

训练和推理脚本

脚本同时用来进行训练和推理任务,推理部分的实现为model_fn,没有实现input_fn等函数

py 复制代码
import ast
import argparse
import logging

import os

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision
import torchvision.models
import torchvision.transforms as transforms
import torch.nn.functional as F

logger=logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

classes=('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


# https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py#L118
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1=nn.Conv2d(3, 6, 5)
        self.pool=nn.MaxPool2d(2, 2)
        self.conv2=nn.Conv2d(6, 16, 5)
        self.fc1=nn.Linear(16 * 5 * 5, 120)
        self.fc2=nn.Linear(120, 84)
        self.fc3=nn.Linear(84, 10)

    def forward(self, x):
        x=self.pool(F.relu(self.conv1(x)))
        x=self.pool(F.relu(self.conv2(x)))
        x=x.view(-1, 16 * 5 * 5)
        x=F.relu(self.fc1(x))
        x=F.relu(self.fc2(x))
        x=self.fc3(x)
        return x


def _train(args):
    is_distributed=len(args.hosts) > 1 and args.dist_backend is not None
    logger.debug("Distributed training - {}".format(is_distributed))

    if is_distributed:
        # Initialize the distributed environment.
        world_size=len(args.hosts)
        os.environ['WORLD_SIZE']=str(world_size)
        host_rank=args.hosts.index(args.current_host)
        dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size)
        logger.info(
            'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
                args.dist_backend,
                dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format(
                dist.get_rank(), torch.cuda.is_available(), args.num_gpus))

    device='cuda' if torch.cuda.is_available() else 'cpu'
    logger.info("Device Type: {}".format(device))

    logger.info("Loading Cifar10 dataset")
    transform=transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    trainset=torchvision.datasets.CIFAR10(root=args.data_dir, train=True,
                                            download=False, transform=transform)
    train_loader=torch.utils.data.DataLoader(trainset, batch_size=args.batch_size,
                                               shuffle=True, num_workers=args.workers)

    testset=torchvision.datasets.CIFAR10(root=args.data_dir, train=False,
                                           download=False, transform=transform)
    test_loader=torch.utils.data.DataLoader(testset, batch_size=args.batch_size,
                                              shuffle=False, num_workers=args.workers)

    logger.info("Model loaded")
    model=Net()

    if torch.cuda.device_count() > 1:
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model=nn.DataParallel(model)

    model=model.to(device)

    criterion=nn.CrossEntropyLoss().to(device)
    optimizer=torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(0, args.epochs):
        running_loss=0.0
        for i, data in enumerate(train_loader):
            # get the inputs
            inputs, labels=data
            inputs, labels=inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs=model(inputs)
            loss=criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss=0.0
    print('Finished Training')
    return _save_model(model, args.model_dir)


def _save_model(model, model_dir):
    logger.info("Saving the model.")
    path=os.path.join(model_dir, 'model.pth')
    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
    torch.save(model.cpu().state_dict(), path)


def model_fn(model_dir):
    logger.info('model_fn triggered, starting to load model...')
    device="cuda" if torch.cuda.is_available() else "cpu"
    model=Net()
    if torch.cuda.device_count() > 1:
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model=nn.DataParallel(model)

    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f))
    return model.to(device)


if __name__ == '__main__':
    parser=argparse.ArgumentParser()

    parser.add_argument('--workers', type=int, default=2, metavar='W',
                        help='number of data loading workers (default: 2)')
    parser.add_argument('--epochs', type=int, default=2, metavar='E',
                        help='number of total epochs to run (default: 2)')
    parser.add_argument('--batch-size', type=int, default=4, metavar='BS',
                        help='batch size (default: 4)')
    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                        help='initial learning rate (default: 0.001)')
    parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='momentum (default: 0.9)')
    parser.add_argument('--dist-backend', type=str, default='gloo', help='distributed backend (default: gloo)')

    # The parameters below retrieve their default values from SageMaker environment variables, which are
    # instantiated by the SageMaker containers framework.
    # https://github.com/aws/sagemaker-containers#how-a-script-is-executed-inside-the-container
    parser.add_argument('--hosts', type=str, default=ast.literal_eval(os.environ['SM_HOSTS']))
    parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
    parser.add_argument('--num-gpus', type=int, default=os.environ['SM_NUM_GPUS'])

    _train(parser.parse_args())

模型训练

提前获取pytorch镜像

  • 托管的DLC中内置了training toolkit和inference toolkit,因此只需要按照规范提供训练和推理脚本即可
py 复制代码
from sagemaker import get_execution_role

role=get_execution_role()

from sagemaker import image_uris
image_uri_inference = image_uris.retrieve(framework='pytorch',region='cn-north-1',version='1.8.0',py_version='py3',image_scope='inference', instance_type='ml.c5.4xlarge')
image_uri_train = image_uris.retrieve(framework='pytorch',region='cn-north-1',version='1.8.0',py_version='py3',image_scope='training', instance_type='ml.c5.4xlarge')
print(image_uri_inference)
print(image_uri_train)

创建Estimator

py 复制代码
from sagemaker.estimator import Estimator

# 超参数实际上会作为训练脚本的参数传入,可以通过argparse进行解析
hyperparameters = {
    'epochs': 1,
}

# 使用通用的Estimator,
estimator=Estimator(
    image_uri=image_uri_train, # 这里可以使用托管镜像或基于托管的扩展镜像
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    hyperparameters=hyperparameters,
    source_dir="src",
    entry_point="cifar10.py"
    # model_uri="s3://zhaojiew-sagemaker/model/cifar10-pytorch/" # 如果有pre-trained的模型可以使用此参数导入

)
# 在本地测试训练任务,实际上是通过docker-compose运行
#estimator.fit('file:///tmp/pytorch-example/cifar-10-data')
# 提交train任务
estimator.fit('s3://zhaojiew-tmp/cifar-10-data/',)

也可以使用PyTorch的Estimator

py 复制代码
from sagemaker.pytorch.estimator import PyTorch
# 也可以使用PyTorch
pytorch_estimator = PyTorch(
    entry_point='cifar10.py',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8.0',
    py_version='py3',
    hyperparameters=hyperparameters
)
pytorch_estimator.fit('s3://zhaojiew-tmp/cifar-10-data/')

最终存储的模型位置为

复制代码
model_location = 's3://sagemaker-cn-north-1-xxxxxxx/pytorch-training-2024-11-19-09-56-55-508/output/model.tar.gz'

模型部署

实际上可以直接基于estimator进行部署,但是这里导入模型将两个阶段分开

python 复制代码
from sagemaker.pytorch.model import PyTorchModel

pytorch_model = PyTorchModel(
    # 指定模型所在位置
    model_data=model_location,
    role=role,
    image_uri=image_uri_inference,
    entry_point='cifar10.py', # 如果指定了推理脚本会打包为source.tar.gz并和model.tar.gz合并成一个tar文件
    source_dir="src" # 指定代码所在目录
)
pytorch_predictor = pytorch_model.deploy(instance_type='ml.m5.xlarge', initial_instance_count=1)

也可以使用更通用的Model

py 复制代码
from sagemaker.model import Model

model = Model(
    # # 指定模型所在位置
    model_data=model_location,
    image_uri=image_uri_inference,
    role=role,
    entry_point="cifar10.py",
    source_dir="src"
)

model_predictor=model.deploy(1, "ml.m5.xlarge")

模型调用

如果predictor丢失,可以通过如下方法重建

py 复制代码
from sagemaker.predictor import Predictor
from sagemaker.serializers import NumpySerializer
from sagemaker.deserializers import NumpyDeserializer

model_predictor = Predictor(
    endpoint_name="pytorch-inference-2024-11-19-14-19-49-678"
)
model_predictor.serializer = NumpySerializer()
model_predictor.deserializer = NumpyDeserializer()

使用测试集测试

py 复制代码
# get some test images
dataiter = iter(testloader)
images, labels = next(dataiter)

# print images
imshow(torchvision.utils.make_grid(images))
print("GroundTruth: ", " ".join("%4s" % classes[labels[j]] for j in range(4)))

outputs = model_predictor.predict(images.numpy())
_, predicted = torch.max(torch.from_numpy(np.array(outputs)), 1)

print("Predicted: ", " ".join("%4s" % classes[predicted[j]] for j in range(4)))

由于模型部署后仅仅是在机器学习实例上启动容器,因此也可以在本地测试,例如以下docker-compose文件

yaml 复制代码
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  localendpoint:
    command: serve # 也可以忽略,默认为serve
    container_name: localendpoint
    environment:
    - AWS_REGION=cn-north-1
    - SAGEMAKER_PROGRAM=cifar10.py
    - S3_ENDPOINT_URL=https://s3.cn-north-1.amazonaws.com.cn
    - SAGEMAKER_SUBMIT_DIRECTORY=/opt/ml/model/code
    image: 727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/pytorch-inference:1.8.0-cpu-py3
    ports:
    - 8080:8080
    networks:
      sagemaker-local:
    volumes:
    - ./src/cifar10.py:/opt/ml/model/code/cifar10.py
    - ./model/model.pth:/opt/ml/model/model.pth
version: '2.3'

但是这只能测试推理服务器能够正常启动,实际调用由于无法使用boto3和sagemaker sdk,可能需要手动封装http请求

python 复制代码
import numpy as np
import torch
import requests
from io import BytesIO

buffer = BytesIO()
np.save(buffer, images.numpy(), allow_pickle=False)
payload = buffer.getvalue()

local_url = "http://localhost:8080/invocations"
try:
    response = requests.post(
        local_url,
        data=payload,
        headers={
            'Content-Type': 'application/x-npy'
        }
    )
    response.raise_for_status()
    result = np.frombuffer(response.content, dtype=np.float32)
    print(result)
except Exception as e:
    print(f"发生错误: {e}")
相关推荐
一点一木4 小时前
深度体验TRAE SOLO移动端7天:作为独立开发者,我把工作流揣进了兜里
前端·人工智能·trae
Lee川4 小时前
mini-cursor 揭秘:从 Tool 定义到 Agent 循环的完整实现
前端·人工智能·后端
weelinking5 小时前
【产品】00_产品经理用Claude实现产品系列介绍
数据库·人工智能·sql·数据挖掘·github·产品经理
Agent产品评测局5 小时前
制造业模具管理AI系统,主流产品能力对比详解:2026年智能制造选型深度洞察
人工智能·ai·chatgpt·制造
研华科技Advantech6 小时前
如何用一套实训设备,打通工业AI预测性维护技术全流程?
人工智能
Lab_AI6 小时前
AI for Science: MaXFlow AI Agent+ 报告体验双升级,让AI智能体更高效易用!
人工智能·ai for science·ai agent·ai智能体
李坤6 小时前
让 Codex 和 Claude 互相 Review:告别手动复制
人工智能·openai·claude
南屹川6 小时前
【API设计】GraphQL实战:从REST到GraphQL的演进
人工智能
KJ_BioMed6 小时前
当计算生物学遇上生成式AI:从头设计生物分子的“新范式”初探
人工智能·从头设计·生命科学·生物医药·科研干货·科晶生物
明月醉窗台6 小时前
深度学习(17)YOLO训练中的超参数详解
人工智能·深度学习·yolo