使用猴子补丁对pytorch的分布式接口进行插桩

训练脚本:

python 复制代码
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch import nn
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader
import torch.nn.functional as F
import os
import distributed_patch

# 设置 NCCL 日志环境变量
'''
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_DEBUG_SUBSYS"] = "ALL"  # 或者 COLL
os.environ["NCCL_LOG_FILE"] = "nccl_log.txt"

# 运行 PyTorch 分布式代码
'''




class Net(nn.Module):  # 模型定义
    def __init__(self):
        super(Net, self).__init__()
        self.flatten = nn.Flatten()
        self.seq = nn.Sequential(
            nn.Linear(28 * 28, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        return self.seq(x)


def main():
    dist.init_process_group(backend='nccl')  # 【集合通讯】其他进程连master,大家互认

    rank = dist.get_rank()
    world_size = dist.get_world_size()
    device_name = f'cuda:{rank}'

    checkpoint = None  # 各自加载checkpoint
    try:
        checkpoint = torch.load('checkpoint.pth', map_location='cpu')  # checkpoint是cuda:0保存的,加载默认会读到cuda:0,所以明确指定给cpu
    except:
        pass

    model = Net().to(device_name)
    if checkpoint and rank == 0:  # rank0恢复模型参数
        model.load_state_dict(checkpoint['model'])

    model = DDP(model)  # 【集合通讯】rank0广播参数给其他进程

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # model参数一致,则optim会保证其初始状态一致
    if checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer'])  # 各自加载checkpoint

    train_dataset = MNIST(root='./data', download=True, transform=ToTensor(), train=True)  # 各自加载dataset
    sampler = DistributedSampler(train_dataset)  # 指派子集给各进程
    train_dataloader = DataLoader(train_dataset, batch_size=32, sampler=sampler, persistent_workers=True, num_workers=2)

    val_dataset = MNIST(root='./data', download=True, transform=ToTensor(), train=False)
    val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True, persistent_workers=True, num_workers=2)

    for epoch in range(20):
        sampler.set_epoch(epoch)  # 【集合通讯】生成随机种子,rank0广播给其他进程

        model.train()
        for x, y in train_dataloader:
            x, y = x.to(device_name), y.to(device_name)
            pred_y = model(x)  # 【集合通讯】rank0广播model buffer给其他进程
            loss = F.cross_entropy(pred_y, y)
            optimizer.zero_grad()
            loss.backward()  # 【集合通讯】每个参数的梯度做all reduce(每个进程会收到其他进程的梯度,并求平均)
            optimizer.step()

        dist.reduce(loss, dst=0)  # 【集合通讯】rank0汇总其他进程的loss

        if rank == 0:
            train_avg_loss = loss.item() / world_size

            # evaluate
            raw_model = model.module
            val_loss = 0
            with torch.no_grad():
                for x, y in val_dataloader:
                    x, y = x.to(device_name), y.to(device_name)
                    pred_y = raw_model(x)
                    loss = F.cross_entropy(pred_y, y)
                    val_loss += loss.item()
            val_avg_loss = val_loss / len(val_dataloader)
            print(f'train_loss:{train_avg_loss} val_loss:{val_avg_loss}')

            # checkpoint
            torch.save({'model': model.module.state_dict(), 'optimizer': optimizer.state_dict()}, '.checkpoint.pth')
            os.replace('.checkpoint.pth', 'checkpoint.pth')

        dist.barrier()  # 【集合通讯】等待rank0跑完eval



if __name__ == '__main__':
    main()

# torchrun --nproc_per_node 1 pytorch_dis_gpu.py

插桩脚本:

python 复制代码
import torch.distributed as dist

# 保存原始函数引用
original_functions = {
    "init_process_group": dist.init_process_group,
    "all_reduce": dist.all_reduce,
    "reduce": dist.reduce,
    "broadcast": dist.broadcast,
    "barrier": dist.barrier,
    "get_rank": dist.get_rank,
    "get_world_size": dist.get_world_size
}

# 插桩函数
def patched_init_process_group(*args, **kwargs):
    print("[distributed] init_process_group called")
    return original_functions["init_process_group"](*args, **kwargs)

def patched_all_reduce(tensor, op=dist.ReduceOp.SUM, group=None, async_op=False):
    print("[distributed] all_reduce called")
    return original_functions["all_reduce"](tensor, op, group, async_op)

def patched_reduce(tensor, dst, op=dist.ReduceOp.SUM, group=None, async_op=False):
    print("[distributed] reduce called")
    return original_functions["reduce"](tensor, dst, op, group, async_op)

def patched_broadcast(tensor, src, group=None, async_op=False):
    print("[distributed] broadcast called")
    return original_functions["broadcast"](tensor, src, group, async_op)

def patched_barrier(*args, **kwargs):
    print("[distributed] barrier called")
    return original_functions["barrier"](*args, **kwargs)

def patched_get_rank(*args, **kwargs):
    print("[distributed] get_rank called")
    return original_functions["get_rank"](*args, **kwargs)

def patched_get_world_size(*args, **kwargs):
    print("[distributed] get_world_size called")
    return original_functions["get_world_size"](*args, **kwargs)

# 替换分布式接口函数为插桩版本
dist.init_process_group = patched_init_process_group
dist.all_reduce = patched_all_reduce
dist.reduce = patched_reduce
dist.broadcast = patched_broadcast
dist.barrier = patched_barrier
dist.get_rank = patched_get_rank
dist.get_world_size = patched_get_world_size
相关推荐
吾日三省吾码几秒前
Python 脚本:自动化你的日常任务
数据库·python·自动化
是店小二呀1 分钟前
AI前沿:资本狂潮下的技术暗战:巨头博弈、开源革命与生态重构
人工智能·重构·开源
snowfoootball33 分钟前
基于 Ollama DeepSeek、Dify RAG 和 Fay 框架的高考咨询 AI 交互系统项目方案
前端·人工智能·后端·python·深度学习·高考
云和数据.ChenGuang42 分钟前
机器学习之回归算法
人工智能·机器学习·回归
odoo中国1 小时前
深度学习 Deep Learning 第15章 表示学习
人工智能·深度学习·学习·表示学习
橙色小博1 小时前
长短期记忆神经网络(LSTM)基础学习与实例:预测序列的未来
人工智能·python·深度学习·神经网络·lstm
SsummerC1 小时前
【leetcode100】每日温度
数据结构·python·leetcode
深蓝学院1 小时前
闭环SOTA!北航DiffAD:基于扩散模型实现端到端自动驾驶「多任务闭环统一」
人工智能·机器学习·自动驾驶
jimmyleeee1 小时前
人工智能基础知识笔记七:随机变量的几种分布
人工智能·笔记·概率论
仙人掌_lz1 小时前
机器学习ML极简指南
人工智能·python·算法·机器学习·面试·强化学习