文章目录
- [1 数据准备](#1 数据准备)
- [2 训练代码](#2 训练代码)
- [3 启动脚本](#3 启动脚本)
- [3 输出日志](#3 输出日志)
本文以局域网内两台机器,每台四卡,只选用其中两卡来做实验,安装环境使用的是conda,尽量用同一个环境,试了不同的,版本接近的也能用,差的大的不能用,所以尽量一致吧。.本系列所有代码 参见
1 数据准备
这里简单以minist数据为例,由于做实验用的机器在内网,需要提前下载好,代码:
python
import os
from torchvision.datasets import MNIST
import torch
# 设置下载路径为当前目录下的mnist_data文件夹
download_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "mnist_data")
os.makedirs(download_path, exist_ok=True)
print(f"正在下载MNIST数据集到 {download_path}...")
# 下载训练集和测试集
train_dataset = MNIST(download_path, train=True, download=True)
test_dataset = MNIST(download_path, train=False, download=True)
print("MNIST数据集下载完成。")
print(f"文件位置: {download_path}")
print("文件列表:")
for root, dirs, files in os.walk(download_path):
for file in files:
print(os.path.join(root, file))
下载完成后,放在当前路径下。为了简便每台机器上拷一份一样的数据就行,后期可以尝试使用nfs系统来做。
2 训练代码
直接上代码
python
# 导入所需的库
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from lightning.fabric import Fabric, seed_everything
from torchmetrics.classification import Accuracy
import time
import argparse # 导入 argparse 用于接收命令行参数
# 定义简单的卷积神经网络模型 (保持不变)
class SimpleConvNet(nn.Module):
def __init__(self):
super(SimpleConvNet, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
# 假设输入 28x28 -> (28-3+1)/1 = 26 -> (26-3+1)/1 = 24 -> 24/2 = 12
# 64 * 12 * 12 = 9216
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
# 训练一个 epoch 的函数 (保持不变)
def train_epoch(fabric: Fabric, model: nn.Module, train_loader: DataLoader, optimizer: optim.Optimizer, epoch: int):
model.train()
# enumerate(train_loader) 会在 DDP 下自动使用 DistributedSampler
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
fabric.backward(loss)
optimizer.step()
if batch_idx % 100 == 0 and fabric.is_global_zero: # 只在全局 rank 0 打印进度
# 注意:len(data) 是单个设备的 batch size, 总处理数需要乘以 world_size
processed_samples = batch_idx * len(data) * fabric.world_size + len(data) * (fabric.global_rank + 1)
total_samples = len(train_loader.dataset)
percent = 100. * (batch_idx * fabric.world_size * len(data)) / total_samples # 估算进度
fabric.print(f'训练 Epoch: {epoch} [{processed_samples}/{total_samples} '
f'({percent:.0f}%)]\t损失: {loss.item():.6f}')
# 测试一个 epoch 的函数 (保持不变)
def test_epoch(fabric: Fabric, model: nn.Module, test_loader: DataLoader):
model.eval()
test_acc = Accuracy(task="multiclass", num_classes=10).to(fabric.device)
total_loss_tensor = torch.tensor(0.0, device=fabric.device)
with torch.no_grad():
for data, target in test_loader:
output = model(data)
batch_loss = F.nll_loss(output, target, reduction='sum')
total_loss_tensor += batch_loss
test_acc.update(output, target) # 在每个 rank 上独立更新
# 聚合所有进程的损失和样本数
gathered_losses = fabric.all_gather(total_loss_tensor) # 收集所有 rank 的 sum loss
# 在 fabric >= 2.1 中,可以直接聚合 metric 对象
# gathered_acc = fabric.all_gather(test_acc) # 收集 metric 状态 (或直接用 compute + all_reduce)
# 计算全局平均损失和准确率 (仅在 rank 0 计算和打印最终结果)
if fabric.is_global_zero:
avg_loss = gathered_losses.sum() / len(test_loader.dataset)
# 对于 Accuracy metric,compute() 应该在所有进程调用后聚合 (或者用 fabric.all_reduce 手动聚合)
# fabric.barrier() # 确保所有进程都完成了 update
final_acc = test_acc.compute() # compute 应该内部处理了同步 (在较新版本 TorchMetrics/Fabric 中)
total_samples = len(test_loader.dataset)
correct_samples = final_acc * total_samples
fabric.print(f'\n测试集: 平均损失: {avg_loss:.4f}, '
f'准确率: {final_acc*100:.0f}% ({int(correct_samples)}/{total_samples})\n')
else:
# 其他 rank 也需要调用 compute 来参与同步(取决于 metric 实现)
test_acc.compute()
# 所有 rank 都需要重置 metric
test_acc.reset()
fabric.barrier() # 确保所有进程在继续之前都已完成测试和打印
# 主执行函数
def main(args):
# --- 超参数设置 ---
batch_size_per_device = 64
epochs = 5
lr = 1.0
seed = 42
num_workers_per_loader = 2
# --- Fabric 初始化 ---
# **关键修改**: 添加 num_nodes 参数
# `devices` 现在指每个节点上的设备数量
fabric = Fabric(accelerator="cuda",
devices=args.gpus_per_node, # 每个节点上的 GPU 数
num_nodes=args.num_nodes, # 总节点数
strategy="ddp",
precision='16-mixed')
# **关键**: fabric.launch() 不再负责启动进程,它假设进程已由外部启动器(如 torchrun)启动
# 它在这里主要用于设置当前进程的环境(设备放置、seed 等)
fabric.launch()
# --- 可复现性 ---
# 使用 Fabric 的 seed_everything 确保所有进程使用不同的、确定性的种子
seed_everything(seed + fabric.global_rank)
# --- 数据准备 ---
# **关键**: 确保所有节点都能访问此路径!最好是共享文件系统。
# 如果不是共享文件系统,需要确保在运行前,数据已存在于每个节点的这个路径下。
# data_path = "/path/to/shared/mnist_data" # 示例:使用共享路径
data_path = "./mnist_data" # 使用相对路径,假设所有节点运行目录结构相同
fabric.print(f"全局 Rank {fabric.global_rank}: 使用数据路径: {data_path}")
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
# --- 数据下载 (只在全局 Rank 0 执行) ---
if fabric.is_global_zero:
fabric.print(f"全局 Rank {fabric.global_rank}: 检查并下载 MNIST 数据集到 {data_path}...")
# 确保目录存在
os.makedirs(data_path, exist_ok=True)
datasets.MNIST(data_path, train=True, download=True)
datasets.MNIST(data_path, train=False, download=True)
fabric.print(f"全局 Rank {fabric.global_rank}: 数据集下载完成或已存在。")
# --- 同步点 ---
# **关键**: 确保其他 rank 等待 rank 0 下载完成数据
fabric.barrier()
fabric.print(f"全局 Rank {fabric.global_rank}: 继续加载数据集...")
# --- 加载数据集 (所有进程执行) ---
train_dataset = datasets.MNIST(
data_path, train=True, download=False, transform=transform # download=False 避免竞争
)
test_dataset = datasets.MNIST(
data_path, train=False, download=False, transform=transform
)
# --- 创建数据加载器 ---
# Fabric 的 setup_dataloaders 会自动处理 DistributedSampler
train_loader = DataLoader(
train_dataset,
batch_size=batch_size_per_device,
num_workers=num_workers_per_loader,
# shuffle=True, # DDP 时不需要设置 shuffle=True,Sampler 会处理
persistent_workers=True if num_workers_per_loader > 0 else False,
pin_memory=True # 通常建议开启
)
test_loader = DataLoader(
test_dataset,
batch_size=batch_size_per_device * 2,
num_workers=num_workers_per_loader,
shuffle=False,
persistent_workers=True if num_workers_per_loader > 0 else False,
pin_memory=True
)
fabric.print(f"进程 {fabric.global_rank}/{fabric.world_size} (节点 Rank: {fabric.node_rank}, 本地 Rank: {fabric.local_rank}) 使用设备: {fabric.device}")
fabric.print(f"训练数据集大小: {len(train_dataset)}")
fabric.print(f"测试数据集大小: {len(test_dataset)}")
fabric.print(f"每个设备的 Batch Size: {batch_size_per_device}")
fabric.print(f"有效全局 Batch Size: {batch_size_per_device * fabric.world_size}")
# --- 使用 Fabric 设置模型、优化器和数据加载器 ---
# 注意: setup_dataloaders 会自动添加 DistributedSampler
train_loader, test_loader = fabric.setup_dataloaders(train_loader, test_loader)
# 实例化模型和优化器
model = SimpleConvNet()
# SyncBatchNorm: 如果模型包含 BatchNorm 层并且 world_size > 1, 推荐在 setup 之前转换
if fabric.world_size > 1:
fabric.print("模型包含 BatchNorm 且 world_size > 1, 尝试转换 SyncBatchNorm (如果模型有 BatchNorm 层)")
# 注意: 只有当模型确实有 BatchNorm 层时才转换,否则可能出错或无效
# 更好的做法是检查模型中是否有 BatchNorm 层
has_batchnorm = any(isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) for m in model.modules())
if has_batchnorm:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
fabric.print("已转换 SyncBatchNorm")
else:
fabric.print("模型中未找到 BatchNorm 层,跳过 SyncBatchNorm 转换")
optimizer = optim.Adadelta(model.parameters(), lr=lr)
# setup 会处理模型移动到设备和 DDP 包装
model, optimizer = fabric.setup(model, optimizer)
# --- 训练循环 ---
fabric.print(f"\n在 {fabric.strategy.num_nodes} 个节点, 共 {fabric.world_size} 个 GPU 上开始训练...")
start_time = time.time()
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
# 设置 DDP Sampler 的 epoch,确保每个 epoch 的 shuffle 不同
# fabric.setup_dataloaders 内部可能会处理,但显式设置更安全
if hasattr(train_loader.sampler, 'set_epoch'):
train_loader.sampler.set_epoch(epoch)
train_epoch(fabric, model, train_loader, optimizer, epoch)
fabric.barrier() # 确保所有进程完成训练 epoch
test_epoch(fabric, model, test_loader) # test_epoch 内部有 barrier
# fabric.barrier() # test_epoch 结束时已有 barrier
# 只在 rank 0 打印 epoch 时间
if fabric.is_global_zero:
epoch_time = time.time() - epoch_start_time
fabric.print(f"Epoch {epoch} 在 {epoch_time:.2f} 秒内完成。")
fabric.barrier() # 确保 rank 0 打印完再进入下一轮
# 只在 rank 0 打印总时间
if fabric.is_global_zero:
total_time = time.time() - start_time
fabric.print(f"\n训练在 {total_time:.2f} 秒内完成。")
# --- 保存模型 (仅在全局 Rank 0 保存) ---
save_path = "mnist_fabric_multinode_model.pt"
state = {"model": model} # fabric.save 会自动处理提取 state_dict
fabric.save(save_path, state)
if fabric.is_global_zero:
fabric.print(f"\n模型状态已保存到 {save_path} (由全局 Rank 0 保存)")
fabric.barrier() # 确保保存完成后所有进程再退出
fabric.print(f"全局 Rank {fabric.global_rank}: 训练和保存完成。")
# fabric.launch() 的上下文管理器会自动处理退出时的清理
# 脚本入口点
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Lightning Fabric MNIST Multi-Node Example')
# 添加多机多卡所需的参数
parser.add_argument('--num_nodes', type=int, default=2, # 默认至少 2 个节点
help='Number of nodes for distributed training')
parser.add_argument('--gpus_per_node', type=int, default=1, # 默认每个节点 1 个 GPU
help='Number of GPUs per node')
# 你也可以添加其他参数,如 batch_size, epochs, lr, data_path 等
# parser.add_argument('--data_path', type=str, default='./mnist_data', help='Path to dataset')
args = parser.parse_args()
# **重要**: CUDA 检查现在意义不大,因为脚本会在不同机器上运行
# 启动器 (torchrun) 应该负责检查资源
# 但可以保留一个本地检查作为参考
if not torch.cuda.is_available():
print("警告:当前环境未检测到 CUDA。假定目标节点有 CUDA 设备。")
else:
available_gpus = torch.cuda.device_count()
print(f"本地检测到 {available_gpus} 个 GPU。脚本将根据 --gpus_per_node={args.gpus_per_node} 运行。")
if available_gpus < args.gpus_per_node:
print(f"警告:本地 GPU 数量 ({available_gpus}) 少于指定的 --gpus_per_node ({args.gpus_per_node})。确保目标节点有足够的 GPU。")
main(args)
这份代码在两台机器上一样的就行。
3 启动脚本
机器一上:
bash
export MASTER_ADDR=10.238.247.11 # 或者另一台机器的 IP,只要固定一个就行 # Master 节点的 IP
export MASTER_PORT=12355 # 未被占用的端口
export NCCL_SOCKET_IFNAME=enp1s0f0 # <--- 修改这里 # 节点间通信的网络接口 (根据实际情况修改)
export NCCL_DEBUG=INFO # (可选) 调试 NCCL
GPUS_PER_NODE=2 # 设置你想在每个节点上使用的 GPU 数量
NNODES=2 # 设置总节点数
torchrun \
--nproc_per_node=$GPUS_PER_NODE \
--nnodes=$NNODES \
--node_rank=0 \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
multi_node_fabric.py --num_nodes $NNODES --gpus_per_node $GPUS_PER_NODE
机器2:
bash
export MASTER_ADDR=10.238.247.11 # 或者另一台机器的 IP,只要固定一个就行
export MASTER_PORT=12355
export NCCL_SOCKET_IFNAME=enp1s0f0 # <--- 修改这里
export NCCL_DEBUG=INFO
GPUS_PER_NODE=2 # 设置你想在每个节点上使用的 GPU 数量 # 与节点 0 设置相同
NNODES=2 # 设置总节点数 # 与节点 0 设置相同
torchrun \
--nproc_per_node=$GPUS_PER_NODE \
--nnodes=$NNODES \
--node_rank=1 \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
multi_node_fabric.py --num_nodes $NNODES --gpus_per_node $GPUS_PER_NODE
两台机器上,先启动0,再启动1。
devices 只写数量,所以默认是用前两张,也可以改成devices=[2,3]就用成后两张了。
3 输出日志
1号:
(f) tl@ai-X785-G30:~/fabric_multi_gpu$ sh run.sh
W0328 12:45:58.078000 673264 site-packages/torch/distributed/run.py:793]
W0328 12:45:58.078000 673264 site-packages/torch/distributed/run.py:793] *****************************************
W0328 12:45:58.078000 673264 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0328 12:45:58.078000 673264 site-packages/torch/distributed/run.py:793] *****************************************
本地检测到 4 个 GPU。脚本将根据 --gpus_per_node=2 运行。
本地检测到 4 个 GPU。脚本将根据 --gpus_per_node=2 运行。
Using 16-bit Automatic Mixed Precision (AMP)
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4
[rank: 1] Seed set to 43
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 4 processes
----------------------------------------------------------------------------------------------------
[rank: 0] Seed set to 42
全局 Rank 0: 使用数据路径: ./mnist_data
全局 Rank 0: 检查并下载 MNIST 数据集到 ./mnist_data...
全局 Rank 0: 数据集下载完成或已存在。
ai-X785-G30:675556:675556 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f0
ai-X785-G30:675556:675556 [0] NCCL INFO Bootstrap : Using enp1s0f0:10.238.247.11<0>
ai-X785-G30:675556:675556 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
ai-X785-G30:675556:675556 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
ai-X785-G30:675556:675556 [0] NCCL INFO NET/Plugin: Using internal network plugin.
ai-X785-G30:675556:675556 [0] NCCL INFO cudaDriverVersion 12040
NCCL version 2.21.5+cuda11.8
ai-X785-G30:675557:675557 [1] NCCL INFO cudaDriverVersion 12040
ai-X785-G30:675557:675557 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f0
ai-X785-G30:675557:675557 [1] NCCL INFO Bootstrap : Using enp1s0f0:10.238.247.11<0>
ai-X785-G30:675557:675557 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
ai-X785-G30:675557:675557 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
ai-X785-G30:675557:675557 [1] NCCL INFO NET/Plugin: Using internal network plugin.
ai-X785-G30:675556:675582 [0] NCCL INFO Failed to open libibverbs.so[.1]
ai-X785-G30:675556:675582 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f0
ai-X785-G30:675556:675582 [0] NCCL INFO NET/Socket : Using [0]enp1s0f0:10.238.247.11<0>
ai-X785-G30:675556:675582 [0] NCCL INFO Using non-device net plugin version 0
ai-X785-G30:675556:675582 [0] NCCL INFO Using network Socket
ai-X785-G30:675557:675583 [1] NCCL INFO Failed to open libibverbs.so[.1]
ai-X785-G30:675557:675583 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f0
ai-X785-G30:675557:675583 [1] NCCL INFO NET/Socket : Using [0]enp1s0f0:10.238.247.11<0>
ai-X785-G30:675557:675583 [1] NCCL INFO Using non-device net plugin version 0
ai-X785-G30:675557:675583 [1] NCCL INFO Using network Socket
ai-X785-G30:675557:675583 [1] NCCL INFO ncclCommInitRank comm 0x94f8a50 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId db000 commId 0xcd5f8c8c67dfa1ad - Init START
ai-X785-G30:675556:675582 [0] NCCL INFO ncclCommInitRank comm 0x97169b0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId da000 commId 0xcd5f8c8c67dfa1ad - Init START
ai-X785-G30:675557:675583 [1] NCCL INFO Setting affinity for GPU 1 to ff,fff00000
ai-X785-G30:675556:675582 [0] NCCL INFO Setting affinity for GPU 0 to ff,fff00000
ai-X785-G30:675557:675583 [1] NCCL INFO comm 0x94f8a50 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
ai-X785-G30:675557:675583 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
ai-X785-G30:675556:675582 [0] NCCL INFO comm 0x97169b0 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
ai-X785-G30:675557:675583 [1] NCCL INFO P2P Chunksize set to 131072
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 00/02 : 0 1 2 3
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 01/02 : 0 1 2 3
ai-X785-G30:675556:675582 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
ai-X785-G30:675556:675582 [0] NCCL INFO P2P Chunksize set to 131072
ai-X785-G30:675557:675583 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/Socket/0
ai-X785-G30:675557:675583 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/Socket/0
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM
ai-X785-G30:675557:675583 [1] NCCL INFO Connected all rings
ai-X785-G30:675557:675583 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM
ai-X785-G30:675556:675582 [0] NCCL INFO Connected all rings
ai-X785-G30:675557:675583 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 00/0 : 2[0] -> 0[0] [receive] via NET/Socket/0
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 01/0 : 2[0] -> 0[0] [receive] via NET/Socket/0
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 00/0 : 0[0] -> 2[0] [send] via NET/Socket/0
ai-X785-G30:675556:675582 [0] NCCL INFO Channel 01/0 : 0[0] -> 2[0] [send] via NET/Socket/0
ai-X785-G30:675556:675582 [0] NCCL INFO Connected all trees
ai-X785-G30:675557:675583 [1] NCCL INFO Connected all trees
ai-X785-G30:675557:675583 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
ai-X785-G30:675557:675583 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
ai-X785-G30:675556:675582 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
ai-X785-G30:675556:675582 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
ai-X785-G30:675556:675582 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so
ai-X785-G30:675557:675583 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so
ai-X785-G30:675556:675582 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin.
ai-X785-G30:675557:675583 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin.
ai-X785-G30:675556:675582 [0] NCCL INFO ncclCommInitRank comm 0x97169b0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId da000 commId 0xcd5f8c8c67dfa1ad - Init COMPLETE
ai-X785-G30:675557:675583 [1] NCCL INFO ncclCommInitRank comm 0x94f8a50 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId db000 commId 0xcd5f8c8c67dfa1ad - Init COMPLETE
全局 Rank 0: 继续加载数据集...
进程 0/4 (节点 Rank: 0, 本地 Rank: 0) 使用设备: cuda:0
训练数据集大小: 60000
测试数据集大小: 10000
每个设备的 Batch Size: 64
有效全局 Batch Size: 256
模型包含 BatchNorm 且 world_size > 1, 尝试转换 SyncBatchNorm (如果模型有 BatchNorm 层)
模型中未找到 BatchNorm 层,跳过 SyncBatchNorm 转换
在 2 个节点, 共 4 个 GPU 上开始训练...
训练 Epoch: 1 [64/60000 (0%)] 损失: 2.330017
训练 Epoch: 1 [25664/60000 (43%)] 损失: 0.335563
训练 Epoch: 1 [51264/60000 (85%)] 损失: 0.223983
测试集: 平均损失: 0.0730, 准确率: 98% (9778/10000)
Epoch 1 在 16.33 秒内完成。
训练 Epoch: 2 [64/60000 (0%)] 损失: 0.362449
训练 Epoch: 2 [25664/60000 (43%)] 损失: 0.035336
训练 Epoch: 2 [51264/60000 (85%)] 损失: 0.198408
测试集: 平均损失: 0.0465, 准确率: 98% (9845/10000)
Epoch 2 在 15.81 秒内完成。
训练 Epoch: 3 [64/60000 (0%)] 损失: 0.244205
训练 Epoch: 3 [25664/60000 (43%)] 损失: 0.154580
训练 Epoch: 3 [51264/60000 (85%)] 损失: 0.108419
测试集: 平均损失: 0.0422, 准确率: 99% (9859/10000)
Epoch 3 在 15.81 秒内完成。
训练 Epoch: 4 [64/60000 (0%)] 损失: 0.268536
训练 Epoch: 4 [25664/60000 (43%)] 损失: 0.141208
训练 Epoch: 4 [51264/60000 (85%)] 损失: 0.203330
测试集: 平均损失: 0.0321, 准确率: 99% (9899/10000)
Epoch 4 在 15.82 秒内完成。
训练 Epoch: 5 [64/60000 (0%)] 损失: 0.246365
训练 Epoch: 5 [25664/60000 (43%)] 损失: 0.092331
训练 Epoch: 5 [51264/60000 (85%)] 损失: 0.146846
测试集: 平均损失: 0.0330, 准确率: 99% (9880/10000)
Epoch 5 在 15.81 秒内完成。
训练在 79.58 秒内完成。
模型状态已保存到 mnist_fabric_multinode_model.pt (由全局 Rank 0 保存)
全局 Rank 0: 训练和保存完成。
ai-X785-G30:675557:675626 [1] NCCL INFO [Service thread] Connection closed by localRank 1
ai-X785-G30:675556:675627 [0] NCCL INFO [Service thread] Connection closed by localRank 0
ai-X785-G30:675557:683161 [1] NCCL INFO comm 0x94f8a50 rank 1 nranks 4 cudaDev 1 busId db000 - Abort COMPLETE
ai-X785-G30:675556:683162 [0] NCCL INFO comm 0x97169b0 rank 0 nranks 4 cudaDev 0 busId da000 - Abort COMPLETE
2号:
(f) tl@ai-X785-G30:~/fabric_multi_gpu$ sh run.sh
W0328 12:52:04.835000 2976049 site-packages/torch/distributed/run.py:793]
W0328 12:52:04.835000 2976049 site-packages/torch/distributed/run.py:793] *****************************************
W0328 12:52:04.835000 2976049 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0328 12:52:04.835000 2976049 site-packages/torch/distributed/run.py:793] *****************************************
本地检测到 4 个 GPU。脚本将根据 --gpus_per_node=2 运行。
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4
本地检测到 4 个 GPU。脚本将根据 --gpus_per_node=2 运行。
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4
[rank: 3] Seed set to 45
[rank: 2] Seed set to 44
全局 Rank 2: 使用数据路径: ./mnist_data
ai-X785-G30:2976427:2976427 [1] NCCL INFO cudaDriverVersion 12000
ai-X785-G30:2976427:2976427 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f0
ai-X785-G30:2976427:2976427 [1] NCCL INFO Bootstrap : Using enp1s0f0:10.238.247.6<0>
ai-X785-G30:2976427:2976427 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
ai-X785-G30:2976427:2976427 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
ai-X785-G30:2976427:2976427 [1] NCCL INFO NET/Plugin: Using internal network plugin.
ai-X785-G30:2976426:2976426 [0] NCCL INFO cudaDriverVersion 12000
ai-X785-G30:2976426:2976426 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f0
ai-X785-G30:2976426:2976426 [0] NCCL INFO Bootstrap : Using enp1s0f0:10.238.247.6<0>
ai-X785-G30:2976426:2976426 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
ai-X785-G30:2976426:2976426 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
ai-X785-G30:2976426:2976426 [0] NCCL INFO NET/Plugin: Using internal network plugin.
ai-X785-G30:2976427:2976877 [1] NCCL INFO Failed to open libibverbs.so[.1]
ai-X785-G30:2976427:2976877 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f0
ai-X785-G30:2976427:2976877 [1] NCCL INFO NET/Socket : Using [0]enp1s0f0:10.238.247.6<0>
ai-X785-G30:2976427:2976877 [1] NCCL INFO Using non-device net plugin version 0
ai-X785-G30:2976427:2976877 [1] NCCL INFO Using network Socket
ai-X785-G30:2976426:2976878 [0] NCCL INFO Failed to open libibverbs.so[.1]
ai-X785-G30:2976426:2976878 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp1s0f0
ai-X785-G30:2976426:2976878 [0] NCCL INFO NET/Socket : Using [0]enp1s0f0:10.238.247.6<0>
ai-X785-G30:2976426:2976878 [0] NCCL INFO Using non-device net plugin version 0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Using network Socket
ai-X785-G30:2976427:2976877 [1] NCCL INFO ncclCommInitRank comm 0x9fa7520 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId db000 commId 0xcd5f8c8c67dfa1ad - Init START
ai-X785-G30:2976426:2976878 [0] NCCL INFO ncclCommInitRank comm 0x87e1fb0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId da000 commId 0xcd5f8c8c67dfa1ad - Init START
ai-X785-G30:2976426:2976878 [0] NCCL INFO Setting affinity for GPU 0 to ff,fff00000
ai-X785-G30:2976427:2976877 [1] NCCL INFO Setting affinity for GPU 1 to ff,fff00000
ai-X785-G30:2976427:2976877 [1] NCCL INFO comm 0x9fa7520 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
ai-X785-G30:2976427:2976877 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
ai-X785-G30:2976427:2976877 [1] NCCL INFO P2P Chunksize set to 131072
ai-X785-G30:2976426:2976878 [0] NCCL INFO comm 0x87e1fb0 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
ai-X785-G30:2976426:2976878 [0] NCCL INFO P2P Chunksize set to 131072
ai-X785-G30:2976426:2976878 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Channel 00/0 : 2[0] -> 3[1] via P2P/CUMEM
ai-X785-G30:2976427:2976877 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/Socket/0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Channel 01/0 : 2[0] -> 3[1] via P2P/CUMEM
ai-X785-G30:2976427:2976877 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/Socket/0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Connected all rings
ai-X785-G30:2976426:2976878 [0] NCCL INFO Channel 00/0 : 0[0] -> 2[0] [receive] via NET/Socket/0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Channel 01/0 : 0[0] -> 2[0] [receive] via NET/Socket/0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Channel 00/0 : 2[0] -> 0[0] [send] via NET/Socket/0
ai-X785-G30:2976426:2976878 [0] NCCL INFO Channel 01/0 : 2[0] -> 0[0] [send] via NET/Socket/0
ai-X785-G30:2976427:2976877 [1] NCCL INFO Connected all rings
ai-X785-G30:2976427:2976877 [1] NCCL INFO Channel 00/0 : 3[1] -> 2[0] via P2P/CUMEM
ai-X785-G30:2976427:2976877 [1] NCCL INFO Channel 01/0 : 3[1] -> 2[0] via P2P/CUMEM
ai-X785-G30:2976426:2976878 [0] NCCL INFO Connected all trees
ai-X785-G30:2976427:2976877 [1] NCCL INFO Connected all trees
ai-X785-G30:2976426:2976878 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
ai-X785-G30:2976426:2976878 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
ai-X785-G30:2976427:2976877 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
ai-X785-G30:2976427:2976877 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
ai-X785-G30:2976426:2976878 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so
ai-X785-G30:2976426:2976878 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin.
ai-X785-G30:2976426:2976878 [0] NCCL INFO ncclCommInitRank comm 0x87e1fb0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId da000 commId 0xcd5f8c8c67dfa1ad - Init COMPLETE
ai-X785-G30:2976427:2976877 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so
ai-X785-G30:2976427:2976877 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin.
ai-X785-G30:2976427:2976877 [1] NCCL INFO ncclCommInitRank comm 0x9fa7520 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId db000 commId 0xcd5f8c8c67dfa1ad - Init COMPLETE
全局 Rank 2: 继续加载数据集...
进程 2/4 (节点 Rank: 1, 本地 Rank: 0) 使用设备: cuda:0
训练数据集大小: 60000
测试数据集大小: 10000
每个设备的 Batch Size: 64
有效全局 Batch Size: 256
模型包含 BatchNorm 且 world_size > 1, 尝试转换 SyncBatchNorm (如果模型有 BatchNorm 层)
模型中未找到 BatchNorm 层,跳过 SyncBatchNorm 转换
在 2 个节点, 共 4 个 GPU 上开始训练...
全局 Rank 2: 训练和保存完成。
ai-X785-G30:2976427:2976879 [1] NCCL INFO [Service thread] Connection closed by localRank 1
ai-X785-G30:2976426:2976881 [0] NCCL INFO [Service thread] Connection closed by localRank 0
ai-X785-G30:2976427:2979638 [1] NCCL INFO comm 0x9fa7520 rank 3 nranks 4 cudaDev 1 busId db000 - Abort COMPLETE
ai-X785-G30:2976426:2979639 [0] NCCL INFO comm 0x87e1fb0 rank 2 nranks 4 cudaDev 0 busId da000 - Abort COMPLETE