LLMOps-第十一周
文章目录
大模型分布式训练;
大模型分布式训练
环境部署
此次环境搭建是在
AutoDL官方网站上搭建的实例,使用的基础镜像为PyTorch/2.8.0/3.12(ubuntu22.04)/12.8
bash
# 查看显卡信息
nvidia-smi
# Wed Oct 22 15:52:53 2025
# +-----------------------------------------------------------------------------------------+
# | NVIDIA-SMI 580.76.05 Driver Version: 580.76.05 CUDA Version: 13.0 |
# +-----------------------------------------+------------------------+----------------------+
# | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
# | | | MIG M. |
# |=========================================+========================+======================|
# | 0 NVIDIA GeForce RTX 5090 On | 00000000:A8:00.0 Off | N/A |
# | 43% 31C P8 6W / 575W | 0MiB / 32607MiB | 0% Default |
# | | | N/A |
# +-----------------------------------------+------------------------+----------------------+
# | 1 NVIDIA GeForce RTX 5090 On | 00000000:B8:00.0 Off | N/A |
# | 42% 33C P8 5W / 575W | 0MiB / 32607MiB | 0% Default |
# | | | N/A |
# +-----------------------------------------+------------------------+----------------------+
# +-----------------------------------------------------------------------------------------+
# | Processes: |
# | GPU GI CI PID Type Process name GPU Memory |
# | ID ID Usage |
# |=========================================================================================|
# | No running processes found |
# +-----------------------------------------------------------------------------------------+
# 更新apt源
apt update
# 安装相关依赖包
apt install git git-lfs iproute2 jq tree -y
# 设置环境变量
cp -rp ~/.bashrc{,.`date +"%F_%H-%M-%S"`}
sed -i "s/HISTSIZE=1000/HISTSIZE=999999999/" ~/.bashrc
sed -i "83a alias datebak='date +"%F_%H-%M-%S"'" ~/.bashrc
sed -i "83a alias vi='vim'" ~/.bashrc
sed -i "83a alias ll='ls -lhrt'" ~/.bashrc
sed -i "83a alias grep='grep --color'" ~/.bashrc
# 生效配置
source .bashrc
# 安装Pytorch大模型训练相关软件包
pip install torch datasets tokenizers transformers
# 安装DeepSpeed大模型训练相关软件包
pip install torch deepspeed datasets transformers
# 加速huggingface
pip install -U huggingface_hub
# Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple
# Requirement already satisfied: huggingface_hub in /root/miniconda3/lib/python3.12/site-packages (0.35.3)
# Requirement already satisfied: filelock in /root/miniconda3/lib/python3.12/site-packages (from huggingface_hub) (3.18.0)
# Requirement already satisfied: fsspec>=2023.5.0 in /root/miniconda3/lib/python3.12/site-packages (from huggingface_hub) (2025.7.0)
# Requirement already satisfied: packaging>=20.9 in /root/miniconda3/lib/python3.12/site-packages (from huggingface_hub) (23.2)
# Requirement already satisfied: pyyaml>=5.1 in /root/miniconda3/lib/python3.12/site-packages (from huggingface_hub) (6.0.2)
# Requirement already satisfied: requests in /root/miniconda3/lib/python3.12/site-packages (from huggingface_hub) (2.32.5)
# Requirement already satisfied: tqdm>=4.42.1 in /root/miniconda3/lib/python3.12/site-packages (from huggingface_hub) (4.67.1)
# Requirement already satisfied: typing-extensions>=3.7.4.3 in /root/miniconda3/lib/python3.12/site-packages (from huggingface_hub) (4.14.1)
# Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /root/miniconda3/lib/python3.12/site-packages (from huggingface_hub) (1.1.10)
# Requirement already satisfied: charset_normalizer<4,>=2 in /root/miniconda3/lib/python3.12/site-packages (from requests->huggingface_hub) (2.0.4)
# Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/lib/python3.12/site-packages (from requests->huggingface_hub) (3.7)
# Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/lib/python3.12/site-packages (from requests->huggingface_hub) (2.1.0)
# Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/lib/python3.12/site-packages (from requests->huggingface_hub) (2024.2.2)
# WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
# 设置环境变量
export HF_ENDPOINT=https://hf-mirror.com
# 永久添加环境变量
echo 'export HF_ENDPOINT=https://hf-mirror.com' >> ~/.bashrc
# 创建缓存目录
mkdir ~/.cache/huggingface/datasets -p
# 下载数据集
# hf download --repo-type dataset --resume-download wikitext --local-dir ~/.cache/huggingface/datasets/wikitext
huggingface-cli download --repo-type dataset --resume-download wikitext --local-dir ~/.cache/huggingface/datasets/wikitext
# ⚠️ Warning: 'huggingface-cli download' is deprecated. Use 'hf download' instead.
# /root/miniconda3/lib/python3.12/site-packages/huggingface_hub/file_download.py:945: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
# warnings.warn(
# Fetching 16 files: 0%| | 0/16 [00:00<?, ?it/s]Downloading 'wikitext-103-v1/train-00000-of-00002.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-103-v1/U-Bqi1ccYPaG7kHCF5eNyOm2leU=.c2ecca8c3250e79518e45d125f3a9a757d8014f6b2d8435c602be87c1f79ec3b.incomplete'
# Downloading 'wikitext-103-raw-v1/train-00000-of-00002.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-103-raw-v1/U-Bqi1ccYPaG7kHCF5eNyOm2leU=.74da360f23826045b3e6ac6375411fdb15f003030aa74f2596ed08b857cb9212.incomplete'
# Downloading 'wikitext-103-raw-v1/validation-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-103-raw-v1/JcQKHXJQlY5NyliaxdyIOjWMSUk=.204929b7ff9d6184953f867dedb860e40aa69c078fc1e54b3baaa8fb28511c4c.incomplete'
# Downloading 'wikitext-103-v1/test-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-103-v1/iln7FAK8ScOKT0OL8tHV8680tg8=.abdfc9f83b1103b502924072460d4c92f277c9b49c313cef3e48cfcf7428e125.incomplete'
# Downloading 'wikitext-103-raw-v1/train-00001-of-00002.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-103-raw-v1/4PAW2js2aor0N0glxWM5DEwzF3M=.ba090ac30dbf5461e8dcbdd1a1b8e6f3cf9c2c756d64f0c1220450acd514f720.incomplete' | 0.00/156M [00:00<?, ?B/s]
# Downloading '.gitattributes' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wPaCkH-WbT7GsmxMKKrNZTV4nSM=.957b2579c6ef20995a09efd9a17f8fd90606f5ed.incomplete'0002.parquet: 7%|██████▌ | 10.5M/156M [00:02<00:34, 4.21MB/s]
# .gitattributes: 1.17kB [00:00, 753kB/s] | 0.00/157M [00:00<?, ?B/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/.gitattributes
# Fetching 16 files: 6%|███████ | 1/16 [00:13<03:18, 13.24s/itDownloading 'wikitext-103-v1/train-00001-of-00002.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-103-v1/4PAW2js2aor0N0glxWM5DEwzF3M=.720f2503551f33c25bb822aad74d699fee4d5331a7373d0c262f1bfb01354fcf.incomplete' | 21.0M/156M [00:04<00:28, 4.79MB/s]
# train-00001-of-00002.parquet: 7%|██████▍ | 10.5M/157M [00:02<00:37, 3.95MB/sDownloading 'README.md' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/Xn7B-BWUGOee2Y6hCZtEhtFu4BE=.2a4fec2bc8df76c9d4da1c8e8865b625eb221c76.incomplete'
# README.md: 10.5kB [00:00, 1.64MB/s]████████████▉ | 21.0M/157M [00:03<00:20, 6.64MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/README.md
# Fetching 16 files: 12%|██████████████▏ | 2/16 [00:15<01:36, 6.86s/itDownloading 'wikitext-103-v1/validation-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-103-v1/JcQKHXJQlY5NyliaxdyIOjWMSUk=.a586125adab06f115018c43507ac267ea70850ce6218cbb96e08bb3b4db0899b.incomplete'
# train-00001-of-00002.parquet: 27%|█████████████████████████▉ | 41.9M/157M [00:05<00:13, 8.47MB/sDownloading 'wikitext-103-raw-v1/test-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-103-raw-v1/iln7FAK8ScOKT0OL8tHV8680tg8=.5f1bea067869d04849c0f975a2b29c4ff47d867f484f5010ea5e861eab246d91.incomplete' | 41.9M/156M [00:08<00:22, 5.00MB/s]
# validation-00000-of-00001.parquet: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 657k/657k [00:00<00:00, 984kB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/validation-00000-of-00001.parquet | 115M/157M [00:10<00:04, 9.39MB/s]
# Downloading 'wikitext-2-raw-v1/test-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-2-raw-v1/iln7FAK8ScOKT0OL8tHV8680tg8=.5f1bea067869d04849c0f975a2b29c4ff47d867f484f5010ea5e861eab246d91.incomplete' | 52.4M/156M [00:10<00:20, 4.99MB/s]
# test-00000-of-00001.parquet: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 733k/733k [00:00<00:00, 1.94MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/test-00000-of-00001.parquet
# Downloading 'wikitext-2-raw-v1/train-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-2-raw-v1/jhdVX7QxSai5xYQbu6FpvD3SSxM=.e83889baabc497075506f91975be5fac0d45c5290b6b20582c8cd1e853d0c9f7.incomplete'███████████████████████████████████████████| 733k/733k [00:00<00:00, 1.95MB/s]
# test-00000-of-00001.parquet: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 722k/722k [00:00<00:00, 743kB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-103-v1/test-00000-of-00001.parquet
# Downloading 'wikitext-2-raw-v1/validation-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-2-raw-v1/JcQKHXJQlY5NyliaxdyIOjWMSUk=.204929b7ff9d6184953f867dedb860e40aa69c078fc1e54b3baaa8fb28511c4c.incomplete' | 62.9M/156M [00:13<00:20, 4.52MB/s]
# train-00000-of-00002.parquet: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 157M/157M [00:14<00:00, 11.2MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/train-00000-of-00002.parquet████████| 157M/157M [00:14<00:00, 10.7MB/s]
# validation-00000-of-00001.parquet: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 657k/657k [00:00<00:00, 1.94MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/validation-00000-of-00001.parquet
# train-00000-of-00001.parquet: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 6.36M/6.36M [00:01<00:00, 3.37MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/train-00000-of-00001.parquet | 0.00/657k [00:00<?, ?B/s]
# Downloading 'wikitext-2-v1/test-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-2-v1/iln7FAK8ScOKT0OL8tHV8680tg8=.e6b3913da714b63a60a571698b20ff15441fb015783ea1b5285f707d4f2f00a9.incomplete'██████████████████████████████████████████████████| 6.36M/6.36M [00:01<00:00, 3.38MB/s]
# Downloading 'wikitext-2-v1/train-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-2-v1/jhdVX7QxSai5xYQbu6FpvD3SSxM=.dfc27e4360c639dc1fba1e403bfffd53af4a5c75d5363b5724d49bf12d07cce6.incomplete'
# test-00000-of-00001.parquet: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 685k/685k [00:00<00:00, 26.7MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-2-v1/test-00000-of-00001.parquet | 0.00/685k [00:00<?, ?B/s]
# Downloading 'wikitext-2-v1/validation-00000-of-00001.parquet' to '/root/.cache/huggingface/datasets/wikitext/.cache/huggingface/download/wikitext-2-v1/JcQKHXJQlY5NyliaxdyIOjWMSUk=.717de9a0c1c0b0b1dfdd8f1e6ad8a30ece618bbde81f5da8207277547d324215.incomplete'
# validation-00000-of-00001.parquet: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 618k/618k [00:00<00:00, 10.3MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-2-v1/validation-00000-of-00001.parquet
# train-00000-of-00001.parquet: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 6.07M/6.07M [00:00<00:00, 8.36MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-2-v1/train-00000-of-00001.parquet████████████| 6.07M/6.07M [00:00<00:00, 8.44MB/s]
# validation-00000-of-00001.parquet: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 655k/655k [00:00<00:00, 1.62MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-103-v1/validation-00000-of-00001.parquet███████| 655k/655k [00:00<00:00, 1.63MB/s]
# train-00001-of-00002.parquet: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 157M/157M [00:15<00:00, 9.87MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/train-00001-of-00002.parquet | 126M/157M [00:14<00:03, 10.3MB/s]
# test-00000-of-00001.parquet: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 733k/733k [00:00<00:00, 797kB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/test-00000-of-00001.parquet██████████| 733k/733k [00:00<00:00, 799kB/s]
# train-00000-of-00002.parquet: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 156M/156M [00:26<00:00, 5.79MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-103-v1/train-00000-of-00002.parquet | 115M/156M [00:07<00:02, 16.3MB/s]
# train-00001-of-00002.parquet: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 156M/156M [00:09<00:00, 17.1MB/s]
# Download complete. Moving file to /root/.cache/huggingface/datasets/wikitext/wikitext-103-v1/train-00001-of-00002.parquet████████████| 156M/156M [00:09<00:00, 21.2MB/s]
# Fetching 16 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:37<00:00, 2.37s/it]
# /root/.cache/huggingface/datasets/wikitext
基于ddp
单机多GPU训练-DDP
bash
# 环境设置
export CUDA_DEVICE_ORDER=PCI_BUS_ID
export TOKENIZERS_PARALLELISM=false
export WANDB_DISABLED=1
export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=1
export TORCH_NCCL_BLOCKING_WAIT=1
export NCCL_DEBUG=INFO
export PYTORCH_NO_IPV6=1
# 如果指定GPU显卡
# export CUDA_VISIBLE_DEVICES=0,2
env|egrep "CUDA|NCCL|WANDB|TOKEN"
# NCCL_P2P_DISABLE=1
# WANDB_DISABLED=1
# NCCL_DEBUG=INFO
# TORCH_NCCL_BLOCKING_WAIT=1
# TOKENIZERS_PARALLELISM=false
# CUDA_DEVICE_ORDER=PCI_BUS_ID
# NCCL_IB_DISABLE=1
# 启动训练命令
cd /data/llm-in-practise/LLM_Distributed_Trainning/PyTorch/ddp_basics/
torchrun --nproc_per_node=2 ddp_gpt_wikitext2.py
# 2025-10-22 16:47:01,296 - INFO - 加载 WikiText 数据集...
# 2025-10-22 16:47:14,538 - INFO - 运行信息 is_distributed=True, rank=0, world_size=2, local_rank=0
# 2025-10-22 16:47:14,538 - INFO - 使用设备: cuda:0
# 2025-10-22 16:47:14,538 - INFO - 加载 WikiText 数据集...
# test-00000-of-00001.parquet: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 733k/733k [00:00<00:00, 1.07MB/s]
# train-00000-of-00001.parquet: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 6.36M/6.36M [00:00<00:00, 9.38MB/s]
# validation-00000-of-00001.parquet: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 657k/657k [00:00<00:00, 41.3MB/s]
# Generating test split: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 4358/4358 [00:00<00:00, 272634.45 examples/s]
# Generating train split: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 36718/36718 [00:00<00:00, 375998.75 examples/s]
# Generating validation split: 100%|███████████████████████████████████████████████████████████████████████████████████████| 3760/3760 [00:00<00:00, 299507.80 examples/s]
# 2025-10-22 16:47:42,316 - INFO - 加载 23767 条非空文本
# 2025-10-22 16:47:43,565 - INFO - 加载 23767 条非空文本
# 2025-10-22 16:47:43,796 - INFO - 编码文本...
# 2025-10-22 16:47:44,870 - INFO - BERT 分词器词汇大小: 30522
# 2025-10-22 16:47:44,870 - INFO - 编码文本...
# 2025-10-22 16:48:01,158 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-22 16:48:02,445 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:2888:2888 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:2888:2888 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:2888:2888 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:2888:2888 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:2889:2889 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:2889:2889 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:2889:2889 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:2889:2889 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO ncclCommInitRankConfig comm 0xcc1d4c0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId a8000 commId 0x34d31444f1f50488 - Init START
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO ncclCommInitRankConfig comm 0xd1fee90 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x34d31444f1f50488 - Init START
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Bootstrap timings total 0.003660 (create 0.000024, send 0.000083, recv 0.003176, ring 0.000014, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO Bootstrap timings total 0.000615 (create 0.000019, send 0.000069, recv 0.000213, ring 0.000012, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Setting affinity for GPU 0 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO comm 0xcc1d4c0 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO comm 0xd1fee90 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Channel 00/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] 0/-1/-1->1->-1 [2] -1/-1/-1->1->0 [3] 0/-1/-1->1->-1
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Channel 01/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Channel 02/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Channel 03/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] -1/-1/-1->0->1 [2] 1/-1/-1->0->-1 [3] -1/-1/-1->0->1
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:2889:3135 [1] NCCL INFO [Proxy Service] Device 1 CPU core 86
# autodl-container-fa4746a7fc-c6053b3e:2888:3134 [0] NCCL INFO [Proxy Service] Device 0 CPU core 73
# autodl-container-fa4746a7fc-c6053b3e:2889:3137 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 192
# autodl-container-fa4746a7fc-c6053b3e:2888:3136 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 178
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO ncclCommInitRankConfig comm 0xd1fee90 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x34d31444f1f50488 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO ncclCommInitRankConfig comm 0xcc1d4c0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId a8000 commId 0x34d31444f1f50488 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:2889:3131 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 2 total 0.26 (kernels 0.22, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:2888:3130 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 2 total 0.26 (kernels 0.22, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:2889:3139 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2889:3139 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2889:3139 [1] NCCL INFO Channel 02 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2889:3139 [1] NCCL INFO Channel 03 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2888:3138 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2888:3138 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2888:3138 [0] NCCL INFO Channel 02 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2888:3138 [0] NCCL INFO Channel 03 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2889:3139 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:2888:3138 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# 2025-10-22 16:48:06,419 - INFO - Epoch 1/3, Batch 50, Avg Loss (last 50): 7.7746
# 2025-10-22 16:48:08,860 - INFO - Epoch 1/3, Batch 100, Avg Loss (last 50): 7.2401
# 2025-10-22 16:48:11,303 - INFO - Epoch 1/3, Batch 150, Avg Loss (last 50): 7.2485
# 2025-10-22 16:48:13,741 - INFO - Epoch 1/3, Batch 200, Avg Loss (last 50): 7.2272
# 2025-10-22 16:48:16,182 - INFO - Epoch 1/3, Batch 250, Avg Loss (last 50): 7.2314
# 2025-10-22 16:48:17,742 - INFO - Epoch 1/3, Avg Loss: 7.3336
# 2025-10-22 16:48:20,515 - INFO - Epoch 2/3, Batch 50, Avg Loss (last 50): 7.2289
# 2025-10-22 16:48:22,945 - INFO - Epoch 2/3, Batch 100, Avg Loss (last 50): 7.2085
# 2025-10-22 16:48:25,380 - INFO - Epoch 2/3, Batch 150, Avg Loss (last 50): 7.2225
# 2025-10-22 16:48:27,819 - INFO - Epoch 2/3, Batch 200, Avg Loss (last 50): 7.2439
# 2025-10-22 16:48:30,259 - INFO - Epoch 2/3, Batch 250, Avg Loss (last 50): 7.2389
# 2025-10-22 16:48:31,837 - INFO - Epoch 2/3, Avg Loss: 7.2261
# 2025-10-22 16:48:34,663 - INFO - Epoch 3/3, Batch 50, Avg Loss (last 50): 7.2155
# 2025-10-22 16:48:37,126 - INFO - Epoch 3/3, Batch 100, Avg Loss (last 50): 7.2041
# 2025-10-22 16:48:39,588 - INFO - Epoch 3/3, Batch 150, Avg Loss (last 50): 7.1370
# 2025-10-22 16:48:42,053 - INFO - Epoch 3/3, Batch 200, Avg Loss (last 50): 7.0875
# 2025-10-22 16:48:44,518 - INFO - Epoch 3/3, Batch 250, Avg Loss (last 50): 7.0187
# 2025-10-22 16:48:46,086 - INFO - Epoch 3/3, Avg Loss: 7.1105
# 2025-10-22 16:48:46,593 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-c6053b3e:2889:2889 [1] NCCL INFO comm 0xd1fee90 rank 1 nranks 2 cudaDev 1 busId b8000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:2888:2888 [0] NCCL INFO comm 0xcc1d4c0 rank 0 nranks 2 cudaDev 0 busId a8000 - Destroy COMPLETE
# 验证
tree checkpoints/ models/
checkpoints/
├── model_epoch_1.pth
├── model_epoch_2.pth
└── model_epoch_3.pth
models/
└── final_model.pth
0 directories, 1 file
多机多GPU训练-DDP
多机多GPU训练主机说明
主机01: 172.17.0.4 GPU:
NVIDIA GeForce RTX 5090*2主机02: 172.17.0.7 GPU:
NVIDIA GeForce RTX 5090*2
bash
# 多机同时切换目录
cd /data/llm-in-practise/LLM_Distributed_Trainning/PyTorch/ddp_basics
# 同时设置环境变量
# 用于多网卡的多机训练时负责进行分布式通信的网络接口
export NCCL_SOCKET_IFNAME=<可达网卡, e.g. eth0或ens3>
env|egrep "CUDA|NCCL|WANDB|TOKEN"
# NCCL_P2P_DISABLE=1
# WANDB_DISABLED=1
# NCCL_DEBUG=INFO
# TORCH_NCCL_BLOCKING_WAIT=1
# TOKENIZERS_PARALLELISM=false
# CUDA_DEVICE_ORDER=PCI_BUS_ID
# NCCL_IB_DISABLE=1
# 确认各个主机版本是否一致
# 同时在多台主机上运行
python - <<'PY'
import torch, os
print("torch", torch.__version__)
print("nccl version:", torch.cuda.nccl.version() if hasattr(torch.cuda, 'nccl') else "n/a")
print("cuda:", torch.version.cuda)
print("gpu count", torch.cuda.device_count())
PY
# torch 2.8.0+cu128
# nccl version: (2, 27, 3)
# cuda: 12.8
# gpu count 2
# 执行训练命令
# 主机01-172.17.0.4:
torchrun --nnodes=2 --nproc_per_node=2 --node_rank=0 --master_addr=172.17.0.4 --master_port=29500 ddp_gpt_wikitext2.py --epochs 3 --batch_size 8
# 主机02-172.17.0.7:
torchrun --nnodes=2 --nproc_per_node=2 --node_rank=1 --master_addr=172.17.0.4 --master_port=29500 ddp_gpt_wikitext2.py --epochs 3 --batch_size 8
# 主机01-172.17.0.4-运行结果:
# 2025-10-23 13:07:59,919 - INFO - 运行信息 is_distributed=True, rank=0, world_size=4, local_rank=0
# 2025-10-23 13:07:59,920 - INFO - 使用设备: cuda:0
# 2025-10-23 13:07:59,920 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:07:59,937 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:08:28,730 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:08:29,867 - INFO - BERT 分词器词汇大小: 30522
# 2025-10-23 13:08:29,867 - INFO - 编码文本...
# 2025-10-23 13:08:44,002 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:08:45,115 - INFO - 编码文本...
# 2025-10-23 13:08:47,272 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-5df31a9e:1559:1559 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:1559:1559 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-5df31a9e:1559:1559 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-5df31a9e:1559:1559 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO ncclCommInitRankConfig comm 0xe29a3c0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 commId 0x992409a6fbc4ff39 - Init START
# 2025-10-23 13:09:02,490 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-5df31a9e:1560:1560 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-5df31a9e:1560:1560 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:1560:1560 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-5df31a9e:1560:1560 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO ncclCommInitRankConfig comm 0xdc97d50 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 commId 0x992409a6fbc4ff39 - Init START
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO Bootstrap timings total 0.000968 (create 0.000024, send 0.000080, recv 0.000352, ring 0.000179, delay 0.000000)
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Bootstrap timings total 15.229525 (create 0.000022, send 0.000084, recv 15.228832, ring 0.000167, delay 0.000000)
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO comm 0xdc97d50 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO comm 0xe29a3c0 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Channel 00/02 : 0 1 2 3
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Channel 01/02 : 0 1 2 3
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-5df31a9e:1560:1803 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 97
# autodl-container-fa4746a7fc-5df31a9e:1560:1802 [1] NCCL INFO [Proxy Service] Device 1 CPU core 196
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-5df31a9e:1559:1804 [0] NCCL INFO [Proxy Service] Device 0 CPU core 2
# autodl-container-fa4746a7fc-5df31a9e:1559:1805 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 3
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO ncclCommInitRankConfig comm 0xdc97d50 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 commId 0x992409a6fbc4ff39 - Init COMPLETE
# autodl-container-fa4746a7fc-5df31a9e:1560:1799 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.26 (kernels 0.22, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO ncclCommInitRankConfig comm 0xe29a3c0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 commId 0x992409a6fbc4ff39 - Init COMPLETE
# autodl-container-fa4746a7fc-5df31a9e:1559:1743 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 15.48 (kernels 0.21, alloc 0.00, bootstrap 15.23, allgathers 0.01, topo 0.02, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-5df31a9e:1560:1808 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 200
# autodl-container-fa4746a7fc-5df31a9e:1560:1806 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:1560:1806 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:1559:1809 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 5
# autodl-container-fa4746a7fc-5df31a9e:1559:1807 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:1559:1807 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:1559:1807 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-5df31a9e:1559:1807 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-5df31a9e:1560:1806 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# autodl-container-fa4746a7fc-5df31a9e:1559:1807 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# autodl-container-fa4746a7fc-5df31a9e:1560:1846 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-5df31a9e:1560:1846 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-5df31a9e:1559:1849 [0] NCCL INFO Channel 00/0 : 2[0] -> 0[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:1559:1849 [0] NCCL INFO Channel 01/0 : 2[0] -> 0[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:1559:1849 [0] NCCL INFO Channel 00/0 : 0[0] -> 2[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:1559:1849 [0] NCCL INFO Channel 01/0 : 0[0] -> 2[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:1559:1849 [0] NCCL INFO Connected all trees
# autodl-container-fa4746a7fc-5df31a9e:1560:1846 [1] NCCL INFO Connected all trees
# 2025-10-23 13:09:11,339 - INFO - Epoch 1/3, Batch 50, Avg Loss (last 50): 7.7722
# 2025-10-23 13:09:18,488 - INFO - Epoch 1/3, Batch 100, Avg Loss (last 50): 7.2346
# 2025-10-23 13:09:25,559 - INFO - Epoch 1/3, Batch 150, Avg Loss (last 50): 7.2517
# 2025-10-23 13:09:32,703 - INFO - Epoch 1/3, Batch 200, Avg Loss (last 50): 7.2352
# 2025-10-23 13:09:39,800 - INFO - Epoch 1/3, Batch 250, Avg Loss (last 50): 7.2328
# 2025-10-23 13:09:44,240 - INFO - Epoch 1/3, Avg Loss: 7.3363
# 2025-10-23 13:09:51,764 - INFO - Epoch 2/3, Batch 50, Avg Loss (last 50): 7.2402
# 2025-10-23 13:09:58,860 - INFO - Epoch 2/3, Batch 100, Avg Loss (last 50): 7.2281
# 2025-10-23 13:10:06,043 - INFO - Epoch 2/3, Batch 150, Avg Loss (last 50): 7.2145
# 2025-10-23 13:10:13,153 - INFO - Epoch 2/3, Batch 200, Avg Loss (last 50): 7.2229
# 2025-10-23 13:10:20,313 - INFO - Epoch 2/3, Batch 250, Avg Loss (last 50): 7.1556
# 2025-10-23 13:10:24,764 - INFO - Epoch 2/3, Avg Loss: 7.1967
# 2025-10-23 13:10:32,221 - INFO - Epoch 3/3, Batch 50, Avg Loss (last 50): 7.0414
# 2025-10-23 13:10:39,346 - INFO - Epoch 3/3, Batch 100, Avg Loss (last 50): 6.9660
# 2025-10-23 13:10:46,495 - INFO - Epoch 3/3, Batch 150, Avg Loss (last 50): 6.9307
# 2025-10-23 13:10:53,631 - INFO - Epoch 3/3, Batch 200, Avg Loss (last 50): 6.8737
# 2025-10-23 13:11:00,737 - INFO - Epoch 3/3, Batch 250, Avg Loss (last 50): 6.8102
# 2025-10-23 13:11:05,250 - INFO - Epoch 3/3, Avg Loss: 6.9075
# 2025-10-23 13:11:05,748 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-5df31a9e:1560:1560 [1] NCCL INFO comm 0xdc97d50 rank 1 nranks 4 cudaDev 1 busId a8000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-5df31a9e:1559:1559 [0] NCCL INFO comm 0xe29a3c0 rank 0 nranks 4 cudaDev 0 busId 38000 - Destroy COMPLETE
# 主机02-172.17.0.7-运行结果:
# 2025-10-23 13:07:59,934 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:07:59,944 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:08:37,917 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:08:40,075 - INFO - 编码文本...
# 2025-10-23 13:08:42,511 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:08:43,673 - INFO - 编码文本...
# 2025-10-23 13:08:57,442 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:1505:1505 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:1505:1505 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:1505:1505 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:1505:1505 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO ncclCommInitRankConfig comm 0xe2e5a00 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 commId 0x992409a6fbc4ff39 - Init START
# 2025-10-23 13:09:01,055 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:1506:1506 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:1506:1506 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:1506:1506 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:1506:1506 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO ncclCommInitRankConfig comm 0xcec7350 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 commId 0x992409a6fbc4ff39 - Init START
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO Bootstrap timings total 5.049287 (create 0.000024, send 0.000141, recv 3.615925, ring 0.000091, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO Bootstrap timings total 1.433739 (create 0.000024, send 0.000144, recv 0.000577, ring 1.432652, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO comm 0xcec7350 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO comm 0xe2e5a00 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:1506:1752 [1] NCCL INFO [Proxy Service] Device 1 CPU core 175
# autodl-container-fa4746a7fc-c6053b3e:1506:1753 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 176
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:1505:1754 [0] NCCL INFO [Proxy Service] Device 0 CPU core 124
# autodl-container-fa4746a7fc-c6053b3e:1505:1755 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 125
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO ncclCommInitRankConfig comm 0xcec7350 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 commId 0x992409a6fbc4ff39 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO ncclCommInitRankConfig comm 0xe2e5a00 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 commId 0x992409a6fbc4ff39 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:1506:1749 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 1.68 (kernels 0.21, alloc 0.00, bootstrap 1.43, allgathers 0.00, topo 0.03, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:1505:1697 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 5.30 (kernels 0.21, alloc 0.00, bootstrap 5.05, allgathers 0.00, topo 0.03, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:1505:1758 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 126
# autodl-container-fa4746a7fc-c6053b3e:1505:1757 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:1505:1757 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:1505:1757 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:1505:1757 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:1506:1759 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 177
# autodl-container-fa4746a7fc-c6053b3e:1506:1756 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:1506:1756 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:1505:1757 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# autodl-container-fa4746a7fc-c6053b3e:1506:1756 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# autodl-container-fa4746a7fc-c6053b3e:1505:1796 [0] NCCL INFO Channel 00/0 : 0[0] -> 2[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:1505:1796 [0] NCCL INFO Channel 01/0 : 0[0] -> 2[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:1505:1796 [0] NCCL INFO Channel 00/0 : 2[0] -> 0[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:1505:1796 [0] NCCL INFO Channel 01/0 : 2[0] -> 0[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:1506:1799 [1] NCCL INFO Channel 00 : 3[1] -> 2[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:1506:1799 [1] NCCL INFO Channel 01 : 3[1] -> 2[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:1505:1796 [0] NCCL INFO Connected all trees
# autodl-container-fa4746a7fc-c6053b3e:1506:1799 [1] NCCL INFO Connected all trees
# autodl-container-fa4746a7fc-c6053b3e:1505:1505 [0] NCCL INFO comm 0xe2e5a00 rank 2 nranks 4 cudaDev 0 busId 27000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:1506:1506 [1] NCCL INFO comm 0xcec7350 rank 3 nranks 4 cudaDev 1 busId c8000 - Destroy COMPLETE
# 在master主机上验证(172.17.0.4)
tree checkpoints/ models/
# checkpoints/
# ├── model_epoch_1.pth
# ├── model_epoch_2.pth
# └── model_epoch_3.pth
# models/
# └── final_model.pth
# 0 directories, 1 file
ls -lrht models/final_model.pth
# -rw-r--r-- 1 root root 253M Oct 23 13:11 models/final_model.pth
基于fsdp及fsdp2
单机多GPU-FSDP
bash
# fsdp
# 运行训练命令
cd /data/llm-in-practise/LLM_Distributed_Trainning/PyTorch/fsdp_basics/
torchrun --nproc_per_node=2 fsdp_gpt_wikitext2.py
# 2025-10-22 16:55:15,237 - INFO - 加载 WikiText 数据集...
# 2025-10-22 16:55:20,201 - INFO - 运行信息 is_distributed=True, rank=0, world_size=2, local_rank=0
# 2025-10-22 16:55:20,201 - INFO - 使用设备: cuda:0
# 2025-10-22 16:55:20,201 - INFO - 加载 WikiText 数据集...
# 2025-10-22 16:55:41,894 - INFO - 加载 23767 条非空文本
# 2025-10-22 16:55:42,816 - INFO - BERT 分词器词汇大小: 30522
# 2025-10-22 16:55:42,816 - INFO - 编码文本...
# 2025-10-22 16:55:47,018 - INFO - 加载 23767 条非空文本
# 2025-10-22 16:55:47,528 - INFO - 编码文本...
# 2025-10-22 16:56:00,197 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-22 16:56:00,919 - INFO - 使用 FSDP 封装模型...
# autodl-container-fa4746a7fc-c6053b3e:3507:3507 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:3507:3507 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:3507:3507 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:3507:3507 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO ncclCommInitRankConfig comm 0xe479e00 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId a8000 commId 0x68d4ad999a7bed4d - Init START
# 2025-10-22 16:56:04,830 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-22 16:56:05,559 - INFO - 使用 FSDP 封装模型...
# autodl-container-fa4746a7fc-c6053b3e:3508:3508 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:3508:3508 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:3508:3508 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:3508:3508 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO ncclCommInitRankConfig comm 0xdc29d00 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x68d4ad999a7bed4d - Init START
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO Bootstrap timings total 0.001013 (create 0.000025, send 0.000085, recv 0.000377, ring 0.000221, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Bootstrap timings total 4.625294 (create 0.000024, send 0.000083, recv 4.624596, ring 0.000021, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Setting affinity for GPU 0 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO comm 0xdc29d00 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] 0/-1/-1->1->-1 [2] -1/-1/-1->1->0 [3] 0/-1/-1->1->-1
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO comm 0xe479e00 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Channel 00/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Channel 01/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Channel 02/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Channel 03/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] -1/-1/-1->0->1 [2] 1/-1/-1->0->-1 [3] -1/-1/-1->0->1
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:3508:3744 [1] NCCL INFO [Proxy Service] Device 1 CPU core 77
# autodl-container-fa4746a7fc-c6053b3e:3508:3745 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 182
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:3507:3746 [0] NCCL INFO [Proxy Service] Device 0 CPU core 162
# autodl-container-fa4746a7fc-c6053b3e:3507:3747 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 167
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO ncclCommInitRankConfig comm 0xdc29d00 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x68d4ad999a7bed4d - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:3508:3741 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 2 total 0.23 (kernels 0.22, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO ncclCommInitRankConfig comm 0xe479e00 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId a8000 commId 0x68d4ad999a7bed4d - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:3507:3689 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 2 total 4.87 (kernels 0.23, alloc 0.00, bootstrap 4.63, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:3508:3749 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:3507:3748 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:3508:3749 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:3507:3748 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:3508:3749 [1] NCCL INFO Channel 02 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:3507:3748 [0] NCCL INFO Channel 02 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:3508:3749 [1] NCCL INFO Channel 03 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:3507:3748 [0] NCCL INFO Channel 03 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:3507:3748 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:3508:3749 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# 2025-10-22 16:56:05,967 - INFO - FSDP 模型封装完成。
# 2025-10-22 16:56:05,969 - INFO - FSDP 模型封装完成。
# 2025-10-22 16:56:09,071 - INFO - Epoch 1/3, Batch 50, Avg Loss (last 50): 7.7938
# 2025-10-22 16:56:12,038 - INFO - Epoch 1/3, Batch 100, Avg Loss (last 50): 7.3063
# 2025-10-22 16:56:14,983 - INFO - Epoch 1/3, Batch 150, Avg Loss (last 50): 7.2569
# 2025-10-22 16:56:17,923 - INFO - Epoch 1/3, Batch 200, Avg Loss (last 50): 7.2394
# 2025-10-22 16:56:20,876 - INFO - Epoch 1/3, Batch 250, Avg Loss (last 50): 7.2538
# 2025-10-22 16:56:23,861 - INFO - Epoch 1/3, Batch 300, Avg Loss (last 50): 7.2550
# 2025-10-22 16:56:26,831 - INFO - Epoch 1/3, Batch 350, Avg Loss (last 50): 7.2300
# 2025-10-22 16:56:29,792 - INFO - Epoch 1/3, Batch 400, Avg Loss (last 50): 7.2300
# 2025-10-22 16:56:32,736 - INFO - Epoch 1/3, Batch 450, Avg Loss (last 50): 7.2269
# 2025-10-22 16:56:35,706 - INFO - Epoch 1/3, Batch 500, Avg Loss (last 50): 7.2363
# 2025-10-22 16:56:38,706 - INFO - Epoch 1/3, Batch 550, Avg Loss (last 50): 7.2538
# 2025-10-22 16:56:39,458 - INFO - Epoch 1/3, Avg Loss: 7.2968
# 2025-10-22 16:56:42,946 - INFO - Epoch 2/3, Batch 50, Avg Loss (last 50): 7.2325
# 2025-10-22 16:56:45,943 - INFO - Epoch 2/3, Batch 100, Avg Loss (last 50): 7.2244
# 2025-10-22 16:56:48,935 - INFO - Epoch 2/3, Batch 150, Avg Loss (last 50): 7.2100
# 2025-10-22 16:56:51,953 - INFO - Epoch 2/3, Batch 200, Avg Loss (last 50): 7.2006
# 2025-10-22 16:56:54,965 - INFO - Epoch 2/3, Batch 250, Avg Loss (last 50): 7.1856
# 2025-10-22 16:56:57,955 - INFO - Epoch 2/3, Batch 300, Avg Loss (last 50): 7.1169
# 2025-10-22 16:57:00,970 - INFO - Epoch 2/3, Batch 350, Avg Loss (last 50): 7.1000
# 2025-10-22 16:57:03,974 - INFO - Epoch 2/3, Batch 400, Avg Loss (last 50): 7.0331
# 2025-10-22 16:57:06,939 - INFO - Epoch 2/3, Batch 450, Avg Loss (last 50): 6.9931
# 2025-10-22 16:57:09,927 - INFO - Epoch 2/3, Batch 500, Avg Loss (last 50): 6.9413
# 2025-10-22 16:57:12,907 - INFO - Epoch 2/3, Batch 550, Avg Loss (last 50): 6.8650
# 2025-10-22 16:57:13,668 - INFO - Epoch 2/3, Avg Loss: 7.0949
# 2025-10-22 16:57:17,052 - INFO - Epoch 3/3, Batch 50, Avg Loss (last 50): 6.8044
# 2025-10-22 16:57:20,027 - INFO - Epoch 3/3, Batch 100, Avg Loss (last 50): 6.8025
# 2025-10-22 16:57:22,999 - INFO - Epoch 3/3, Batch 150, Avg Loss (last 50): 6.7800
# 2025-10-22 16:57:25,971 - INFO - Epoch 3/3, Batch 200, Avg Loss (last 50): 6.7281
# 2025-10-22 16:57:28,933 - INFO - Epoch 3/3, Batch 250, Avg Loss (last 50): 6.7156
# 2025-10-22 16:57:31,895 - INFO - Epoch 3/3, Batch 300, Avg Loss (last 50): 6.6531
# 2025-10-22 16:57:34,851 - INFO - Epoch 3/3, Batch 350, Avg Loss (last 50): 6.6712
# 2025-10-22 16:57:37,807 - INFO - Epoch 3/3, Batch 400, Avg Loss (last 50): 6.6419
# 2025-10-22 16:57:40,804 - INFO - Epoch 3/3, Batch 450, Avg Loss (last 50): 6.6150
# 2025-10-22 16:57:43,820 - INFO - Epoch 3/3, Batch 500, Avg Loss (last 50): 6.6256
# 2025-10-22 16:57:46,857 - INFO - Epoch 3/3, Batch 550, Avg Loss (last 50): 6.5675
# 2025-10-22 16:57:47,648 - INFO - Epoch 3/3, Avg Loss: 6.6889
# 2025-10-22 16:57:48,464 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-c6053b3e:3508:3508 [1] NCCL INFO comm 0xdc29d00 rank 1 nranks 2 cudaDev 1 busId b8000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:3507:3507 [0] NCCL INFO comm 0xe479e00 rank 0 nranks 2 cudaDev 0 busId a8000 - Destroy COMPLETE
# 验证
ll
# total 40K
# -rw-r--r-- 1 root root 16K Oct 22 16:17 fsdp_gpt_wikitext2.py
# -rw-r--r-- 1 root root 15K Oct 22 16:17 fsdp2_gpt_wikitext2.py
# -rw-r--r-- 1 root root 6.9K Oct 22 16:17 README.md
# drwxr-xr-x 2 root root 97 Oct 22 16:57 checkpoints
# drwxr-xr-x 2 root root 37 Oct 22 16:57 models
tree checkpoints/ models/
# checkpoints/
# ├── model_epoch_1.pth
# ├── model_epoch_2.pth
# └── model_epoch_3.pth
# models/
# └── final_model.pth
# 0 directories, 1 file
# fsdp2
# 验证fully_shard是否可用
python -c "from torch.distributed.fsdp import fully_shard; print('fully_shard imported successfully')"
# fully_shard imported successfully
# 运行训练命令
cd /data/llm-in-practise/LLM_Distributed_Trainning/PyTorch/fsdp_basics/
torchrun --nproc_per_node=2 fsdp2_gpt_wikitext2.py
# 2025-10-22 17:02:17,661 - INFO - 使用设备: cuda:0
# 2025-10-22 17:02:17,661 - INFO - 加载 WikiText 数据集...
# 2025-10-22 17:02:25,860 - INFO - 加载 WikiText 数据集...
# 2025-10-22 17:02:40,625 - INFO - 加载 23767 条非空文本
# 2025-10-22 17:02:41,631 - INFO - 编码文本...
# 2025-10-22 17:02:59,058 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-22 17:02:59,772 - INFO - 使用 FSDP2 (fully_shard) 封装模型...
# 2025-10-22 17:02:59,806 - INFO - FSDP2 模型封装完成。
# autodl-container-fa4746a7fc-c6053b3e:4070:4070 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:4070:4070 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:4070:4070 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:4070:4070 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO ncclCommInitRankConfig comm 0x10f1a820 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId a8000 commId 0x1e3aac31d716dff - Init START
# 2025-10-22 17:03:16,557 - INFO - 加载 23767 条非空文本
# 2025-10-22 17:03:17,191 - INFO - 编码文本...
# 2025-10-22 17:03:34,440 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-22 17:03:35,174 - INFO - 使用 FSDP2 (fully_shard) 封装模型...
# 2025-10-22 17:03:35,207 - INFO - FSDP2 模型封装完成。
# autodl-container-fa4746a7fc-c6053b3e:4071:4071 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:4071:4071 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:4071:4071 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:4071:4071 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.10<0>
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO ncclCommInitRankConfig comm 0xfc333a0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x1e3aac31d716dff - Init START
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO Bootstrap timings total 0.000837 (create 0.000023, send 0.000082, recv 0.000376, ring 0.000056, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Bootstrap timings total 35.378572 (create 0.000021, send 0.000088, recv 35.378043, ring 0.000015, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Setting affinity for GPU 0 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO comm 0x10f1a820 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO comm 0xfc333a0 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Channel 00/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] 0/-1/-1->1->-1 [2] -1/-1/-1->1->0 [3] 0/-1/-1->1->-1
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Channel 01/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Channel 02/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Channel 03/04 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] -1/-1/-1->0->1 [2] 1/-1/-1->0->-1 [3] -1/-1/-1->0->1
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:4071:4353 [1] NCCL INFO [Proxy Service] Device 1 CPU core 184
# autodl-container-fa4746a7fc-c6053b3e:4071:4354 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 88
# autodl-container-fa4746a7fc-c6053b3e:4070:4355 [0] NCCL INFO [Proxy Service] Device 0 CPU core 91
# autodl-container-fa4746a7fc-c6053b3e:4070:4356 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 93
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO ncclCommInitRankConfig comm 0xfc333a0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x1e3aac31d716dff - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:4071:4350 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 2 total 0.24 (kernels 0.21, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.00, graphs 0.00, connections 0.01, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO ncclCommInitRankConfig comm 0x10f1a820 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId a8000 commId 0x1e3aac31d716dff - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:4070:4265 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 2 total 35.64 (kernels 0.24, alloc 0.00, bootstrap 35.38, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.02, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:4070:4358 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:4071:4357 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:4070:4358 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:4071:4357 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:4070:4358 [0] NCCL INFO Channel 02 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:4071:4357 [1] NCCL INFO Channel 02 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:4070:4358 [0] NCCL INFO Channel 03 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:4071:4357 [1] NCCL INFO Channel 03 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:4070:4358 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:4071:4357 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# 2025-10-22 17:03:38,314 - INFO - Epoch 1, Batch 50, Avg Loss (last 50): 7.7917
# 2025-10-22 17:03:40,704 - INFO - Epoch 1, Batch 100, Avg Loss (last 50): 7.3070
# 2025-10-22 17:03:43,102 - INFO - Epoch 1, Batch 150, Avg Loss (last 50): 7.2569
# 2025-10-22 17:03:45,494 - INFO - Epoch 1, Batch 200, Avg Loss (last 50): 7.2384
# 2025-10-22 17:03:47,888 - INFO - Epoch 1, Batch 250, Avg Loss (last 50): 7.2531
# 2025-10-22 17:03:50,262 - INFO - Epoch 1, Batch 300, Avg Loss (last 50): 7.2549
# 2025-10-22 17:03:52,603 - INFO - Epoch 1, Batch 350, Avg Loss (last 50): 7.2292
# 2025-10-22 17:03:54,911 - INFO - Epoch 1, Batch 400, Avg Loss (last 50): 7.2296
# 2025-10-22 17:03:57,297 - INFO - Epoch 1, Batch 450, Avg Loss (last 50): 7.2252
# 2025-10-22 17:03:59,683 - INFO - Epoch 1, Batch 500, Avg Loss (last 50): 7.2378
# 2025-10-22 17:04:02,062 - INFO - Epoch 1, Batch 550, Avg Loss (last 50): 7.2525
# 2025-10-22 17:04:02,653 - INFO - Epoch 1/3, Avg Loss: 7.2963
# 2025-10-22 17:04:05,391 - INFO - Epoch 2, Batch 50, Avg Loss (last 50): 7.2299
# 2025-10-22 17:04:07,784 - INFO - Epoch 2, Batch 100, Avg Loss (last 50): 7.2239
# 2025-10-22 17:04:10,181 - INFO - Epoch 2, Batch 150, Avg Loss (last 50): 7.2082
# 2025-10-22 17:04:12,562 - INFO - Epoch 2, Batch 200, Avg Loss (last 50): 7.1917
# 2025-10-22 17:04:14,929 - INFO - Epoch 2, Batch 250, Avg Loss (last 50): 7.1660
# 2025-10-22 17:04:17,296 - INFO - Epoch 2, Batch 300, Avg Loss (last 50): 7.0947
# 2025-10-22 17:04:19,665 - INFO - Epoch 2, Batch 350, Avg Loss (last 50): 7.0839
# 2025-10-22 17:04:22,032 - INFO - Epoch 2, Batch 400, Avg Loss (last 50): 7.0218
# 2025-10-22 17:04:24,401 - INFO - Epoch 2, Batch 450, Avg Loss (last 50): 6.9839
# 2025-10-22 17:04:26,786 - INFO - Epoch 2, Batch 500, Avg Loss (last 50): 6.9318
# 2025-10-22 17:04:29,156 - INFO - Epoch 2, Batch 550, Avg Loss (last 50): 6.8569
# 2025-10-22 17:04:29,748 - INFO - Epoch 2/3, Avg Loss: 7.0851
# 2025-10-22 17:04:32,629 - INFO - Epoch 3, Batch 50, Avg Loss (last 50): 6.8014
# 2025-10-22 17:04:35,002 - INFO - Epoch 3, Batch 100, Avg Loss (last 50): 6.7958
# 2025-10-22 17:04:37,369 - INFO - Epoch 3, Batch 150, Avg Loss (last 50): 6.7744
# 2025-10-22 17:04:39,746 - INFO - Epoch 3, Batch 200, Avg Loss (last 50): 6.7146
# 2025-10-22 17:04:42,116 - INFO - Epoch 3, Batch 250, Avg Loss (last 50): 6.7023
# 2025-10-22 17:04:44,483 - INFO - Epoch 3, Batch 300, Avg Loss (last 50): 6.6466
# 2025-10-22 17:04:46,857 - INFO - Epoch 3, Batch 350, Avg Loss (last 50): 6.6782
# 2025-10-22 17:04:49,224 - INFO - Epoch 3, Batch 400, Avg Loss (last 50): 6.6452
# 2025-10-22 17:04:51,598 - INFO - Epoch 3, Batch 450, Avg Loss (last 50): 6.6077
# 2025-10-22 17:04:53,965 - INFO - Epoch 3, Batch 500, Avg Loss (last 50): 6.6327
# 2025-10-22 17:04:56,338 - INFO - Epoch 3, Batch 550, Avg Loss (last 50): 6.5596
# 2025-10-22 17:04:56,939 - INFO - Epoch 3/3, Avg Loss: 6.6850
# 2025-10-22 17:04:57,841 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-c6053b3e:4071:4071 [1] NCCL INFO comm 0xfc333a0 rank 1 nranks 2 cudaDev 1 busId b8000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:4070:4070 [0] NCCL INFO comm 0x10f1a820 rank 0 nranks 2 cudaDev 0 busId a8000 - Destroy COMPLETE
# 验证
# ll
# total 40K
# -rw-r--r-- 1 root root 16K Oct 22 16:17 fsdp_gpt_wikitext2.py
# -rw-r--r-- 1 root root 15K Oct 22 16:17 fsdp2_gpt_wikitext2.py
# -rw-r--r-- 1 root root 6.9K Oct 22 16:17 README.md
# drwxr-xr-x 2 root root 97 Oct 22 16:57 checkpoints_fsdp
# drwxr-xr-x 2 root root 37 Oct 22 16:57 models_fsdp
# drwxr-xr-x 2 root root 97 Oct 22 17:04 checkpoints
# drwxr-xr-x 2 root root 37 Oct 22 17:04 models
tree checkpoints/ models/
# checkpoints/
# ├── model_epoch_1.pth
# ├── model_epoch_2.pth
# └── model_epoch_3.pth
# models/
# └── final_model.pth
# 0 directories, 1 file
多机多GPU训练-FSDP
多机多GPU训练主机说明
主机01: 172.17.0.4 GPU:
NVIDIA GeForce RTX 5090*2主机02: 172.17.0.7 GPU:
NVIDIA GeForce RTX 5090*2
bash
# 多机同时切换目录
cd /data/llm-in-practise/LLM_Distributed_Trainning/PyTorch/fsdp_basics/
# 相关多机环境变量等设置同ddp
# fsdp
# 执行训练命令
# 主机01-172.17.0.4:
torchrun --nnodes=2 --nproc_per_node=1 --node_rank=0 --master_addr=172.17.0.4 --master_port=29500 fsdp_gpt_wikitext2.py --epochs 3 --batch_size 8
# 主机02-172.17.0.7:
torchrun --nnodes=2 --nproc_per_node=1 --node_rank=1 --master_addr=172.17.0.4 --master_port=29500 fsdp_gpt_wikitext2.py --epochs 3 --batch_size 8
# 主机01-172.17.0.4-运行结果:
# 2025-10-23 13:20:40,447 - INFO - 运行信息 is_distributed=True, rank=0, world_size=2, local_rank=0
# 2025-10-23 13:20:40,447 - INFO - 使用设备: cuda:0
# 2025-10-23 13:20:40,448 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:21:06,889 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:21:07,525 - INFO - BERT 分词器词汇大小: 30522
# 2025-10-23 13:21:07,525 - INFO - 编码文本...
# 2025-10-23 13:21:24,869 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-23 13:21:25,602 - INFO - 使用 FSDP 封装模型...
# autodl-container-fa4746a7fc-5df31a9e:2264:2264 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:2264:2264 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-5df31a9e:2264:2264 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-5df31a9e:2264:2264 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO ncclCommInitRankConfig comm 0xed84b10 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 commId 0x5a7c37a18f63b868 - Init START
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Bootstrap timings total 18.889796 (create 0.000029, send 0.000088, recv 18.889230, ring 0.000024, delay 0.000000)
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO comm 0xed84b10 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Channel 00/02 : 0 1
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Channel 01/02 : 0 1
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] -1/-1/-1->0->1
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-5df31a9e:2264:2406 [0] NCCL INFO [Proxy Service] Device 0 CPU core 11
# autodl-container-fa4746a7fc-5df31a9e:2264:2407 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 12
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO ncclCommInitRankConfig comm 0xed84b10 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 commId 0x5a7c37a18f63b868 - Init COMPLETE
# autodl-container-fa4746a7fc-5df31a9e:2264:2398 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 2 total 19.13 (kernels 0.22, alloc 0.00, bootstrap 18.89, allgathers 0.00, topo 0.00, graphs 0.00, connections 0.02, rest 0.00)
# autodl-container-fa4746a7fc-5df31a9e:2264:2409 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 13
# autodl-container-fa4746a7fc-5df31a9e:2264:2408 [0] NCCL INFO Channel 00/0 : 1[0] -> 0[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:2264:2408 [0] NCCL INFO Channel 01/0 : 1[0] -> 0[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:2264:2408 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:2264:2408 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:2264:2408 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 2025-10-23 13:21:45,004 - INFO - FSDP 模型封装完成。
# 2025-10-23 13:21:48,517 - INFO - Epoch 1/3, Batch 50, Avg Loss (last 50): 7.8050
# 2025-10-23 13:21:51,659 - INFO - Epoch 1/3, Batch 100, Avg Loss (last 50): 7.3037
# 2025-10-23 13:21:54,773 - INFO - Epoch 1/3, Batch 150, Avg Loss (last 50): 7.2556
# 2025-10-23 13:21:57,932 - INFO - Epoch 1/3, Batch 200, Avg Loss (last 50): 7.2400
# 2025-10-23 13:22:01,060 - INFO - Epoch 1/3, Batch 250, Avg Loss (last 50): 7.2538
# 2025-10-23 13:22:04,165 - INFO - Epoch 1/3, Batch 300, Avg Loss (last 50): 7.2538
# 2025-10-23 13:22:07,287 - INFO - Epoch 1/3, Batch 350, Avg Loss (last 50): 7.2287
# 2025-10-23 13:22:10,399 - INFO - Epoch 1/3, Batch 400, Avg Loss (last 50): 7.2313
# 2025-10-23 13:22:13,536 - INFO - Epoch 1/3, Batch 450, Avg Loss (last 50): 7.2263
# 2025-10-23 13:22:16,637 - INFO - Epoch 1/3, Batch 500, Avg Loss (last 50): 7.2375
# 2025-10-23 13:22:19,744 - INFO - Epoch 1/3, Batch 550, Avg Loss (last 50): 7.2531
# 2025-10-23 13:22:20,520 - INFO - Epoch 1/3, Avg Loss: 7.2974
# 2025-10-23 13:22:24,008 - INFO - Epoch 2/3, Batch 50, Avg Loss (last 50): 7.2325
# 2025-10-23 13:22:27,139 - INFO - Epoch 2/3, Batch 100, Avg Loss (last 50): 7.2181
# 2025-10-23 13:22:30,275 - INFO - Epoch 2/3, Batch 150, Avg Loss (last 50): 7.1544
# 2025-10-23 13:22:33,383 - INFO - Epoch 2/3, Batch 200, Avg Loss (last 50): 7.0994
# 2025-10-23 13:22:36,543 - INFO - Epoch 2/3, Batch 250, Avg Loss (last 50): 7.0738
# 2025-10-23 13:22:39,699 - INFO - Epoch 2/3, Batch 300, Avg Loss (last 50): 7.0125
# 2025-10-23 13:22:42,818 - INFO - Epoch 2/3, Batch 350, Avg Loss (last 50): 6.9938
# 2025-10-23 13:22:45,932 - INFO - Epoch 2/3, Batch 400, Avg Loss (last 50): 6.9306
# 2025-10-23 13:22:49,048 - INFO - Epoch 2/3, Batch 450, Avg Loss (last 50): 6.8931
# 2025-10-23 13:22:52,160 - INFO - Epoch 2/3, Batch 500, Avg Loss (last 50): 6.8469
# 2025-10-23 13:22:55,279 - INFO - Epoch 2/3, Batch 550, Avg Loss (last 50): 6.7831
# 2025-10-23 13:22:56,086 - INFO - Epoch 2/3, Avg Loss: 7.0163
# 2025-10-23 13:22:59,681 - INFO - Epoch 3/3, Batch 50, Avg Loss (last 50): 6.7237
# 2025-10-23 13:23:02,814 - INFO - Epoch 3/3, Batch 100, Avg Loss (last 50): 6.7056
# 2025-10-23 13:23:05,974 - INFO - Epoch 3/3, Batch 150, Avg Loss (last 50): 6.6969
# 2025-10-23 13:23:09,108 - INFO - Epoch 3/3, Batch 200, Avg Loss (last 50): 6.6544
# 2025-10-23 13:23:12,244 - INFO - Epoch 3/3, Batch 250, Avg Loss (last 50): 6.6487
# 2025-10-23 13:23:15,375 - INFO - Epoch 3/3, Batch 300, Avg Loss (last 50): 6.5981
# 2025-10-23 13:23:18,496 - INFO - Epoch 3/3, Batch 350, Avg Loss (last 50): 6.6181
# 2025-10-23 13:23:21,621 - INFO - Epoch 3/3, Batch 400, Avg Loss (last 50): 6.5862
# 2025-10-23 13:23:24,750 - INFO - Epoch 3/3, Batch 450, Avg Loss (last 50): 6.5469
# 2025-10-23 13:23:27,934 - INFO - Epoch 3/3, Batch 500, Avg Loss (last 50): 6.5938
# 2025-10-23 13:23:31,117 - INFO - Epoch 3/3, Batch 550, Avg Loss (last 50): 6.5081
# 2025-10-23 13:23:31,920 - INFO - Epoch 3/3, Avg Loss: 6.6231
# 2025-10-23 13:23:32,802 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-5df31a9e:2264:2264 [0] NCCL INFO comm 0xed84b10 rank 0 nranks 2 cudaDev 0 busId 38000 - Destroy COMPLETE
# 主机02-172.17.0.7-运行结果:
# 2025-10-23 13:20:40,467 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:21:25,979 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:21:26,522 - INFO - 编码文本...
# 2025-10-23 13:21:43,768 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-23 13:21:44,492 - INFO - 使用 FSDP 封装模型...
# autodl-container-fa4746a7fc-c6053b3e:2206:2206 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:2206:2206 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:2206:2206 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:2206:2206 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO ncclCommInitRankConfig comm 0xec90940 rank 1 nranks 2 cudaDev 0 nvmlDev 0 busId 27000 commId 0x5a7c37a18f63b868 - Init START
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO Bootstrap timings total 0.001206 (create 0.000025, send 0.000156, recv 0.000277, ring 0.000105, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO comm 0xec90940 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] 0/-1/-1->1->-1
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:2206:2348 [0] NCCL INFO [Proxy Service] Device 0 CPU core 2
# autodl-container-fa4746a7fc-c6053b3e:2206:2349 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 3
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO ncclCommInitRankConfig comm 0xec90940 rank 1 nranks 2 cudaDev 0 nvmlDev 0 busId 27000 commId 0x5a7c37a18f63b868 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:2206:2346 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 2 total 0.23 (kernels 0.21, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.00, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:2206:2351 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 4
# autodl-container-fa4746a7fc-c6053b3e:2206:2350 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:2206:2350 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:2206:2350 [0] NCCL INFO Channel 00/0 : 1[0] -> 0[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:2206:2350 [0] NCCL INFO Channel 01/0 : 1[0] -> 0[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:2206:2350 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 2025-10-23 13:21:45,006 - INFO - FSDP 模型封装完成。
# autodl-container-fa4746a7fc-c6053b3e:2206:2206 [0] NCCL INFO comm 0xec90940 rank 1 nranks 2 cudaDev 0 busId 27000 - Destroy COMPLETE
# 在master主机上验证(172.17.0.4)
tree checkpoints/ models/
# checkpoints/
# ├── model_epoch_1.pth
# ├── model_epoch_2.pth
# └── model_epoch_3.pth
# models/
# └── final_model.pth
# 0 directories, 1 file
ls -lhrt models/final_model.pth
# -rw-r--r-- 1 root root 342M Oct 23 13:23 models/final_model.pth
# fsdp2
# 执行训练命令
# 主机01-172.17.0.4:
torchrun --nnodes=2 --nproc_per_node=2 --node_rank=0 --master_addr=172.17.0.4 --master_port=29500 fsdp2_gpt_wikitext2.py --epochs 3 --batch_size 8
# 主机02-172.17.0.7:
torchrun --nnodes=2 --nproc_per_node=2 --node_rank=1 --master_addr=172.17.0.4 --master_port=29500 fsdp2_gpt_wikitext2.py --epochs 3 --batch_size 8
# 主机01-172.17.0.4-运行结果:
# 2025-10-23 13:27:27,357 - INFO - 使用设备: cuda:0
# 2025-10-23 13:27:27,357 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:27:32,413 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:27:50,516 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:27:51,099 - INFO - 编码文本...
# 2025-10-23 13:27:52,064 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:27:52,601 - INFO - 编码文本...
# 2025-10-23 13:28:08,494 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-23 13:28:09,214 - INFO - 使用 FSDP2 (fully_shard) 封装模型...
# 2025-10-23 13:28:09,249 - INFO - FSDP2 模型封装完成。
# autodl-container-fa4746a7fc-5df31a9e:2663:2663 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:2663:2663 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-5df31a9e:2663:2663 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-5df31a9e:2663:2663 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO ncclCommInitRankConfig comm 0x10b2c2c0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 commId 0x7c7efefc7aae1dd3 - Init START
# 2025-10-23 13:28:09,998 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-23 13:28:10,715 - INFO - 使用 FSDP2 (fully_shard) 封装模型...
# 2025-10-23 13:28:10,748 - INFO - FSDP2 模型封装完成。
# autodl-container-fa4746a7fc-5df31a9e:2664:2664 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-5df31a9e:2664:2664 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:2664:2664 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-5df31a9e:2664:2664 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO ncclCommInitRankConfig comm 0xe3cd810 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 commId 0x7c7efefc7aae1dd3 - Init START
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Bootstrap timings total 23.533898 (create 0.000021, send 0.000081, recv 1.458636, ring 0.000086, delay 0.000000)
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO Bootstrap timings total 22.075780 (create 0.000025, send 0.000080, recv 14.921014, ring 7.154198, delay 0.000000)
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO comm 0xe3cd810 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO comm 0x10b2c2c0 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Channel 00/02 : 0 1 2 3
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Channel 01/02 : 0 1 2 3
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-5df31a9e:2664:2941 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 76
# autodl-container-fa4746a7fc-5df31a9e:2664:2940 [1] NCCL INFO [Proxy Service] Device 1 CPU core 75
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-5df31a9e:2663:2942 [0] NCCL INFO [Proxy Service] Device 0 CPU core 0
# autodl-container-fa4746a7fc-5df31a9e:2663:2943 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 128
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO ncclCommInitRankConfig comm 0xe3cd810 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 commId 0x7c7efefc7aae1dd3 - Init COMPLETE
# autodl-container-fa4746a7fc-5df31a9e:2664:2929 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 22.33 (kernels 0.21, alloc 0.00, bootstrap 22.08, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.03, rest 0.00)
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO ncclCommInitRankConfig comm 0x10b2c2c0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 commId 0x7c7efefc7aae1dd3 - Init COMPLETE
# autodl-container-fa4746a7fc-5df31a9e:2663:2860 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 23.79 (kernels 0.21, alloc 0.00, bootstrap 23.53, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.00)
# autodl-container-fa4746a7fc-5df31a9e:2663:2946 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 17
# autodl-container-fa4746a7fc-5df31a9e:2663:2945 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:2663:2945 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:2663:2945 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-5df31a9e:2663:2945 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-5df31a9e:2664:2947 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 77
# autodl-container-fa4746a7fc-5df31a9e:2664:2944 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:2664:2944 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-5df31a9e:2663:2945 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# autodl-container-fa4746a7fc-5df31a9e:2664:2944 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 2025-10-23 13:28:40,901 - INFO - Epoch 1, Batch 50, Avg Loss (last 50): 7.7729
# 2025-10-23 13:28:48,065 - INFO - Epoch 1, Batch 100, Avg Loss (last 50): 7.2346
# 2025-10-23 13:28:55,181 - INFO - Epoch 1, Batch 150, Avg Loss (last 50): 7.2515
# 2025-10-23 13:29:02,305 - INFO - Epoch 1, Batch 200, Avg Loss (last 50): 7.2347
# 2025-10-23 13:29:09,448 - INFO - Epoch 1, Batch 250, Avg Loss (last 50): 7.2319
# 2025-10-23 13:29:13,894 - INFO - Epoch 1/3, Avg Loss: 7.3361
# 2025-10-23 13:29:21,747 - INFO - Epoch 2, Batch 50, Avg Loss (last 50): 7.2398
# 2025-10-23 13:29:28,935 - INFO - Epoch 2, Batch 100, Avg Loss (last 50): 7.2281
# 2025-10-23 13:29:36,173 - INFO - Epoch 2, Batch 150, Avg Loss (last 50): 7.2154
# 2025-10-23 13:29:43,502 - INFO - Epoch 2, Batch 200, Avg Loss (last 50): 7.2471
# 2025-10-23 13:29:50,980 - INFO - Epoch 2, Batch 250, Avg Loss (last 50): 7.2384
# 2025-10-23 13:29:55,626 - INFO - Epoch 2/3, Avg Loss: 7.2290
# 2025-10-23 13:30:03,604 - INFO - Epoch 3, Batch 50, Avg Loss (last 50): 7.1685
# 2025-10-23 13:30:10,855 - INFO - Epoch 3, Batch 100, Avg Loss (last 50): 7.0924
# 2025-10-23 13:30:18,113 - INFO - Epoch 3, Batch 150, Avg Loss (last 50): 7.0307
# 2025-10-23 13:30:25,369 - INFO - Epoch 3, Batch 200, Avg Loss (last 50): 6.9945
# 2025-10-23 13:30:32,601 - INFO - Epoch 3, Batch 250, Avg Loss (last 50): 6.8960
# 2025-10-23 13:30:37,206 - INFO - Epoch 3/3, Avg Loss: 7.0173
# 2025-10-23 13:30:38,256 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-5df31a9e:2664:2664 [1] NCCL INFO comm 0xe3cd810 rank 1 nranks 4 cudaDev 1 busId a8000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-5df31a9e:2663:2663 [0] NCCL INFO comm 0x10b2c2c0 rank 0 nranks 4 cudaDev 0 busId 38000 - Destroy COMPLETE
# 主机02-172.17.0.7-运行结果:
# 2025-10-23 13:27:40,725 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:27:40,732 - INFO - 加载 WikiText 数据集...
# 2025-10-23 13:28:07,117 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:28:07,666 - INFO - 编码文本...
# 2025-10-23 13:28:13,864 - INFO - 加载 23767 条非空文本
# 2025-10-23 13:28:14,665 - INFO - 编码文本...
# 2025-10-23 13:28:24,881 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-23 13:28:25,603 - INFO - 使用 FSDP2 (fully_shard) 封装模型...
# 2025-10-23 13:28:25,636 - INFO - FSDP2 模型封装完成。
# autodl-container-fa4746a7fc-c6053b3e:2598:2598 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:2598:2598 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:2598:2598 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:2598:2598 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO ncclCommInitRankConfig comm 0x105acf90 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 commId 0x7c7efefc7aae1dd3 - Init START
# 2025-10-23 13:28:32,050 - INFO - 总 token 数=2301731, 块数=8991
# 2025-10-23 13:28:32,787 - INFO - 使用 FSDP2 (fully_shard) 封装模型...
# 2025-10-23 13:28:32,821 - INFO - FSDP2 模型封装完成。
# autodl-container-fa4746a7fc-c6053b3e:2599:2599 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:2599:2599 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:2599:2599 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:2599:2599 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO ncclCommInitRankConfig comm 0x111449f0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 commId 0x7c7efefc7aae1dd3 - Init START
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO Bootstrap timings total 0.001214 (create 0.000025, send 0.000145, recv 0.000571, ring 0.000181, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO Bootstrap timings total 7.155062 (create 0.000025, send 0.000155, recv 7.154197, ring 0.000261, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO comm 0x111449f0 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO comm 0x105acf90 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:2599:2874 [1] NCCL INFO [Proxy Service] Device 1 CPU core 103
# autodl-container-fa4746a7fc-c6053b3e:2599:2875 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 201
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:2598:2876 [0] NCCL INFO [Proxy Service] Device 0 CPU core 2
# autodl-container-fa4746a7fc-c6053b3e:2598:2877 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 116
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO ncclCommInitRankConfig comm 0x111449f0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 commId 0x7c7efefc7aae1dd3 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:2599:2871 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.26 (kernels 0.21, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.01, rest 0.02)
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO ncclCommInitRankConfig comm 0x105acf90 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 commId 0x7c7efefc7aae1dd3 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:2598:2802 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 7.42 (kernels 0.22, alloc 0.00, bootstrap 7.16, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:2598:2880 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 37
# autodl-container-fa4746a7fc-c6053b3e:2598:2879 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:2598:2879 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:2599:2881 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 157
# autodl-container-fa4746a7fc-c6053b3e:2599:2878 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:2599:2878 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/Socket/0
# autodl-container-fa4746a7fc-c6053b3e:2598:2879 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2598:2879 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:2598:2879 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# autodl-container-fa4746a7fc-c6053b3e:2599:2878 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# autodl-container-fa4746a7fc-c6053b3e:2598:2598 [0] NCCL INFO comm 0x105acf90 rank 2 nranks 4 cudaDev 0 busId 27000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:2599:2599 [1] NCCL INFO comm 0x111449f0 rank 3 nranks 4 cudaDev 1 busId c8000 - Destroy COMPLETE
# 运行训练时主机网络及显卡的情况
ss -tn|grep 29500
# ESTAB 0 0 [::ffff:172.17.0.7]:33328 [::ffff:172.17.0.4]:29500
# ESTAB 0 0 [::ffff:172.17.0.7]:52112 [::ffff:172.17.0.4]:29500
# ESTAB 0 0 [::ffff:172.17.0.7]:52128 [::ffff:172.17.0.4]:29500
iftop
nvidia-smi
# Thu Oct 23 13:30:03 2025
# +-----------------------------------------------------------------------------------------+
# | NVIDIA-SMI 580.76.05 Driver Version: 580.76.05 CUDA Version: 13.0 |
# +-----------------------------------------+------------------------+----------------------+
# | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
# | | | MIG M. |
# |=========================================+========================+======================|
# | 0 NVIDIA GeForce RTX 5090 On | 00000000:27:00.0 Off | N/A |
# | 42% 50C P1 180W / 575W | 3909MiB / 32607MiB | 99% Default |
# | | | N/A |
# +-----------------------------------------+------------------------+----------------------+
# | 1 NVIDIA GeForce RTX 5090 On | 00000000:C8:00.0 Off | N/A |
# | 41% 44C P1 178W / 575W | 3903MiB / 32607MiB | 99% Default |
# | | | N/A |
# +-----------------------------------------+------------------------+----------------------+
# +-----------------------------------------------------------------------------------------+
# | Processes: |
# | GPU GI CI PID Type Process name GPU Memory |
# | ID ID Usage |
# |=========================================================================================|
# | 0 N/A N/A 2598 C /root/miniconda3/bin/python 3900MiB |
# | 1 N/A N/A 2599 C /root/miniconda3/bin/python 3894MiB |
# +-----------------------------------------------------------------------------------------+
# 在master主机上验证(172.17.0.4)
tree checkpoints/ models/
# checkpoints/
# ├── model_epoch_1.pth
# ├── model_epoch_2.pth
# └── model_epoch_3.pth
# models/
# └── final_model.pth
# 0 directories, 1 file
ls -lhrt models/final_model.pth
# -rw-r--r-- 1 root root 342M Oct 23 13:30 models/final_model.pth
基于DeepSpeed
单机多GPU-DeepSeed
DeepSpeed-State-ZeRO-1
bash
cd /data/llm-in-practise/LLM_Distributed_Trainning/DeepSpeed/DeepSpeed-GPTLike-ZeRO-1
# 指定GPU显卡
export CUDA_VISIBLE_DEVICES=0,1
# 运行训练命令
deepspeed --num_gpus 2 DeepSpeed-GPTLike-ZeRO-1.py
# [2025-10-23 11:58:46,489] [WARNING] [runner.py:232:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
# Detected VISIBLE_DEVICES=0,1 but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed.
# [2025-10-23 11:58:46,489] [INFO] [runner.py:630:main] cmd = /root/miniconda3/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None --log_level=info DeepSpeed-GPTLike-ZeRO-1.py
# [2025-10-23 11:58:50,667] [INFO] [launch.py:155:main] 0 NCCL_P2P_DISABLE=1
# [2025-10-23 11:58:50,667] [INFO] [launch.py:155:main] 0 NCCL_DEBUG=INFO
# [2025-10-23 11:58:50,667] [INFO] [launch.py:155:main] 0 TORCH_NCCL_BLOCKING_WAIT=1
# [2025-10-23 11:58:50,667] [INFO] [launch.py:155:main] 0 NCCL_IB_DISABLE=1
# [2025-10-23 11:58:50,667] [INFO] [launch.py:162:main] WORLD INFO DICT: {'localhost': [0, 1]}
# [2025-10-23 11:58:50,667] [INFO] [launch.py:168:main] nnodes=1, num_local_procs=2, node_rank=0
# [2025-10-23 11:58:50,667] [INFO] [launch.py:179:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
# [2025-10-23 11:58:50,667] [INFO] [launch.py:180:main] dist_world_size=2
# [2025-10-23 11:58:50,667] [INFO] [launch.py:184:main] Setting CUDA_VISIBLE_DEVICES=0,1
# [2025-10-23 11:58:50,668] [INFO] [launch.py:272:main] process 8016 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-1.py', '--local_rank=0']
# [2025-10-23 11:58:50,668] [INFO] [launch.py:272:main] process 8017 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-1.py', '--local_rank=1']
# 2025-10-23 11:58:55,505 - INFO - 从 ds_config.json 读取到 train_micro_batch_size_per_gpu=16,将用于 DataLoader.batch_size(覆盖命令行 --batch_size)
# 2025-10-23 11:58:55,530 - INFO - 从 ds_config.json 读取到 train_micro_batch_size_per_gpu=16,将用于 DataLoader.batch_size(覆盖命令行 --batch_size)
# autodl-container-fa4746a7fc-c6053b3e:8016:8016 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:8016:8016 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:8016:8016 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:8017:8017 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:8017:8017 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:8017:8017 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:8017:8017 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8016 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO ncclCommInitRankConfig comm 0xd0bd2e0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x51e42c3ce21fa4ca - Init START
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO ncclCommInitRankConfig comm 0xe1a8fb0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 commId 0x51e42c3ce21fa4ca - Init START
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Bootstrap timings total 0.000678 (create 0.000018, send 0.000069, recv 0.000290, ring 0.000019, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO Bootstrap timings total 0.077216 (create 0.000023, send 0.000081, recv 0.037039, ring 0.000014, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO comm 0xd0bd2e0 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO comm 0xe1a8fb0 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Channel 00/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Channel 01/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:8017:8488 [1] NCCL INFO [Proxy Service] Device 1 CPU core 169
# autodl-container-fa4746a7fc-c6053b3e:8017:8489 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 66
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:8016:8490 [0] NCCL INFO [Proxy Service] Device 0 CPU core 105
# autodl-container-fa4746a7fc-c6053b3e:8016:8491 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 107
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO ncclCommInitRankConfig comm 0xd0bd2e0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x51e42c3ce21fa4ca - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:8017:8484 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 2 total 0.30 (kernels 0.22, alloc 0.00, bootstrap 0.08, allgathers 0.00, topo 0.00, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO ncclCommInitRankConfig comm 0xe1a8fb0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 commId 0x51e42c3ce21fa4ca - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:8016:8485 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 2 total 0.27 (kernels 0.22, alloc 0.00, bootstrap 0.00, allgathers 0.04, topo 0.00, graphs 0.00, connections 0.00, rest 0.00)
# 2025-10-23 11:59:06,127 - INFO - 加载 WikiText 数据集...
# 2025-10-23 11:59:06,165 - INFO - DeepSpeed 环境:rank=0, world_size=2
# 2025-10-23 11:59:06,165 - INFO - 使用设备: cuda:0
# 2025-10-23 11:59:06,165 - INFO - 加载 WikiText 数据集...
# 2025-10-23 11:59:40,602 - INFO - 加载 23767 条非空文本
# 2025-10-23 11:59:41,087 - INFO - BERT 分词器词汇大小: 30522
# 2025-10-23 11:59:41,087 - INFO - 编码文本...
# 2025-10-23 11:59:44,013 - INFO - 加载 23767 条非空文本
# 2025-10-23 11:59:44,551 - INFO - 编码文本...
# 2025-10-23 11:59:58,110 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:8016:8016 [0] NCCL INFO Comm config Blocking set to 1
# 2025-10-23 12:00:01,572 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:8017:8017 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO ncclCommSplit comm 0x1056cfb0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 parent 0xe1a8fb0 splitCount 1 color 2130503744 key 0- Init START
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO ncclCommSplit comm 0xffc3470 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 parent 0xd0bd2e0 splitCount 1 color 2130503744 key 1- Init START
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO comm 0xffc3470 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO comm 0x1056cfb0 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO Channel 00/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO Channel 01/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:8017:8619 [1] NCCL INFO [Proxy Service] Device 1 CPU core 66
# autodl-container-fa4746a7fc-c6053b3e:8016:8622 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 106
# autodl-container-fa4746a7fc-c6053b3e:8017:8621 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 171
# autodl-container-fa4746a7fc-c6053b3e:8016:8620 [0] NCCL INFO [Proxy Service] Device 0 CPU core 105
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO ncclCommSplit comm 0xffc3470 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 parent 0xd0bd2e0 splitCount 1 color 2130503744 key 1 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO ncclCommSplit comm 0x1056cfb0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 parent 0xe1a8fb0 splitCount 1 color 2130503744 key 0 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:8016:8565 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 2 total 3.49 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 3.48)
# autodl-container-fa4746a7fc-c6053b3e:8017:8618 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 2 total 0.01 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:8017:8623 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:8016:8624 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:8017:8623 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:8016:8624 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:8017:8623 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8624 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# [2025-10-23 12:00:02,519] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# [2025-10-23 12:00:02,918] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# 2025-10-23 12:00:05,076 - INFO - Epoch 1/3, Batch 50, Avg Loss (last 50): 9.1718
# 2025-10-23 12:00:07,062 - INFO - Epoch 1/3, Batch 100, Avg Loss (last 50): 7.3743
# 2025-10-23 12:00:09,063 - INFO - Epoch 1/3, Batch 150, Avg Loss (last 50): 7.2468
# 2025-10-23 12:00:11,008 - INFO - Epoch 1/3, Batch 200, Avg Loss (last 50): 7.2195
# 2025-10-23 12:00:12,991 - INFO - Epoch 1/3, Batch 250, Avg Loss (last 50): 7.2265
# autodl-container-fa4746a7fc-c6053b3e:8017:8665 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:8017:8665 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# 2025-10-23 12:00:14,266 - INFO - Epoch 1/3, Avg Loss: 7.6032
# autodl-container-fa4746a7fc-c6053b3e:8016:8666 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:8016:8666 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:8017:8665 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:8016:8666 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# 2025-10-23 12:00:14,997 - INFO - 已保存检查点: checkpoints/epoch1
# 2025-10-23 12:00:16,823 - INFO - Epoch 2/3, Batch 50, Avg Loss (last 50): 7.2264
# 2025-10-23 12:00:18,754 - INFO - Epoch 2/3, Batch 100, Avg Loss (last 50): 7.2084
# 2025-10-23 12:00:20,750 - INFO - Epoch 2/3, Batch 150, Avg Loss (last 50): 7.2237
# 2025-10-23 12:00:22,719 - INFO - Epoch 2/3, Batch 200, Avg Loss (last 50): 7.2451
# 2025-10-23 12:00:24,694 - INFO - Epoch 2/3, Batch 250, Avg Loss (last 50): 7.2405
# 2025-10-23 12:00:25,991 - INFO - Epoch 2/3, Avg Loss: 7.2265
# 2025-10-23 12:00:26,668 - INFO - 已保存检查点: checkpoints/epoch2
# 2025-10-23 12:00:28,335 - INFO - Epoch 3/3, Batch 50, Avg Loss (last 50): 7.2171
# 2025-10-23 12:00:30,258 - INFO - Epoch 3/3, Batch 100, Avg Loss (last 50): 7.2249
# 2025-10-23 12:00:32,237 - INFO - Epoch 3/3, Batch 150, Avg Loss (last 50): 7.2176
# 2025-10-23 12:00:34,197 - INFO - Epoch 3/3, Batch 200, Avg Loss (last 50): 7.2316
# 2025-10-23 12:00:36,114 - INFO - Epoch 3/3, Batch 250, Avg Loss (last 50): 7.2280
# 2025-10-23 12:00:37,365 - INFO - Epoch 3/3, Avg Loss: 7.2213
# 2025-10-23 12:00:38,011 - INFO - 已保存检查点: checkpoints/epoch3
# 2025-10-23 12:00:38,188 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-c6053b3e:8016:8016 [0] NCCL INFO comm 0x1056cfb0 rank 0 nranks 2 cudaDev 0 busId 38000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:8017:8017 [1] NCCL INFO comm 0xffc3470 rank 1 nranks 2 cudaDev 1 busId b8000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:8016:8016 [0] NCCL INFO comm 0xe1a8fb0 rank 0 nranks 2 cudaDev 0 busId 38000 - Destroy COMPLETE
# 2025-10-23 12:00:38,818 - INFO - 已成功销毁 torch.distributed 进程组 (destroy_process_group)
# autodl-container-fa4746a7fc-c6053b3e:8017:8017 [1] NCCL INFO comm 0xd0bd2e0 rank 1 nranks 2 cudaDev 1 busId b8000 - Destroy COMPLETE
# [2025-10-23 12:00:40,676] [INFO] [launch.py:367:main] Process 8017 exits successfully.
# [2025-10-23 12:00:40,677] [INFO] [launch.py:367:main] Process 8016 exits successfully.
# 验证
tree ./checkpoints/ ./models/
# ./checkpoints/
# ├── epoch1
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── epoch2
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── epoch3
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── latest
# └── zero_to_fp32.py
# ./models/
# └── final_model.pth
# 0 directories, 1 file
DeepSpeed-State-ZeRO-2
bash
cd /data/hooper/llm-in-practise/LLM_Distributed_Trainning/DeepSpeed/DeepSpeed-GPTLike-ZeRO-2
deepspeed --num_gpus 2 DeepSpeed-GPTLike-ZeRO-2.py
# [2025-10-23 11:50:51,449] [WARNING] [runner.py:232:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
# Detected VISIBLE_DEVICES=0,1 but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed.
# [2025-10-23 11:50:51,449] [INFO] [runner.py:630:main] cmd = /root/miniconda3/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None --log_level=info DeepSpeed-GPTLike-ZeRO-2.py
# [2025-10-23 11:50:55,738] [INFO] [launch.py:155:main] 0 NCCL_P2P_DISABLE=1
# [2025-10-23 11:50:55,738] [INFO] [launch.py:155:main] 0 NCCL_DEBUG=INFO
# [2025-10-23 11:50:55,739] [INFO] [launch.py:155:main] 0 TORCH_NCCL_BLOCKING_WAIT=1
# [2025-10-23 11:50:55,739] [INFO] [launch.py:155:main] 0 NCCL_IB_DISABLE=1
# [2025-10-23 11:50:55,739] [INFO] [launch.py:162:main] WORLD INFO DICT: {'localhost': [0, 1]}
# [2025-10-23 11:50:55,739] [INFO] [launch.py:168:main] nnodes=1, num_local_procs=2, node_rank=0
# [2025-10-23 11:50:55,739] [INFO] [launch.py:179:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
# [2025-10-23 11:50:55,739] [INFO] [launch.py:180:main] dist_world_size=2
# [2025-10-23 11:50:55,739] [INFO] [launch.py:184:main] Setting CUDA_VISIBLE_DEVICES=0,1
# [2025-10-23 11:50:55,739] [INFO] [launch.py:272:main] process 6803 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-2.py', '--local_rank=0']
# [2025-10-23 11:50:55,739] [INFO] [launch.py:272:main] process 6804 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-2.py', '--local_rank=1']
# 2025-10-23 11:51:00,415 - INFO - DataLoader batch_size=16
# 2025-10-23 11:51:00,499 - INFO - DataLoader batch_size=16
# autodl-container-fa4746a7fc-c6053b3e:6803:6803 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:6803:6803 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:6803:6803 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:6804:6804 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:6804:6804 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:6804:6804 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:6804:6804 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO ncclCommInitRankConfig comm 0xe180b20 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x69c7c100a5d1cc - Init START
# autodl-container-fa4746a7fc-c6053b3e:6803:6803 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO ncclCommInitRankConfig comm 0xc96dd40 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 commId 0x69c7c100a5d1cc - Init START
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Bootstrap timings total 0.000665 (create 0.000022, send 0.000077, recv 0.000245, ring 0.000026, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO Bootstrap timings total 0.234759 (create 0.000022, send 0.000079, recv 0.216063, ring 0.000015, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO comm 0xe180b20 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO comm 0xc96dd40 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Channel 00/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Channel 01/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:6804:7273 [1] NCCL INFO [Proxy Service] Device 1 CPU core 196
# autodl-container-fa4746a7fc-c6053b3e:6804:7274 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 95
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:6803:7275 [0] NCCL INFO [Proxy Service] Device 0 CPU core 13
# autodl-container-fa4746a7fc-c6053b3e:6803:7276 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 8
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO ncclCommInitRankConfig comm 0xe180b20 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0x69c7c100a5d1cc - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:6804:7269 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 2 total 0.47 (kernels 0.22, alloc 0.00, bootstrap 0.23, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO ncclCommInitRankConfig comm 0xc96dd40 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 commId 0x69c7c100a5d1cc - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:6803:7270 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 2 total 0.24 (kernels 0.21, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.00, graphs 0.00, connections 0.00, rest 0.00)
# 2025-10-23 11:51:09,112 - INFO - 加载 WikiText 数据集...
# 2025-10-23 11:51:09,124 - INFO - DeepSpeed 环境:rank=0, world_size=2
# 2025-10-23 11:51:09,124 - INFO - 使用设备: cuda:0
# 2025-10-23 11:51:09,124 - INFO - 加载 WikiText 数据集...
# 2025-10-23 11:51:26,535 - INFO - 加载 23767 条非空文本
# 2025-10-23 11:51:27,012 - INFO - 编码文本...
# 2025-10-23 11:51:42,063 - INFO - 加载 23767 条非空文本
# 2025-10-23 11:51:42,528 - INFO - BERT 分词器词汇大小: 30522
# 2025-10-23 11:51:42,528 - INFO - 编码文本...
# 2025-10-23 11:51:44,008 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:6804:6804 [1] NCCL INFO Comm config Blocking set to 1
# 2025-10-23 11:51:59,479 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:6803:6803 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO ncclCommSplit comm 0x11077430 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 parent 0xe180b20 splitCount 1 color 2130503744 key 1- Init START
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO ncclCommSplit comm 0xfdb1e40 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 parent 0xc96dd40 splitCount 1 color 2130503744 key 0- Init START
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO comm 0x11077430 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO comm 0xfdb1e40 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO Channel 00/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO Channel 01/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:6804:7403 [1] NCCL INFO [Proxy Service] Device 1 CPU core 167
# autodl-container-fa4746a7fc-c6053b3e:6804:7404 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 67
# autodl-container-fa4746a7fc-c6053b3e:6803:7405 [0] NCCL INFO [Proxy Service] Device 0 CPU core 106
# autodl-container-fa4746a7fc-c6053b3e:6803:7406 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 107
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO ncclCommSplit comm 0x11077430 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 parent 0xe180b20 splitCount 1 color 2130503744 key 1 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:6804:7344 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 2 total 15.48 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 15.46)
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO ncclCommSplit comm 0xfdb1e40 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 parent 0xc96dd40 splitCount 1 color 2130503744 key 0 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:6803:7402 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 2 total 0.02 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:6803:7407 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:6804:7408 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:6803:7407 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:6804:7408 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:6803:7407 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:6804:7408 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# [2025-10-23 11:52:00,416] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# [2025-10-23 11:52:00,801] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# 2025-10-23 11:52:03,643 - INFO - Epoch 1/3, Batch 50, Loss=7.9805
# 2025-10-23 11:52:06,020 - INFO - Epoch 1/3, Batch 100, Loss=7.2539
# 2025-10-23 11:52:08,425 - INFO - Epoch 1/3, Batch 150, Loss=7.3320
# 2025-10-23 11:52:10,820 - INFO - Epoch 1/3, Batch 200, Loss=7.2773
# 2025-10-23 11:52:13,216 - INFO - Epoch 1/3, Batch 250, Loss=7.2695
# 2025-10-23 11:52:14,726 - INFO - Epoch 1/3, Avg Loss=7.6175
# autodl-container-fa4746a7fc-c6053b3e:6803:7449 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:6804:7450 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:6803:7449 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:6804:7450 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:6803:7449 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:6804:7450 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# 2025-10-23 11:52:15,470 - INFO - 已保存检查点: checkpoints/epoch1
# 2025-10-23 11:52:17,851 - INFO - Epoch 2/3, Batch 50, Loss=7.2305
# 2025-10-23 11:52:20,298 - INFO - Epoch 2/3, Batch 100, Loss=7.2500
# 2025-10-23 11:52:22,745 - INFO - Epoch 2/3, Batch 150, Loss=7.2578
# 2025-10-23 11:52:25,190 - INFO - Epoch 2/3, Batch 200, Loss=7.2500
# 2025-10-23 11:52:27,624 - INFO - Epoch 2/3, Batch 250, Loss=7.2305
# 2025-10-23 11:52:29,221 - INFO - Epoch 2/3, Avg Loss=7.2386
# 2025-10-23 11:52:29,905 - INFO - 已保存检查点: checkpoints/epoch2
# 2025-10-23 11:52:32,344 - INFO - Epoch 3/3, Batch 50, Loss=7.3242
# 2025-10-23 11:52:34,828 - INFO - Epoch 3/3, Batch 100, Loss=7.2812
# 2025-10-23 11:52:37,296 - INFO - Epoch 3/3, Batch 150, Loss=7.1641
# 2025-10-23 11:52:39,688 - INFO - Epoch 3/3, Batch 200, Loss=7.2383
# 2025-10-23 11:52:42,041 - INFO - Epoch 3/3, Batch 250, Loss=7.1172
# 2025-10-23 11:52:43,625 - INFO - Epoch 3/3, Avg Loss=7.2300
# 2025-10-23 11:52:44,326 - INFO - 已保存检查点: checkpoints/epoch3
# 2025-10-23 11:52:44,484 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-c6053b3e:6804:6804 [1] NCCL INFO comm 0x11077430 rank 1 nranks 2 cudaDev 1 busId b8000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:6803:6803 [0] NCCL INFO comm 0xfdb1e40 rank 0 nranks 2 cudaDev 0 busId 38000 - Destroy COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:6803:6803 [0] NCCL INFO comm 0xc96dd40 rank 0 nranks 2 cudaDev 0 busId 38000 - Destroy COMPLETE
# 2025-10-23 11:52:45,300 - INFO - 已成功销毁 torch.distributed 进程组
# autodl-container-fa4746a7fc-c6053b3e:6804:6804 [1] NCCL INFO comm 0xe180b20 rank 1 nranks 2 cudaDev 1 busId b8000 - Destroy COMPLETE
# [2025-10-23 11:52:47,749] [INFO] [launch.py:367:main] Process 6804 exits successfully.
# [2025-10-23 11:52:47,749] [INFO] [launch.py:367:main] Process 6803 exits successfully.
# 验证
ll
# total 16K
# -rw-r--r-- 1 root root 929 Oct 23 11:12 ds_config.json
# -rw-r--r-- 1 root root 11K Oct 23 11:12 DeepSpeed-GPTLike-ZeRO-2.py
# drwxr-xr-x 5 root root 109 Oct 23 11:52 checkpoints
# drwxr-xr-x 2 root root 37 Oct 23 11:52 models
tree ./checkpoints/ ./models/
./checkpoints/
# ├── epoch1
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── epoch2
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── epoch3
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── latest
# └── zero_to_fp32.py
# ./models/
# └── final_model.pth
# 0 directories, 1 file
DeepSpeed-State-ZeRO-3
bash
cd /data/llm-in-practise/LLM_Distributed_Trainning/DeepSpeed/DeepSpeed-GPTLike-ZeRO-3
deepspeed --num_gpus 2 DeepSpeed-GPTLike-ZeRO-3.py
# [2025-10-23 12:04:15,741] [WARNING] [runner.py:232:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
# Detected VISIBLE_DEVICES=0,1 but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed.
# [2025-10-23 12:04:15,741] [INFO] [runner.py:630:main] cmd = /root/miniconda3/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None --log_level=info DeepSpeed-GPTLike-ZeRO-3.py
# [2025-10-23 12:04:19,968] [INFO] [launch.py:155:main] 0 NCCL_P2P_DISABLE=1
# [2025-10-23 12:04:19,968] [INFO] [launch.py:155:main] 0 NCCL_DEBUG=INFO
# [2025-10-23 12:04:19,968] [INFO] [launch.py:155:main] 0 TORCH_NCCL_BLOCKING_WAIT=1
# [2025-10-23 12:04:19,968] [INFO] [launch.py:155:main] 0 NCCL_IB_DISABLE=1
# [2025-10-23 12:04:19,968] [INFO] [launch.py:162:main] WORLD INFO DICT: {'localhost': [0, 1]}
# [2025-10-23 12:04:19,968] [INFO] [launch.py:168:main] nnodes=1, num_local_procs=2, node_rank=0
# [2025-10-23 12:04:19,968] [INFO] [launch.py:179:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
# [2025-10-23 12:04:19,968] [INFO] [launch.py:180:main] dist_world_size=2
# [2025-10-23 12:04:19,968] [INFO] [launch.py:184:main] Setting CUDA_VISIBLE_DEVICES=0,1
# [2025-10-23 12:04:19,969] [INFO] [launch.py:272:main] process 9169 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-3.py', '--local_rank=0']
# [2025-10-23 12:04:19,969] [INFO] [launch.py:272:main] process 9170 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-3.py', '--local_rank=1']
# 2025-10-23 12:04:24,778 - INFO - 使用 ds_config.json 的 train_micro_batch_size_per_gpu=16 作为 DataLoader 批次大小
# 2025-10-23 12:04:24,918 - INFO - 使用 ds_config.json 的 train_micro_batch_size_per_gpu=16 作为 DataLoader 批次大小
# autodl-container-fa4746a7fc-c6053b3e:9169:9169 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:9169:9169 [0] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:9169:9169 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:9170:9170 [1] NCCL INFO cudaDriverVersion 13000
# autodl-container-fa4746a7fc-c6053b3e:9170:9170 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:9170:9170 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# autodl-container-fa4746a7fc-c6053b3e:9170:9170 [1] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9169 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO ncclCommInitRankConfig comm 0xdec0520 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0xa71c620706e73a6e - Init START
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Initialized NET plugin Socket
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO ncclCommInitRankConfig comm 0xcd8ab60 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 commId 0xa71c620706e73a6e - Init START
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO RAS client listening socket at ::1<28028>
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Bootstrap timings total 0.000602 (create 0.000017, send 0.000069, recv 0.000186, ring 0.000026, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO Bootstrap timings total 0.044899 (create 0.000026, send 0.000091, recv 0.023563, ring 0.000013, delay 0.000000)
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO comm 0xdec0520 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO comm 0xcd8ab60 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Channel 00/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Channel 01/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:9170:9504 [1] NCCL INFO [Proxy Service] Device 1 CPU core 84
# autodl-container-fa4746a7fc-c6053b3e:9170:9505 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 189
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:9169:9507 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 2
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:9169:9506 [0] NCCL INFO [Proxy Service] Device 0 CPU core 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO ncclCommInitRankConfig comm 0xdec0520 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 commId 0xa71c620706e73a6e - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:9170:9500 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 2 total 0.27 (kernels 0.22, alloc 0.00, bootstrap 0.04, allgathers 0.00, topo 0.00, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO ncclCommInitRankConfig comm 0xcd8ab60 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 commId 0xa71c620706e73a6e - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:9169:9501 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 2 total 0.25 (kernels 0.22, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.00, graphs 0.00, connections 0.00, rest 0.00)
# 2025-10-23 12:04:25,941 - INFO - 加载 WikiText-2 数据集...
# 2025-10-23 12:04:25,957 - INFO - 使用设备: cuda, rank=0/2
# 2025-10-23 12:04:25,957 - INFO - 加载 WikiText-2 数据集...
# 2025-10-23 12:04:55,367 - INFO - 加载 23767 条非空文本
# 2025-10-23 12:04:55,857 - INFO - 编码文本...
# 2025-10-23 12:04:56,909 - INFO - 加载 23767 条非空文本
# 2025-10-23 12:04:57,389 - INFO - BERT 分词器词汇大小: 30522
# 2025-10-23 12:04:57,389 - INFO - 编码文本...
# 2025-10-23 12:05:12,728 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:9170:9170 [1] NCCL INFO Comm config Blocking set to 1
# 2025-10-23 12:05:14,528 - INFO - 总 token 数=2301731, 块数=8991
# autodl-container-fa4746a7fc-c6053b3e:9169:9169 [0] NCCL INFO Comm config Blocking set to 1
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO Assigned NET plugin Socket to comm
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO Using network Socket
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO ncclCommSplit comm 0xfcd5de0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 parent 0xdec0520 splitCount 1 color 2130503744 key 1- Init START
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO ncclCommSplit comm 0x101c13a0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 parent 0xcd8ab60 splitCount 1 color 2130503744 key 0- Init START
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO comm 0xfcd5de0 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO comm 0x101c13a0 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO Channel 00/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO Channel 01/02 : 0 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO P2P Chunksize set to 131072
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# autodl-container-fa4746a7fc-c6053b3e:9170:9770 [1] NCCL INFO [Proxy Service] Device 1 CPU core 72
# autodl-container-fa4746a7fc-c6053b3e:9170:9771 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 73
# autodl-container-fa4746a7fc-c6053b3e:9169:9772 [0] NCCL INFO [Proxy Service] Device 0 CPU core 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9773 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 107
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO CC Off, workFifoBytes 1048576
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO ncclCommSplit comm 0xfcd5de0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId b8000 parent 0xdec0520 splitCount 1 color 2130503744 key 1 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:9170:9717 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 2 total 1.68 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 1.67)
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO ncclCommSplit comm 0x101c13a0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 38000 parent 0xcd8ab60 splitCount 1 color 2130503744 key 0 - Init COMPLETE
# autodl-container-fa4746a7fc-c6053b3e:9169:9769 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 2 total 0.02 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# autodl-container-fa4746a7fc-c6053b3e:9170:9775 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:9169:9774 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:9170:9775 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:9169:9774 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:9170:9775 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9774 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:9170:9776 [1] NCCL INFO Channel 00 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:9170:9776 [1] NCCL INFO Channel 01 : 1[1] -> 0[0] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:9169:9777 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:9169:9777 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# autodl-container-fa4746a7fc-c6053b3e:9170:9776 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9777 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1
# Parameter Offload - Persistent parameters statistics: param_count = 50, numel = 61440
# [2025-10-23 12:05:16,991] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# [2025-10-23 12:05:17,103] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# 2025-10-23 12:05:22,602 - INFO - Epoch 1/3, Batch 50, Avg Loss (last 50): 9.1557
# 2025-10-23 12:05:27,497 - INFO - Epoch 1/3, Batch 100, Avg Loss (last 50): 7.3699
# 2025-10-23 12:05:32,369 - INFO - Epoch 1/3, Batch 150, Avg Loss (last 50): 7.2472
# 2025-10-23 12:05:37,248 - INFO - Epoch 1/3, Batch 200, Avg Loss (last 50): 7.2198
# 2025-10-23 12:05:42,125 - INFO - Epoch 1/3, Batch 250, Avg Loss (last 50): 7.2265
# 2025-10-23 12:05:45,208 - INFO - Epoch 1/3, Avg Loss: 7.5998
# 2025-10-23 12:05:45,758 - INFO - 已保存检查点: checkpoints/epoch1
# 2025-10-23 12:05:50,618 - INFO - Epoch 2/3, Batch 50, Avg Loss (last 50): 7.2263
# 2025-10-23 12:05:55,401 - INFO - Epoch 2/3, Batch 100, Avg Loss (last 50): 7.2088
# 2025-10-23 12:06:00,296 - INFO - Epoch 2/3, Batch 150, Avg Loss (last 50): 7.2231
# 2025-10-23 12:06:05,201 - INFO - Epoch 2/3, Batch 200, Avg Loss (last 50): 7.2453
# 2025-10-23 12:06:10,082 - INFO - Epoch 2/3, Batch 250, Avg Loss (last 50): 7.2406
# 2025-10-23 12:06:13,199 - INFO - Epoch 2/3, Avg Loss: 7.2266
# 2025-10-23 12:06:13,742 - INFO - 已保存检查点: checkpoints/epoch2
# 2025-10-23 12:06:18,581 - INFO - Epoch 3/3, Batch 50, Avg Loss (last 50): 7.2171
# 2025-10-23 12:06:23,486 - INFO - Epoch 3/3, Batch 100, Avg Loss (last 50): 7.2245
# 2025-10-23 12:06:28,274 - INFO - Epoch 3/3, Batch 150, Avg Loss (last 50): 7.2177
# 2025-10-23 12:06:33,176 - INFO - Epoch 3/3, Batch 200, Avg Loss (last 50): 7.2316
# 2025-10-23 12:06:38,088 - INFO - Epoch 3/3, Batch 250, Avg Loss (last 50): 7.2280
# 2025-10-23 12:06:41,207 - INFO - Epoch 3/3, Avg Loss: 7.2212
# 2025-10-23 12:06:41,752 - INFO - 已保存检查点: checkpoints/epoch3
# 2025-10-23 12:06:41,756 - INFO - 最终模型已保存至 models/final_model.pth
# autodl-container-fa4746a7fc-c6053b3e:9169:9506 [0] NCCL INFO [Service thread] Connection closed by localRank 1
# autodl-container-fa4746a7fc-c6053b3e:9169:9772 [0] NCCL INFO [Service thread] Connection closed by localRank 1
# [2025-10-23 12:06:43,981] [INFO] [launch.py:367:main] Process 9169 exits successfully.
# [2025-10-23 12:06:43,982] [INFO] [launch.py:367:main] Process 9170 exits successfully.
# 验证
tree ./checkpoints/ ./models/
# ./checkpoints/
# ├── epoch1
# │ ├── zero_pp_rank_0_mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ ├── zero_pp_rank_1_mp_rank_00_model_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── epoch2
# │ ├── zero_pp_rank_0_mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ ├── zero_pp_rank_1_mp_rank_00_model_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── epoch3
# │ ├── zero_pp_rank_0_mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ ├── zero_pp_rank_1_mp_rank_00_model_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── latest
# └── zero_to_fp32.py
# ./models/
# └── final_model.pth
# 0 directories, 1 file
多机多GPU训练-DeepSeed
多机多GPU训练主机说明
主机01: 172.17.0.4 GPU:
NVIDIA GeForce RTX 5090*2主机02: 172.17.0.7 GPU:
NVIDIA GeForce RTX 5090*2
bash
# 相关多机环境变量等设置同ddp
# 安装pdsh
apt update
apt install pdsh -y
# 添加ssh免密登录
# 生成 SSH 密钥对
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
# 将公钥复制到目标节点(172.17.0.4)
ssh-copy-id -i ~/.ssh/id_rsa.pub 172.17.0.4
ssh-copy-id -i ~/.ssh/id_rsa.pub 172.17.0.7
# 添加环境变量文件
tee ~/.ssh/environment << EOF
CUDA_DEVICE_ORDER=PCI_BUS_ID
TOKENIZERS_PARALLELISM=false
WANDB_DISABLED=1
NCCL_P2P_DISABLE=1
NCCL_IB_DISABLE=1
TORCH_NCCL_BLOCKING_WAIT=1
NCCL_DEBUG=INFO
PYTORCH_NO_IPV6=1
CUDA_VISIBLE_DEVICES=0,1
NCCL_SOCKET_IFNAME=eth0
HF_ENDPOINT=https://hf-mirror.com
EOF
# 修改sshd配置文件
sed -i{,.$(date +"%F_%H-%M-%S")} "s/#PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config
# 重启服务
systemctl reload sshd
# 测试是否能免密登录
ssh -o PasswordAuthentication=no 172.17.0.4 env|egrep "CUDA|NCCL|HF"
ssh -o PasswordAuthentication=no 172.17.0.7 env|egrep "CUDA|NCCL|HF"
# DeepSpeed-State-ZeRO-1
# 主机01-172.17.0.4:
# 添加hostfile文件
tee hostfile << EOF
172.17.0.4 slots=2
172.17.0.7 slots=2
EOF
# 执行训练命令
deepspeed --hostfile hostfile --num_nodes=2 --num_gpus=2 --node_rank=0 --master_addr=172.17.0.4 --master_port=29500 DeepSpeed-GPTLike-ZeRO-1.py
# [2025-10-23 14:49:20,554] [INFO] [multinode_runner.py:85:get_cmd] Running on the following workers: 172.17.0.4,172.17.0.7
# [2025-10-23 14:49:20,554] [INFO] [runner.py:630:main] cmd = pdsh -S -f 1024 -w 172.17.0.4,172.17.0.7 export NCCL_P2P_DISABLE=1; export NCCL_SOCKET_IFNAME=eth0; export NCCL_DEBUG=INFO; export NCCL_IB_DISABLE=1; export PYTHONPATH=/data/llm-in-practise/LLM_Distributed_Trainning/DeepSpeed/DeepSpeed-GPTLike-ZeRO-1; cd /data/llm-in-practise/LLM_Distributed_Trainning/DeepSpeed/DeepSpeed-GPTLike-ZeRO-1; /root/miniconda3/bin/python -u -m deepspeed.launcher.launch --world_info=eyIxNzIuMTcuMC40IjogWzAsIDFdLCAiMTcyLjE3LjAuNyI6IFswLCAxXX0= --node_rank=%n --master_addr=172.17.0.4 --master_port=29500 --enable_each_rank_log=None DeepSpeed-GPTLike-ZeRO-1.py
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:155:main] 0 NCCL_P2P_DISABLE=1
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:155:main] 0 NCCL_SOCKET_IFNAME=eth0
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:155:main] 0 NCCL_DEBUG=INFO
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:155:main] 0 TORCH_NCCL_BLOCKING_WAIT=1
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:155:main] 0 NCCL_IB_DISABLE=1
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:162:main] WORLD INFO DICT: {'172.17.0.4': [0, 1], '172.17.0.7': [0, 1]}
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:168:main] nnodes=2, num_local_procs=2, node_rank=0
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:179:main] global_rank_mapping=defaultdict(<class 'list'>, {'172.17.0.4': [0, 1], '172.17.0.7': [2, 3]})
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:180:main] dist_world_size=4
# 172.17.0.4: [2025-10-23 14:49:24,946] [INFO] [launch.py:184:main] Setting CUDA_VISIBLE_DEVICES=0,1
# 172.17.0.4: [2025-10-23 14:49:24,947] [INFO] [launch.py:272:main] process 1345 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-1.py', '--local_rank=0']
# 172.17.0.4: [2025-10-23 14:49:24,947] [INFO] [launch.py:272:main] process 1346 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-1.py', '--local_rank=1']
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:155:main] 1 NCCL_P2P_DISABLE=1
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:155:main] 1 NCCL_SOCKET_IFNAME=eth0
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:155:main] 1 NCCL_DEBUG=INFO
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:155:main] 1 TORCH_NCCL_BLOCKING_WAIT=1
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:155:main] 1 NCCL_IB_DISABLE=1
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:162:main] WORLD INFO DICT: {'172.17.0.4': [0, 1], '172.17.0.7': [0, 1]}
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:168:main] nnodes=2, num_local_procs=2, node_rank=1
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:179:main] global_rank_mapping=defaultdict(<class 'list'>, {'172.17.0.4': [0, 1], '172.17.0.7': [2, 3]})
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:180:main] dist_world_size=4
# 172.17.0.7: [2025-10-23 14:49:25,309] [INFO] [launch.py:184:main] Setting CUDA_VISIBLE_DEVICES=0,1
# 172.17.0.7: [2025-10-23 14:49:25,310] [INFO] [launch.py:272:main] process 865 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-1.py', '--local_rank=0']
# 172.17.0.7: [2025-10-23 14:49:25,310] [INFO] [launch.py:272:main] process 866 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-1.py', '--local_rank=1']
# 172.17.0.4: 2025-10-23 14:49:29,796 - INFO - 从 ds_config.json 读取到 train_micro_batch_size_per_gpu=16,将用于 DataLoader.batch_size(覆盖命令行 --batch_size)
# 172.17.0.4: 2025-10-23 14:49:29,890 - INFO - 从 ds_config.json 读取到 train_micro_batch_size_per_gpu=16,将用于 DataLoader.batch_size(覆盖命令行 --batch_size)
# 172.17.0.7: 2025-10-23 14:49:30,091 - INFO - 从 ds_config.json 读取到 train_micro_batch_size_per_gpu=16,将用于 DataLoader.batch_size(覆盖命令行 --batch_size)
# 172.17.0.7: 2025-10-23 14:49:30,296 - INFO - 从 ds_config.json 读取到 train_micro_batch_size_per_gpu=16,将用于 DataLoader.batch_size(覆盖命令行 --batch_size)
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1345 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1345 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1345 [0] NCCL INFO cudaDriverVersion 13000
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1345 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1346 [1] NCCL INFO cudaDriverVersion 13000
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1346 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1346 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1346 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:865 [0] NCCL INFO cudaDriverVersion 13000
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1346 [1] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:865 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:865 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:865 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO Initialized NET plugin Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO ncclCommInitRankConfig comm 0xe22f3e0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 commId 0xf5679ad6d7bb0aa5 - Init START
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:865 [0] NCCL INFO Comm config Blocking set to 1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1345 [0] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:866 [1] NCCL INFO cudaDriverVersion 13000
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:866 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:866 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:866 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:866 [1] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO Initialized NET plugin Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO Using network Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO ncclCommInitRankConfig comm 0xc696350 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 commId 0xf5679ad6d7bb0aa5 - Init START
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Initialized NET plugin Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO ncclCommInitRankConfig comm 0xe4c5920 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 commId 0xf5679ad6d7bb0aa5 - Init START
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO RAS client listening socket at ::1<28028>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO Initialized NET plugin Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO Using network Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO ncclCommInitRankConfig comm 0xe296820 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 commId 0xf5679ad6d7bb0aa5 - Init START
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO RAS client listening socket at ::1<28028>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO RAS client listening socket at ::1<28028>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO RAS client listening socket at ::1<28028>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Bootstrap timings total 0.004980 (create 0.000019, send 0.000074, recv 0.000114, ring 0.000069, delay 0.000000)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO Bootstrap timings total 0.013699 (create 0.000018, send 0.000105, recv 0.000387, ring 0.000113, delay 0.000000)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO Bootstrap timings total 0.048091 (create 0.000027, send 0.000131, recv 0.032747, ring 0.000133, delay 0.000000)
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO Bootstrap timings total 0.330419 (create 0.000030, send 0.000116, recv 0.276658, ring 0.004386, delay 0.000000)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO comm 0xe296820 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO comm 0xe22f3e0 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO comm 0xe4c5920 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO comm 0xc696350 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Channel 00/02 : 0 1 2 3
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Channel 01/02 : 0 1 2 3
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1391 [1] NCCL INFO [Proxy Service] Device 1 CPU core 158
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1392 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 67
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1878 [0] NCCL INFO [Proxy Service] Device 0 CPU core 38
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1877 [1] NCCL INFO [Proxy Service] Device 1 CPU core 61
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1879 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 67
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1394 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 144
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1393 [0] NCCL INFO [Proxy Service] Device 0 CPU core 30
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1880 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 144
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO CC Off, workFifoBytes 1048576
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO ncclCommInitRankConfig comm 0xe22f3e0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 commId 0xf5679ad6d7bb0aa5 - Init COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO ncclCommInitRankConfig comm 0xe4c5920 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 commId 0xf5679ad6d7bb0aa5 - Init COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1873 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.73 (kernels 0.38, alloc 0.00, bootstrap 0.33, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO ncclCommInitRankConfig comm 0xe296820 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 commId 0xf5679ad6d7bb0aa5 - Init COMPLETE
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1874 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.26 (kernels 0.22, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO ncclCommInitRankConfig comm 0xc696350 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 commId 0xf5679ad6d7bb0aa5 - Init COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1388 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.25 (kernels 0.22, alloc 0.00, bootstrap 0.01, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1387 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.29 (kernels 0.22, alloc 0.00, bootstrap 0.05, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.7: 2025-10-23 14:49:44,056 - INFO - 加载 WikiText 数据集...
# 172.17.0.4: 2025-10-23 14:49:44,061 - INFO - 加载 WikiText 数据集...
# 172.17.0.7: 2025-10-23 14:49:44,075 - INFO - 加载 WikiText 数据集...
# 172.17.0.4: 2025-10-23 14:49:44,095 - INFO - DeepSpeed 环境:rank=0, world_size=4
# 172.17.0.4: 2025-10-23 14:49:44,096 - INFO - 使用设备: cuda:0
# 172.17.0.4: 2025-10-23 14:49:44,096 - INFO - 加载 WikiText 数据集...
# 172.17.0.7: 2025-10-23 14:50:05,200 - INFO - 加载 23767 条非空文本
# 172.17.0.4: 2025-10-23 14:50:05,372 - INFO - 加载 23767 条非空文本
# 172.17.0.7: 2025-10-23 14:50:05,720 - INFO - 编码文本...
# 172.17.0.4: 2025-10-23 14:50:05,951 - INFO - BERT 分词器词汇大小: 30522
# 172.17.0.4: 2025-10-23 14:50:05,951 - INFO - 编码文本...
# 172.17.0.4: 2025-10-23 14:50:11,886 - INFO - 加载 23767 条非空文本
# 172.17.0.4: 2025-10-23 14:50:12,408 - INFO - 编码文本...
# 172.17.0.7: 2025-10-23 14:50:16,823 - INFO - 加载 23767 条非空文本
# 172.17.0.7: 2025-10-23 14:50:17,341 - INFO - 编码文本...
# 172.17.0.7: 2025-10-23 14:50:22,983 - INFO - 总 token 数=2301731, 块数=8991
# 172.17.0.4: 2025-10-23 14:50:22,983 - INFO - 总 token 数=2301731, 块数=8991
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1345 [0] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:866 [1] NCCL INFO Comm config Blocking set to 1
# 172.17.0.4: 2025-10-23 14:50:29,506 - INFO - 总 token 数=2301731, 块数=8991
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1346 [1] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: 2025-10-23 14:50:34,779 - INFO - 总 token 数=2301731, 块数=8991
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:865 [0] NCCL INFO Comm config Blocking set to 1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO Using network Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO Using network Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO ncclCommSplit comm 0x187c80a0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 parent 0xe22f3e0 splitCount 1 color 2003953581 key 1- Init START
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO ncclCommSplit comm 0x11908a10 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 parent 0xe4c5920 splitCount 1 color 2003953581 key 0- Init START
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO ncclCommSplit comm 0x18843b70 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 parent 0xe296820 splitCount 1 color 2003953581 key 3- Init START
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO ncclCommSplit comm 0xfada8c0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 parent 0xc696350 splitCount 1 color 2003953581 key 2- Init START
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO comm 0x187c80a0 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO comm 0x11908a10 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO comm 0x18843b70 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO comm 0xfada8c0 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO Channel 00/02 : 0 1 2 3
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO Channel 01/02 : 0 1 2 3
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1629 [0] NCCL INFO [Proxy Service] Device 0 CPU core 3
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1631 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 128
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2140 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 178
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2141 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 129
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1630 [1] NCCL INFO [Proxy Service] Device 1 CPU core 173
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1632 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 176
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2139 [0] NCCL INFO [Proxy Service] Device 0 CPU core 125
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2138 [1] NCCL INFO [Proxy Service] Device 1 CPU core 69
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO CC Off, workFifoBytes 1048576
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO ncclCommSplit comm 0x11908a10 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 parent 0xe4c5920 splitCount 1 color 2003953581 key 0 - Init COMPLETE
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2026 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 11.83 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.00, graphs 0.00, connections 0.00, rest 11.82)
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO ncclCommSplit comm 0x187c80a0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 parent 0xe22f3e0 splitCount 1 color 2003953581 key 1 - Init COMPLETE
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2134 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 5.32 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.00, graphs 0.00, connections 0.00, rest 5.30)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO ncclCommSplit comm 0x18843b70 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 parent 0xe296820 splitCount 1 color 2003953581 key 3 - Init COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1518 [1] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 11.83 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 11.81)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO ncclCommSplit comm 0xfada8c0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 parent 0xc696350 splitCount 1 color 2003953581 key 2 - Init COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1628 [0] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.02 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1635 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 141
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1633 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1633 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1633 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1633 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2144 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 19
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2142 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2142 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2142 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2142 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2145 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 78
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2143 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/Socket/0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2143 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1636 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 182
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1634 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1634 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/Socket/0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2143 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2142 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1633 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1634 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 172.17.0.4: [2025-10-23 14:50:35,865] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# 172.17.0.7: [2025-10-23 14:50:36,098] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# 172.17.0.7: [2025-10-23 14:50:36,123] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# 172.17.0.4: [2025-10-23 14:50:36,469] [WARNING] [lr_schedules.py:686:get_lr] Attempting to get learning rate from scheduler before it has started
# 172.17.0.4: 2025-10-23 14:50:40,563 - INFO - Epoch 1/3, Batch 50, Avg Loss (last 50): 9.1440
# 172.17.0.4: 2025-10-23 14:50:44,031 - INFO - Epoch 1/3, Batch 100, Avg Loss (last 50): 7.3683
# 172.17.0.4: 2025-10-23 14:50:46,907 - INFO - Epoch 1/3, Avg Loss: 7.9648
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2196 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 105
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2195 [0] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2195 [0] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [receive] via NET/Socket/0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2195 [0] NCCL INFO Channel 00 : 0[0] -> 1[1] via SHM/direct/direct
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2195 [0] NCCL INFO Channel 01 : 0[0] -> 1[1] via SHM/direct/direct
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1681 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 11
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1679 [0] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1679 [0] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [receive] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1679 [0] NCCL INFO Channel 00 : 2[0] -> 3[1] via SHM/direct/direct
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1679 [0] NCCL INFO Channel 01 : 2[0] -> 3[1] via SHM/direct/direct
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2197 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 67
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2194 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[0] [send] via NET/Socket/0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2194 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[0] [send] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1682 [1] NCCL INFO [Proxy Progress] Device 1 CPU core 158
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1680 [1] NCCL INFO Channel 00/0 : 3[1] -> 0[0] [send] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1680 [1] NCCL INFO Channel 01/0 : 3[1] -> 0[0] [send] via NET/Socket/0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:1680 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:2195 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:1679 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:2194 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 0
# 172.17.0.4: 2025-10-23 14:50:47,406 - INFO - 已保存检查点: checkpoints/epoch1
# 172.17.0.4: 2025-10-23 14:50:50,996 - INFO - Epoch 2/3, Batch 50, Avg Loss (last 50): 7.2227
# 172.17.0.4: 2025-10-23 14:50:54,467 - INFO - Epoch 2/3, Batch 100, Avg Loss (last 50): 7.2251
# 172.17.0.4: 2025-10-23 14:50:57,325 - INFO - Epoch 2/3, Avg Loss: 7.2233
# 172.17.0.4: 2025-10-23 14:50:57,720 - INFO - 已保存检查点: checkpoints/epoch2
# 172.17.0.4: 2025-10-23 14:51:01,347 - INFO - Epoch 3/3, Batch 50, Avg Loss (last 50): 7.2205
# 172.17.0.4: 2025-10-23 14:51:04,821 - INFO - Epoch 3/3, Batch 100, Avg Loss (last 50): 7.2204
# 172.17.0.4: 2025-10-23 14:51:07,672 - INFO - Epoch 3/3, Avg Loss: 7.2157
# 172.17.0.4: 2025-10-23 14:51:08,115 - INFO - 已保存检查点: checkpoints/epoch3
# 172.17.0.4: 2025-10-23 14:51:08,272 - INFO - 最终模型已保存至 models/final_model.pth
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1345 [0] NCCL INFO comm 0x11908a10 rank 0 nranks 4 cudaDev 0 busId 38000 - Destroy COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:865 [0] NCCL INFO comm 0xfada8c0 rank 2 nranks 4 cudaDev 0 busId 27000 - Destroy COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:866 [1] NCCL INFO comm 0x18843b70 rank 3 nranks 4 cudaDev 1 busId c8000 - Destroy COMPLETE
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1346 [1] NCCL INFO comm 0x187c80a0 rank 1 nranks 4 cudaDev 1 busId a8000 - Destroy COMPLETE
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1345:1345 [0] NCCL INFO comm 0xe4c5920 rank 0 nranks 4 cudaDev 0 busId 38000 - Destroy COMPLETE
# 172.17.0.4: 2025-10-23 14:51:09,008 - INFO - 已成功销毁 torch.distributed 进程组 (destroy_process_group)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:865:865 [0] NCCL INFO comm 0xc696350 rank 2 nranks 4 cudaDev 0 busId 27000 - Destroy COMPLETE
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:1346:1346 [1] NCCL INFO comm 0xe22f3e0 rank 1 nranks 4 cudaDev 1 busId a8000 - Destroy COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:866:866 [1] NCCL INFO comm 0xe296820 rank 3 nranks 4 cudaDev 1 busId c8000 - Destroy COMPLETE
# 172.17.0.7: [2025-10-23 14:51:11,319] [INFO] [launch.py:367:main] Process 866 exits successfully.
# 172.17.0.7: [2025-10-23 14:51:11,320] [INFO] [launch.py:367:main] Process 865 exits successfully.
# 172.17.0.4: [2025-10-23 14:51:11,956] [INFO] [launch.py:367:main] Process 1346 exits successfully.
# 172.17.0.4: [2025-10-23 14:51:11,956] [INFO] [launch.py:367:main] Process 1345 exits successfully.
# 在master主机上验证(172.17.0.4)
tree checkpoints/ models/
# checkpoints/
# ├── epoch1
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── epoch2
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── epoch3
# │ ├── mp_rank_00_model_states.pt
# │ ├── zero_pp_rank_0_mp_rank_00_optim_states.pt
# │ └── zero_pp_rank_1_mp_rank_00_optim_states.pt
# ├── latest
# └── zero_to_fp32.py
# models/
# └── final_model.pth
# 0 directories, 1 file
ls -lhrt models/final_model.pth
# -rw-r--r-- 1 root root 127M Oct 23 14:51 models/final_model.pth
# DeepSpeed-State-ZeRO-2
deepspeed --hostfile hostfile --master_addr=172.17.0.4 --master_port=29500 DeepSpeed-GPTLike-ZeRO-2.py
# [2025-10-23 15:00:50,158] [INFO] [multinode_runner.py:85:get_cmd] Running on the following workers: 172.17.0.4,172.17.0.7
# [2025-10-23 15:00:50,158] [INFO] [runner.py:630:main] cmd = pdsh -S -f 1024 -w 172.17.0.4,172.17.0.7 export NCCL_P2P_DISABLE=1; export NCCL_SOCKET_IFNAME=eth0; export NCCL_DEBUG=INFO; export NCCL_IB_DISABLE=1; export PYTHONPATH=/data/llm-in-practise/LLM_Distributed_Trainning/DeepSpeed/DeepSpeed-GPTLike-ZeRO-2; cd /data/llm-in-practise/LLM_Distributed_Trainning/DeepSpeed/DeepSpeed-GPTLike-ZeRO-2; /root/miniconda3/bin/python -u -m deepspeed.launcher.launch --world_info=eyIxNzIuMTcuMC40IjogWzAsIDFdLCAiMTcyLjE3LjAuNyI6IFswLCAxXX0= --node_rank=%n --master_addr=172.17.0.4 --master_port=29500 --enable_each_rank_log=None DeepSpeed-GPTLike-ZeRO-2.py
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:155:main] 0 NCCL_P2P_DISABLE=1
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:155:main] 0 NCCL_SOCKET_IFNAME=eth0
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:155:main] 0 NCCL_DEBUG=INFO
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:155:main] 0 TORCH_NCCL_BLOCKING_WAIT=1
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:155:main] 0 NCCL_IB_DISABLE=1
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:162:main] WORLD INFO DICT: {'172.17.0.4': [0, 1], '172.17.0.7': [0, 1]}
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:168:main] nnodes=2, num_local_procs=2, node_rank=0
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:179:main] global_rank_mapping=defaultdict(<class 'list'>, {'172.17.0.4': [0, 1], '172.17.0.7': [2, 3]})
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:180:main] dist_world_size=4
# 172.17.0.4: [2025-10-23 15:00:54,675] [INFO] [launch.py:184:main] Setting CUDA_VISIBLE_DEVICES=0,1
# 172.17.0.4: [2025-10-23 15:00:54,676] [INFO] [launch.py:272:main] process 2932 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-2.py', '--local_rank=0']
# 172.17.0.4: [2025-10-23 15:00:54,676] [INFO] [launch.py:272:main] process 2933 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-2.py', '--local_rank=1']
# 172.17.0.7: [2025-10-23 15:00:54,942] [INFO] [launch.py:155:main] 1 NCCL_P2P_DISABLE=1
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:155:main] 1 NCCL_SOCKET_IFNAME=eth0
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:155:main] 1 NCCL_DEBUG=INFO
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:155:main] 1 TORCH_NCCL_BLOCKING_WAIT=1
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:155:main] 1 NCCL_IB_DISABLE=1
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:162:main] WORLD INFO DICT: {'172.17.0.4': [0, 1], '172.17.0.7': [0, 1]}
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:168:main] nnodes=2, num_local_procs=2, node_rank=1
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:179:main] global_rank_mapping=defaultdict(<class 'list'>, {'172.17.0.4': [0, 1], '172.17.0.7': [2, 3]})
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:180:main] dist_world_size=4
# 172.17.0.7: [2025-10-23 15:00:54,943] [INFO] [launch.py:184:main] Setting CUDA_VISIBLE_DEVICES=0,1
# 172.17.0.7: [2025-10-23 15:00:54,944] [INFO] [launch.py:272:main] process 2219 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-2.py', '--local_rank=0']
# 172.17.0.7: [2025-10-23 15:00:54,944] [INFO] [launch.py:272:main] process 2220 spawned with command: ['/root/miniconda3/bin/python', '-u', 'DeepSpeed-GPTLike-ZeRO-2.py', '--local_rank=1']
# 172.17.0.4: 2025-10-23 15:00:59,612 - INFO - DataLoader batch_size=16
# 172.17.0.4: 2025-10-23 15:00:59,773 - INFO - DataLoader batch_size=16
# 172.17.0.7: 2025-10-23 15:00:59,994 - INFO - DataLoader batch_size=16
# 172.17.0.7: 2025-10-23 15:01:00,078 - INFO - DataLoader batch_size=16
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:2932 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:2932 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:2932 [0] NCCL INFO cudaDriverVersion 13000
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:2932 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2219 [0] NCCL INFO cudaDriverVersion 13000
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2219 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2219 [0] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2219 [0] NCCL INFO NCCL version 2.27.3+cuda12.9
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2219 [0] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2220 [1] NCCL INFO cudaDriverVersion 13000
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2220 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2220 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.7<0>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2220 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2220 [1] NCCL INFO Comm config Blocking set to 1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:2933 [1] NCCL INFO cudaDriverVersion 13000
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:2933 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:2933 [1] NCCL INFO Bootstrap: Using eth0:172.17.0.4<0>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:2933 [1] NCCL INFO NCCL version 2.27.3+cuda12.9
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:2933 [1] NCCL INFO Comm config Blocking set to 1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:2932 [0] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO Initialized NET plugin Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO Using network Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO ncclCommInitRankConfig comm 0xc8760f0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 commId 0xf7dd7ba1ea1267a4 - Init START
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.7<0>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO Initialized NET plugin Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO Using network Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO ncclCommInitRankConfig comm 0xd0df8d0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 commId 0xf7dd7ba1ea1267a4 - Init START
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO Initialized NET plugin Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO ncclCommInitRankConfig comm 0xcfb55a0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 commId 0xf7dd7ba1ea1267a4 - Init START
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO RAS client listening socket at ::1<28028>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.4<0>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Initialized NET plugin Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO ncclCommInitRankConfig comm 0xe2481c0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 commId 0xf7dd7ba1ea1267a4 - Init START
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO RAS client listening socket at ::1<28028>
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO RAS client listening socket at ::1<28028>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO RAS client listening socket at ::1<28028>
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Bootstrap timings total 0.000615 (create 0.000017, send 0.000064, recv 0.000172, ring 0.000106, delay 0.000000)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO Bootstrap timings total 0.073540 (create 0.000026, send 0.000133, recv 0.010590, ring 0.019064, delay 0.000000)
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO Bootstrap timings total 0.034820 (create 0.000019, send 0.000070, recv 0.000087, ring 0.000072, delay 0.000000)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO Bootstrap timings total 0.071995 (create 0.000019, send 0.000100, recv 0.050070, ring 0.000113, delay 0.000000)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO NCCL_P2P_DISABLE set by environment to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO comm 0xcfb55a0 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO comm 0xd0df8d0 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO comm 0xe2481c0 rank 0 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO comm 0xc8760f0 rank 2 nRanks 4 nNodes 2 localRanks 2 localRank 0 MNNVL 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Channel 00/02 : 0 1 2 3
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Channel 01/02 : 0 1 2 3
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Trees [0] 1/2/-1->0->-1 [1] 1/-1/-1->0->2
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO Trees [0] 3/-1/-1->2->0 [1] 3/0/-1->2->-1
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO P2P Chunksize set to 131072
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2745 [1] NCCL INFO [Proxy Service] Device 1 CPU core 58
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2746 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 167
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3464 [1] NCCL INFO [Proxy Service] Device 1 CPU core 92
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3465 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 197
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2747 [0] NCCL INFO [Proxy Service] Device 0 CPU core 107
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2748 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 5
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3467 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 109
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3466 [0] NCCL INFO [Proxy Service] Device 0 CPU core 108
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO CC Off, workFifoBytes 1048576
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO ncclCommInitRankConfig comm 0xcfb55a0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 commId 0xf7dd7ba1ea1267a4 - Init COMPLETE
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3460 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.28 (kernels 0.22, alloc 0.00, bootstrap 0.03, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO ncclCommInitRankConfig comm 0xd0df8d0 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 commId 0xf7dd7ba1ea1267a4 - Init COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2742 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.30 (kernels 0.22, alloc 0.00, bootstrap 0.07, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO ncclCommInitRankConfig comm 0xc8760f0 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 commId 0xf7dd7ba1ea1267a4 - Init COMPLETE
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO ncclCommInitRankConfig comm 0xe2481c0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 commId 0xf7dd7ba1ea1267a4 - Init COMPLETE
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2741 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.32 (kernels 0.22, alloc 0.00, bootstrap 0.07, allgathers 0.01, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3461 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 0.26 (kernels 0.22, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.01, graphs 0.00, connections 0.00, rest 0.00)
# 172.17.0.4: 2025-10-23 15:01:06,149 - INFO - 加载 WikiText 数据集...
# 172.17.0.7: 2025-10-23 15:01:06,152 - INFO - 加载 WikiText 数据集...
# 172.17.0.7: 2025-10-23 15:01:06,174 - INFO - 加载 WikiText 数据集...
# 172.17.0.4: 2025-10-23 15:01:06,196 - INFO - DeepSpeed 环境:rank=0, world_size=4
# 172.17.0.4: 2025-10-23 15:01:06,197 - INFO - 使用设备: cuda:0
# 172.17.0.4: 2025-10-23 15:01:06,197 - INFO - 加载 WikiText 数据集...
# 172.17.0.7: 2025-10-23 15:01:37,279 - INFO - 加载 23767 条非空文本
# 172.17.0.7: 2025-10-23 15:01:37,805 - INFO - 编码文本...
# 172.17.0.4: 2025-10-23 15:01:38,285 - INFO - 加载 23767 条非空文本
# 172.17.0.4: 2025-10-23 15:01:38,795 - INFO - BERT 分词器词汇大小: 30522
# 172.17.0.4: 2025-10-23 15:01:38,796 - INFO - 编码文本...
# 172.17.0.4: 2025-10-23 15:01:40,206 - INFO - 加载 23767 条非空文本
# 172.17.0.4: 2025-10-23 15:01:40,746 - INFO - 编码文本...
# 172.17.0.7: 2025-10-23 15:01:44,123 - INFO - 加载 23767 条非空文本
# 172.17.0.7: 2025-10-23 15:01:44,897 - INFO - 编码文本...
# 172.17.0.7: 2025-10-23 15:01:54,672 - INFO - 总 token 数=2301731, 块数=8991
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2220 [1] NCCL INFO Comm config Blocking set to 1
# 172.17.0.4: 2025-10-23 15:01:56,824 - INFO - 总 token 数=2301731, 块数=8991
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:2932 [0] NCCL INFO Comm config Blocking set to 1
# 172.17.0.4: 2025-10-23 15:01:57,793 - INFO - 总 token 数=2301731, 块数=8991
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:2933 [1] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: 2025-10-23 15:02:02,224 - INFO - 总 token 数=2301731, 块数=8991
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2219 [0] NCCL INFO Comm config Blocking set to 1
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2989 [0] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2989 [0] NCCL INFO Using network Socket
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2883 [1] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2883 [1] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3703 [1] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3703 [1] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3599 [0] NCCL INFO Assigned NET plugin Socket to comm
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3599 [0] NCCL INFO Using network Socket
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3703 [1] NCCL INFO ncclCommSplit comm 0xfec2210 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId a8000 parent 0xcfb55a0 splitCount 1 color 2003953581 key 1- Init START
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2883 [1] NCCL INFO ncclCommSplit comm 0xffe2020 rank 3 nranks 4 cudaDev 1 nvmlDev 1 busId c8000 parent 0xd0df8d0 splitCount 1 color 2003953581 key 3- Init START
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2989 [0] NCCL INFO ncclCommSplit comm 0xfcb8990 rank 2 nranks 4 cudaDev 0 nvmlDev 0 busId 27000 parent 0xc8760f0 splitCount 1 color 2003953581 key 2- Init START
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3599 [0] NCCL INFO ncclCommSplit comm 0x1168e3b0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 38000 parent 0xe2481c0 splitCount 1 color 2003953581 key 0- Init START
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3703 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2932:3599 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2883 [1] NCCL INFO Setting affinity for GPU 1 to 52-103,156-207
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2219:2989 [0] NCCL INFO Setting affinity for GPU 0 to 0-51,104-155
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:2220:2883 [1] NCCL INFO comm 0xffe2020 rank 3 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# 172.17.0.4: autodl-container-fa4746a7fc-5df31a9e:2933:3703 [1] NCCL INFO comm 0xfec2210 rank 1 nRanks 4 nNodes 2 localRanks 2 localRank 1 MNNVL 0
# 172.17.0.7: autodl-container-fa4746a7fc-c6053b3e:22