PyTorch 通过两种方式可以进行多GPU训练: DataParallel, DistributedDataParallel. 当使用DataParallel的时候, 梯度的计算结果和在单卡上跑是一样的, 对每个数据计算出来的梯度进行累加. 当使用DistributedDataParallel的时候, 每个卡单独计算梯度, 然后多卡的梯度再进行平均.
下面是实验验证:
DataParallel
python
import torch
import os
import torch.nn as nn
def main():
model = nn.Linear(2, 3).cuda()
model = torch.nn.DataParallel(model, device_ids=[0, 1])
input = torch.rand(2, 2)
labels = torch.tensor([[1, 0, 0], [0, 1, 0]]).cuda()
(model(input) * labels).sum().backward()
print('input', input)
print([p.grad for p in model.parameters()])
if __name__=="__main__":
main()
执行CUDA_VISIBLE_DEVICES=0,1 python t.py
可以看到输出, 代码中对两个样本分别求梯度, 梯度等于样本的值, DataParallel把两个样本的梯度累加起来在不同GPU中同步.
bash
input tensor([[0.4362, 0.4574],
[0.2052, 0.2362]])
[tensor([[0.4363, 0.4573],
[0.2052, 0.2362],
[0.0000, 0.0000]], device='cuda:0'), tensor([1., 1., 0.], device='cuda:0')]
DistributedDataParallel
python
import torch
import os
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def example(rank, world_size):
# create default process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(2, 3).to(rank)
print('model param', 'rank', rank, [p for p in model.parameters()])
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
print('ddp model param', 'rank', rank, [p for p in ddp_model.parameters()])
# forward pass
input = torch.randn(1, 2).to(rank)
outputs = ddp_model(input)
labels = torch.randn(1, 3).to(rank) * 0
labels[0, rank] = 1
# backward pass
(outputs * labels).sum().backward()
print('rank', rank, 'grad', [p.grad for p in ddp_model.parameters()])
print('rank', rank, 'input', input, 'outputs', outputs)
print('rank', rank, 'labels', labels)
# update parameters
optimizer.step()
def main():
world_size = 2
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__=="__main__":
# Environment variables which need to be
# set when using c10d's default "env"
# initialization mode.
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29504"
main()
执行CUDA_VISIBLE_DEVICES=0,1 python t1.py
可以看到输出, 代码中对两个样本分别求梯度, 梯度等于样本的值, 最终的梯度是各个GPU的梯度的平均.
bash
model param rank 0 [Parameter containing:
tensor([[-0.4819, 0.0253],
[ 0.0858, 0.2256],
[ 0.5614, 0.2702]], device='cuda:0', requires_grad=True), Parameter containing:
tensor([-0.0090, 0.4461, -0.3493], device='cuda:0', requires_grad=True)]
model param rank 1 [Parameter containing:
tensor([[-0.3737, 0.3062],
[ 0.6450, 0.2930],
[-0.2422, 0.2089]], device='cuda:1', requires_grad=True), Parameter containing:
tensor([-0.5868, 0.2106, -0.4461], device='cuda:1', requires_grad=True)]
ddp model param rank 1 [Parameter containing:
tensor([[-0.4819, 0.0253],
[ 0.0858, 0.2256],
[ 0.5614, 0.2702]], device='cuda:1', requires_grad=True), Parameter containing:
tensor([-0.0090, 0.4461, -0.3493], device='cuda:1', requires_grad=True)]
ddp model param rank 0 [Parameter containing:
tensor([[-0.4819, 0.0253],
[ 0.0858, 0.2256],
[ 0.5614, 0.2702]], device='cuda:0', requires_grad=True), Parameter containing:
tensor([-0.0090, 0.4461, -0.3493], device='cuda:0', requires_grad=True)]
rank 1 grad [tensor([[ 0.2605, 0.1631],
[-0.0934, -0.5308],
[ 0.0000, 0.0000]], device='cuda:1'), tensor([0.5000, 0.5000, 0.0000], device='cuda:1')]
rank 0 grad [tensor([[ 0.2605, 0.1631],
[-0.0934, -0.5308],
[ 0.0000, 0.0000]], device='cuda:0'), tensor([0.5000, 0.5000, 0.0000], device='cuda:0')]
rank 1 input tensor([[-0.1868, -1.0617]], device='cuda:1') outputs tensor([[ 0.0542, 0.1906, -0.7411]], device='cuda:1',
grad_fn=<AddmmBackward0>)
rank 0 input tensor([[0.5209, 0.3261]], device='cuda:0') outputs tensor([[-0.2518, 0.5644, 0.0314]], device='cuda:0',
grad_fn=<AddmmBackward0>)
rank 1 labels tensor([[-0., 1., -0.]], device='cuda:1')
rank 0 labels tensor([[1., 0., -0.]], device='cuda:0')