摘要
HCCL(Huawei Collective Communication Library)是CANN生态中基于NPU硬件的高性能集合通信库,为单机多卡及多机多卡间的数据并行、模型并行提供通信方案。本文将系统介绍HCCL的通信原语、编程模型、优化技巧及实践案例。
一、HCCL通信库概述
1.1 项目介绍
项目地址 :https://atomgit.com/cann/hccl
HCCL是华为集合通信库,专为NPU集群优化设计,提供:
- 点对点通信:Send、Recv
- 集合通信:Broadcast、Reduce、AllReduce等
- 高性能:基于RDMA、硬件加速
- 易用接口:与主流ML框架无缝集成
1.2 HCCL生态组件
HCCL通信栈
├── HCCL API # 用户接口层
├── HCCS # 通信服务层
│ ├── hcomm # 通信基础库
│ ├── shmem # 共享内存通信
│ └── hixl # 传输层协议
└── Driver # 硬件驱动层
相关项目链接:
- hcomm :https://atomgit.com/cann/hcomm
- shmem :https://atomgit.com/cann/shmem
- hixl :https://atomgit.com/cann/hixl
1.3 应用场景
分布式训练场景
├── 数据并行 # 数据分片到多卡
├── 模型并行 # 模型分片到多卡
├── 流水线并行 # 层级分片到多卡
└── 混合并行 # 组合多种并行策略
二、HCCL基础编程
2.1 初始化通信环境
c
#include "hccl/hccl.h"
// 初始化HCCL通信域
hcclResult_t init_hccl_comm() {
// 1. 获取设备信息
int32_t device_id = 0;
aclError ret = aclrtGetDevice(&device_id);
if (ret != ACL_ERROR_NONE) {
printf("Failed to get device\n");
return HCCL_ERROR_INTERNAL;
}
// 2. 获取通信rank信息
uint32_t rank_id = 0;
uint32_t rank_size = 0;
// 从环境变量获取rank信息(通常由启动脚本设置)
char* rank_id_str = getenv("RANK_ID");
char* rank_size_str = getenv("RANK_SIZE");
if (rank_id_str && rank_size_str) {
rank_id = atoi(rank_id_str);
rank_size = atoi(rank_size_str);
} else {
printf("RANK_ID or RANK_SIZE not set\n");
return HCCL_ERROR_INTERNAL;
}
// 3. 创建通信域
HcclComm comm;
hcclResult_t hccl_ret = hcclCommInitRootInfo(
&comm, // 输出:通信域句柄
rank_id, // 当前进程的rank ID
rank_size // 总的rank数量
);
if (hccl_ret != HCCL_SUCCESS) {
printf("Failed to initialize HCCL comm: %d\n", hccl_ret);
return hccl_ret;
}
printf("HCCL initialized: rank=%u/%u\n", rank_id, rank_size);
return HCCL_SUCCESS;
}
2.2 Python接口初始化
python
# 使用PyTorch NPU初始化HCCL
import torch
import torch_npu
import torch.distributed as dist
def init_hccl():
# 设置设备
local_rank = int(os.environ.get("LOCAL_RANK", 0))
torch.npu.set_device(local_rank)
# 初始化进程组(使用HCCL后端)
dist.init_process_group(
backend='hccl', # 使用HCCL后端
init_method='env://' # 环境变量初始化
)
# 获取通信信息
rank = dist.get_rank()
world_size = dist.get_world_size()
print(f"Rank {rank}/{world_size} initialized")
return rank, world_size
2.3 通信域管理
c
// 通信域管理示例
typedef struct {
HcclComm world_comm; // 全局通信域
HcclComm local_comm; // 本地通信域(单机内)
HcclComm node_comm; // 节点通信域(跨节点)
uint32_t rank;
uint32_t local_rank;
uint32_t world_size;
} CommContext;
// 创建分层通信域
hcclResult_t create_comm_context(CommContext* ctx) {
// 1. 创建全局通信域
hcclCommInitRootInfo(&ctx->world_comm, ctx->rank, ctx->world_size);
// 2. 创建本地通信域(同一机器内的GPU)
uint32_t local_rank = ctx->rank % 8; // 假设每机8卡
hcclCommInitComm(
&ctx->local_comm,
8, // 本地通信域大小
local_rank,
ctx->world_comm,
HCCL_GROUP_LOCAL_COMM // 组标识
);
// 3. 获取本地rank
ctx->local_rank = local_rank;
return HCCL_SUCCESS;
}
// 销毁通信域
void destroy_comm_context(CommContext* ctx) {
hcclCommDestroy(ctx->world_comm);
hcclCommDestroy(ctx->local_comm);
hcclCommDestroy(ctx->node_comm);
}
三、集合通信原语
3.1 Broadcast广播
c
// 从root rank广播数据到所有rank
hcclResult_t broadcast_example(HcclComm comm, uint32_t rank, uint32_t root) {
// 准备数据
const size_t count = 1024;
float* send_buffer = NULL;
float* recv_buffer = NULL;
// 分配内存
aclrtMalloc((void**)&send_buffer, count * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMalloc((void**)&recv_buffer, count * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
// 初始化数据(仅root需要)
if (rank == root) {
for (size_t i = 0; i < count; ++i) {
send_buffer[i] = (float)i;
}
}
// 创建执行流
aclrtStream stream = NULL;
aclrtCreateStream(&stream);
// 执行Broadcast
hcclResult_t ret = hcclBroadcast(
send_buffer, // 发送缓冲区(root使用)
recv_buffer, // 接收缓冲区(所有rank使用)
count, // 数据元素个数
HCCL_DATA_TYPE_FP32, // 数据类型
root, // root rank
comm, // 通信域
stream // 执行流
);
if (ret != HCCL_SUCCESS) {
printf("Broadcast failed: %d\n", ret);
return ret;
}
// 同步等待
aclrtSynchronizeStream(stream);
// 验证结果(所有rank应该收到相同数据)
printf("Rank %d received: ", rank);
for (int i = 0; i < 5; ++i) {
printf("%.1f ", recv_buffer[i]);
}
printf("...\n");
// 清理
aclrtDestroyStream(stream);
aclrtFree(send_buffer);
aclrtFree(recv_buffer);
return HCCL_SUCCESS;
}
3.2 AllReduce归约
c
// AllReduce:所有rank执行归约操作并广播结果
hcclResult_t allreduce_example(HcclComm comm, uint32_t rank) {
const size_t count = 1024;
float* buffer = NULL;
aclrtMalloc((void**)&buffer, count * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
// 初始化数据(每个rank不同)
for (size_t i = 0; i < count; ++i) {
buffer[i] = (float)rank;
}
aclrtStream stream = NULL;
aclrtCreateStream(&stream);
printf("Rank %d: Before AllReduce, sum = %.1f\n", rank,
buffer[0] * count);
// 执行AllReduce Sum
hcclResult_t ret = hcclAllReduce(
buffer, // 输入缓冲区
buffer, // 输出缓冲区(可以原地)
count,
HCCL_DATA_TYPE_FP32,
HCCL_SUM, // 归约操作:SUM, PROD, MIN, MAX
comm,
stream
);
if (ret != HCCL_SUCCESS) {
return ret;
}
aclrtSynchronizeStream(stream);
// 验证结果:所有rank的结果应该相同
printf("Rank %d: After AllReduce Sum, sum = %.1f\n", rank,
buffer[0] * count);
aclrtDestroyStream(stream);
aclrtFree(buffer);
return HCCL_SUCCESS;
}
3.3 ReduceScatter归约散射
c
// ReduceScatter:归约后按维度分发
hcclResult_t reducescatter_example(HcclComm comm, uint32_t rank, uint32_t world_size) {
// 假设有4个rank,每个rank有1024个元素
// ReduceScatter后,每个rank得到256个元素(归约后的结果)
const size_t total_count = 1024;
const size_t count_per_rank = total_count / world_size;
float* send_buffer = NULL;
float* recv_buffer = NULL;
aclrtMalloc((void**)&send_buffer, total_count * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMalloc((void**)&recv_buffer, count_per_rank * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
// 初始化数据
for (size_t i = 0; i < total_count; ++i) {
send_buffer[i] = (float)(rank * total_count + i);
}
aclrtStream stream = NULL;
aclrtCreateStream(&stream);
printf("Rank %d: Before ReduceScatter, send[0]=%.1f\n", rank, send_buffer[0]);
// 执行ReduceScatter
hcclResult_t ret = hcclReduceScatter(
send_buffer,
recv_buffer,
count_per_rank,
HCCL_DATA_TYPE_FP32,
HCCL_SUM,
comm,
stream
);
if (ret != HCCL_SUCCESS) {
return ret;
}
aclrtSynchronizeStream(stream);
printf("Rank %d: After ReduceScatter, recv[0]=%.1f\n", rank, recv_buffer[0]);
aclrtDestroyStream(stream);
aclrtFree(send_buffer);
aclrtFree(recv_buffer);
return HCCL_SUCCESS;
}
3.4 AllGather收集
c
// AllGather:收集所有rank的数据并分发
hcclResult_t allgather_example(HcclComm comm, uint32_t rank, uint32_t world_size) {
const size_t count_per_rank = 256;
const size_t total_count = count_per_rank * world_size;
float* send_buffer = NULL;
float* recv_buffer = NULL;
aclrtMalloc((void**)&send_buffer, count_per_rank * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMalloc((void**)&recv_buffer, total_count * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
// 初始化数据
for (size_t i = 0; i < count_per_rank; ++i) {
send_buffer[i] = (float)(rank * 100 + i);
}
aclrtStream stream = NULL;
aclrtCreateStream(&stream);
printf("Rank %d: Before AllGather, send[0]=%.1f\n", rank, send_buffer[0]);
// 执行AllGather
hcclResult_t ret = hcclAllGather(
send_buffer,
recv_buffer,
count_per_rank,
HCCL_DATA_TYPE_FP32,
comm,
stream
);
if (ret != HCCL_SUCCESS) {
return ret;
}
aclrtSynchronizeStream(stream);
// 验证:每个rank收到所有rank的数据
printf("Rank %d: After AllGather, recv = ", rank);
for (uint32_t i = 0; i < world_size; ++i) {
printf("%.1f ", recv_buffer[i * count_per_rank]);
}
printf("\n");
aclrtDestroyStream(stream);
aclrtFree(send_buffer);
aclrtFree(recv_buffer);
return HCCL_SUCCESS;
}
四、点对点通信
4.1 Send/Recv操作
c
// 点对点通信示例
typedef struct {
int tag; // 消息标签
int source_rank; // 发送方rank
int dest_rank; // 接收方rank
} MessageInfo;
hcclResult_t sendrecv_example(HcclComm comm, uint32_t rank) {
const size_t count = 1024;
float* buffer = NULL;
aclrtMalloc((void**)&buffer, count * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
aclrtStream stream = NULL;
aclrtCreateStream(&stream);
if (rank == 0) {
// Rank 0发送数据到Rank 1
for (size_t i = 0; i < count; ++i) {
buffer[i] = (float)i;
}
hcclResult_t ret = hcclSend(
buffer, // 发送缓冲区
count,
HCCL_DATA_TYPE_FP32,
1, // 目标rank
comm,
stream,
0 // tag
);
printf("Rank 0: Sent %zu elements to rank 1\n", count);
} else if (rank == 1) {
// Rank 1接收来自Rank 0的数据
hcclResult_t ret = hcclRecv(
buffer, // 接收缓冲区
count,
HCCL_DATA_TYPE_FP32,
0, // 源rank
comm,
stream,
0 // tag
);
aclrtSynchronizeStream(stream);
printf("Rank 1: Received %zu elements from rank 0\n", count);
printf("First 5 values: %.1f %.1f %.1f %.1f %.1f\n",
buffer[0], buffer[1], buffer[2], buffer[3], buffer[4]);
}
aclrtDestroyStream(stream);
aclrtFree(buffer);
return HCCL_SUCCESS;
}
4.2 异步通信
c
// 使用多流实现通信与计算重叠
typedef struct {
aclrtStream compute_stream;
aclrtStream comm_stream;
HcclComm comm;
uint32_t rank;
} AsyncCommContext;
hcclResult_t async_comm_example(AsyncCommContext* ctx) {
const size_t count = 1024 * 1024; // 1M元素
float* buffer1 = NULL;
float* buffer2 = NULL;
aclrtMalloc((void**)&buffer1, count * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
aclrtMalloc((void**)&buffer2, count * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST);
// 在comm_stream上执行AllReduce
hcclAllReduce(
buffer1,
buffer1,
count,
HCCL_DATA_TYPE_FP32,
HCCL_SUM,
ctx->comm,
ctx->comm_stream
);
// 在compute_stream上同时执行计算
// ... 执行其他计算操作 ...
// 同步两个流
aclrtSynchronizeStream(ctx->comm_stream);
aclrtSynchronizeStream(ctx->compute_stream);
aclrtFree(buffer1);
aclrtFree(buffer2);
return HCCL_SUCCESS;
}
五、PyTorch分布式训练实践
5.1 数据并行训练
python
# data_parallel_training.py
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
import torch_npu
class SimpleModel(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 1024)
self.fc2 = nn.Linear(1024, 512)
self.fc3 = nn.Linear(512, 10)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.dropout(self.relu(self.fc1(x)))
x = self.dropout(self.relu(self.fc2(x)))
x = self.fc3(x)
return x
def setup_distributed():
"""初始化分布式环境"""
# 获取分布式参数
local_rank = int(os.environ.get("LOCAL_RANK", 0))
world_size = int(os.environ.get("WORLD_SIZE", 1))
# 设置设备
torch.npu.set_device(local_rank)
# 初始化进程组
torch.distributed.init_process_group(
backend='hccl',
init_method='env://'
)
return local_rank, world_size
def cleanup_distributed():
"""清理分布式环境"""
torch.distributed.destroy_process_group()
def train_epoch(model, dataloader, optimizer, criterion, rank):
"""训练一个epoch"""
model.train()
total_loss = 0.0
for batch_idx, (data, target) in enumerate(dataloader):
# 将数据移动到NPU
data = data.to(f"npu:{rank}")
target = target.to(f"npu:{rank}")
# 前向传播
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
# 反向传播
loss.backward()
# 梯度同步(DDP自动处理)
optimizer.step()
total_loss += loss.item()
if batch_idx % 100 == 0:
print(f"Rank {rank}, Batch {batch_idx}, Loss: {loss.item():.4f}")
return total_loss / len(dataloader)
def main():
# 初始化分布式
rank, world_size = setup_distributed()
# 创建模型并包装为DDP
model = SimpleModel().to(f"npu:{rank}")
model = DDP(model, device_ids=[rank])
# 优化器和损失函数
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()
# 加载数据(每个rank加载不同的分片)
from torch.utils.data import DataLoader, DistributedSampler
from torchvision import datasets, transforms
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
dataloader = DataLoader(dataset, batch_size=128, sampler=sampler)
# 训练
for epoch in range(10):
if rank == 0:
print(f"\nEpoch {epoch + 1}/10")
# 确保每个epoch所有rank使用相同的随机顺序
sampler.set_epoch(epoch)
avg_loss = train_epoch(model, dataloader, optimizer, criterion, rank)
if rank == 0:
print(f"Average loss: {avg_loss:.4f}")
# 清理
cleanup_distributed()
if __name__ == "__main__":
main()
5.2 启动脚本
bash
#!/bin/bash
# launch_training.sh
# 配置参数
NODE_COUNT=2 # 节点数
GPUS_PER_NODE=8 # 每节点GPU数
MASTER_ADDR="node1" # 主节点地址
MASTER_PORT=29500 # 主端口
# 获取节点信息
NODE_RANK=${SLURM_NODEID:-0} # 当前节点的rank
# 启动分布式训练
python3 -m torch.distributed.launch \
--nproc_per_node=$GPUS_PER_NODE \
--nnodes=$NODE_COUNT \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
data_parallel_training.py
5.3 梯度累积
python
# gradient_accumulation.py
def train_with_gradient_accumulation(
model, dataloader, optimizer, criterion,
rank, accumulation_steps=4
):
"""使用梯度累积的训练函数"""
model.train()
optimizer.zero_grad()
for i, (data, target) in enumerate(dataloader):
data = data.to(f"npu:{rank}")
target = target.to(f"npu:{rank}")
# 前向传播
output = model(data)
loss = criterion(output, target)
loss = loss / accumulation_steps # 缩放loss
# 反向传播(梯度累积)
loss.backward()
# 每accumulation_steps步更新一次
if (i + 1) % accumulation_steps == 0:
# 梯度同步发生在optimizer.step()
optimizer.step()
optimizer.zero_grad()
print(f"Rank {rank}, Step {i + 1}, Updated")
5.4 混合精度训练
python
# mixed_precision_training.py
from torch_npu.amp import autocast, GradScaler
def train_mixed_precision(model, dataloader, optimizer, rank):
"""混合精度训练"""
model.train()
scaler = GradScaler()
for data, target in dataloader:
data = data.to(f"npu:{rank}")
target = target.to(f"npu:{rank}")
optimizer.zero_grad()
# 使用autocast自动进行混合精度
with autocast():
output = model(data)
loss = criterion(output, target)
# 反向传播(自动缩放)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
六、通信优化技巧
6.1 梯度压缩
python
# gradient_compression.py
import torch
from torch.distributed.algorithms.gradient_compression import (
CompressModel,
SparsityPolicy
)
# 创建压缩策略
compression_policy = SparsityPolicy(
compression_type="topk", # Top-K稀疏化
sparse_ratio=0.9 # 保留10%梯度
)
# 压缩模型梯度
compressed_model = CompressModel(
model=model,
compression_policy=compression_policy
)
# 正常训练流程
for data, target in dataloader:
optimizer.zero_grad()
output = compressed_model(data)
loss = criterion(output, target)
loss.backward()
# 梯度自动压缩
compressed_model.compress_gradients()
optimizer.step()
6.2 通信与计算重叠
python
# overlap_comm_compute.py
def training_with_overlap(model, dataloader, optimizer, rank):
"""实现通信与计算重叠"""
model.train()
for data, target in dataloader:
data = data.to(f"npu:{rank}")
target = target.to(f"npu:{rank}")
# 前向传播
output = model(data)
loss = criterion(output, target)
# 反向传播(DDP异步通信)
loss.backward()
# 在等待梯度同步的同时进行其他计算
with torch.no_grad():
# 某些不需要梯度的计算
_ = torch.sum(output).item()
# 更新参数
optimizer.step()
optimizer.zero_grad()
6.3 分层AllReduce
c
// 分层AllReduce减少跨节点通信
typedef struct {
HcclComm world_comm; // 全局通信域
HcclComm local_comm; // 本地通信域(NVLink)
uint32_t local_rank;
uint32_t node_rank;
} HierarchicalComm;
hcclResult_t hierarchical_allreduce(
HierarchicalComm* hcomm,
float* buffer,
size_t count
) {
aclrtStream stream = NULL;
aclrtCreateStream(&stream);
// 第一步:本地AllReduce(高带宽NVLink)
hcclAllReduce(
buffer, buffer, count,
HCCL_DATA_TYPE_FP32, HCCL_SUM,
hcomm->local_comm, stream
);
aclrtSynchronizeStream(stream);
// 第二步:跨节点AllReduce(低带宽,但数据量少)
// 仅在每节点的一个rank上执行
if (hcomm->local_rank == 0) {
hcclAllReduce(
buffer, buffer, count,
HCCL_DATA_TYPE_FP32, HCCL_SUM,
hcomm->world_comm, stream
);
aclrtSynchronizeStream(stream);
}
// 第三步:本地Broadcast(将结果广播到节点内所有rank)
hcclBroadcast(
buffer, buffer, count,
HCCL_DATA_TYPE_FP32,
0, // root为rank 0
hcomm->local_comm, stream
);
aclrtSynchronizeStream(stream);
aclrtDestroyStream(stream);
return HCCL_SUCCESS;
}
七、性能分析与调优
7.1 通信性能分析
python
# profile_communication.py
import torch
import torch.distributed as dist
import time
def profile_allreduce(rank, size=1024*1024, warmup=10, iters=100):
"""分析AllReduce性能"""
# 创建数据
tensor = torch.randn(size, dtype=torch.float32).npu()
# Warmup
for _ in range(warmup):
dist.all_reduce(tensor.clone())
torch.npu.synchronize()
# 测试
start = time.time()
for _ in range(iters):
dist.all_reduce(tensor.clone())
torch.npu.synchronize()
end = time.time()
# 计算统计
avg_time_ms = (end - start) / iters * 1000
data_size_mb = size * 4 / (1024 * 1024) # float32 = 4 bytes
bandwidth_gb_s = data_size_mb / (avg_time_ms / 1000) / 1024
if rank == 0:
print(f"AllReduce Performance:")
print(f" Data size: {data_size_mb:.2f} MB")
print(f" Avg time: {avg_time_ms:.3f} ms")
print(f" Bandwidth: {bandwidth_gb_s:.2f} GB/s")
return avg_time_ms
7.2 通信热点分析
python
# analyze_comm_hotspots.py
from torch.profiler import profile, record_function, ProfilerActivity
def profile_training_step(model, data, target, rank):
"""分析训练步骤中的通信热点"""
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.NPU],
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
with record_function("training_step"):
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
# 保存结果
if rank == 0:
prof.export_chrome_trace("trace.json")
print(prof.key_averages().table(sort_by="npu_time_total", row_limit=10))
八、总结
HCCL作为CANN生态的高性能通信库,为分布式训练提供了强大的通信能力。通过掌握集合通信原语、通信优化技巧和实践方法,开发者可以构建高效的分布式训练系统,充分发挥多卡NPU集群的计算能力。
参考链接
- CANN组织:https://atomgit.com/cann
- HCCL项目:https://atomgit.com/cann/hccl
- hcomm基础库:https://atomgit.com/cann/hcomm
- shmem通信库:https://atomgit.com/cann/shmem