本文基于CANN开源社区的多个仓库进行应用案例讲解
CANN组织地址:https://atomgit.com/cann
atc仓库地址:https://atomgit.com/cann/atc
runtime仓库地址:https://atomgit.com/cann/runtime
前言
性能优化是深度学习模型部署的关键环节。CANN提供了完整的性能分析工具链,包括profiling工具、性能数据采集、可视化分析等功能。
本文将展示如何使用CANN的性能分析工具定位性能瓶颈,并通过各种优化手段提升模型推理和训练性能。
Profiling基础
1. 启用性能分析
python
import torch
import torch_npu
from torch_npu.profiler import Profile
class PerformanceProfiler:
def __init__(self, output_path='./profiling_data'):
self.output_path = output_path
def profile_model(self, model, input_data, warmup_steps=10, profile_steps=20):
"""对模型进行性能分析"""
model.eval()
# 预热
print(f"预热 {warmup_steps} 步...")
with torch.no_grad():
for _ in range(warmup_steps):
_ = model(input_data)
# 开始profiling
print(f"开始profiling {profile_steps} 步...")
with Profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.NPU],
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
with torch.no_grad():
for _ in range(profile_steps):
_ = model(input_data)
# 保存结果
prof.export_chrome_trace(f"{self.output_path}/trace.json")
return prof
def print_summary(self, prof):
"""打印性能摘要"""
print("\n=== 性能摘要 ===")
print(prof.key_averages().table(
sort_by="npu_time_total",
row_limit=20
))
def analyze_memory(self, prof):
"""分析内存使用"""
print("\n=== 内存分析 ===")
print(prof.key_averages().table(
sort_by="npu_memory_usage",
row_limit=10
))
# 使用示例
import torchvision.models as models
# 创建模型
model = models.resnet50().npu()
input_data = torch.randn(32, 3, 224, 224).npu()
# 性能分析
profiler = PerformanceProfiler()
prof = profiler.profile_model(model, input_data)
# 查看结果
profiler.print_summary(prof)
profiler.analyze_memory(prof)
2. 算子级别分析
python
class OperatorProfiler:
def __init__(self):
self.op_stats = {}
def profile_operators(self, model, input_data, iterations=100):
"""分析每个算子的性能"""
import time
model.eval()
# 注册hook
hooks = []
def make_hook(name):
def hook(module, input, output):
if name not in self.op_stats:
self.op_stats[name] = {
'count': 0,
'total_time': 0,
'input_shape': [],
'output_shape': []
}
# 记录形状
if isinstance(input, tuple):
self.op_stats[name]['input_shape'] = [
list(i.shape) if hasattr(i, 'shape') else None
for i in input
]
if hasattr(output, 'shape'):
self.op_stats[name]['output_shape'] = list(output.shape)
self.op_stats[name]['count'] += 1
return hook
# 为每个模块注册hook
for name, module in model.named_modules():
if len(list(module.children())) == 0: # 叶子节点
hook = module.register_forward_hook(make_hook(name))
hooks.append(hook)
# 运行推理
with torch.no_grad():
for _ in range(iterations):
_ = model(input_data)
# 移除hooks
for hook in hooks:
hook.remove()
return self.op_stats
def print_operator_stats(self):
"""打印算子统计"""
print("\n=== 算子统计 ===")
print(f"{'算子名称':<50} {'调用次数':<10} {'输入形状':<30} {'输出形状':<30}")
print("-" * 120)
for name, stats in sorted(self.op_stats.items(),
key=lambda x: x[1]['count'],
reverse=True):
input_shape = str(stats['input_shape'])[:28]
output_shape = str(stats['output_shape'])[:28]
print(f"{name:<50} {stats['count']:<10} {input_shape:<30} {output_shape:<30}")
# 使用示例
op_profiler = OperatorProfiler()
op_stats = op_profiler.profile_operators(model, input_data)
op_profiler.print_operator_stats()
性能指标监控
1. 实时性能监控
python
import time
import psutil
import torch
import torch_npu
class PerformanceMonitor:
def __init__(self):
self.metrics = {
'latency': [],
'throughput': [],
'npu_utilization': [],
'memory_usage': []
}
def measure_latency(self, model, input_data, iterations=100):
"""测量延迟"""
model.eval()
latencies = []
# 预热
with torch.no_grad():
for _ in range(10):
_ = model(input_data)
# 测量
with torch.no_grad():
for _ in range(iterations):
torch.npu.synchronize()
start = time.time()
_ = model(input_data)
torch.npu.synchronize()
end = time.time()
latencies.append((end - start) * 1000) # 转换为毫秒
self.metrics['latency'] = latencies
return {
'mean': sum(latencies) / len(latencies),
'min': min(latencies),
'max': max(latencies),
'p50': sorted(latencies)[len(latencies) // 2],
'p95': sorted(latencies)[int(len(latencies) * 0.95)],
'p99': sorted(latencies)[int(len(latencies) * 0.99)]
}
def measure_throughput(self, model, batch_size, input_shape, duration=10):
"""测量吞吐量"""
model.eval()
input_data = torch.randn(batch_size, *input_shape).npu()
total_samples = 0
start_time = time.time()
with torch.no_grad():
while time.time() - start_time < duration:
_ = model(input_data)
total_samples += batch_size
elapsed = time.time() - start_time
throughput = total_samples / elapsed
self.metrics['throughput'].append(throughput)
return throughput
def get_npu_utilization(self):
"""获取NPU利用率"""
# 这里使用torch_npu的API获取NPU信息
npu_id = torch.npu.current_device()
# 获取NPU利用率(示例)
utilization = torch.npu.utilization(npu_id)
self.metrics['npu_utilization'].append(utilization)
return utilization
def get_memory_usage(self):
"""获取内存使用情况"""
npu_id = torch.npu.current_device()
allocated = torch.npu.memory_allocated(npu_id) / 1024**3 # GB
reserved = torch.npu.memory_reserved(npu_id) / 1024**3 # GB
memory_info = {
'allocated': allocated,
'reserved': reserved,
'free': reserved - allocated
}
self.metrics['memory_usage'].append(memory_info)
return memory_info
def print_report(self):
"""打印性能报告"""
print("\n=== 性能报告 ===")
if self.metrics['latency']:
latencies = self.metrics['latency']
print(f"\n延迟统计 (ms):")
print(f" 平均: {sum(latencies) / len(latencies):.2f}")
print(f" 最小: {min(latencies):.2f}")
print(f" 最大: {max(latencies):.2f}")
print(f" P50: {sorted(latencies)[len(latencies) // 2]:.2f}")
print(f" P95: {sorted(latencies)[int(len(latencies) * 0.95)]:.2f}")
print(f" P99: {sorted(latencies)[int(len(latencies) * 0.99)]:.2f}")
if self.metrics['throughput']:
print(f"\n吞吐量: {self.metrics['throughput'][-1]:.2f} samples/sec")
if self.metrics['memory_usage']:
mem = self.metrics['memory_usage'][-1]
print(f"\n内存使用:")
print(f" 已分配: {mem['allocated']:.2f} GB")
print(f" 已保留: {mem['reserved']:.2f} GB")
print(f" 空闲: {mem['free']:.2f} GB")
# 使用示例
monitor = PerformanceMonitor()
# 测量延迟
latency_stats = monitor.measure_latency(model, input_data)
print(f"延迟统计: {latency_stats}")
# 测量吞吐量
throughput = monitor.measure_throughput(model, batch_size=32, input_shape=(3, 224, 224))
print(f"吞吐量: {throughput:.2f} samples/sec")
# 获取资源使用情况
memory_info = monitor.get_memory_usage()
print(f"内存使用: {memory_info}")
# 打印完整报告
monitor.print_report()
2. 批量大小优化
python
class BatchSizeOptimizer:
def __init__(self, model, input_shape):
self.model = model
self.input_shape = input_shape
self.monitor = PerformanceMonitor()
def find_optimal_batch_size(self, min_batch=1, max_batch=256, step=2):
"""寻找最优批量大小"""
results = []
batch_size = min_batch
while batch_size <= max_batch:
try:
print(f"\n测试批量大小: {batch_size}")
# 创建输入
input_data = torch.randn(batch_size, *self.input_shape).npu()
# 测量性能
throughput = self.monitor.measure_throughput(
self.model,
batch_size,
self.input_shape,
duration=5
)
memory_info = self.monitor.get_memory_usage()
results.append({
'batch_size': batch_size,
'throughput': throughput,
'memory_allocated': memory_info['allocated']
})
print(f" 吞吐量: {throughput:.2f} samples/sec")
print(f" 内存: {memory_info['allocated']:.2f} GB")
batch_size *= step
except RuntimeError as e:
if "out of memory" in str(e):
print(f" 批量大小 {batch_size} 超出内存限制")
break
else:
raise e
# 找到最优批量大小
if results:
optimal = max(results, key=lambda x: x['throughput'])
print(f"\n=== 最优批量大小 ===")
print(f"批量大小: {optimal['batch_size']}")
print(f"吞吐量: {optimal['throughput']:.2f} samples/sec")
print(f"内存使用: {optimal['memory_allocated']:.2f} GB")
return optimal
return None
# 使用示例
optimizer = BatchSizeOptimizer(model, input_shape=(3, 224, 224))
optimal_config = optimizer.find_optimal_batch_size(min_batch=1, max_batch=256)
模型优化技术
1. 算子融合
python
import torch
import torch_npu
class OperatorFusion:
@staticmethod
def fuse_conv_bn(conv, bn):
"""融合卷积和BatchNorm"""
# 获取参数
conv_weight = conv.weight
conv_bias = conv.bias if conv.bias is not None else torch.zeros(conv.out_channels)
bn_weight = bn.weight
bn_bias = bn.bias
bn_mean = bn.running_mean
bn_var = bn.running_var
bn_eps = bn.eps
# 计算融合后的权重和偏置
bn_std = torch.sqrt(bn_var + bn_eps)
fused_weight = conv_weight * (bn_weight / bn_std).reshape(-1, 1, 1, 1)
fused_bias = (conv_bias - bn_mean) * bn_weight / bn_std + bn_bias
# 创建新的卷积层
fused_conv = torch.nn.Conv2d(
conv.in_channels,
conv.out_channels,
conv.kernel_size,
conv.stride,
conv.padding,
conv.dilation,
conv.groups,
bias=True
)
fused_conv.weight.data = fused_weight
fused_conv.bias.data = fused_bias
return fused_conv
@staticmethod
def fuse_model(model):
"""融合模型中的算子"""
# 遍历模型
modules_to_fuse = []
for name, module in model.named_modules():
if isinstance(module, torch.nn.Sequential):
# 查找Conv+BN模式
for i in range(len(module) - 1):
if isinstance(module[i], torch.nn.Conv2d) and \
isinstance(module[i+1], torch.nn.BatchNorm2d):
modules_to_fuse.append((name, i))
# 执行融合
for module_name, idx in modules_to_fuse:
parent = dict(model.named_modules())[module_name]
conv = parent[idx]
bn = parent[idx + 1]
fused = OperatorFusion.fuse_conv_bn(conv, bn)
# 替换
parent[idx] = fused
parent[idx + 1] = torch.nn.Identity()
return model
# 使用示例
# 融合前
model_original = models.resnet50().npu()
input_data = torch.randn(32, 3, 224, 224).npu()
monitor = PerformanceMonitor()
latency_before = monitor.measure_latency(model_original, input_data)
print(f"融合前延迟: {latency_before['mean']:.2f} ms")
# 融合后
model_fused = OperatorFusion.fuse_model(model_original)
latency_after = monitor.measure_latency(model_fused, input_data)
print(f"融合后延迟: {latency_after['mean']:.2f} ms")
print(f"加速比: {latency_before['mean'] / latency_after['mean']:.2f}x")
2. 混合精度训练
python
class MixedPrecisionTrainer:
def __init__(self, model, optimizer, loss_scale=128.0):
self.model = model
self.optimizer = optimizer
self.loss_scale = loss_scale
# 转换为FP16
self.model = self.model.half()
def train_step(self, inputs, targets):
"""训练一步"""
# 前向传播(FP16)
outputs = self.model(inputs.half())
loss = torch.nn.functional.cross_entropy(outputs, targets)
# 缩放损失
scaled_loss = loss * self.loss_scale
# 反向传播
self.optimizer.zero_grad()
scaled_loss.backward()
# 缩放梯度
for param in self.model.parameters():
if param.grad is not None:
param.grad.data = param.grad.data / self.loss_scale
# 更新参数
self.optimizer.step()
return loss.item()
def train_epoch(self, dataloader):
"""训练一个epoch"""
self.model.train()
total_loss = 0
for batch_idx, (inputs, targets) in enumerate(dataloader):
inputs = inputs.npu()
targets = targets.npu()
loss = self.train_step(inputs, targets)
total_loss += loss
if (batch_idx + 1) % 100 == 0:
avg_loss = total_loss / (batch_idx + 1)
print(f"Batch {batch_idx + 1}, Loss: {avg_loss:.4f}")
return total_loss / len(dataloader)
# 使用示例
model = models.resnet50().npu()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
trainer = MixedPrecisionTrainer(model, optimizer)
# 假设有dataloader
# for epoch in range(10):
# loss = trainer.train_epoch(dataloader)
# print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")
3. 动态shape优化
python
class DynamicShapeOptimizer:
def __init__(self, model):
self.model = model
self.shape_cache = {}
def optimize_for_shape(self, input_shape):
"""为特定shape优化模型"""
shape_key = str(input_shape)
if shape_key in self.shape_cache:
return self.shape_cache[shape_key]
# 创建示例输入
dummy_input = torch.randn(*input_shape).npu()
# 追踪模型
traced_model = torch.jit.trace(self.model, dummy_input)
# 优化
traced_model = torch.jit.optimize_for_inference(traced_model)
# 缓存
self.shape_cache[shape_key] = traced_model
return traced_model
def infer(self, input_data):
"""推理"""
input_shape = tuple(input_data.shape)
optimized_model = self.optimize_for_shape(input_shape)
with torch.no_grad():
output = optimized_model(input_data)
return output
# 使用示例
model = models.resnet50().npu()
optimizer = DynamicShapeOptimizer(model)
# 不同shape的输入
shapes = [
(1, 3, 224, 224),
(8, 3, 224, 224),
(32, 3, 224, 224)
]
for shape in shapes:
input_data = torch.randn(*shape).npu()
# 第一次会优化并缓存
start = time.time()
output = optimizer.infer(input_data)
elapsed = time.time() - start
print(f"Shape {shape}: {elapsed*1000:.2f} ms")
内存优化
1. 梯度累积
python
class GradientAccumulation:
def __init__(self, model, optimizer, accumulation_steps=4):
self.model = model
self.optimizer = optimizer
self.accumulation_steps = accumulation_steps
def train_step(self, inputs, targets, step):
"""训练一步(带梯度累积)"""
# 前向传播
outputs = self.model(inputs)
loss = torch.nn.functional.cross_entropy(outputs, targets)
# 缩放损失
loss = loss / self.accumulation_steps
# 反向传播
loss.backward()
# 每accumulation_steps步更新一次
if (step + 1) % self.accumulation_steps == 0:
self.optimizer.step()
self.optimizer.zero_grad()
return loss.item() * self.accumulation_steps
def train_epoch(self, dataloader):
"""训练一个epoch"""
self.model.train()
total_loss = 0
for batch_idx, (inputs, targets) in enumerate(dataloader):
inputs = inputs.npu()
targets = targets.npu()
loss = self.train_step(inputs, targets, batch_idx)
total_loss += loss
return total_loss / len(dataloader)
# 使用示例
model = models.resnet50().npu()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# 使用梯度累积,相当于batch_size扩大4倍
trainer = GradientAccumulation(model, optimizer, accumulation_steps=4)
2. 检查点技术
python
from torch.utils.checkpoint import checkpoint
class CheckpointModel(torch.nn.Module):
def __init__(self, original_model):
super().__init__()
self.model = original_model
def forward(self, x):
# 使用检查点减少内存
# 只保存部分中间结果,需要时重新计算
# 假设模型有多个stage
if hasattr(self.model, 'layer1'):
x = checkpoint(self.model.layer1, x)
x = checkpoint(self.model.layer2, x)
x = checkpoint(self.model.layer3, x)
x = checkpoint(self.model.layer4, x)
x = self.model.avgpool(x)
x = torch.flatten(x, 1)
x = self.model.fc(x)
else:
x = self.model(x)
return x
# 使用示例
original_model = models.resnet50().npu()
checkpoint_model = CheckpointModel(original_model).npu()
# 对比内存使用
input_data = torch.randn(32, 3, 224, 224).npu()
# 原始模型
torch.npu.reset_peak_memory_stats()
_ = original_model(input_data)
original_memory = torch.npu.max_memory_allocated() / 1024**3
# 检查点模型
torch.npu.reset_peak_memory_stats()
_ = checkpoint_model(input_data)
checkpoint_memory = torch.npu.max_memory_allocated() / 1024**3
print(f"原始模型内存: {original_memory:.2f} GB")
print(f"检查点模型内存: {checkpoint_memory:.2f} GB")
print(f"节省: {(1 - checkpoint_memory/original_memory)*100:.1f}%")
分布式性能优化
1. 通信优化
python
import torch.distributed as dist
class CommunicationOptimizer:
def __init__(self, model):
self.model = model
def overlap_communication_computation(self, inputs, targets):
"""重叠通信和计算"""
# 前向传播
outputs = self.model(inputs)
loss = torch.nn.functional.cross_entropy(outputs, targets)
# 反向传播
loss.backward()
# 异步梯度同步
handles = []
for param in self.model.parameters():
if param.grad is not None:
handle = dist.all_reduce(param.grad, async_op=True)
handles.append(handle)
# 等待通信完成
for handle in handles:
handle.wait()
return loss.item()
def gradient_compression(self, threshold=0.01):
"""梯度压缩"""
for param in self.model.parameters():
if param.grad is not None:
# 只传输大于阈值的梯度
mask = torch.abs(param.grad) > threshold
compressed_grad = param.grad * mask
# 同步压缩后的梯度
dist.all_reduce(compressed_grad)
param.grad = compressed_grad
# 使用示例(需要在分布式环境中)
# dist.init_process_group(backend='hccl')
# model = models.resnet50().npu()
# optimizer = CommunicationOptimizer(model)
2. 流水线并行
python
class PipelineParallel:
def __init__(self, model, num_stages=4):
self.num_stages = num_stages
self.stages = self.split_model(model)
def split_model(self, model):
"""将模型分割为多个stage"""
# 简化示例:假设模型有4个layer
stages = []
if hasattr(model, 'layer1'):
stages.append(torch.nn.Sequential(
model.conv1,
model.bn1,
model.relu,
model.maxpool,
model.layer1
))
stages.append(model.layer2)
stages.append(model.layer3)
stages.append(torch.nn.Sequential(
model.layer4,
model.avgpool,
torch.nn.Flatten(),
model.fc
))
return stages
def forward_pipeline(self, inputs, micro_batch_size):
"""流水线前向传播"""
# 将输入分成多个micro-batch
micro_batches = torch.split(inputs, micro_batch_size)
# 流水线执行
outputs = []
intermediate = [None] * len(micro_batches)
for stage_idx, stage in enumerate(self.stages):
stage_outputs = []
for batch_idx, micro_batch in enumerate(micro_batches):
if stage_idx == 0:
x = micro_batch
else:
x = intermediate[batch_idx]
x = stage(x)
if stage_idx < len(self.stages) - 1:
intermediate[batch_idx] = x
else:
stage_outputs.append(x)
if stage_idx == len(self.stages) - 1:
outputs = stage_outputs
return torch.cat(outputs, dim=0)
# 使用示例
model = models.resnet50().npu()
pipeline = PipelineParallel(model, num_stages=4)
input_data = torch.randn(32, 3, 224, 224).npu()
output = pipeline.forward_pipeline(input_data, micro_batch_size=8)
推理优化
1. 模型量化
python
import torch.quantization as quantization
class ModelQuantizer:
def __init__(self, model):
self.model = model
def quantize_dynamic(self):
"""动态量化(仅权重)"""
quantized_model = quantization.quantize_dynamic(
self.model,
{torch.nn.Linear, torch.nn.Conv2d},
dtype=torch.qint8
)
return quantized_model
def quantize_static(self, calibration_data):
"""静态量化(权重+激活)"""
# 准备模型
self.model.eval()
self.model.qconfig = quantization.get_default_qconfig('fbgemm')
# 准备量化
prepared_model = quantization.prepare(self.model)
# 校准
with torch.no_grad():
for data in calibration_data:
prepared_model(data)
# 转换
quantized_model = quantization.convert(prepared_model)
return quantized_model
def compare_performance(self, original_model, quantized_model, test_data):
"""对比性能"""
monitor = PerformanceMonitor()
# 原始模型
latency_original = monitor.measure_latency(original_model, test_data)
memory_original = monitor.get_memory_usage()
# 量化模型
latency_quantized = monitor.measure_latency(quantized_model, test_data)
memory_quantized = monitor.get_memory_usage()
print("\n=== 量化对比 ===")
print(f"原始模型延迟: {latency_original['mean']:.2f} ms")
print(f"量化模型延迟: {latency_quantized['mean']:.2f} ms")
print(f"加速比: {latency_original['mean'] / latency_quantized['mean']:.2f}x")
print(f"\n原始模型内存: {memory_original['allocated']:.2f} GB")
print(f"量化模型内存: {memory_quantized['allocated']:.2f} GB")
print(f"内存节省: {(1 - memory_quantized['allocated']/memory_original['allocated'])*100:.1f}%")
# 使用示例
model = models.resnet50().npu()
quantizer = ModelQuantizer(model)
# 动态量化
quantized_model = quantizer.quantize_dynamic()
# 对比性能
test_data = torch.randn(32, 3, 224, 224).npu()
quantizer.compare_performance(model, quantized_model, test_data)
2. 模型剪枝
python
import torch.nn.utils.prune as prune
class ModelPruner:
def __init__(self, model):
self.model = model
def prune_unstructured(self, amount=0.3):
"""非结构化剪枝"""
parameters_to_prune = []
for name, module in self.model.named_modules():
if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
parameters_to_prune.append((module, 'weight'))
# 全局剪枝
prune.global_unstructured(
parameters_to_prune,
pruning_method=prune.L1Unstructured,
amount=amount
)
# 移除剪枝重参数化
for module, param_name in parameters_to_prune:
prune.remove(module, param_name)
return self.model
def prune_structured(self, amount=0.3):
"""结构化剪枝(通道剪枝)"""
for name, module in self.model.named_modules():
if isinstance(module, torch.nn.Conv2d):
prune.ln_structured(
module,
name='weight',
amount=amount,
n=2,
dim=0 # 剪枝输出通道
)
prune.remove(module, 'weight')
return self.model
def iterative_pruning(self, dataloader, epochs=10, prune_rate=0.2):
"""迭代剪枝"""
for epoch in range(epochs):
# 训练
self.train_epoch(dataloader)
# 剪枝
current_amount = prune_rate * (epoch + 1) / epochs
self.prune_unstructured(amount=current_amount)
print(f"Epoch {epoch + 1}, 剪枝率: {current_amount*100:.1f}%")
return self.model
def train_epoch(self, dataloader):
"""训练一个epoch"""
self.model.train()
# 训练逻辑
pass
def calculate_sparsity(self):
"""计算稀疏度"""
total_params = 0
zero_params = 0
for param in self.model.parameters():
total_params += param.numel()
zero_params += (param == 0).sum().item()
sparsity = zero_params / total_params
print(f"\n模型稀疏度: {sparsity*100:.2f}%")
print(f"总参数: {total_params:,}")
print(f"零参数: {zero_params:,}")
return sparsity
# 使用示例
model = models.resnet50().npu()
pruner = ModelPruner(model)
# 剪枝前
print("剪枝前:")
pruner.calculate_sparsity()
# 执行剪枝
pruned_model = pruner.prune_unstructured(amount=0.3)
# 剪枝后
print("\n剪枝后:")
pruner.calculate_sparsity()
3. 知识蒸馏
python
class KnowledgeDistillation:
def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7):
self.teacher = teacher_model
self.student = student_model
self.temperature = temperature
self.alpha = alpha
self.teacher.eval()
def distillation_loss(self, student_logits, teacher_logits, labels):
"""蒸馏损失"""
# 软标签损失
soft_loss = torch.nn.functional.kl_div(
torch.nn.functional.log_softmax(student_logits / self.temperature, dim=1),
torch.nn.functional.softmax(teacher_logits / self.temperature, dim=1),
reduction='batchmean'
) * (self.temperature ** 2)
# 硬标签损失
hard_loss = torch.nn.functional.cross_entropy(student_logits, labels)
# 组合损失
loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss
return loss
def train_step(self, inputs, labels, optimizer):
"""训练一步"""
# 教师模型推理
with torch.no_grad():
teacher_logits = self.teacher(inputs)
# 学生模型推理
student_logits = self.student(inputs)
# 计算损失
loss = self.distillation_loss(student_logits, teacher_logits, labels)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss.item()
def train_epoch(self, dataloader, optimizer):
"""训练一个epoch"""
self.student.train()
total_loss = 0
for batch_idx, (inputs, labels) in enumerate(dataloader):
inputs = inputs.npu()
labels = labels.npu()
loss = self.train_step(inputs, labels, optimizer)
total_loss += loss
if (batch_idx + 1) % 100 == 0:
avg_loss = total_loss / (batch_idx + 1)
print(f"Batch {batch_idx + 1}, Loss: {avg_loss:.4f}")
return total_loss / len(dataloader)
# 使用示例
teacher = models.resnet50(pretrained=True).npu()
student = models.resnet18().npu()
distiller = KnowledgeDistillation(teacher, student)
optimizer = torch.optim.SGD(student.parameters(), lr=0.01)
# 训练
# for epoch in range(10):
# loss = distiller.train_epoch(dataloader, optimizer)
# print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")
端到端优化案例
1. ResNet推理优化
python
class OptimizedResNetInference:
def __init__(self, model_path):
# 加载模型
self.model = models.resnet50()
self.model.load_state_dict(torch.load(model_path))
self.model.eval()
# 优化1: 算子融合
self.model = OperatorFusion.fuse_model(self.model)
# 优化2: 转换为FP16
self.model = self.model.half()
# 优化3: 移到NPU
self.model = self.model.npu()
# 优化4: JIT编译
dummy_input = torch.randn(1, 3, 224, 224).half().npu()
self.model = torch.jit.trace(self.model, dummy_input)
self.model = torch.jit.optimize_for_inference(self.model)
def preprocess(self, image):
"""预处理"""
from torchvision import transforms
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
return transform(image)
def infer(self, image):
"""推理"""
# 预处理
input_tensor = self.preprocess(image).unsqueeze(0).half().npu()
# 推理
with torch.no_grad():
output = self.model(input_tensor)
# 后处理
probabilities = torch.nn.functional.softmax(output[0], dim=0)
return probabilities
def batch_infer(self, images, batch_size=32):
"""批量推理"""
results = []
for i in range(0, len(images), batch_size):
batch_images = images[i:i+batch_size]
# 预处理
batch_tensors = torch.stack([
self.preprocess(img) for img in batch_images
]).half().npu()
# 推理
with torch.no_grad():
outputs = self.model(batch_tensors)
# 后处理
probabilities = torch.nn.functional.softmax(outputs, dim=1)
results.extend(probabilities.cpu().numpy())
return results
# 使用示例
from PIL import Image
# 创建优化后的推理器
inferencer = OptimizedResNetInference('resnet50.pth')
# 单张图像推理
image = Image.open('test.jpg')
probs = inferencer.infer(image)
print(f"Top-1 概率: {probs.max().item():.4f}")
# 批量推理
images = [Image.open(f'test_{i}.jpg') for i in range(100)]
results = inferencer.batch_infer(images, batch_size=32)
print(f"处理了 {len(results)} 张图像")
2. 性能基准测试
python
class PerformanceBenchmark:
def __init__(self):
self.results = {}
def benchmark_model(self, model, input_shape, batch_sizes=[1, 8, 16, 32, 64]):
"""对模型进行基准测试"""
model.eval()
monitor = PerformanceMonitor()
print(f"\n=== 基准测试: {model.__class__.__name__} ===")
print(f"输入形状: {input_shape}")
for batch_size in batch_sizes:
try:
input_data = torch.randn(batch_size, *input_shape).npu()
# 测量延迟
latency_stats = monitor.measure_latency(model, input_data)
# 测量吞吐量
throughput = monitor.measure_throughput(
model,
batch_size,
input_shape,
duration=5
)
# 测量内存
memory_info = monitor.get_memory_usage()
self.results[batch_size] = {
'latency_mean': latency_stats['mean'],
'latency_p99': latency_stats['p99'],
'throughput': throughput,
'memory': memory_info['allocated']
}
print(f"\nBatch Size: {batch_size}")
print(f" 延迟 (mean): {latency_stats['mean']:.2f} ms")
print(f" 延迟 (p99): {latency_stats['p99']:.2f} ms")
print(f" 吞吐量: {throughput:.2f} samples/sec")
print(f" 内存: {memory_info['allocated']:.2f} GB")
except RuntimeError as e:
if "out of memory" in str(e):
print(f"\nBatch Size {batch_size}: 内存不足")
break
else:
raise e
def compare_models(self, models_dict, input_shape, batch_size=32):
"""对比多个模型"""
print(f"\n=== 模型对比 (Batch Size: {batch_size}) ===")
print(f"{'模型':<20} {'延迟(ms)':<15} {'吞吐量(samples/s)':<20} {'内存(GB)':<15}")
print("-" * 70)
for name, model in models_dict.items():
model.eval()
monitor = PerformanceMonitor()
input_data = torch.randn(batch_size, *input_shape).npu()
latency = monitor.measure_latency(model, input_data)
throughput = monitor.measure_throughput(model, batch_size, input_shape, duration=5)
memory = monitor.get_memory_usage()
print(f"{name:<20} {latency['mean']:<15.2f} {throughput:<20.2f} {memory['allocated']:<15.2f}")
# 使用示例
benchmark = PerformanceBenchmark()
# 单模型基准测试
model = models.resnet50().npu()
benchmark.benchmark_model(model, input_shape=(3, 224, 224))
# 多模型对比
models_dict = {
'ResNet18': models.resnet18().npu(),
'ResNet50': models.resnet50().npu(),
'ResNet101': models.resnet101().npu()
}
benchmark.compare_models(models_dict, input_shape=(3, 224, 224), batch_size=32)
总结
CANN性能分析与优化要点:
- Profiling工具:性能数据采集和分析
- 性能监控:延迟、吞吐量、资源使用
- 模型优化:算子融合、混合精度、动态shape
- 内存优化:梯度累积、检查点技术
- 分布式优化:通信优化、流水线并行
- 推理优化:量化、剪枝、知识蒸馏
- 端到端优化:综合应用各种优化技术
通过CANN的atc和runtime工具,可以系统地分析和优化模型性能,实现高效的AI应用部署。
相关链接
atc仓库地址:https://atomgit.com/cann/atc
runtime仓库地址:https://atomgit.com/cann/runtime
CANN组织地址:https://atomgit.com/cann