CANN性能分析与调优工具链实践：从profiling到性能优化的完整流程

本文基于CANN开源社区的多个仓库进行应用案例讲解

CANN组织地址：https://atomgit.com/cann

atc仓库地址：https://atomgit.com/cann/atc

runtime仓库地址：https://atomgit.com/cann/runtime

前言

性能优化是深度学习模型部署的关键环节。CANN提供了完整的性能分析工具链，包括profiling工具、性能数据采集、可视化分析等功能。

本文将展示如何使用CANN的性能分析工具定位性能瓶颈，并通过各种优化手段提升模型推理和训练性能。

Profiling基础

1. 启用性能分析

python 复制代码

import torch
import torch_npu
from torch_npu.profiler import Profile

class PerformanceProfiler:
    def __init__(self, output_path='./profiling_data'):
        self.output_path = output_path
  
    def profile_model(self, model, input_data, warmup_steps=10, profile_steps=20):
        """对模型进行性能分析"""
        model.eval()
      
        # 预热
        print(f"预热 {warmup_steps} 步...")
        with torch.no_grad():
            for _ in range(warmup_steps):
                _ = model(input_data)
      
        # 开始profiling
        print(f"开始profiling {profile_steps} 步...")
        with Profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.NPU],
            record_shapes=True,
            profile_memory=True,
            with_stack=True
        ) as prof:
            with torch.no_grad():
                for _ in range(profile_steps):
                    _ = model(input_data)
      
        # 保存结果
        prof.export_chrome_trace(f"{self.output_path}/trace.json")
      
        return prof
  
    def print_summary(self, prof):
        """打印性能摘要"""
        print("\n=== 性能摘要 ===")
        print(prof.key_averages().table(
            sort_by="npu_time_total",
            row_limit=20
        ))
  
    def analyze_memory(self, prof):
        """分析内存使用"""
        print("\n=== 内存分析 ===")
        print(prof.key_averages().table(
            sort_by="npu_memory_usage",
            row_limit=10
        ))

# 使用示例
import torchvision.models as models

# 创建模型
model = models.resnet50().npu()
input_data = torch.randn(32, 3, 224, 224).npu()

# 性能分析
profiler = PerformanceProfiler()
prof = profiler.profile_model(model, input_data)

# 查看结果
profiler.print_summary(prof)
profiler.analyze_memory(prof)

2. 算子级别分析

python 复制代码

class OperatorProfiler:
    def __init__(self):
        self.op_stats = {}
  
    def profile_operators(self, model, input_data, iterations=100):
        """分析每个算子的性能"""
        import time
      
        model.eval()
      
        # 注册hook
        hooks = []
      
        def make_hook(name):
            def hook(module, input, output):
                if name not in self.op_stats:
                    self.op_stats[name] = {
                        'count': 0,
                        'total_time': 0,
                        'input_shape': [],
                        'output_shape': []
                    }
              
                # 记录形状
                if isinstance(input, tuple):
                    self.op_stats[name]['input_shape'] = [
                        list(i.shape) if hasattr(i, 'shape') else None 
                        for i in input
                    ]
              
                if hasattr(output, 'shape'):
                    self.op_stats[name]['output_shape'] = list(output.shape)
              
                self.op_stats[name]['count'] += 1
          
            return hook
      
        # 为每个模块注册hook
        for name, module in model.named_modules():
            if len(list(module.children())) == 0:  # 叶子节点
                hook = module.register_forward_hook(make_hook(name))
                hooks.append(hook)
      
        # 运行推理
        with torch.no_grad():
            for _ in range(iterations):
                _ = model(input_data)
      
        # 移除hooks
        for hook in hooks:
            hook.remove()
      
        return self.op_stats
  
    def print_operator_stats(self):
        """打印算子统计"""
        print("\n=== 算子统计 ===")
        print(f"{'算子名称':<50} {'调用次数':<10} {'输入形状':<30} {'输出形状':<30}")
        print("-" * 120)
      
        for name, stats in sorted(self.op_stats.items(), 
                                  key=lambda x: x[1]['count'], 
                                  reverse=True):
            input_shape = str(stats['input_shape'])[:28]
            output_shape = str(stats['output_shape'])[:28]
          
            print(f"{name:<50} {stats['count']:<10} {input_shape:<30} {output_shape:<30}")

# 使用示例
op_profiler = OperatorProfiler()
op_stats = op_profiler.profile_operators(model, input_data)
op_profiler.print_operator_stats()

性能指标监控

1. 实时性能监控

python 复制代码

import time
import psutil
import torch
import torch_npu

class PerformanceMonitor:
    def __init__(self):
        self.metrics = {
            'latency': [],
            'throughput': [],
            'npu_utilization': [],
            'memory_usage': []
        }
  
    def measure_latency(self, model, input_data, iterations=100):
        """测量延迟"""
        model.eval()
        latencies = []
      
        # 预热
        with torch.no_grad():
            for _ in range(10):
                _ = model(input_data)
      
        # 测量
        with torch.no_grad():
            for _ in range(iterations):
                torch.npu.synchronize()
                start = time.time()
              
                _ = model(input_data)
              
                torch.npu.synchronize()
                end = time.time()
              
                latencies.append((end - start) * 1000)  # 转换为毫秒
      
        self.metrics['latency'] = latencies
      
        return {
            'mean': sum(latencies) / len(latencies),
            'min': min(latencies),
            'max': max(latencies),
            'p50': sorted(latencies)[len(latencies) // 2],
            'p95': sorted(latencies)[int(len(latencies) * 0.95)],
            'p99': sorted(latencies)[int(len(latencies) * 0.99)]
        }
  
    def measure_throughput(self, model, batch_size, input_shape, duration=10):
        """测量吞吐量"""
        model.eval()
      
        input_data = torch.randn(batch_size, *input_shape).npu()
      
        total_samples = 0
        start_time = time.time()
      
        with torch.no_grad():
            while time.time() - start_time < duration:
                _ = model(input_data)
                total_samples += batch_size
      
        elapsed = time.time() - start_time
        throughput = total_samples / elapsed
      
        self.metrics['throughput'].append(throughput)
      
        return throughput
  
    def get_npu_utilization(self):
        """获取NPU利用率"""
        # 这里使用torch_npu的API获取NPU信息
        npu_id = torch.npu.current_device()
      
        # 获取NPU利用率（示例）
        utilization = torch.npu.utilization(npu_id)
      
        self.metrics['npu_utilization'].append(utilization)
      
        return utilization
  
    def get_memory_usage(self):
        """获取内存使用情况"""
        npu_id = torch.npu.current_device()
      
        allocated = torch.npu.memory_allocated(npu_id) / 1024**3  # GB
        reserved = torch.npu.memory_reserved(npu_id) / 1024**3    # GB
      
        memory_info = {
            'allocated': allocated,
            'reserved': reserved,
            'free': reserved - allocated
        }
      
        self.metrics['memory_usage'].append(memory_info)
      
        return memory_info
  
    def print_report(self):
        """打印性能报告"""
        print("\n=== 性能报告 ===")
      
        if self.metrics['latency']:
            latencies = self.metrics['latency']
            print(f"\n延迟统计 (ms):")
            print(f"  平均: {sum(latencies) / len(latencies):.2f}")
            print(f"  最小: {min(latencies):.2f}")
            print(f"  最大: {max(latencies):.2f}")
            print(f"  P50: {sorted(latencies)[len(latencies) // 2]:.2f}")
            print(f"  P95: {sorted(latencies)[int(len(latencies) * 0.95)]:.2f}")
            print(f"  P99: {sorted(latencies)[int(len(latencies) * 0.99)]:.2f}")
      
        if self.metrics['throughput']:
            print(f"\n吞吐量: {self.metrics['throughput'][-1]:.2f} samples/sec")
      
        if self.metrics['memory_usage']:
            mem = self.metrics['memory_usage'][-1]
            print(f"\n内存使用:")
            print(f"  已分配: {mem['allocated']:.2f} GB")
            print(f"  已保留: {mem['reserved']:.2f} GB")
            print(f"  空闲: {mem['free']:.2f} GB")

# 使用示例
monitor = PerformanceMonitor()

# 测量延迟
latency_stats = monitor.measure_latency(model, input_data)
print(f"延迟统计: {latency_stats}")

# 测量吞吐量
throughput = monitor.measure_throughput(model, batch_size=32, input_shape=(3, 224, 224))
print(f"吞吐量: {throughput:.2f} samples/sec")

# 获取资源使用情况
memory_info = monitor.get_memory_usage()
print(f"内存使用: {memory_info}")

# 打印完整报告
monitor.print_report()

2. 批量大小优化

python 复制代码

class BatchSizeOptimizer:
    def __init__(self, model, input_shape):
        self.model = model
        self.input_shape = input_shape
        self.monitor = PerformanceMonitor()
  
    def find_optimal_batch_size(self, min_batch=1, max_batch=256, step=2):
        """寻找最优批量大小"""
        results = []
      
        batch_size = min_batch
        while batch_size <= max_batch:
            try:
                print(f"\n测试批量大小: {batch_size}")
              
                # 创建输入
                input_data = torch.randn(batch_size, *self.input_shape).npu()
              
                # 测量性能
                throughput = self.monitor.measure_throughput(
                    self.model,
                    batch_size,
                    self.input_shape,
                    duration=5
                )
              
                memory_info = self.monitor.get_memory_usage()
              
                results.append({
                    'batch_size': batch_size,
                    'throughput': throughput,
                    'memory_allocated': memory_info['allocated']
                })
              
                print(f"  吞吐量: {throughput:.2f} samples/sec")
                print(f"  内存: {memory_info['allocated']:.2f} GB")
              
                batch_size *= step
              
            except RuntimeError as e:
                if "out of memory" in str(e):
                    print(f"  批量大小 {batch_size} 超出内存限制")
                    break
                else:
                    raise e
      
        # 找到最优批量大小
        if results:
            optimal = max(results, key=lambda x: x['throughput'])
          
            print(f"\n=== 最优批量大小 ===")
            print(f"批量大小: {optimal['batch_size']}")
            print(f"吞吐量: {optimal['throughput']:.2f} samples/sec")
            print(f"内存使用: {optimal['memory_allocated']:.2f} GB")
          
            return optimal
      
        return None

# 使用示例
optimizer = BatchSizeOptimizer(model, input_shape=(3, 224, 224))
optimal_config = optimizer.find_optimal_batch_size(min_batch=1, max_batch=256)

模型优化技术

1. 算子融合

python 复制代码

import torch
import torch_npu

class OperatorFusion:
    @staticmethod
    def fuse_conv_bn(conv, bn):
        """融合卷积和BatchNorm"""
        # 获取参数
        conv_weight = conv.weight
        conv_bias = conv.bias if conv.bias is not None else torch.zeros(conv.out_channels)
      
        bn_weight = bn.weight
        bn_bias = bn.bias
        bn_mean = bn.running_mean
        bn_var = bn.running_var
        bn_eps = bn.eps
      
        # 计算融合后的权重和偏置
        bn_std = torch.sqrt(bn_var + bn_eps)
        fused_weight = conv_weight * (bn_weight / bn_std).reshape(-1, 1, 1, 1)
        fused_bias = (conv_bias - bn_mean) * bn_weight / bn_std + bn_bias
      
        # 创建新的卷积层
        fused_conv = torch.nn.Conv2d(
            conv.in_channels,
            conv.out_channels,
            conv.kernel_size,
            conv.stride,
            conv.padding,
            conv.dilation,
            conv.groups,
            bias=True
        )
      
        fused_conv.weight.data = fused_weight
        fused_conv.bias.data = fused_bias
      
        return fused_conv
  
    @staticmethod
    def fuse_model(model):
        """融合模型中的算子"""
        # 遍历模型
        modules_to_fuse = []
      
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Sequential):
                # 查找Conv+BN模式
                for i in range(len(module) - 1):
                    if isinstance(module[i], torch.nn.Conv2d) and \
                       isinstance(module[i+1], torch.nn.BatchNorm2d):
                        modules_to_fuse.append((name, i))
      
        # 执行融合
        for module_name, idx in modules_to_fuse:
            parent = dict(model.named_modules())[module_name]
            conv = parent[idx]
            bn = parent[idx + 1]
          
            fused = OperatorFusion.fuse_conv_bn(conv, bn)
          
            # 替换
            parent[idx] = fused
            parent[idx + 1] = torch.nn.Identity()
      
        return model

# 使用示例
# 融合前
model_original = models.resnet50().npu()
input_data = torch.randn(32, 3, 224, 224).npu()

monitor = PerformanceMonitor()
latency_before = monitor.measure_latency(model_original, input_data)

print(f"融合前延迟: {latency_before['mean']:.2f} ms")

# 融合后
model_fused = OperatorFusion.fuse_model(model_original)
latency_after = monitor.measure_latency(model_fused, input_data)

print(f"融合后延迟: {latency_after['mean']:.2f} ms")
print(f"加速比: {latency_before['mean'] / latency_after['mean']:.2f}x")

2. 混合精度训练

python 复制代码

class MixedPrecisionTrainer:
    def __init__(self, model, optimizer, loss_scale=128.0):
        self.model = model
        self.optimizer = optimizer
        self.loss_scale = loss_scale
      
        # 转换为FP16
        self.model = self.model.half()
  
    def train_step(self, inputs, targets):
        """训练一步"""
        # 前向传播（FP16）
        outputs = self.model(inputs.half())
        loss = torch.nn.functional.cross_entropy(outputs, targets)
      
        # 缩放损失
        scaled_loss = loss * self.loss_scale
      
        # 反向传播
        self.optimizer.zero_grad()
        scaled_loss.backward()
      
        # 缩放梯度
        for param in self.model.parameters():
            if param.grad is not None:
                param.grad.data = param.grad.data / self.loss_scale
      
        # 更新参数
        self.optimizer.step()
      
        return loss.item()
  
    def train_epoch(self, dataloader):
        """训练一个epoch"""
        self.model.train()
        total_loss = 0
      
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs = inputs.npu()
            targets = targets.npu()
          
            loss = self.train_step(inputs, targets)
            total_loss += loss
          
            if (batch_idx + 1) % 100 == 0:
                avg_loss = total_loss / (batch_idx + 1)
                print(f"Batch {batch_idx + 1}, Loss: {avg_loss:.4f}")
      
        return total_loss / len(dataloader)

# 使用示例
model = models.resnet50().npu()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

trainer = MixedPrecisionTrainer(model, optimizer)

# 假设有dataloader
# for epoch in range(10):
#     loss = trainer.train_epoch(dataloader)
#     print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")

3. 动态shape优化

python 复制代码

class DynamicShapeOptimizer:
    def __init__(self, model):
        self.model = model
        self.shape_cache = {}
  
    def optimize_for_shape(self, input_shape):
        """为特定shape优化模型"""
        shape_key = str(input_shape)
      
        if shape_key in self.shape_cache:
            return self.shape_cache[shape_key]
      
        # 创建示例输入
        dummy_input = torch.randn(*input_shape).npu()
      
        # 追踪模型
        traced_model = torch.jit.trace(self.model, dummy_input)
      
        # 优化
        traced_model = torch.jit.optimize_for_inference(traced_model)
      
        # 缓存
        self.shape_cache[shape_key] = traced_model
      
        return traced_model
  
    def infer(self, input_data):
        """推理"""
        input_shape = tuple(input_data.shape)
        optimized_model = self.optimize_for_shape(input_shape)
      
        with torch.no_grad():
            output = optimized_model(input_data)
      
        return output

# 使用示例
model = models.resnet50().npu()
optimizer = DynamicShapeOptimizer(model)

# 不同shape的输入
shapes = [
    (1, 3, 224, 224),
    (8, 3, 224, 224),
    (32, 3, 224, 224)
]

for shape in shapes:
    input_data = torch.randn(*shape).npu()
  
    # 第一次会优化并缓存
    start = time.time()
    output = optimizer.infer(input_data)
    elapsed = time.time() - start
  
    print(f"Shape {shape}: {elapsed*1000:.2f} ms")

内存优化

1. 梯度累积

python 复制代码

class GradientAccumulation:
    def __init__(self, model, optimizer, accumulation_steps=4):
        self.model = model
        self.optimizer = optimizer
        self.accumulation_steps = accumulation_steps
  
    def train_step(self, inputs, targets, step):
        """训练一步（带梯度累积）"""
        # 前向传播
        outputs = self.model(inputs)
        loss = torch.nn.functional.cross_entropy(outputs, targets)
      
        # 缩放损失
        loss = loss / self.accumulation_steps
      
        # 反向传播
        loss.backward()
      
        # 每accumulation_steps步更新一次
        if (step + 1) % self.accumulation_steps == 0:
            self.optimizer.step()
            self.optimizer.zero_grad()
      
        return loss.item() * self.accumulation_steps
  
    def train_epoch(self, dataloader):
        """训练一个epoch"""
        self.model.train()
        total_loss = 0
      
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs = inputs.npu()
            targets = targets.npu()
          
            loss = self.train_step(inputs, targets, batch_idx)
            total_loss += loss
      
        return total_loss / len(dataloader)

# 使用示例
model = models.resnet50().npu()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# 使用梯度累积，相当于batch_size扩大4倍
trainer = GradientAccumulation(model, optimizer, accumulation_steps=4)

2. 检查点技术

python 复制代码

from torch.utils.checkpoint import checkpoint

class CheckpointModel(torch.nn.Module):
    def __init__(self, original_model):
        super().__init__()
        self.model = original_model
  
    def forward(self, x):
        # 使用检查点减少内存
        # 只保存部分中间结果，需要时重新计算
      
        # 假设模型有多个stage
        if hasattr(self.model, 'layer1'):
            x = checkpoint(self.model.layer1, x)
            x = checkpoint(self.model.layer2, x)
            x = checkpoint(self.model.layer3, x)
            x = checkpoint(self.model.layer4, x)
          
            x = self.model.avgpool(x)
            x = torch.flatten(x, 1)
            x = self.model.fc(x)
        else:
            x = self.model(x)
      
        return x

# 使用示例
original_model = models.resnet50().npu()
checkpoint_model = CheckpointModel(original_model).npu()

# 对比内存使用
input_data = torch.randn(32, 3, 224, 224).npu()

# 原始模型
torch.npu.reset_peak_memory_stats()
_ = original_model(input_data)
original_memory = torch.npu.max_memory_allocated() / 1024**3

# 检查点模型
torch.npu.reset_peak_memory_stats()
_ = checkpoint_model(input_data)
checkpoint_memory = torch.npu.max_memory_allocated() / 1024**3

print(f"原始模型内存: {original_memory:.2f} GB")
print(f"检查点模型内存: {checkpoint_memory:.2f} GB")
print(f"节省: {(1 - checkpoint_memory/original_memory)*100:.1f}%")

分布式性能优化

1. 通信优化

python 复制代码

import torch.distributed as dist

class CommunicationOptimizer:
    def __init__(self, model):
        self.model = model
  
    def overlap_communication_computation(self, inputs, targets):
        """重叠通信和计算"""
        # 前向传播
        outputs = self.model(inputs)
        loss = torch.nn.functional.cross_entropy(outputs, targets)
      
        # 反向传播
        loss.backward()
      
        # 异步梯度同步
        handles = []
        for param in self.model.parameters():
            if param.grad is not None:
                handle = dist.all_reduce(param.grad, async_op=True)
                handles.append(handle)
      
        # 等待通信完成
        for handle in handles:
            handle.wait()
      
        return loss.item()
  
    def gradient_compression(self, threshold=0.01):
        """梯度压缩"""
        for param in self.model.parameters():
            if param.grad is not None:
                # 只传输大于阈值的梯度
                mask = torch.abs(param.grad) > threshold
                compressed_grad = param.grad * mask
              
                # 同步压缩后的梯度
                dist.all_reduce(compressed_grad)
              
                param.grad = compressed_grad

# 使用示例（需要在分布式环境中）
# dist.init_process_group(backend='hccl')
# model = models.resnet50().npu()
# optimizer = CommunicationOptimizer(model)

2. 流水线并行

python 复制代码

class PipelineParallel:
    def __init__(self, model, num_stages=4):
        self.num_stages = num_stages
        self.stages = self.split_model(model)
  
    def split_model(self, model):
        """将模型分割为多个stage"""
        # 简化示例：假设模型有4个layer
        stages = []
      
        if hasattr(model, 'layer1'):
            stages.append(torch.nn.Sequential(
                model.conv1,
                model.bn1,
                model.relu,
                model.maxpool,
                model.layer1
            ))
            stages.append(model.layer2)
            stages.append(model.layer3)
            stages.append(torch.nn.Sequential(
                model.layer4,
                model.avgpool,
                torch.nn.Flatten(),
                model.fc
            ))
      
        return stages
  
    def forward_pipeline(self, inputs, micro_batch_size):
        """流水线前向传播"""
        # 将输入分成多个micro-batch
        micro_batches = torch.split(inputs, micro_batch_size)
      
        # 流水线执行
        outputs = []
        intermediate = [None] * len(micro_batches)
      
        for stage_idx, stage in enumerate(self.stages):
            stage_outputs = []
          
            for batch_idx, micro_batch in enumerate(micro_batches):
                if stage_idx == 0:
                    x = micro_batch
                else:
                    x = intermediate[batch_idx]
              
                x = stage(x)
              
                if stage_idx < len(self.stages) - 1:
                    intermediate[batch_idx] = x
                else:
                    stage_outputs.append(x)
          
            if stage_idx == len(self.stages) - 1:
                outputs = stage_outputs
      
        return torch.cat(outputs, dim=0)

# 使用示例
model = models.resnet50().npu()
pipeline = PipelineParallel(model, num_stages=4)

input_data = torch.randn(32, 3, 224, 224).npu()
output = pipeline.forward_pipeline(input_data, micro_batch_size=8)

推理优化

1. 模型量化

python 复制代码

import torch.quantization as quantization

class ModelQuantizer:
    def __init__(self, model):
        self.model = model
  
    def quantize_dynamic(self):
        """动态量化（仅权重）"""
        quantized_model = quantization.quantize_dynamic(
            self.model,
            {torch.nn.Linear, torch.nn.Conv2d},
            dtype=torch.qint8
        )
      
        return quantized_model
  
    def quantize_static(self, calibration_data):
        """静态量化（权重+激活）"""
        # 准备模型
        self.model.eval()
        self.model.qconfig = quantization.get_default_qconfig('fbgemm')
      
        # 准备量化
        prepared_model = quantization.prepare(self.model)
      
        # 校准
        with torch.no_grad():
            for data in calibration_data:
                prepared_model(data)
      
        # 转换
        quantized_model = quantization.convert(prepared_model)
      
        return quantized_model
  
    def compare_performance(self, original_model, quantized_model, test_data):
        """对比性能"""
        monitor = PerformanceMonitor()
      
        # 原始模型
        latency_original = monitor.measure_latency(original_model, test_data)
        memory_original = monitor.get_memory_usage()
      
        # 量化模型
        latency_quantized = monitor.measure_latency(quantized_model, test_data)
        memory_quantized = monitor.get_memory_usage()
      
        print("\n=== 量化对比 ===")
        print(f"原始模型延迟: {latency_original['mean']:.2f} ms")
        print(f"量化模型延迟: {latency_quantized['mean']:.2f} ms")
        print(f"加速比: {latency_original['mean'] / latency_quantized['mean']:.2f}x")
        print(f"\n原始模型内存: {memory_original['allocated']:.2f} GB")
        print(f"量化模型内存: {memory_quantized['allocated']:.2f} GB")
        print(f"内存节省: {(1 - memory_quantized['allocated']/memory_original['allocated'])*100:.1f}%")

# 使用示例
model = models.resnet50().npu()
quantizer = ModelQuantizer(model)

# 动态量化
quantized_model = quantizer.quantize_dynamic()

# 对比性能
test_data = torch.randn(32, 3, 224, 224).npu()
quantizer.compare_performance(model, quantized_model, test_data)

2. 模型剪枝

python 复制代码

import torch.nn.utils.prune as prune

class ModelPruner:
    def __init__(self, model):
        self.model = model
  
    def prune_unstructured(self, amount=0.3):
        """非结构化剪枝"""
        parameters_to_prune = []
      
        for name, module in self.model.named_modules():
            if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
                parameters_to_prune.append((module, 'weight'))
      
        # 全局剪枝
        prune.global_unstructured(
            parameters_to_prune,
            pruning_method=prune.L1Unstructured,
            amount=amount
        )
      
        # 移除剪枝重参数化
        for module, param_name in parameters_to_prune:
            prune.remove(module, param_name)
      
        return self.model
  
    def prune_structured(self, amount=0.3):
        """结构化剪枝（通道剪枝）"""
        for name, module in self.model.named_modules():
            if isinstance(module, torch.nn.Conv2d):
                prune.ln_structured(
                    module,
                    name='weight',
                    amount=amount,
                    n=2,
                    dim=0  # 剪枝输出通道
                )
                prune.remove(module, 'weight')
      
        return self.model
  
    def iterative_pruning(self, dataloader, epochs=10, prune_rate=0.2):
        """迭代剪枝"""
        for epoch in range(epochs):
            # 训练
            self.train_epoch(dataloader)
          
            # 剪枝
            current_amount = prune_rate * (epoch + 1) / epochs
            self.prune_unstructured(amount=current_amount)
          
            print(f"Epoch {epoch + 1}, 剪枝率: {current_amount*100:.1f}%")
      
        return self.model
  
    def train_epoch(self, dataloader):
        """训练一个epoch"""
        self.model.train()
        # 训练逻辑
        pass
  
    def calculate_sparsity(self):
        """计算稀疏度"""
        total_params = 0
        zero_params = 0
      
        for param in self.model.parameters():
            total_params += param.numel()
            zero_params += (param == 0).sum().item()
      
        sparsity = zero_params / total_params
      
        print(f"\n模型稀疏度: {sparsity*100:.2f}%")
        print(f"总参数: {total_params:,}")
        print(f"零参数: {zero_params:,}")
      
        return sparsity

# 使用示例
model = models.resnet50().npu()
pruner = ModelPruner(model)

# 剪枝前
print("剪枝前:")
pruner.calculate_sparsity()

# 执行剪枝
pruned_model = pruner.prune_unstructured(amount=0.3)

# 剪枝后
print("\n剪枝后:")
pruner.calculate_sparsity()

3. 知识蒸馏

python 复制代码

class KnowledgeDistillation:
    def __init__(self, teacher_model, student_model, temperature=3.0, alpha=0.7):
        self.teacher = teacher_model
        self.student = student_model
        self.temperature = temperature
        self.alpha = alpha
      
        self.teacher.eval()
  
    def distillation_loss(self, student_logits, teacher_logits, labels):
        """蒸馏损失"""
        # 软标签损失
        soft_loss = torch.nn.functional.kl_div(
            torch.nn.functional.log_softmax(student_logits / self.temperature, dim=1),
            torch.nn.functional.softmax(teacher_logits / self.temperature, dim=1),
            reduction='batchmean'
        ) * (self.temperature ** 2)
      
        # 硬标签损失
        hard_loss = torch.nn.functional.cross_entropy(student_logits, labels)
      
        # 组合损失
        loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss
      
        return loss
  
    def train_step(self, inputs, labels, optimizer):
        """训练一步"""
        # 教师模型推理
        with torch.no_grad():
            teacher_logits = self.teacher(inputs)
      
        # 学生模型推理
        student_logits = self.student(inputs)
      
        # 计算损失
        loss = self.distillation_loss(student_logits, teacher_logits, labels)
      
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
      
        return loss.item()
  
    def train_epoch(self, dataloader, optimizer):
        """训练一个epoch"""
        self.student.train()
        total_loss = 0
      
        for batch_idx, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.npu()
            labels = labels.npu()
          
            loss = self.train_step(inputs, labels, optimizer)
            total_loss += loss
          
            if (batch_idx + 1) % 100 == 0:
                avg_loss = total_loss / (batch_idx + 1)
                print(f"Batch {batch_idx + 1}, Loss: {avg_loss:.4f}")
      
        return total_loss / len(dataloader)

# 使用示例
teacher = models.resnet50(pretrained=True).npu()
student = models.resnet18().npu()

distiller = KnowledgeDistillation(teacher, student)
optimizer = torch.optim.SGD(student.parameters(), lr=0.01)

# 训练
# for epoch in range(10):
#     loss = distiller.train_epoch(dataloader, optimizer)
#     print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")

端到端优化案例

1. ResNet推理优化

python 复制代码

class OptimizedResNetInference:
    def __init__(self, model_path):
        # 加载模型
        self.model = models.resnet50()
        self.model.load_state_dict(torch.load(model_path))
        self.model.eval()
      
        # 优化1: 算子融合
        self.model = OperatorFusion.fuse_model(self.model)
      
        # 优化2: 转换为FP16
        self.model = self.model.half()
      
        # 优化3: 移到NPU
        self.model = self.model.npu()
      
        # 优化4: JIT编译
        dummy_input = torch.randn(1, 3, 224, 224).half().npu()
        self.model = torch.jit.trace(self.model, dummy_input)
        self.model = torch.jit.optimize_for_inference(self.model)
  
    def preprocess(self, image):
        """预处理"""
        from torchvision import transforms
      
        transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ])
      
        return transform(image)
  
    def infer(self, image):
        """推理"""
        # 预处理
        input_tensor = self.preprocess(image).unsqueeze(0).half().npu()
      
        # 推理
        with torch.no_grad():
            output = self.model(input_tensor)
      
        # 后处理
        probabilities = torch.nn.functional.softmax(output[0], dim=0)
      
        return probabilities
  
    def batch_infer(self, images, batch_size=32):
        """批量推理"""
        results = []
      
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i+batch_size]
          
            # 预处理
            batch_tensors = torch.stack([
                self.preprocess(img) for img in batch_images
            ]).half().npu()
          
            # 推理
            with torch.no_grad():
                outputs = self.model(batch_tensors)
          
            # 后处理
            probabilities = torch.nn.functional.softmax(outputs, dim=1)
            results.extend(probabilities.cpu().numpy())
      
        return results

# 使用示例
from PIL import Image

# 创建优化后的推理器
inferencer = OptimizedResNetInference('resnet50.pth')

# 单张图像推理
image = Image.open('test.jpg')
probs = inferencer.infer(image)
print(f"Top-1 概率: {probs.max().item():.4f}")

# 批量推理
images = [Image.open(f'test_{i}.jpg') for i in range(100)]
results = inferencer.batch_infer(images, batch_size=32)
print(f"处理了 {len(results)} 张图像")

2. 性能基准测试

python 复制代码

class PerformanceBenchmark:
    def __init__(self):
        self.results = {}
  
    def benchmark_model(self, model, input_shape, batch_sizes=[1, 8, 16, 32, 64]):
        """对模型进行基准测试"""
        model.eval()
        monitor = PerformanceMonitor()
      
        print(f"\n=== 基准测试: {model.__class__.__name__} ===")
        print(f"输入形状: {input_shape}")
      
        for batch_size in batch_sizes:
            try:
                input_data = torch.randn(batch_size, *input_shape).npu()
              
                # 测量延迟
                latency_stats = monitor.measure_latency(model, input_data)
              
                # 测量吞吐量
                throughput = monitor.measure_throughput(
                    model,
                    batch_size,
                    input_shape,
                    duration=5
                )
              
                # 测量内存
                memory_info = monitor.get_memory_usage()
              
                self.results[batch_size] = {
                    'latency_mean': latency_stats['mean'],
                    'latency_p99': latency_stats['p99'],
                    'throughput': throughput,
                    'memory': memory_info['allocated']
                }
              
                print(f"\nBatch Size: {batch_size}")
                print(f"  延迟 (mean): {latency_stats['mean']:.2f} ms")
                print(f"  延迟 (p99): {latency_stats['p99']:.2f} ms")
                print(f"  吞吐量: {throughput:.2f} samples/sec")
                print(f"  内存: {memory_info['allocated']:.2f} GB")
              
            except RuntimeError as e:
                if "out of memory" in str(e):
                    print(f"\nBatch Size {batch_size}: 内存不足")
                    break
                else:
                    raise e
  
    def compare_models(self, models_dict, input_shape, batch_size=32):
        """对比多个模型"""
        print(f"\n=== 模型对比 (Batch Size: {batch_size}) ===")
        print(f"{'模型':<20} {'延迟(ms)':<15} {'吞吐量(samples/s)':<20} {'内存(GB)':<15}")
        print("-" * 70)
      
        for name, model in models_dict.items():
            model.eval()
            monitor = PerformanceMonitor()
          
            input_data = torch.randn(batch_size, *input_shape).npu()
          
            latency = monitor.measure_latency(model, input_data)
            throughput = monitor.measure_throughput(model, batch_size, input_shape, duration=5)
            memory = monitor.get_memory_usage()
          
            print(f"{name:<20} {latency['mean']:<15.2f} {throughput:<20.2f} {memory['allocated']:<15.2f}")

# 使用示例
benchmark = PerformanceBenchmark()

# 单模型基准测试
model = models.resnet50().npu()
benchmark.benchmark_model(model, input_shape=(3, 224, 224))

# 多模型对比
models_dict = {
    'ResNet18': models.resnet18().npu(),
    'ResNet50': models.resnet50().npu(),
    'ResNet101': models.resnet101().npu()
}
benchmark.compare_models(models_dict, input_shape=(3, 224, 224), batch_size=32)

总结

CANN性能分析与优化要点：

Profiling工具：性能数据采集和分析
性能监控：延迟、吞吐量、资源使用
模型优化：算子融合、混合精度、动态shape
内存优化：梯度累积、检查点技术
分布式优化：通信优化、流水线并行
推理优化：量化、剪枝、知识蒸馏
端到端优化：综合应用各种优化技术

通过CANN的atc和runtime工具，可以系统地分析和优化模型性能，实现高效的AI应用部署。

CANN性能分析与调优工具链实践：从profiling到性能优化的完整流程

前言

Profiling基础

1. 启用性能分析

2. 算子级别分析

性能指标监控

1. 实时性能监控

2. 批量大小优化

模型优化技术

1. 算子融合

2. 混合精度训练

3. 动态shape优化

内存优化

1. 梯度累积

2. 检查点技术

分布式性能优化

1. 通信优化

2. 流水线并行

推理优化

1. 模型量化

2. 模型剪枝

3. 知识蒸馏

端到端优化案例

1. ResNet推理优化

2. 性能基准测试

总结

相关链接