CANN 日志系统:调试与性能分析的日志艺术

一、CANN 日志基础

1.1 日志级别

复制代码
CANN 日志级别 (从低到高):
  DEBUG    → 调试信息,开发阶段使用
  INFO     → 一般信息,默认级别
  WARNING  → 警告信息,不影响运行
  ERROR    → 错误信息,功能受损
  FATAL    → 致命错误,程序终止

1.2 基础配置

bash 复制代码
# 环境变量配置
export ASCEND_LOG_LEVEL=3          # INFO 级别
export ASCEND_LOG_PATH=/var/log/ascend
export ASCEND_LOG_FILE=ascend.log
export ASCEND_LOG_FILE_SIZE=100     # MB
export ASCEND_LOG_FILE_NUM=10       # 保留文件数

# 运行时查看日志
tail -f /var/log/ascend/ascend.log

1.3 Python 日志接口

python 复制代码
import logging

# 获取 CANN 日志器
logger = logging.getLogger('ascend')

# 配置日志格式
handler = logging.StreamHandler()
formatter = logging.Formatter(
    '[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

# 使用日志
logger.info("模型加载开始")
logger.warning("显存使用率超过 80%")
logger.error("模型转换失败")

二、自定义日志

2.1 推理日志装饰器

python 复制代码
import functools
import time
import logging

def log_inference(func):
    """推理日志装饰器"""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        logger = logging.getLogger('inference')
        
        start_time = time.time()
        logger.info(f"开始推理: {func.__name__}")
        
        try:
            result = func(*args, **kwargs)
            latency = time.time() - start_time
            
            logger.info(
                f"推理完成: {func.__name__} "
                f"耗时={latency:.3f}s "
                f"输入形状={args[0].shape if hasattr(args[0], 'shape') else 'N/A'}"
            )
            
            return result
        
        except Exception as e:
            latency = time.time() - start_time
            logger.error(
                f"推理失败: {func.__name__} "
                f"耗时={latency:.3f}s "
                f"错误={str(e)}"
            )
            raise
    
    return wrapper

# 使用示例
@log_inference
def predict(model, input_data):
    return model(input_data)

2.2 性能追踪日志

python 复制代码
class PerformanceTracer:
    def __init__(self, log_file='performance.jsonl'):
        self.log_file = log_file
        self.logger = logging.getLogger('performance')
    
    def trace(self, operation_name):
        """性能追踪上下文管理器"""
        return TraceContext(self, operation_name)
    
    def record(self, operation_name, duration, metadata=None):
        """记录性能数据"""
        record = {
            'timestamp': time.time(),
            'operation': operation_name,
            'duration_ms': duration * 1000,
            'metadata': metadata or {}
        }
        
        # 写入 JSONL 文件
        with open(self.log_file, 'a') as f:
            f.write(json.dumps(record) + '\n')
        
        # 同时输出到日志
        self.logger.info(
            f"性能: {operation_name} "
            f"耗时={duration*1000:.2f}ms "
            f"元数据={metadata}"
        )

class TraceContext:
    def __init__(self, tracer, operation_name):
        self.tracer = tracer
        self.operation_name = operation_name
        self.start_time = None
    
    def __enter__(self):
        self.start_time = time.time()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        duration = time.time() - self.start_time
        metadata = {}
        if exc_type:
            metadata['error'] = str(exc_val)
        self.tracer.record(self.operation_name, duration, metadata)
        return False

# 使用示例
tracer = PerformanceTracer()

with tracer.trace("model_load"):
    model = load_model("model.om")

with tracer.trace("preprocess"):
    input_data = preprocess(raw_data)

with tracer.trace("inference"):
    output = model(input_data)

with tracer.trace("postprocess"):
    result = postprocess(output)

三、分布式日志收集

3.1 日志聚合

python 复制代码
import logging
import logging.handlers
import socket

class DistributedLogHandler(logging.Handler):
    """分布式日志处理器"""
    
    def __init__(self, host, port, node_id):
        super().__init__()
        self.host = host
        self.port = port
        self.node_id = node_id
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.socket.connect((host, port))
    
    def emit(self, record):
        """发送日志到中心服务器"""
        try:
            log_entry = {
                'node_id': self.node_id,
                'timestamp': record.created,
                'level': record.levelname,
                'message': self.format(record),
                'module': record.module,
                'function': record.funcName,
                'line': record.lineno
            }
            
            message = json.dumps(log_entry).encode('utf-8')
            self.socket.send(message)
        
        except Exception as e:
            self.handleError(record)

# 配置分布式日志
def setup_distributed_logging(node_id, log_server='192.168.1.100', port=5000):
    """配置分布式日志"""
    
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    
    # 控制台输出
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(
        '[%(asctime)s] [%(node_id)s] [%(levelname)s] %(message)s'
    ))
    logger.addHandler(console_handler)
    
    # 分布式输出
    distributed_handler = DistributedLogHandler(log_server, port, node_id)
    distributed_handler.setFormatter(logging.Formatter(
        '[%(asctime)s] [%(node_id)s] [%(levelname)s] [%(module)s:%(funcName)s:%(lineno)d] %(message)s'
    ))
    logger.addHandler(distributed_handler)
    
    return logger

3.2 ELK 日志栈

yaml 复制代码
# docker-compose-elk.yml
version: '3.8'
services:
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.10.0
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
    ports:
      - "9200:9200"
    volumes:
      - esdata:/usr/share/elasticsearch/data

  logstash:
    image: docker.elastic.co/logstash/logstash:8.10.0
    volumes:
      - ./logstash.conf:/usr/share/logstash/pipeline/logstash.conf
    ports:
      - "5000:5000"
    depends_on:
      - elasticsearch

  kibana:
    image: docker.elastic.co/kibana/kibana:8.10.0
    ports:
      - "5601:5601"
    depends_on:
      - elasticsearch

volumes:
  esdata:
ruby 复制代码
# logstash.conf
input {
  tcp {
    port => 5000
    codec => json_lines
  }
}

filter {
  # 添加字段
  mutate {
    add_field => { "service" => "cann-inference" }
  }
  
  # 解析时间戳
  date {
    match => [ "timestamp", "UNIX_MS" ]
  }
}

output {
  elasticsearch {
    hosts => ["elasticsearch:9200"]
    index => "cann-logs-%{+YYYY.MM.dd}"
  }
}

四、基于日志的性能分析

4.1 日志分析脚本

python 复制代码
import json
import pandas as pd
from collections import defaultdict

class LogAnalyzer:
    def __init__(self, log_file):
        self.log_file = log_file
        self.records = self._parse_log()
    
    def _parse_log(self):
        """解析日志文件"""
        records = []
        with open(self.log_file, 'r') as f:
            for line in f:
                try:
                    record = json.loads(line.strip())
                    records.append(record)
                except json.JSONDecodeError:
                    continue
        return records
    
    def performance_summary(self):
        """性能摘要"""
        df = pd.DataFrame(self.records)
        
        if 'duration_ms' not in df.columns:
            print("无性能数据")
            return
        
        summary = df.groupby('operation')['duration_ms'].agg([
            'count', 'mean', 'std', 'min', 'max',
            lambda x: x.quantile(0.5),
            lambda x: x.quantile(0.95),
            lambda x: x.quantile(0.99)
        ])
        
        summary.columns = [
            '次数', '平均(ms)', '标准差', '最小(ms)', '最大(ms)',
            'P50(ms)', 'P95(ms)', 'P99(ms)'
        ]
        
        print("\n性能摘要:")
        print(summary.to_string())
        
        return summary
    
    def bottleneck_analysis(self):
        """瓶颈分析"""
        df = pd.DataFrame(self.records)
        
        if 'duration_ms' not in df.columns:
            return
        
        # 按操作统计
        operation_stats = df.groupby('operation')['duration_ms'].agg(['mean', 'sum', 'count'])
        operation_stats = operation_stats.sort_values('sum', ascending=False)
        
        print("\n瓶颈分析 (按总耗时排序):")
        print(operation_stats.to_string())
        
        # 识别瓶颈
        total_time = operation_stats['sum'].sum()
        cumulative = 0
        bottlenecks = []
        
        for op, row in operation_stats.iterrows():
            cumulative += row['sum']
            percentage = cumulative / total_time * 100
            bottlenecks.append((op, row['sum'], percentage))
            
            if percentage >= 80:  # 80/20 法则
                break
        
        print(f"\n前 {len(bottlenecks)} 个操作占总耗时 80%:")
        for op, total, pct in bottlenecks:
            print(f"  {op}: {total:.2f}ms ({pct:.1f}%)")

# 使用示例
analyzer = LogAnalyzer('performance.jsonl')
analyzer.performance_summary()
analyzer.bottleneck_analysis()

4.2 实时监控

python 复制代码
class RealtimeLogMonitor:
    def __init__(self, log_file):
        self.log_file = log_file
        self.metrics = defaultdict(list)
        self.lock = threading.Lock()
    
    def start(self):
        """启动实时监控"""
        self.thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self.thread.start()
    
    def _monitor_loop(self):
        """监控循环"""
        with open(self.log_file, 'r') as f:
            f.seek(0, 2)  # 跳到文件末尾
            
            while True:
                line = f.readline()
                if not line:
                    time.sleep(0.1)
                    continue
                
                try:
                    record = json.loads(line.strip())
                    self._process_record(record)
                except json.JSONDecodeError:
                    continue
    
    def _process_record(self, record):
        """处理日志记录"""
        with self.lock:
            if 'duration_ms' in record:
                op = record.get('operation', 'unknown')
                self.metrics[op].append(record['duration_ms'])
                
                # 保持最近 1000 条
                if len(self.metrics[op]) > 1000:
                    self.metrics[op] = self.metrics[op][-1000:]
    
    def get_current_metrics(self):
        """获取当前指标"""
        with self.lock:
            metrics = {}
            for op, durations in self.metrics.items():
                if durations:
                    metrics[op] = {
                        'count': len(durations),
                        'avg': sum(durations) / len(durations),
                        'p99': sorted(durations)[int(len(durations) * 0.99)]
                    }
            return metrics

# 使用示例
monitor = RealtimeLogMonitor('performance.jsonl')
monitor.start()

# 定期获取指标
while True:
    time.sleep(10)
    metrics = monitor.get_current_metrics()
    print(json.dumps(metrics, indent=2))

五、常见问题

问题 原因 解决方案
日志文件过大 日志级别太低 调整为 INFO 或 WARNING
日志缺失 未正确配置路径 检查 ASCEND_LOG_PATH
性能日志格式错误 JSON 解析失败 检查日志格式
分布式日志延迟 网络问题 增加缓冲区、异步发送
日志文件损坏 异常终止 使用日志轮转

相关仓库

相关推荐
FlyWIHTSKY1 小时前
Next中引入 Ant Design (antd)的配置
开发语言·前端·javascript
JAVA学习通1 小时前
《大营销平台系统设计实现》 - 营销服务 第9节:模板模式串联抽奖规则
服务器·前端·javascript
阿正的梦工坊1 小时前
【Typescript】10-条件类型与-infer
前端·javascript·typescript
GuWenyue2 小时前
我被 React 性能问题逼疯了,直到学会这 4 个优化技巧
前端
babe小鑫2 小时前
数学专业学数据分析的价值
数据挖掘·数据分析
窗边的anini2 小时前
那个因为 vibecoding 差点搞砸约会的女孩,被 TRAE SOLO 救了
前端·人工智能·程序员
用户713874229002 小时前
OAuth 2.0 client_id深度解析:从规范到安全实践
前端
ZC跨境爬虫2 小时前
跟着 MDN 学CSS day_8:(盒模型完全解)
前端·javascript·css·ui·交互
Cache技术分享2 小时前
415. Java 文件操作基础 - 精准读取压缩诗集:从二进制文件中高效提取指定十四行诗
前端·后端