一、CANN 日志基础
1.1 日志级别
复制代码
CANN 日志级别 (从低到高):
DEBUG → 调试信息,开发阶段使用
INFO → 一般信息,默认级别
WARNING → 警告信息,不影响运行
ERROR → 错误信息,功能受损
FATAL → 致命错误,程序终止
1.2 基础配置
bash
复制代码
# 环境变量配置
export ASCEND_LOG_LEVEL=3 # INFO 级别
export ASCEND_LOG_PATH=/var/log/ascend
export ASCEND_LOG_FILE=ascend.log
export ASCEND_LOG_FILE_SIZE=100 # MB
export ASCEND_LOG_FILE_NUM=10 # 保留文件数
# 运行时查看日志
tail -f /var/log/ascend/ascend.log
1.3 Python 日志接口
python
复制代码
import logging
# 获取 CANN 日志器
logger = logging.getLogger('ascend')
# 配置日志格式
handler = logging.StreamHandler()
formatter = logging.Formatter(
'[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
# 使用日志
logger.info("模型加载开始")
logger.warning("显存使用率超过 80%")
logger.error("模型转换失败")
二、自定义日志
2.1 推理日志装饰器
python
复制代码
import functools
import time
import logging
def log_inference(func):
"""推理日志装饰器"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
logger = logging.getLogger('inference')
start_time = time.time()
logger.info(f"开始推理: {func.__name__}")
try:
result = func(*args, **kwargs)
latency = time.time() - start_time
logger.info(
f"推理完成: {func.__name__} "
f"耗时={latency:.3f}s "
f"输入形状={args[0].shape if hasattr(args[0], 'shape') else 'N/A'}"
)
return result
except Exception as e:
latency = time.time() - start_time
logger.error(
f"推理失败: {func.__name__} "
f"耗时={latency:.3f}s "
f"错误={str(e)}"
)
raise
return wrapper
# 使用示例
@log_inference
def predict(model, input_data):
return model(input_data)
2.2 性能追踪日志
python
复制代码
class PerformanceTracer:
def __init__(self, log_file='performance.jsonl'):
self.log_file = log_file
self.logger = logging.getLogger('performance')
def trace(self, operation_name):
"""性能追踪上下文管理器"""
return TraceContext(self, operation_name)
def record(self, operation_name, duration, metadata=None):
"""记录性能数据"""
record = {
'timestamp': time.time(),
'operation': operation_name,
'duration_ms': duration * 1000,
'metadata': metadata or {}
}
# 写入 JSONL 文件
with open(self.log_file, 'a') as f:
f.write(json.dumps(record) + '\n')
# 同时输出到日志
self.logger.info(
f"性能: {operation_name} "
f"耗时={duration*1000:.2f}ms "
f"元数据={metadata}"
)
class TraceContext:
def __init__(self, tracer, operation_name):
self.tracer = tracer
self.operation_name = operation_name
self.start_time = None
def __enter__(self):
self.start_time = time.time()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
duration = time.time() - self.start_time
metadata = {}
if exc_type:
metadata['error'] = str(exc_val)
self.tracer.record(self.operation_name, duration, metadata)
return False
# 使用示例
tracer = PerformanceTracer()
with tracer.trace("model_load"):
model = load_model("model.om")
with tracer.trace("preprocess"):
input_data = preprocess(raw_data)
with tracer.trace("inference"):
output = model(input_data)
with tracer.trace("postprocess"):
result = postprocess(output)
三、分布式日志收集
3.1 日志聚合
python
复制代码
import logging
import logging.handlers
import socket
class DistributedLogHandler(logging.Handler):
"""分布式日志处理器"""
def __init__(self, host, port, node_id):
super().__init__()
self.host = host
self.port = port
self.node_id = node_id
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.connect((host, port))
def emit(self, record):
"""发送日志到中心服务器"""
try:
log_entry = {
'node_id': self.node_id,
'timestamp': record.created,
'level': record.levelname,
'message': self.format(record),
'module': record.module,
'function': record.funcName,
'line': record.lineno
}
message = json.dumps(log_entry).encode('utf-8')
self.socket.send(message)
except Exception as e:
self.handleError(record)
# 配置分布式日志
def setup_distributed_logging(node_id, log_server='192.168.1.100', port=5000):
"""配置分布式日志"""
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# 控制台输出
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter(
'[%(asctime)s] [%(node_id)s] [%(levelname)s] %(message)s'
))
logger.addHandler(console_handler)
# 分布式输出
distributed_handler = DistributedLogHandler(log_server, port, node_id)
distributed_handler.setFormatter(logging.Formatter(
'[%(asctime)s] [%(node_id)s] [%(levelname)s] [%(module)s:%(funcName)s:%(lineno)d] %(message)s'
))
logger.addHandler(distributed_handler)
return logger
3.2 ELK 日志栈
yaml
复制代码
# docker-compose-elk.yml
version: '3.8'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.10.0
environment:
- discovery.type=single-node
- xpack.security.enabled=false
ports:
- "9200:9200"
volumes:
- esdata:/usr/share/elasticsearch/data
logstash:
image: docker.elastic.co/logstash/logstash:8.10.0
volumes:
- ./logstash.conf:/usr/share/logstash/pipeline/logstash.conf
ports:
- "5000:5000"
depends_on:
- elasticsearch
kibana:
image: docker.elastic.co/kibana/kibana:8.10.0
ports:
- "5601:5601"
depends_on:
- elasticsearch
volumes:
esdata:
ruby
复制代码
# logstash.conf
input {
tcp {
port => 5000
codec => json_lines
}
}
filter {
# 添加字段
mutate {
add_field => { "service" => "cann-inference" }
}
# 解析时间戳
date {
match => [ "timestamp", "UNIX_MS" ]
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "cann-logs-%{+YYYY.MM.dd}"
}
}
四、基于日志的性能分析
4.1 日志分析脚本
python
复制代码
import json
import pandas as pd
from collections import defaultdict
class LogAnalyzer:
def __init__(self, log_file):
self.log_file = log_file
self.records = self._parse_log()
def _parse_log(self):
"""解析日志文件"""
records = []
with open(self.log_file, 'r') as f:
for line in f:
try:
record = json.loads(line.strip())
records.append(record)
except json.JSONDecodeError:
continue
return records
def performance_summary(self):
"""性能摘要"""
df = pd.DataFrame(self.records)
if 'duration_ms' not in df.columns:
print("无性能数据")
return
summary = df.groupby('operation')['duration_ms'].agg([
'count', 'mean', 'std', 'min', 'max',
lambda x: x.quantile(0.5),
lambda x: x.quantile(0.95),
lambda x: x.quantile(0.99)
])
summary.columns = [
'次数', '平均(ms)', '标准差', '最小(ms)', '最大(ms)',
'P50(ms)', 'P95(ms)', 'P99(ms)'
]
print("\n性能摘要:")
print(summary.to_string())
return summary
def bottleneck_analysis(self):
"""瓶颈分析"""
df = pd.DataFrame(self.records)
if 'duration_ms' not in df.columns:
return
# 按操作统计
operation_stats = df.groupby('operation')['duration_ms'].agg(['mean', 'sum', 'count'])
operation_stats = operation_stats.sort_values('sum', ascending=False)
print("\n瓶颈分析 (按总耗时排序):")
print(operation_stats.to_string())
# 识别瓶颈
total_time = operation_stats['sum'].sum()
cumulative = 0
bottlenecks = []
for op, row in operation_stats.iterrows():
cumulative += row['sum']
percentage = cumulative / total_time * 100
bottlenecks.append((op, row['sum'], percentage))
if percentage >= 80: # 80/20 法则
break
print(f"\n前 {len(bottlenecks)} 个操作占总耗时 80%:")
for op, total, pct in bottlenecks:
print(f" {op}: {total:.2f}ms ({pct:.1f}%)")
# 使用示例
analyzer = LogAnalyzer('performance.jsonl')
analyzer.performance_summary()
analyzer.bottleneck_analysis()
4.2 实时监控
python
复制代码
class RealtimeLogMonitor:
def __init__(self, log_file):
self.log_file = log_file
self.metrics = defaultdict(list)
self.lock = threading.Lock()
def start(self):
"""启动实时监控"""
self.thread = threading.Thread(target=self._monitor_loop, daemon=True)
self.thread.start()
def _monitor_loop(self):
"""监控循环"""
with open(self.log_file, 'r') as f:
f.seek(0, 2) # 跳到文件末尾
while True:
line = f.readline()
if not line:
time.sleep(0.1)
continue
try:
record = json.loads(line.strip())
self._process_record(record)
except json.JSONDecodeError:
continue
def _process_record(self, record):
"""处理日志记录"""
with self.lock:
if 'duration_ms' in record:
op = record.get('operation', 'unknown')
self.metrics[op].append(record['duration_ms'])
# 保持最近 1000 条
if len(self.metrics[op]) > 1000:
self.metrics[op] = self.metrics[op][-1000:]
def get_current_metrics(self):
"""获取当前指标"""
with self.lock:
metrics = {}
for op, durations in self.metrics.items():
if durations:
metrics[op] = {
'count': len(durations),
'avg': sum(durations) / len(durations),
'p99': sorted(durations)[int(len(durations) * 0.99)]
}
return metrics
# 使用示例
monitor = RealtimeLogMonitor('performance.jsonl')
monitor.start()
# 定期获取指标
while True:
time.sleep(10)
metrics = monitor.get_current_metrics()
print(json.dumps(metrics, indent=2))
五、常见问题
| 问题 |
原因 |
解决方案 |
| 日志文件过大 |
日志级别太低 |
调整为 INFO 或 WARNING |
| 日志缺失 |
未正确配置路径 |
检查 ASCEND_LOG_PATH |
| 性能日志格式错误 |
JSON 解析失败 |
检查日志格式 |
| 分布式日志延迟 |
网络问题 |
增加缓冲区、异步发送 |
| 日志文件损坏 |
异常终止 |
使用日志轮转 |
相关仓库