CANN算子开发调试实战：从“Segmentation Fault“到定位根因的完整流程

写Ascend C算子最怕的不是编译失败------编译失败有明确的错误信息。最怕的是运行时Segmentation Fault，什么都没告诉你，NPU直接挂了。没有堆栈、没有日志、只有一行"Killed"。

这篇整理了算子开发中常见的运行时错误、调试方法、以及定位根因的完整流程。

运行时错误的分类

在算子开发中，我们通常会遇到以下几类运行时错误：

内存错误
- Segmentation Fault：通常由越界访问或空指针引起。
- Bus Error：通常由未对齐访问引起。
- 内存泄漏：UB（Unified Buffer）未释放、pipe未关闭等导致。
计算错误
- NaN/Inf输出：除零、溢出、数据未初始化等原因导致。
- 精度偏差：类型转换不当、浮点误差累积导致。
- 结果全零：UB数据还没搬过来就开始计算。
调度错误
- 算子注册失败：算子名重复、参数不匹配。
- Kernel Launch失败：block_dim超限、内存不足。
- Shape不匹配：tiling参数配置错误。
同步错误
- 死锁：缺少 pipe_barrier 导致流水线卡死。
- 数据竞争：双缓冲交换时机错误。
- 结果未就绪：DMA异步搬运但没等待完成就读取。

调试工具一：printf调试法（最实用）

Ascend C支持printf，但有特定限制：它主要在CPU模拟模式下使用；在NPU上运行时，printf的输出会重定向到 /var/log/npu/slog/host-0/ 目录下的日志文件中；且printf会影响性能，调试完记得删掉。

cpp 复制代码

// debug_printf.cpp - 用printf调试Ascend C kernel
#include "kernel_operator.h"
using namespace AscendC;

class DebugKernel {
public:
    __aicore__ inline void Init(GM_ADDR input, GM_ADDR output, int32_t size) {
        this->size = size;
        input_gm.SetGlobalBuffer((__gm__ half*)input, size);
        output_gm.SetGlobalBuffer((__gm__ half*)output, size);
        
        pipe.InitBuffer(in_ub, size * sizeof(half));
        pipe.InitBuffer(out_ub, size * sizeof(half));
        
        // ★ 调试技巧1：打印参数
        printf("[Init] size=%d, input=%p, output=%p\n", 
               size, input, output);
    }
    
    __aicore__ inline void Process() {
        LocalTensor<half> in_local = in_ub.Get<half>();
        LocalTensor<half> out_local = out_ub.Get<half>();
        
        // 搬入数据
        DataCopy(in_local, input_gm, size);
        pipe_barrier();
        
        // ★ 调试技巧2：打印搬运后的数据（前10个）
        printf("[After DataCopy] First 10 values:\n");
        for (int i = 0; i < 10 && i < size; i++) {
            // half需要转成float才能打印（printf不支持half格式）
            printf("  in[%d] = %f\n", i, (float)in_local.GetValue(i));
        }
        
        // 计算
        for (int i = 0; i < size; i++) {
            half val = in_local.GetValue(i);
            // ★ 调试技巧3：打印中间结果
            if (i < 5) {
                printf("  Processing: in[%d]=%f\n", i, (float)val);
            }
            out_local.SetValue(i, val * (half)2.0);
        }
        
        // 搬出数据
        DataCopy(output_gm, out_local, size);
        pipe_barrier();
        
        // ★ 调试技巧4：打印完成信息
        printf("[Process] Done. Processed %d elements.\n", size);
    }
    
private:
    GlobalTensor<half> input_gm, output_gm;
    TBuf<UB> in_ub, out_ub;
    int32_t size;
    TPipe pipe;
};

调试工具二：CPU模拟模式

CPU模拟模式允许我们在CPU上运行kernel，从而可以使用gdb等传统工具进行单步调试。

bash 复制代码

# 编译（CPU模式）
atc --singleop \
    --kernel=debug_kernel.cpp \
    --output=debug_kernel_cpu.so \
    --socVersion=Ascend910 \
    --simulate_mode=cpu  # ★ 关键参数：CPU模拟模式

# 运行测试脚本
python3 test_kernel.py

对应的测试脚本 test_kernel.py 示例：

python 复制代码

import torch
import numpy as np

# 准备测试数据
input_data = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float16)
output_data = np.zeros(5, dtype=np.float16)

# 用CPU模拟模式加载并运行
kernel = cann.AscendCKernel("debug_kernel_cpu.so")
kernel.run(input_data, output_data, len(input_data))

print(f"Input:  {input_data}")
print(f"Output: {output_data}")

CPU模拟模式的优劣势：

优势：可以用gdb单步调试；printf正常输出到终端；地址检查工具可用（如valgrind, ASan）；崩溃时有完整的堆栈信息。
劣势：不能测试NPU特有的行为（比如Cube Unit的tile对齐要求）；性能数据无参考价值；某些NPU指令在CPU上没有对应实现。

使用gdb调试的方法：

bash 复制代码

gdb --args python3 test_kernel.py
(gdb) break debug_kernel.cpp:45
(gdb) run
(gdb) print size
(gdb) step
(gdb) continue

调试工具三：NPU日志系统（slog）

昇腾NPU拥有完善的日志系统，可以通过分析日志来定位问题。

python 复制代码

import subprocess
import os

def configure_npu_logging(level="WARNING"):
    """
    配置NPU日志级别
    
    level: DEBUG/INFO/WARNING/ERROR
    """
    # 通过环境变量控制
    os.environ["ASCEND_SLOG_PRINT_TO_STDOUT"] = "1"  # 同时输出到终端
    os.environ["ASCEND_GLOBAL_LOG_LEVEL"] = level
    
    # 或者通过npu-smi配置
    result = subprocess.run(
        ["npu-smi", "info", "-t", "log", "-l", level],
        capture_output=True, text=True
    )
    print(f"日志级别已设为: {level}")

def get_recent_npu_logs(lines=50):
    """获取最近的NPU日志"""
    log_dir = "/var/log/npu/slog/host-0/device-0/"
    
    # 找最新的日志文件
    log_files = sorted(os.listdir(log_dir), reverse=True)
    
    if log_files:
        latest_log = os.path.join(log_dir, log_files[0])
        result = subprocess.run(
            ["tail", "-n", str(lines), latest_log],
            capture_output=True, text=True
        )
        return result.stdout
    
    return "没有找到NPU日志"

def analyze_npu_error(log_content):
    """分析NPU日志中的错误"""
    
    errors = []
    warnings = []
    
    for line in log_content.split('\n'):
        if 'ERROR' in line:
            errors.append(line)
        elif 'WARNING' in line:
            warnings.append(line)
    
    print(f"发现 {len(errors)} 个错误, {len(warnings)} 个警告")
    
    if errors:
        print("\n错误列表：")
        for err in errors[-10:]:  # 最近的10个错误
            print(f"  {err}")
    
    # 常见错误模式匹配
    error_patterns = {
        "out of memory": "显存不足，减小batch size或tiling参数",
        "invalid address": "地址越界，检查数组索引和指针计算",
        "kernel launch failed": "kernel启动失败，检查block_dim和共享内存",
        "timeout": "执行超时，可能死锁，检查pipe_barrier",
        "ECC": "硬件ECC错误，可能是内存硬件故障",
    }
    
    for pattern, suggestion in error_patterns.items():
        for err in errors:
            if pattern in err.lower():
                print(f"\n 可能原因: {suggestion}")
                break

调试工具四：AddressSanitizer（ASan）

ASan是检测内存越界、use-after-free、内存泄漏的神器。

bash 复制代码

# 编译时加ASan
export CXX=g++
export CXXFLAGS="-fsanitize=address -fno-omit-frame-pointer -g"
export LDFLAGS="-fsanitize=address"

atc --singleop \
    --kernel=debug_kernel.cpp \
    --output=debug_kernel_asan.so \
    --socVersion=Ascend910 \
    --simulate_mode=cpu \
    --extra_cflags="-fsanitize=address -g" \
    --extra_ldflags="-fsanitize=address"

# 运行
python3 test_kernel.py

如果有内存问题，ASan会输出非常详细的报告，例如：

text 复制代码

=================================================================
==12345==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x602000000028
READ of size 2 at 0x602000000028 thread T0
    #0 0x7f8a1b2c3d4e in DebugKernel::Process() debug_kernel.cpp:35
    #1 0x7f8a1b2c4d5e in main test_kernel.py:42
0x602000000028 is located 0 bytes to the right of 16-byte region [0x602000000018,0x602000000028)