目录
1. 概述
1.1 宕机问题分类
| 类型 | 表现 | 分析工具 |
|---|---|---|
| 用户态崩溃 | Segmentation fault, abort | GDB + core dump |
| 内核崩溃 | Kernel panic, oops | kdump, crash |
| 死锁/卡死 | 进程无响应 | GDB attach, strace |
| 内存泄漏 | 系统逐渐变慢 | valgrind, gdb |
| 看门狗复位 | 系统重启 | 硬件日志, kdump |
1.2 调试原则
- 保护现场 - 第一时间保存日志和转储文件
- 复现问题 - 建立可靠的复现步骤
- 二分定位 - 通过版本/条件二分缩小范围
- 假设验证 - 提出假设并验证
2. 环境准备
2.1 交叉编译工具链
bash
# 设置环境变量
export CROSS_COMPILE=arm-linux-gnueabihf-
export ARCH=arm
# 验证工具链
${CROSS_COMPILE}gcc --version
${CROSS_COMPILE}gdb --version
2.2 目标板配置
bash
# 启用核心转储
ulimit -c unlimited
echo "/var/core/core.%e.%p.%t" > /proc/sys/kernel/core_pattern
# 创建核心转储目录
mkdir -p /var/core
chmod 777 /var/core
# 验证配置
cat /proc/sys/kernel/core_pattern
ulimit -a | grep core
2.3 编译选项
Makefile配置:
makefile
# 调试版本
CFLAGS += -g -O0 -fno-omit-frame-pointer
CFLAGS += -fno-inline -fno-strict-aliasing
# 发布版本(保留调试符号)
CFLAGS += -g -O2 -fno-omit-frame-pointer
# 地址消毒器(检测内存错误)
CFLAGS += -fsanitize=address -fno-omit-frame-pointer
LDFLAGS += -fsanitize=address
2.4 分离调试符号
bash
# 编译时分离调试符号
${CROSS_COMPILE}objcopy --only-keep-debug myapp myapp.debug
${CROSS_COMPILE}strip --strip-debug --strip-unneeded myapp
${CROSS_COMPILE}objcopy --add-gnu-debuglink=myapp.debug myapp
# 目标板运行精简版本,开发机用debug文件分析
3. GDB启动方式
3.1 本地调试
bash
# 直接启动程序
gdb ./myapp
# 带参数启动
gdb --args ./myapp -c config.ini -d
# 附加到运行中的进程
gdb -p <PID>
# 分析核心转储
gdb ./myapp core.1234
3.2 交叉调试(gdbserver)
目标板:
bash
# 启动gdbserver监听端口
gdbserver :1234 ./myapp
# 附加到已运行的进程
gdbserver :1234 --attach <PID>
# 使用串口
gdbserver /dev/ttyS0 ./myapp
开发机:
bash
# 启动交叉GDB
arm-linux-gnueabihf-gdb ./myapp
# GDB内部连接
(gdb) target remote 192.168.1.100:1234
# 或串口连接
(gdb) target remote /dev/ttyUSB0
3.3 GDB启动配置文件
~/.gdbinit:
gdb
# 基本设置
set pagination off
set confirm off
set print array on
set print array-indexes on
set print pretty on
set print object on
set print static-members on
set print vtbl on
set print demangle on
# 历史记录
set history save on
set history size 10000
# 反汇编风格
set disassembly-flavor intel
# 自动加载安全路径
add-auto-load-safe-path /
# 自定义命令
define plist
set $node = $arg0
while $node
print *$node
set $node = $node->next
end
end
# 显示提示
echo GDB initialized successfully.\n
4. 核心转储分析
4.1 获取核心转储
目标板配置:
bash
#!/bin/sh
# setup_coredump.sh
# 启用核心转储
ulimit -c unlimited
# 设置转储格式
# %e - 可执行文件名
# %p - 进程ID
# %t - 时间戳
# %s - 信号编号
# %h - 主机名
echo "/var/core/core.%e.%p.%s.%t" > /proc/sys/kernel/core_pattern
# 设置转储限制(字节,0为不限制)
echo 0 > /proc/sys/kernel/core_pipe_limit
# 创建目录
mkdir -p /var/core
chmod 777 /var/core
4.2 传输核心转储
bash
# 从目标板下载
scp root@192.168.1.100:/var/core/core.myapp.* ./
# 如果文件过大,先压缩
ssh root@192.168.1.100 "gzip /var/core/core.myapp.*"
scp root@192.168.1.100:/var/core/core.myapp.*.gz ./
gunzip core.myapp.*.gz
4.3 分析核心转储
bash
# 启动GDB分析
arm-linux-gnueabihf-gdb ./myapp ./myapp.debug core.myapp.1234.11.1699999999
# GDB内部命令
(gdb) set solib-search-path ./lib:/opt/sdk/sysroot/lib
(gdb) file ./myapp
(gdb) core-file core.myapp.1234.11.1699999999
核心分析步骤:
gdb
# 1. 查看崩溃信息
(gdb) info signals
(gdb) info target
# 2. 查看崩溃时的寄存器
(gdb) info registers
(gdb) info all-registers
# 3. 查看调用栈
(gdb) bt
(gdb) bt full
(gdb) info frame
(gdb) info args
(gdb) info locals
# 4. 查看崩溃位置的代码
(gdb) list
(gdb) disassemble
# 5. 查看内存内容
(gdb) x/20x $pc
(gdb) x/20i $pc
(gdb) x/s string_ptr
4.4 线程信息分析
gdb
# 查看所有线程
(gdb) info threads
# 查看所有线程的调用栈
(gdb) thread apply all bt
(gdb) thread apply all bt full
# 切换到特定线程
(gdb) thread 3
(gdb) bt
# 查看特定线程的寄存器
(gdb) thread 3
(gdb) info registers
5. 运行时调试
5.1 启动与运行控制
gdb
# 启动程序
(gdb) run
(gdb) run arg1 arg2
# 单步执行
(gdb) step # 进入函数
(gdb) next # 跳过函数
(gdb) stepi # 单条指令
(gdb) nexti # 单条指令跳过函数
# 继续执行
(gdb) continue
(gdb) continue 5 # 忽略5次断点
# 执行到指定位置
(gdb) until
(gdb) until 100 # 执行到第100行
(gdb) advance main.c:100
# 跳出当前函数
(gdb) finish
# 执行完毕反向执行(需要record)
(gdb) reverse-step
(gdb) reverse-next
(gdb) reverse-continue
5.2 断点管理
gdb
# 设置断点
(gdb) break main
(gdb) break main.c:100
(gdb) break main.c:100 if x > 10
(gdb) break *0x400500
# 设置条件断点
(gdb) break process_data if data == NULL
(gdb) condition 1 count > 100
# 设置临时断点
(gdb) tbreak main.c:200
# 设置硬件断点
(gdb) hbreak *0x400500
# 设置观察点(数据断点)
(gdb) watch global_var
(gdb) rwatch global_var # 读观察点
(gdb) awatch global_var # 读写观察点
# 查看断点
(gdb) info breakpoints
(gdb) info watchpoints
# 禁用/启用断点
(gdb) disable 1
(gdb) enable 1
(gdb) enable once 1
# 删除断点
(gdb) delete 1
(gdb) delete
(gdb) clear main.c:100
# 断点命令
(gdb) commands 1
> printf "x = %d\n", x
> continue
> end
5.3 查看变量与内存
gdb
# 打印变量
(gdb) print var
(gdb) print *ptr
(gdb) print arr[0]@10 # 打印数组前10个元素
(gdb) print/x var # 十六进制格式
(gdb) print/t var # 二进制格式
(gdb) print/c var # 字符格式
# 打印结构体
(gdb) print *struct_ptr
(gdb) print struct_var.member
(gdb) ptype struct_name # 查看结构体定义
# 打印字符串
(gdb) print str
(gdb) print str[0]@50 # 打印前50个字符
# 查看内存
(gdb) x/20x 0x400000 # 20个十六进制字
(gdb) x/20i $pc # 20条指令
(gdb) x/s string_ptr # 字符串
(gdb) x/20c buffer # 20个字符
# 自动显示
(gdb) display var
(gdb) display/x $pc
(gdb) info display
(gdb) undisplay 1
# 查看类型信息
(gdb) whatis var
(gdb) ptype struct_name
(gdb) info types
5.4 函数调用与修改
gdb
# 调用函数
(gdb) call func(1, 2)
(gdb) call printf("Hello\n")
# 修改变量
(gdb) set var = 10
(gdb) set *ptr = 0
(gdb) set {int}0x400000 = 100
# 修改内存
(gdb) set {char[10]}buffer = "hello"
# 强制返回
(gdb) return
(gdb) return 10
6. 内核转储分析
6.1 配置kdump
内核配置:
CONFIG_CRASH_DUMP=y
CONFIG_PROC_VMCORE=y
CONFIG_KEXEC=y
CONFIG_DEBUG_INFO=y
CONFIG_MAGIC_SYSRQ=y
启动参数:
bash
# /boot/cmdline.txt 或 grub配置
crashkernel=128M@256M
用户态工具:
bash
# 安装kexec-tools
apt-get install kexec-tools
# 加载崩溃内核
kexec -p /boot/vmlinuz-crash --append="root=/dev/mmcblk0p2"
# 触发崩溃测试
echo c > /proc/sysrq-trigger
6.2 分析vmcore
bash
# 使用crash工具
crash vmlinux vmcore
# crash内部命令
crash> sys
crash> bt
crash> bt -a
crash> ps
crash> files
crash> vm
crash> log
crash> dis -r <address>
6.3 内核Oops分析
bash
# 解析Oops日志
./scripts/decodecode < oops.log
# 或使用addr2line
arm-linux-gnueabihf-addr2line -e vmlinux -f 0xc0101234
7. 多线程调试
7.1 线程信息查看
gdb
# 查看所有线程
(gdb) info threads
Id Target Id Frame
1 Thread 0x7ffff7fc0740 (LWP 1234) "myapp" 0x00007ffff7dd5f7d in pthread_join () from /lib/libpthread.so.0
2 Thread 0x7ffff6bc0700 (LWP 1235) "myapp" worker_thread () at worker.c:100
* 3 Thread 0x7ffff63bf700 (LWP 1236) "myapp" 0x00007ffff7dd8ed5 in pthread_cond_wait () from /lib/libpthread.so.0
# 查看线程详细信息
(gdb) info thread 2
# 切换线程
(gdb) thread 2
# 查看所有线程调用栈
(gdb) thread apply all bt
(gdb) thread apply all bt full
# 查看特定线程组
(gdb) thread apply 1-3 bt
7.2 线程断点
gdb
# 只在特定线程触发断点
(gdb) break main.c:100 thread 2
(gdb) break main.c:100 thread 2 if x > 10
# 设置线程特定观察点
(gdb) watch global_var thread 1
7.3 线程同步调试
gdb
# 查看互斥锁状态
(gdb) print *mutex
(gdb) print mutex->__data.__owner
# 查看条件变量
(gdb) print *cond
(gdb) print cond->__data
# 查看线程属性
(gdb) print *attr
# 检测死锁
(gdb) thread apply all bt
# 查找所有线程是否都在等待锁
7.4 线程调度分析
gdb
# 查看线程调度策略
(gdb) call pthread_getschedparam(pthread_self(), &$policy, &$param)
(gdb) print $policy
(gdb) print $param
# 查看CPU亲和性
(gdb) call pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset)
8. 常见宕机场景分析
8.1 段错误(Segmentation Fault)
典型原因:
- 空指针解引用
- 数组越界
- 栈溢出
- 释放后使用
- 未初始化指针
调试步骤:
gdb
# 分析core dump
(gdb) bt
#0 0x0000000000400abc in process_data (data=0x0) at main.c:50
#1 0x0000000000400bcd in main () at main.c:100
# 查看崩溃位置
(gdb) frame 0
(gdb) list
45 int process_data(Data *data) {
46 // ...
47 if (data == NULL) {
48 return -1;
49 }
50 return data->value; // 崩溃位置
51 }
# 查看变量
(gdb) print data
$1 = (Data *) 0x0
# 查看寄存器
(gdb) info registers
rax 0x0 0 # 空指针
# 反汇编分析
(gdb) disassemble
0x0000000000400abc <+0>: mov (%rdi),%rax # 解引用空指针
8.2 内存越界
调试方法:
gdb
# 使用观察点检测
(gdb) watch array[100]
(gdb) continue
# 检查内存布局
(gdb) info proc mappings
(gdb) x/100x array
# 使用内存检查
(gdb) set check memory on
AddressSanitizer检测:
bash
# 编译时启用ASan
gcc -fsanitize=address -fno-omit-frame-pointer -g -o myapp myapp.c
# 运行时配置
export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1:abort_on_error=1
./myapp
# ASan会自动报告:
# ==1234==ERROR: AddressSanitizer: heap-buffer-overflow
# WRITE of size 4 at 0x602000000050 thread T0
8.3 栈溢出
检测方法:
gdb
# 查看栈指针
(gdb) info registers $sp
(gdb) print $sp
# 查看栈帧
(gdb) info frame
(gdb) info stack
# 查看栈内存
(gdb) x/100x $sp
# 检查栈保护
(gdb) info functions __stack_chk_fail
预防措施:
c
// 编译选项
-fstack-protector-all // 全部函数启用栈保护
-fstack-check // 运行时栈检查
// 代码规范
// 1. 避免大数组局部变量
// 2. 避免深层递归
// 3. 使用ulimit设置栈大小
8.4 死锁
检测步骤:
gdb
# 附加到卡死的进程
gdb -p <PID>
# 查看所有线程
(gdb) info threads
(gdb) thread apply all bt
# 分析等待关系
# 线程1:
#0 __lll_lock_wait () at pthread_mutex_lock.c
#1 pthread_mutex_lock (mutex=0x601000) at pthread_mutex_lock.c
# 线程2:
#0 __lll_lock_wait () at pthread_mutex_lock.c
#1 pthread_mutex_lock (mutex=0x601010) at pthread_mutex_lock.c
# 查看锁的持有者
(gdb) print *mutex
$1 = {__data = {__lock = 2, __count = 1, __owner = 1235, ...}}
# owner = 1235 表示线程1235持有该锁
死锁检测脚本:
python
# deadlock_detect.py
import gdb
class DeadlockDetector(gdb.Command):
def __init__(self):
super().__init__("detect-deadlock", gdb.COMMAND_USER)
def invoke(self, arg, from_tty):
threads = gdb.execute("info threads", to_string=True)
waiting = {}
for line in threads.split('\n'):
if "pthread_mutex_lock" in line:
# 解析线程ID和等待的锁
pass
# 分析循环等待
print("Deadlock analysis complete")
DeadlockDetector()
8.5 内存泄漏
GDB检测:
gdb
# 断点malloc/free
(gdb) break malloc
(gdb) commands
> silent
> printf "malloc(%d) = %p\n", $rdi, $rax
> continue
> end
(gdb) break free
(gdb) commands
> silent
> printf "free(%p)\n", $rdi
> continue
> end
Valgrind检测:
bash
# 运行valgrind
valgrind --leak-check=full \
--show-leak-kinds=all \
--track-origins=yes \
--log-file=valgrind.log \
./myapp
# 输出示例
# ==1234== 100 bytes in 1 blocks are definitely lost
# ==1234== at 0x4C2AB80: malloc (in /usr/lib/valgrind)
# ==1234== by 0x400544: main (main.c:10)
9. 高级调试技巧
9.1 反向调试
gdb
# 启用记录
(gdb) record
(gdb) record full
# 运行程序
(gdb) continue
# 反向执行
(gdb) reverse-continue
(gdb) reverse-step
(gdb) reverse-next
(gdb) reverse-stepi
# 查看执行日志
(gdb) info record
# 停止记录
(gdb) record stop
9.2 Python脚本扩展
python
# custom_commands.py
import gdb
class DumpMemory(gdb.Command):
"""Dump memory region to file: dump-mem <start> <size> <filename>"""
def __init__(self):
super().__init__("dump-mem", gdb.COMMAND_USER)
def invoke(self, arg, from_tty):
args = gdb.string_to_argv(arg)
if len(args) != 3:
print("Usage: dump-mem <start> <size> <filename>")
return
start = int(args[0], 0)
size = int(args[1], 0)
filename = args[2]
inferior = gdb.selected_inferior()
mem = inferior.read_memory(start, size)
with open(filename, 'wb') as f:
f.write(mem)
print(f"Dumped {size} bytes from 0x{start:x} to {filename}")
DumpMemory()
class BacktraceAll(gdb.Command):
"""Print detailed backtrace for all threads"""
def __init__(self):
super().__init__("bt-all", gdb.COMMAND_USER)
def invoke(self, arg, from_tty):
gdb.execute("thread apply all bt full")
BacktraceAll()
加载脚本:
gdb
(gdb) source custom_commands.py
(gdb) dump-mem 0x400000 1024 mem.bin
9.3 远程调试优化
gdb
# 减少数据传输
set remotebreak 0
set remotelogfile /dev/null
# 批量读取符号
set trust-readonly-sections on
# 禁用符号自动加载
set auto-load safe-path /
# 使用共享库搜索路径
set solib-search-path /opt/sdk/sysroot/lib:/opt/sdk/sysroot/usr/lib
set sysroot /opt/sdk/sysroot
9.4 条件断点优化
gdb
# 复杂条件断点会很慢,改用计数器
(gdb) break main.c:100
(gdb) ignore 1 999
(gdb) continue
# 第1000次触发断点
# 使用断点命令加速
(gdb) break main.c:100
(gdb) commands
> if x > 1000
> printf "x = %d\n", x
> end
> continue
> end
9.5 内联函数调试
gdb
# 显示内联函数
(gdb) set print inline on
# 在内联函数设置断点
(gdb) break inline_func if $pc != 0
# 查看内联调用链
(gdb) backtrace
10. 调试脚本与自动化
10.1 自动化分析脚本
bash
#!/bin/bash
# auto_analyze.sh - 自动分析core dump
CORE_FILE=$1
BINARY=$2
DEBUG_FILE=$3
if [ -z "$CORE_FILE" ] || [ -z "$BINARY" ]; then
echo "Usage: $0 <core-file> <binary> [debug-file]"
exit 1
fi
GDB="arm-linux-gnueabihf-gdb"
# 生成分析报告
REPORT="analysis_$(basename $CORE_FILE).txt"
echo "=== Crash Analysis Report ===" > $REPORT
echo "Date: $(date)" >> $REPORT
echo "Core: $CORE_FILE" >> $REPORT
echo "Binary: $BINARY" >> $REPORT
echo "" >> $REPORT
# 执行GDB命令
$GDB -batch \
-ex "file $BINARY" \
${DEBUG_FILE:+-ex "add-symbol-file $DEBUG_FILE"} \
-ex "core-file $CORE_FILE" \
-ex "set pagination off" \
-ex "echo === Signal Info ===\n" \
-ex "info signals" \
-ex "echo \n=== Registers ===\n" \
-ex "info registers" \
-ex "echo \n=== Backtrace ===\n" \
-ex "bt full" \
-ex "echo \n=== All Threads ===\n" \
-ex "thread apply all bt" \
-ex "echo \n=== Memory Map ===\n" \
-ex "info proc mappings" \
-ex "echo \n=== Disassembly ===\n" \
-ex "disassemble" \
>> $REPORT 2>&1
echo "Analysis complete: $REPORT"
10.2 GDB批处理脚本
gdb
# analyze.gdb - 批处理分析脚本
set pagination off
set confirm off
# 加载符号
file ./myapp
add-symbol-file ./myapp.debug
# 加载core
core-file ./core.1234
# 输出到文件
set logging file analysis.txt
set logging on
# 分析
echo === Crash Analysis ===\n
echo \n=== Signal ===\n
info signals
echo \n=== Registers ===\n
info all-registers
echo \n=== Backtrace ===\n
bt full
echo \n=== Thread Info ===\n
info threads
thread apply all bt full
echo \n=== Frame Info ===\n
info frame
info args
info locals
echo \n=== Memory at PC ===\n
x/20i $pc
x/20x $sp
# 退出
set logging off
quit
执行:
bash
arm-linux-gnueabihf-gdb -x analyze.gdb
10.3 持续监控脚本
bash
#!/bin/bash
# monitor_crash.sh - 持续监控并分析崩溃
WATCH_DIR="/var/core"
ANALYZED_DIR="/var/core/analyzed"
BINARY="/usr/bin/myapp"
DEBUG_FILE="/opt/debug/myapp.debug"
mkdir -p $ANALYZED_DIR
inotifywait -m -e create -e moved_to $WATCH_DIR |
while read dir action file; do
if [[ "$file" == core.* ]]; then
echo "Detected new core dump: $file"
CORE="$WATCH_DIR/$file"
REPORT="$ANALYZED_DIR/${file}.txt"
# 等待core dump完成
sleep 2
# 分析
arm-linux-gnueabihf-gdb -batch \
-ex "file $BINARY" \
-ex "add-symbol-file $DEBUG_FILE" \
-ex "core-file $CORE" \
-ex "bt full" \
-ex "thread apply all bt" \
> $REPORT
# 发送通知
echo "Crash analyzed: $REPORT" | mail -s "Crash Report" admin@example.com
# 压缩core dump
gzip $CORE
fi
done
10.4 GDB Dashboard配置
python
# ~/.gdbinit.d/dashboard.py
# GDB Dashboard - 可视化调试界面
import gdb
class Dashboard:
def __init__(self):
self.enabled = True
gdb.events.stop.connect(self.refresh)
gdb.events.cont.connect(self.clear)
def refresh(self, event):
if not self.enabled:
return
print("\n" + "="*80)
self.show_registers()
self.show_stack()
self.show_code()
self.show_locals()
print("="*80 + "\n")
def clear(self, event):
pass
def show_registers(self):
print("\n[Registers]")
regs = ['rax', 'rbx', 'rcx', 'rdx', 'rsi', 'rdi', 'rbp', 'rsp',
'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'rip']
for reg in regs:
try:
val = gdb.parse_and_eval(f'${reg}')
print(f" {reg:4s} = 0x{int(val):016x}")
except:
pass
def show_stack(self):
print("\n[Stack]")
try:
sp = int(gdb.parse_and_eval('$sp'))
for i in range(8):
addr = sp + i * 8
val = gdb.parse_and_eval(f'*(long long*){addr}')
print(f" 0x{addr:016x} : 0x{int(val):016x}")
except:
pass
def show_code(self):
print("\n[Code]")
try:
gdb.execute('list', to_string=True)
except:
pass
def show_locals(self):
print("\n[Locals]")
try:
gdb.execute('info locals', to_string=True)
except:
pass
dashboard = Dashboard()
附录
A. GDB常用命令速查表
| 命令 | 简写 | 说明 |
|---|---|---|
| break | b | 设置断点 |
| run | r | 运行程序 |
| continue | c | 继续执行 |
| next | n | 单步(跳过函数) |
| step | s | 单步(进入函数) |
| finish | fin | 执行到函数返回 |
| backtrace | bt | 显示调用栈 |
| p | 打印变量 | |
| list | l | 显示源码 |
| info | i | 查看信息 |
| thread | t | 线程操作 |
| watch | wa | 设置观察点 |
| x | - | 查看内存 |
| disassemble | dis | 反汇编 |
| quit | q | 退出 |
B. 信号与错误对应表
| 信号 | 编号 | 常见原因 |
|---|---|---|
| SIGSEGV | 11 | 段错误,非法内存访问 |
| SIGABRT | 6 | abort()调用,assert失败 |
| SIGBUS | 7 | 总线错误,内存对齐 |
| SIGFPE | 8 | 浮点异常,除零 |
| SIGILL | 4 | 非法指令 |
| SIGPIPE | 13 | 管道破裂 |
| SIGTERM | 15 | 终止请求 |
| SIGKILL | 9 | 强制终止 |
C. 参考资源
- GDB官方文档: https://sourceware.org/gdb/documentation/
- GDB Dashboard: https://github.com/cyrus-and/gdb-dashboard
- Crash工具: https://github.com/crash-utility/crash
- Valgrind: https://valgrind.org/
- AddressSanitizer: https://github.com/google/sanitizers
文档版本: 1.0
最后更新: 2026-05-10