SignalAnrTracer才是真正意义上的ANR监控,当应用发生ANR后,system_server进程会发送SIGQUIT信号来通知相关进程来dump堆栈,SignalAnrTracer使用sigaction方法注册signalHandler来监听SIGQUIT信号来达到监听ANR的目的。
一、ANR流程
ANR有以下几种类型:
InputDispatching Timeout
:5秒内无法响应屏幕触摸事件或键盘输入事件。BroadcastQueue Timeout
:在执行前台广播(BroadcastReceiver)的onReceive()函数时10秒没有处理完成,后台为60秒。Service Timeout
:前台服务20秒内,后台服务在200秒内没有执行完毕。ContentProvider Timeout
:ContentProvider的publish在10s内没进行完。
这些类型的ANR最后都会走到appNotResponding
(以下源码基于Android10,只分析相关流程)
java
//ProcessRecord.java
void appNotResponding(String activityShortComponentName, ApplicationInfo aInfo,
String parentShortComponentName, WindowProcessController parentProcess,
boolean aboveSystem, String annotation){
//①首先搜集发生ANR的进程
// Dump thread traces as quickly as we can, starting with "interesting" processes.
firstPids.add(pid);
// Don't dump other PIDs if it's a background ANR
if (!isSilentAnr()) {
int parentPid = pid;
if (parentProcess != null && parentProcess.getPid() > 0) {
parentPid = parentProcess.getPid();
}
//②搜集当前进程的父进程到firstPids
if (parentPid != pid) firstPids.add(parentPid);
//③搜集系统进程到firstPids
if (MY_PID != pid && MY_PID != parentPid) firstPids.add(MY_PID);
for (int i = getLruProcessList().size() - 1; i >= 0; i--) {
ProcessRecord r = getLruProcessList().get(i);
if (r != null && r.thread != null) {
int myPid = r.pid;
if (myPid > 0 && myPid != pid && myPid != parentPid && myPid != MY_PID) {
//④搜集关键进程到firstPids
if (r.isPersistent()) {
firstPids.add(myPid);
if (DEBUG_ANR) Slog.i(TAG, "Adding persistent proc: " + r);
} else if (r.treatLikeActivity) {
firstPids.add(myPid);
if (DEBUG_ANR) Slog.i(TAG, "Adding likely IME: " + r);
} else {
//⑤搜集使用率高的进程
lastPids.put(myPid, Boolean.TRUE);
if (DEBUG_ANR) Slog.i(TAG, "Adding ANR proc: " + r);
}
}
}
}
//⑥搜集一些固定的native进程
int[] pids = nativeProcs == null ? null : Process.getPidsForCommands(nativeProcs);
ArrayList<Integer> nativePids = null;
if (pids != null) {
nativePids = new ArrayList<>(pids.length);
for (int i : pids) {
nativePids.add(i);
}
}
//⑦开始dump这些进程的堆栈信息
// For background ANRs, don't pass the ProcessCpuTracker to
// avoid spending 1/2 second collecting stats to rank lastPids.
File tracesFile = ActivityManagerService.dumpStackTraces(firstPids,
(isSilentAnr()) ? null : processCpuTracker, (isSilentAnr()) ? null : lastPids,
nativePids);
}
}
首先搜集发生ANR的进程,再搜集其它进程,按照顺序dump,也就是发送ANR进程是第一个被dump的。
java
//ActivityManagerService.java
public static File dumpStackTraces(ArrayList<Integer> firstPids,
ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
ArrayList<Integer> nativePids) {
//按照firstPids、nativePids、extraPids顺序dump
dumpStackTraces(tracesFile.getAbsolutePath(), firstPids, nativePids, extraPids);
}
//然后又经过一系列调用
ActivityManagerService#dumpStackTraces
-> dumpJavaTracesTombstoned
-> Debug#dumpJavaBacktraceToFileTimeout
-> android_os_Debug#android_os_Debug_dumpJavaBacktraceToFileTimeout
-> dumpTraces (这里的dump_type是kDebuggerdJavaBacktrace)
-> debuggerd_client#dump_backtrace_to_file_timeout
-> debuggerd_trigger_dump
cpp
//path: /system/core/debuggerd/client/debuggerd_client.cpp
bool debuggerd_trigger_dump(pid_t tid, DebuggerdDumpType dump_type, unsigned int timeout_ms,
unique_fd output_fd) {
//send_signal
if (!send_signal(tid, dump_type)) {
return false;
}
}
static bool send_signal(pid_t pid, const DebuggerdDumpType dump_type) {
//这里signal为SIGQUIT
const int signal = (dump_type == kDebuggerdJavaBacktrace) ? SIGQUIT : DEBUGGER_SIGNAL;
sigval val;
val.sival_int = (dump_type == kDebuggerdNativeBacktrace) ? 1 : 0;
//通过sigqueue向需要dump堆栈信息的进程发送SIGQUIT信号
if (sigqueue(pid, signal, val) != 0) {
PLOG(ERROR) << "libdebuggerd_client: failed to send signal to pid " << pid;
return false;
}
return true;
}
出于安全考虑,进程之间是相互隔离的,因此需要使用IPC通信方式,通过sigqueue向需要dump堆栈信息的进程发送SIGQUIT信号。
SignalCatcher
ActivityManagerService会请求Zygote fork进程,最终会通过Runtime 创建SignalCatcher线程。
cc
//path: /art/runtime/runtime.cc
bool Runtime::Init(RuntimeArgumentMap&& runtime_options_in) {
//Android默认把SIGQUIT设置成了BLOCKED
BlockSignals();
}
Android默认把SIGQUIT设置成了BLOCKED,每个应用进程都会有一个SignalCatcher线程,专门处理SIGQUIT等信号。
cc
//path: /art/runtime/signal_catcher.cc
void* SignalCatcher::Run(void* arg) {
// Set up mask with signals we want to handle.
SignalSet signals;
//注册信号监听
signals.Add(SIGQUIT);
signals.Add(SIGUSR1);
while (true) {
//监听信号
int signal_number = signal_catcher->WaitForSignal(self, signals);
if (signal_catcher->ShouldHalt()) {
runtime->DetachCurrentThread();
return nullptr;
}
switch (signal_number) {
case SIGQUIT:
//处理信号
signal_catcher->HandleSigQuit();
break;
case SIGUSR1:
signal_catcher->HandleSigUsr1();
break;
default:
LOG(ERROR) << "Unexpected signal %d" << signal_number;
break;
}
}
}
WaitForSignal
最终调用了Linux系统提供的sigwait
监听被阻塞的信号
cc
int SignalCatcher::WaitForSignal(Thread* self, SignalSet& signals) {
ScopedThreadStateChange tsc(self, kWaitingInMainSignalCatcherLoop);
// Signals for sigwait() must be blocked but not ignored. We
// block signals like SIGQUIT for all threads, so the condition
// is met. When the signal hits, we wake up, without any signal
// handlers being invoked.
int signal_number = signals.Wait();
return signal_number;
}
二、SIGQUIT信号监控
SignalAnrTracer在onAlive时调用nativeInitSignalAnrDetective方法进入MatrixTracer.cc
cc
static void nativeInitSignalAnrDetective(JNIEnv *env, jclass, jstring anrTracePath, jstring printTracePath) {
//构建AnrDumper
sAnrDumper.emplace(anrTracePathChar, printTracePathChar);
}
AnrDumper
cc
class AnrDumper : public SignalHandler {
public:
AnrDumper(const char* anrTraceFile, const char* printTraceFile);
}
AnrDumper继承自SignalHandler,先看下SignalHandler的构造函数:
cc
SignalHandler::SignalHandler() {
installHandlersLocked();
}
bool SignalHandler::installHandlersLocked() {
struct sigaction sa{};
//设置signalHandler
sa.sa_sigaction = signalHandler;
sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTART;
//使用sigaction方法注册signal handler进行异步监听
if (sigaction(TARGET_SIG, &sa, nullptr) == -1) {
return false;
}
return true;
}
这里之所以不用SignalCatcher线程的sigwait
监听被阻塞的信号方式,是因为多个线程同时使用这种方式监听,哪一个线程收到信号是不能确定的。
所以这里采用了另一种方式: 创建我们自己的signalHandler,通过Linux提供的sigaction方法注册signal handler进行异步监听。
接着看下AnrDumper的构造函数:
cc
AnrDumper::AnrDumper(const char* anrTraceFile, const char* printTraceFile) {
//通过pthread_sigmask将SIGQUIT设置成BLOCKED,才能进我们设置的signalHandler中
pthread_sigmask(SIG_UNBLOCK, &sigSet , &old_sigSet);
}
AnrDumper在初始化的时候,通过pthread_sigmask将SIGQUIT设置成BLOCKED,否则会被系统的SignalCatcher线程捕获,我们设置的signalHandler就监听不到SIGQUIT信号。
三、handleSignal
handleSignal 方法中收到SIGQUIT 信号,并根据SIGQUIT信号是否是当前进程发出的,分别设置anrCallback和siUserCallback
cc
void AnrDumper::handleSignal(int sig, const siginfo_t *info, void *uc) {
int fromPid1 = info->_si_pad[3];
int fromPid2 = info->_si_pad[4];
int myPid = getpid();
bool fromMySelf = fromPid1 == myPid || fromPid2 == myPid;
if (sig == SIGQUIT) {
pthread_t thd;
//SIGQUIT信号是否是当前进程发出的
if (!fromMySelf) {
//不是当前进程发出的,进一步确认是否的确发生了ANR
pthread_create(&thd, nullptr, anrCallback, nullptr);
} else {
//可能收到非ANR的SIGQUIT信号
pthread_create(&thd, nullptr, siUserCallback, nullptr);
}
pthread_detach(thd);
}
}
anrCallback
如果不是当前进程发出的SIGQUIT信号,就要进一步确认是否的确发生了anr
cc
static void *anrCallback(void* arg) {
//调到了Java层SignalAnrTracer的onANRDumped方法
anrDumpCallback();
if (strlen(mAnrTraceFile) > 0) {
//hook anr信息写入指定文件
hookAnrTraceWrite(false);
}
//恢复系统anr处理流程
sendSigToSignalCatcher();
return nullptr;
}
onANRDumped
anrDumpCallback调到了Java层SignalAnrTracer的onANRDumped方法
java
private synchronized static void onANRDumped() {
final CountDownLatch anrDumpLatch = new CountDownLatch(1);
new Thread(new Runnable() {
@Override
public void run() {
//确认是否的确发生了anr
confirmRealAnr(true);
anrDumpLatch.countDown();
}
}, ANR_DUMP_THREAD_NAME).start();
try {
anrDumpLatch.await(anrReportTimeout, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
//empty here
}
}
confirmRealAnr
confirmRealAnr主要根据主线程是否卡住 和进程是否有NOT_RESPONDING标记来确定是否发生了ANR
java
private static void confirmRealAnr(final boolean isSigQuit) {
MatrixLog.i(TAG, "confirmRealAnr, isSigQuit = " + isSigQuit);
//判断主线程是否卡住
boolean needReport = isMainThreadBlocked();
if (needReport) {
//上报anr
report(false, isSigQuit);
} else {
new Thread(new Runnable() {
@Override
public void run() {
//循环检测进程状态
checkErrorStateCycle(isSigQuit);
}
}, CHECK_ANR_STATE_THREAD_NAME).start();
}
}
isMainThreadBlocked
isMainThreadBlocked通过反射获取消息队列MessageQueue中的mMessages执行的预期时间when,与当前时间比较,超时阈值说明当主线程卡住消息无法被执行。
java
@RequiresApi(api = Build.VERSION_CODES.M)
private static boolean isMainThreadBlocked() {
try {
//反射MessageQueue拿到第一条消息mMessages
MessageQueue mainQueue = Looper.getMainLooper().getQueue();
Field field = mainQueue.getClass().getDeclaredField("mMessages");
field.setAccessible(true);
final Message mMessage = (Message) field.get(mainQueue);
if (mMessage != null) {
anrMessageString = mMessage.toString();
MatrixLog.i(TAG, "anrMessageString = " + anrMessageString);
//获取消息执行的预期时间when
long when = mMessage.getWhen();
if (when == 0) {
return false;
}
//当主线程卡住,消息无法被执行,此时when-当前时间会得到一个负值
long time = when - SystemClock.uptimeMillis();
anrMessageWhen = time;
//后台卡住为-10秒
long timeThreshold = BACKGROUND_MSG_THRESHOLD;
if (currentForeground) {
//前台卡住为-2秒
timeThreshold = FOREGROUND_MSG_THRESHOLD;
}
//差值<定义的卡住时长判定为 主线程消息卡住
return time < timeThreshold;
} else {
MatrixLog.i(TAG, "mMessage is null");
}
} catch (Exception e) {
return false;
}
return false;
}
checkErrorStateCycle
为了防止SIGQUIT信号不是由本应用ANR导致发出的,需要判断当前进程是否有NOT_RESPONDING标记。这里创建了一个线程在20s内每隔500ms检测一次。
java
private static void checkErrorStateCycle(boolean isSigQuit) {
int checkErrorStateCount = 0;
while (checkErrorStateCount < CHECK_ERROR_STATE_COUNT) {
try {
checkErrorStateCount++;
//检测当前进程是否被置为NOT_RESPONDING状态
boolean myAnr = checkErrorState();
if (myAnr) {
//上报anr
report(true, isSigQuit);
break;
}
Thread.sleep(CHECK_ERROR_STATE_INTERVAL);
} catch (Throwable t) {
MatrixLog.e(TAG, "checkErrorStateCycle error, e : " + t.getMessage());
break;
}
}
}
java
private static boolean checkErrorState() {
try {
MatrixLog.i(TAG, "[checkErrorState] start");
Application application =
sApplication == null ? Matrix.with().getApplication() : sApplication;
ActivityManager am = (ActivityManager) application
.getSystemService(Context.ACTIVITY_SERVICE);
List<ActivityManager.ProcessErrorStateInfo> procs = am.getProcessesInErrorState();
if (procs == null) {
MatrixLog.i(TAG, "[checkErrorState] procs == null");
return false;
}
for (ActivityManager.ProcessErrorStateInfo proc : procs) {
MatrixLog.i(TAG, "[checkErrorState] found Error State proccessName = %s, proc.condition = %d", proc.processName, proc.condition);
//状态为NOT_RESPONDING且不是当前进程说明是其它应用ANR导致
if (proc.uid != android.os.Process.myUid()
&& proc.condition == ActivityManager.ProcessErrorStateInfo.NOT_RESPONDING) {
MatrixLog.i(TAG, "maybe received other apps ANR signal");
return false;
}
//不是当前进程继续遍历
if (proc.pid != android.os.Process.myPid()) continue;
//状态不是NOT_RESPONDING继续遍历
if (proc.condition != ActivityManager.ProcessErrorStateInfo.NOT_RESPONDING) {
continue;
}
MatrixLog.i(TAG, "error sate longMsg = %s", proc.longMsg);
//直到是当前进程且状态为NOT_RESPONDING
return true;
}
return false;
} catch (Throwable t) {
MatrixLog.e(TAG, "[checkErrorState] error : %s", t.getMessage());
}
return false;
}
siUserCallback
如果我们想要知道应用当前的所有线程的状态,我们就可以主动发送一个 SIGQUIT 信号给 SignalCatcher 线程,这样也可以通过 hook 拿到对应的 dump 文件,发送信号可以通过 syscall(SYS_tgkill, myPid, gSignalCatcherTid, SIGQUIT)方法发送。
四、hookAnrTraceWrite
SignalCatcher线程写Tracer是通过socket的write方法,hook这里的write拿到系统dump的ANR Tracer内容。
cc
void hookAnrTraceWrite(bool isSiUser) {
int apiLevel = getApiLevel();
if (apiLevel < 19) {
return;
}
if (!fromMyPrintTrace && isSiUser) {
return;
}
if (isHooking) {
return;
}
isHooking = true;
//hook socket建立的时候,不同版本so和函数不同
if (apiLevel >= 27) {
xhook_grouped_register(HOOK_REQUEST_GROUPID_ANR_DUMP_TRACE, ".*libcutils\\.so$",
"connect", (void *) my_connect, (void **) (&original_connect));
} else {
xhook_grouped_register(HOOK_REQUEST_GROUPID_ANR_DUMP_TRACE, ".*libart\\.so$",
"open", (void *) my_open, (void **) (&original_open));
}
//write方法不同版本so也不同
if (apiLevel >= 30 || apiLevel == 25 || apiLevel == 24) {
xhook_grouped_register(HOOK_REQUEST_GROUPID_ANR_DUMP_TRACE, ".*libc\\.so$",
"write", (void *) my_write, (void **) (&original_write));
} else if (apiLevel == 29) {
xhook_grouped_register(HOOK_REQUEST_GROUPID_ANR_DUMP_TRACE, ".*libbase\\.so$",
"write", (void *) my_write, (void **) (&original_write));
} else {
xhook_grouped_register(HOOK_REQUEST_GROUPID_ANR_DUMP_TRACE, ".*libart\\.so$",
"write", (void *) my_write, (void **) (&original_write));
}
xhook_refresh(true);
}
hook点因API Level而不同,处理SignalCatcher线程connect/open后的第一次write,需要hook的write在不同Android版本so也不一样,需要分别处理。
my_connect/my_open
dump tracer流程在write前,系统会先用connect方法链接一个path为"/dev/socket/tombstoned_java_trace"的socket,拦截connect获取获取SignalCatcher的线程id,并将isTraceWrite置为true。my_open方法逻辑一样。
cc
int my_connect(int __fd, const struct sockaddr* __addr, socklen_t __addr_length) {
if (__addr!= nullptr) {
//比较path相同
if (strcmp(__addr->sa_data, HOOK_CONNECT_PATH) == 0) {
//获取SignalCatcher的线程id
signalCatcherTid = gettid();
//isTraceWrite置为true
isTraceWrite = true;
}
}
//不改变原有逻辑
return original_connect(__fd, __addr, __addr_length);
}
my_write
只处理connect这个socket后,相同线程(即SignalCatcher线程)的第一次write,拦截write方法将ANR Trace写入指定文件路径。
cc
ssize_t my_write(int fd, const void* const buf, size_t count) {
//isTraceWrite为true表示connect的这个socket,并且是相同线程
if(isTraceWrite && gettid() == signalCatcherTid) {
//isTraceWrite置为false,即只处理一次
isTraceWrite = false;
//signalCatcherTid置为0,即只处理一次
signalCatcherTid = 0;
if (buf != nullptr) {
std::string targetFilePath;
if (fromMyPrintTrace) {
targetFilePath = printTracePathString;
} else {
targetFilePath = anrTracePathString;
}
if (!targetFilePath.empty()) {
char *content = (char *) buf;
//写入指定文件
writeAnr(content, targetFilePath);
if(!fromMyPrintTrace) {
anrDumpTraceCallback();
} else {
printTraceCallback();
}
fromMyPrintTrace = false;
}
}
}
//原有的写入逻辑
return original_write(fd, buf, count);
}
五、sendSigToSignalCatcher
我们设置了signalHandler抢了系统的SIGQUIT,SignalCatcher线程的sigwait就收不到信号了,系统原有的dump等流程就无法完成了,所以需要转发一个SIGQUIT保证系统逻辑不变。
cc
static void sendSigToSignalCatcher() {
//遍历/proc/[pid]目录,找到SignalCatcher线程的tid
int tid = getSignalCatcherThreadId();
//转发一个SIGQUIT信号
syscall(SYS_tgkill, getpid(), tid, SIGQUIT);
}
小结
SignalAnrTracer主要功能:
1、使用sigaction方法注册signal handler进行异步监听SIGQUIT信号。
2、将SIGQUIT设置成BLOCKED,使我们设置的signalHandler能监听到SIGQUIT信号。
3、收到SIGQUIT 信号后,根据主线程是否卡住和进程是否有NOT_RESPONDING标记来进一步确定是否发生了ANR。
4、hook anr写入,只处理connect这个socket后,相同线程(即SignalCatcher线程)的第一次write,拦截write方法将ANR Trace写入指定文件路径。
5、转发SIGQUIT信号给SignalCatcher线程,以完成系统流程。