首先以 service 处理超时逻辑来分析
1)service timeout调用到anr 的逻辑:
/frameworks/base/services/core/java/com/android/server/am/ActiveServices.java
5827 void scheduleServiceTimeoutLocked(ProcessRecord proc) {
5828 if (proc.mServices.numberOfExecutingServices() == 0 || proc.getThread() == null) {
5829 return;
5830 }
// 去设置了这个message 去处理超时逻辑
5831 Message msg = mAm.mHandler.obtainMessage(
5832 ActivityManagerService.SERVICE_TIMEOUT_MSG);
5833 msg.obj = proc;
5834 mAm.mHandler.sendMessageDelayed(msg, proc.mServices.shouldExecServicesFg()
5835 ? SERVICE_TIMEOUT : SERVICE_BACKGROUND_TIMEOUT);
5836 }
service 走超时的逻辑:
/frameworks/base/services/core/java/com/android/server/am/ActivityManagerService.java
1685 @Override
1686 public void handleMessage(Message msg) {
1687 switch (msg.what) {
1688 case GC_BACKGROUND_PROCESSES_MSG: {
1689 synchronized (ActivityManagerService.this) {
1690 mAppProfiler.performAppGcsIfAppropriateLocked();
1691 }
1692 } break;
1693 case SERVICE_TIMEOUT_MSG: {
1694 mServices.serviceTimeout((ProcessRecord) msg.obj);
1695 } break;
/frameworks/base/services/core/java/com/android/server/am/ActiveServices.java
5712 void serviceTimeout(ProcessRecord proc) {
5713 String anrMessage = null;
5714 synchronized(mAm) {
5715 if (proc.isDebugging()) {
5716 // The app's being debugged, ignore timeout.
5717 return;
5718 }
5719 final ProcessServiceRecord psr = proc.mServices;
// 如果没有在执行的service ,则直接return
5720 if (psr.numberOfExecutingServices() == 0 || proc.getThread() == null) {
5721 return;
5722 }
5723 final long now = SystemClock.uptimeMillis();
// 获取理想的开始处理service 的时间
5724 final long maxTime = now -
5725 (psr.shouldExecServicesFg() ? SERVICE_TIMEOUT : SERVICE_BACKGROUND_TIMEOUT);
5726 ServiceRecord timeout = null;
5727 long nextTime = 0;
5728 for (int i = psr.numberOfExecutingServices() - 1; i >= 0; i--) {
5729 ServiceRecord sr = psr.getExecutingServiceAt(i);
// 找到运行超时的service
5730 if (sr.executingStart < maxTime) {
5731 timeout = sr;
5732 break;
5733 }
5734 if (sr.executingStart > nextTime) {
5735 nextTime = sr.executingStart;
5736 }
5737 }
// 满足下列的条件
5738 if (timeout != null && mAm.mProcessList.isInLruListLOSP(proc)) {
// 会打印下列的log
5739 Slog.w(TAG, "Timeout executing service: " + timeout);
5740 StringWriter sw = new StringWriter();
5741 PrintWriter pw = new FastPrintWriter(sw, false, 1024);
5742 pw.println(timeout);
5743 timeout.dump(pw, " ");
5744 pw.close();
// 保存上一次anr 的dump信息,在sump ams 的时候会打印
5745 mLastAnrDump = sw.toString();
5746 mAm.mHandler.removeCallbacks(mLastAnrDumpClearer);
// 超过 2 个小时,则去移除anr 的信息
5747 mAm.mHandler.postDelayed(mLastAnrDumpClearer, LAST_ANR_LIFETIME_DURATION_MSECS);
// 设置anr 的消息
5748 anrMessage = "executing service " + timeout.shortInstanceName;
5749 } else {
5750 Message msg = mAm.mHandler.obtainMessage(
5751 ActivityManagerService.SERVICE_TIMEOUT_MSG);
5752 msg.obj = proc;
5753 mAm.mHandler.sendMessageAtTime(msg, psr.shouldExecServicesFg()
5754 ? (nextTime+SERVICE_TIMEOUT) : (nextTime + SERVICE_BACKGROUND_TIMEOUT));
5755 }
5756 }
5757
// anr 消息不为空,走 AnrHelper 的 appNotResponding 方法
5758 if (anrMessage != null) {
5759 mAm.mAnrHelper.appNotResponding(proc, anrMessage);
5760 }
5761 }
// anr 消息不为空,走 AnrHelper 的 appNotResponding 方法
这里分析 a14 的代码,a14 增加了打印trace 的log
/frameworks/base/services/core/java/com/android/server/am/AnrHelper.java
// 这里变化为 TimeoutRecord 了
111 void appNotResponding(ProcessRecord anrProcess, TimeoutRecord timeoutRecord) {
112 appNotResponding(anrProcess, null /* activityShortComponentName */, null /* aInfo */,
113 null /* parentShortComponentName */, null /* parentProcess */,
114 false /* aboveSystem */, timeoutRecord, /*isContinuousAnr*/ false);
115 }
117 void appNotResponding(ProcessRecord anrProcess, String activityShortComponentName,
118 ApplicationInfo aInfo, String parentShortComponentName,
119 WindowProcessController parentProcess, boolean aboveSystem,
120 TimeoutRecord timeoutRecord, boolean isContinuousAnr) {
121 try {
// 1-1)增加了打印trace 的日志
122 timeoutRecord.mLatencyTracker.appNotRespondingStarted();
// 获取到anr 的进程号pid
123 final int incomingPid = anrProcess.mPid;
124 timeoutRecord.mLatencyTracker.waitingOnAnrRecordLockStarted();
125 synchronized (mAnrRecords) {
126 timeoutRecord.mLatencyTracker.waitingOnAnrRecordLockEnded();
// 如果是zygote 进程anr,则不处理
127 if (incomingPid == 0) {
128 // Extreme corner case such as zygote is no response
129 // to return pid for the process.
130 Slog.i(TAG, "Skip zero pid ANR, process=" + anrProcess.processName);
131 return;
132 }
// 处理重复的anr 进程
133 if (mProcessingPid == incomingPid) {
134 Slog.i(TAG,
135 "Skip duplicated ANR, pid=" + incomingPid + " "
136 + timeoutRecord.mReason);
137 return;
138 }
// 将anr 的pid 增加到 mTempDumpedPids set 中,如果增加不成功,表示存在
139 if (!mTempDumpedPids.add(incomingPid)) {
140 Slog.i(TAG,
141 "Skip ANR being predumped, pid=" + incomingPid + " "
142 + timeoutRecord.mReason);
143 return;
144 }
145 for (int i = mAnrRecords.size() - 1; i >= 0; i--) {
146 if (mAnrRecords.get(i).mPid == incomingPid) {
147 Slog.i(TAG,
148 "Skip queued ANR, pid=" + incomingPid + " "
149 + timeoutRecord.mReason);
150 return;
151 }
152 }
153 // We dump the main process as soon as we can on a different thread,
154 // this is done as the main process's dump can go stale in a few hundred
155 // milliseconds and the average full ANR dump takes a few seconds.
156 timeoutRecord.mLatencyTracker.earlyDumpRequestSubmittedWithSize(
157 mTempDumpedPids.size());
// 1-2)通过线程池去创建tempanr 的路径:StackTracesDumpHelper.dumpStackTracesTempFile;并且去通知tombstone dump出java 的调用栈
158 Future<File> firstPidDumpPromise = mEarlyDumpExecutor.submit(() -> {
159 // the class AnrLatencyTracker is not generally thread safe but the values
160 // recorded/touched by the Temporary dump thread(s) are all volatile/atomic.
161 File tracesFile = StackTracesDumpHelper.dumpStackTracesTempFile(incomingPid,
162 timeoutRecord.mLatencyTracker);
163 mTempDumpedPids.remove(incomingPid);
164 return tracesFile;
165 });
166
167 timeoutRecord.mLatencyTracker.anrRecordPlacingOnQueueWithSize(mAnrRecords.size());
// 创建 AnrRecord 对象,将其保存到 mAnrRecords 中
168 mAnrRecords.add(new AnrRecord(anrProcess, activityShortComponentName, aInfo,
169 parentShortComponentName, parentProcess, aboveSystem, timeoutRecord,
170 isContinuousAnr, firstPidDumpPromise));
171 }
// 1-3)开启线程去处理保存的 AnrRecord:startAnrConsumerIfNeeded
172 startAnrConsumerIfNeeded();
173 } finally {
174 timeoutRecord.mLatencyTracker.appNotRespondingEnded();
175 }
176
177 }
1-1)增加了打印trace 的日志
打印的trace 为 "AnrHelper#appNotResponding()"
timeoutRecord.mLatencyTracker.appNotRespondingStarted();
timeoutRecord.mLatencyTracker.appNotRespondingEnded();
/frameworks/base/core/java/com/android/internal/os/anr/AnrLatencyTracker.java
137 /** Records the start of AnrHelper#appNotResponding. */
138 public void appNotRespondingStarted() {
139 mAppNotRespondingStartUptime = getUptimeMillis();
140 Trace.traceBegin(TRACE_TAG_ACTIVITY_MANAGER,
141 "AnrHelper#appNotResponding()");
142 }
143
144 /** Records the end of AnrHelper#appNotResponding. */
145 public void appNotRespondingEnded() {
146 Trace.traceEnd(TRACE_TAG_ACTIVITY_MANAGER);
147 }
1-2)通过线程池去创建tempanr 的路径:StackTracesDumpHelper.dumpStackTracesTempFile;并且去通知tombstone dump出java 的调用栈
/frameworks/base/services/core/java/com/android/server/am/StackTracesDumpHelper.java
363 public static File dumpStackTracesTempFile(int pid, AnrLatencyTracker latencyTracker) {
364 try {
// 这里也是通过trace 记录创建file 的耗时:"dumpStackTracesTempFile"
365 if (latencyTracker != null) {
366 latencyTracker.dumpStackTracesTempFileStarted();
367 }
368
369 File tmpTracesFile;
370 try {
// ANR_TEMP_FILE_PREFIX 为:temp_anr_,ANR_TRACE_DIR 是路径为 /data/anr/
371 tmpTracesFile = File.createTempFile(ANR_TEMP_FILE_PREFIX, ".txt",
372 new File(ANR_TRACE_DIR));
// 会打印下列的日志,为:
// D ActivityManager: created ANR temporary file:/data/anr/temp_anr_5190759843177188281.txt
373 Slog.d(TAG, "created ANR temporary file:" + tmpTracesFile.getAbsolutePath());
374 } catch (IOException e) {
375 Slog.w(TAG, "Exception creating temporary ANR dump file:", e);
376 if (latencyTracker != null) {
377 latencyTracker.dumpStackTracesTempFileCreationFailed();
378 }
379 return null;
380 }
381
// 也会打印下列的log,表示收集哪个进程的
382 Slog.i(TAG, "Collecting stacks for pid " + pid + " into temporary file "
383 + tmpTracesFile.getName());
384 if (latencyTracker != null) {
385 latencyTracker.dumpingPidStarted(pid);
386 }
// 与tombstone 通信保存java 调用栈,设置超时的时间为 10 秒:dumpJavaTracesTombstoned
387 final long timeTaken = dumpJavaTracesTombstoned(pid, tmpTracesFile.getAbsolutePath(),
388 TEMP_DUMP_TIME_LIMIT);
389 if (latencyTracker != null) {
390 latencyTracker.dumpingPidEnded();
391 }
// 如果保存java 调用栈时间超过10秒,会打印下列的日志
392 if (TEMP_DUMP_TIME_LIMIT <= timeTaken) {
393 Slog.e(TAG, "Aborted stack trace dump (current primary pid=" + pid
394 + "); deadline exceeded.");
395 if (latencyTracker != null) {
396 latencyTracker.dumpStackTracesTempFileTimedOut();
397 }
398 }
399 if (DEBUG_ANR) {
400 Slog.d(TAG, "Done with primary pid " + pid + " in " + timeTaken + "ms"
401 + " dumped into temporary file " + tmpTracesFile.getName());
402 }
403 return tmpTracesFile;
404 } finally {
405 if (latencyTracker != null) {
406 latencyTracker.dumpStackTracesTempFileEnded();
407 }
408 }
409 }
// 与tombstone 通信保存java 调用栈,设置超时的时间为 10 秒:dumpJavaTracesTombstoned
547 private static long dumpJavaTracesTombstoned(int pid, String fileName, long timeoutMs) {
548 final long timeStart = SystemClock.elapsedRealtime();
// 增加字符串为:"----- dumping pid:
549 int headerSize = writeUptimeStartHeaderForPid(pid, fileName);
// 通过dumpJavaBacktraceToFileTimeout 去保存调用栈
550 boolean javaSuccess = Debug.dumpJavaBacktraceToFileTimeout(pid, fileName,
551 (int) (timeoutMs / 1000));
552 if (javaSuccess) {
553 // Check that something is in the file, actually. Try-catch should not be necessary,
554 // but better safe than sorry.
555 try {
556 long size = new File(fileName).length();
557 if ((size - headerSize) < JAVA_DUMP_MINIMUM_SIZE) {
558 Slog.w(TAG, "Successfully created Java ANR file is empty!");
559 javaSuccess = false;
560 }
561 } catch (Exception e) {
562 Slog.w(TAG, "Unable to get ANR file size", e);
563 javaSuccess = false;
564 }
565 }
// 保存java 调用栈不成功,则去获取到native 层的调用栈
566 if (!javaSuccess) {
567 Slog.w(TAG, "Dumping Java threads failed, initiating native stack dump.");
568 if (!Debug.dumpNativeBacktraceToFileTimeout(pid, fileName,
569 (NATIVE_DUMP_TIMEOUT_MS / 1000))) {
570 Slog.w(TAG, "Native stack dump failed!");
571 }
572 }
573
574 return SystemClock.elapsedRealtime() - timeStart;
575 }
// 通过dumpJavaBacktraceToFileTimeout 去保存调用栈
/android-14.0.0_r2/xref/frameworks/base/core/jni/android_os_Debug.cpp
779 static jboolean android_os_Debug_dumpJavaBacktraceToFileTimeout(JNIEnv* env, jobject clazz,
780 jint pid, jstring fileName, jint timeoutSecs) {
// 传入的是打印java 的调用栈:kDebuggerdJavaBacktrace
781 const bool ret = dumpTraces(env, pid, fileName, timeoutSecs, kDebuggerdJavaBacktrace);
782 return ret ? JNI_TRUE : JNI_FALSE;
783 }
757 static bool dumpTraces(JNIEnv* env, jint pid, jstring fileName, jint timeoutSecs,
758 DebuggerdDumpType dumpType) {
759 const ScopedUtfChars fileNameChars(env, fileName);
760 if (fileNameChars.c_str() == nullptr) {
761 return false;
762 }
763
// 打开这个文件,返回fd 文件描述符,给客户端使用
764 android::base::unique_fd fd(open(fileNameChars.c_str(),
765 O_CREAT | O_WRONLY | O_NOFOLLOW | O_CLOEXEC | O_APPEND,
766 0666));
767 if (fd < 0) {
768 PLOG(ERROR) << "Can't open " << fileNameChars.c_str();
769 return false;
770 }
771
// 调用 dump_backtrace_to_file_timeout 方法
772 int res = dump_backtrace_to_file_timeout(pid, dumpType, timeoutSecs, fd);
773 if (fdatasync(fd.get()) != 0) {
774 PLOG(ERROR) << "Failed flushing trace.";
775 }
776 return res == 0;
777 }
/xref/system/core/debuggerd/client/debuggerd_client.cpp
// 另外一篇博客有分析,会与tombstone进行socket 通信,让tombstone 去dump
304 int dump_backtrace_to_file_timeout(pid_t tid, DebuggerdDumpType dump_type, int timeout_secs,
305 int fd) {
306 android::base::unique_fd copy(dup(fd));
307 if (copy == -1) {
308 return -1;
309 }
310
311 // debuggerd_trigger_dump results in every thread in the process being interrupted
312 // by a signal, so we need to fetch the wchan data before calling that.
313 std::string wchan_data = get_wchan_data(fd, tid);
314
315 int timeout_ms = timeout_secs > 0 ? timeout_secs * 1000 : 0;
316 int ret = debuggerd_trigger_dump(tid, dump_type, timeout_ms, std::move(copy)) ? 0 : -1;
317
318 // Dump wchan data, since only privileged processes (CAP_SYS_ADMIN) can read
319 // kernel stack traces (/proc/*/stack).
320 if (!WriteStringToFd(wchan_data, fd)) {
321 LOG(WARNING) << TAG "Failed to dump wchan data for pid: " << tid;
322 }
323
324 return ret;
325 }
326
1-3)开启线程去处理保存的 AnrRecord:startAnrConsumerIfNeeded
/frameworks/base/services/core/java/com/android/server/am/AnrHelper.java
179 private void startAnrConsumerIfNeeded() {
// 如果为false,则开启AnrConsumerThread 线程
180 if (mRunning.compareAndSet(false, true)) {
181 new AnrConsumerThread().start();
182 }
183 }
199 private class AnrConsumerThread extends Thread {
200 AnrConsumerThread() {
201 super("AnrConsumer");
202 }
203
// 获取到下一个 AnrRecord 对象
204 private AnrRecord next() {
205 synchronized (mAnrRecords) {
206 if (mAnrRecords.isEmpty()) {
207 return null;
208 }
209 final AnrRecord record = mAnrRecords.remove(0);
210 mProcessingPid = record.mPid;
211 record.mTimeoutRecord.mLatencyTracker.anrRecordsQueueSizeWhenPopped(
212 mAnrRecords.size());
213 return record;
214 }
215 }
216
217 @Override
218 public void run() {
219 AnrRecord r;
220 while ((r = next()) != null) {
221 scheduleBinderHeavyHitterAutoSamplerIfNecessary();
222 final int currentPid = r.mApp.mPid;
223 if (currentPid != r.mPid) {
224 // The process may have restarted or died.
225 Slog.i(TAG, "Skip ANR with mismatched pid=" + r.mPid + ", current pid="
226 + currentPid);
227 continue;
228 }
229 final long startTime = SystemClock.uptimeMillis();
// mTimestamp 为创建 AnrRecord 的时间
232 final long reportLatency = startTime - r.mTimestamp;
// 如果创建AnrRecord 到处理anr的时间耗时超过 10秒,则onlyDumpSelf 为true。一般都是为false的
233 final boolean onlyDumpSelf = reportLatency > EXPIRED_REPORT_TIME_MS;
// 调用AnrRecord 的 appNotResponding 方法
234 r.appNotResponding(onlyDumpSelf);
235 final long endTime = SystemClock.uptimeMillis();
// 会打印下列log,可以判断 onlyDumpSelf 是否为true
236 Slog.d(TAG, "Completed ANR of " + r.mApp.processName + " in "
237 + (endTime - startTime) + "ms, latency " + reportLatency
238 + (onlyDumpSelf ? "ms (expired, only dump ANR app)" : "ms"));
239 }
240
241 mRunning.set(false);
242 synchronized (mAnrRecords) {
243 mProcessingPid = -1;
244 // The race should be unlikely to happen. Just to make sure we don't miss.
245 if (!mAnrRecords.isEmpty()) {
246 startAnrConsumerIfNeeded();
247 }
248 }
249 }
250
251 }
// 调用AnrRecord 的 appNotResponding 方法,该方法是在线程中处理的
296 void appNotResponding(boolean onlyDumpSelf) {
297 try {
298 mTimeoutRecord.mLatencyTracker.anrProcessingStarted();
299 mApp.mErrorState.appNotResponding(mActivityShortComponentName, mAppInfo,
300 mParentShortComponentName, mParentProcess, mAboveSystem,
301 mTimeoutRecord, mAuxiliaryTaskExecutor, onlyDumpSelf,
302 mIsContinuousAnr, mFirstPidFilePromise);
303 } finally {
304 mTimeoutRecord.mLatencyTracker.anrProcessingEnded();
305 }
306 }
307 }
/frameworks/base/services/core/java/com/android/server/am/ProcessErrorStateRecord.java
289 void appNotResponding(String activityShortComponentName, ApplicationInfo aInfo,
290 String parentShortComponentName, WindowProcessController parentProcess,
291 boolean aboveSystem, TimeoutRecord timeoutRecord,
292 ExecutorService auxiliaryTaskExecutor, boolean onlyDumpSelf,
293 boolean isContinuousAnr, Future<File> firstPidFilePromise) {
// 获取到anr 的message
294 String annotation = timeoutRecord.mReason;
295 AnrLatencyTracker latencyTracker = timeoutRecord.mLatencyTracker;
296 Future<?> updateCpuStatsNowFirstCall = null;
297
298 ArrayList<Integer> firstPids = new ArrayList<>(5);
299 SparseBooleanArray lastPids = new SparseBooleanArray(20);
300
// 这里需要应用去设置了controller,这里不会走
301 mApp.getWindowProcessController().appEarlyNotResponding(annotation, () -> {
302 latencyTracker.waitingOnAMSLockStarted();
303 synchronized (mService) {
304 latencyTracker.waitingOnAMSLockEnded();
305 // Store annotation here as instance below races with this killLocked.
306 setAnrAnnotation(annotation);
307 mApp.killLocked("anr", ApplicationExitInfo.REASON_ANR, true);
308 }
309 });
310
311 long anrTime = SystemClock.uptimeMillis();
312
// 返回值为true 的,去更新cpu 的状态
313 if (isMonitorCpuUsage()) {
314 updateCpuStatsNowFirstCall = auxiliaryTaskExecutor.submit(
315 () -> {
316 latencyTracker.updateCpuStatsNowCalled();
317 mService.updateCpuStatsNow();
318 latencyTracker.updateCpuStatsNowReturned();
319 });
320
321 }
323 final boolean isSilentAnr;
// 获取到anr 的进程号
324 final int pid = mApp.getPid();
325 final UUID errorId;
326 latencyTracker.waitingOnAMSLockStarted();
327 synchronized (mService) {
328 latencyTracker.waitingOnAMSLockEnded();
329 // Store annotation here as instance above will not be hit on all paths.
330 setAnrAnnotation(annotation);
331
332 Counter.logIncrement("stability_anr.value_total_anrs");
// 看是否跳过anr,这里如果进程被kill 或者anr,或者shutdown,则return
333 if (skipAnrLocked(annotation)) {
334 latencyTracker.anrSkippedProcessErrorStateRecordAppNotResponding();
335 Counter.logIncrement("stability_anr.value_skipped_anrs");
336 return;
337 }
338 // In case we come through here for the same app before completing
339 // this one, mark as anring now so we will bail out.
340 latencyTracker.waitingOnProcLockStarted();
341 synchronized (mProcLock) {
342 latencyTracker.waitingOnProcLockEnded();
// 设置无响应状态
343 setNotResponding(true);
344 }
345
// 会打印 am_anr 的日志
// I am_anr : [0,4607,com.android.phone,952647245,executing service com.android.phone/.TelephonyDebugService]
346 // Log the ANR to the event log.
347 EventLog.writeEvent(EventLogTags.AM_ANR, mApp.userId, pid, mApp.processName,
348 mApp.info.flags, annotation);
349
350 if (mService.mTraceErrorLogger != null
351 && mService.mTraceErrorLogger.isAddErrorIdEnabled()) {
352 errorId = mService.mTraceErrorLogger.generateErrorId();
353 mService.mTraceErrorLogger.addProcessInfoAndErrorIdToTrace(
354 mApp.processName, pid, errorId);
355 mService.mTraceErrorLogger.addSubjectToTrace(annotation, errorId);
356 } else {
357 errorId = null;
358 }
359
360 // This atom is only logged with the purpose of triggering Perfetto and the logging
361 // needs to happen as close as possible to the time when the ANR is detected.
362 // Also, it needs to be logged after adding the error id to the trace, to make sure
363 // the error id is present in the trace when the Perfetto trace is captured.
364 FrameworkStatsLog.write(FrameworkStatsLog.ANR_OCCURRED_PROCESSING_STARTED,
365 mApp.processName);
366
367 // Dump thread traces as quickly as we can, starting with "interesting" processes.
// 将需要dump 调用栈的保存到 firstPids 中,将anr 的pid 保存
368 firstPids.add(pid);
369
// 1-3-1)判断是否需要SilentAnr
374 isSilentAnr = isSilentAnr();
375 if (!isSilentAnr && !onlyDumpSelf) {
// 设置父亲进程号为anr 的进程pid
376 int parentPid = pid;
// 父进程为空
377 if (parentProcess != null && parentProcess.getPid() > 0) {
378 parentPid = parentProcess.getPid();
379 }
380 if (parentPid != pid) firstPids.add(parentPid);
381
// 增加系统进程,所以这里会dump系统进程
382 if (MY_PID != pid && MY_PID != parentPid) firstPids.add(MY_PID);
383
384 final int ppid = parentPid;
// 依据lru从大到小开始遍历所有进程
385 mService.mProcessList.forEachLruProcessesLOSP(false, r -> {
386 if (r != null && r.getThread() != null) {
387 int myPid = r.getPid();
// 不是系统进程,不是anr进程
388 if (myPid > 0 && myPid != pid && myPid != ppid && myPid != MY_PID) {
// 保存常驻进程
389 if (r.isPersistent()) {
390 firstPids.add(myPid);
391 if (DEBUG_ANR) Slog.i(TAG, "Adding persistent proc: " + r);
// 是否有flag:BIND_TREAT_LIKE_ACTIVITY
392 } else if (r.mServices.isTreatedLikeActivity()) {
393 firstPids.add(myPid);
394 if (DEBUG_ANR) Slog.i(TAG, "Adding likely IME: " + r);
395 } else {
// 其他进程保存在 lastPids
396 lastPids.put(myPid, true);
397 if (DEBUG_ANR) Slog.i(TAG, "Adding ANR proc: " + r);
398 }
399 }
400 }
401 });
402 }
403 }
// 获取到anr 进程的内存信息
404 // Build memory headers for the ANRing process.
405 String memoryHeaders = buildMemoryHeadersFor(pid);
406
407 // Get critical event log before logging the ANR so that it doesn't occur in the log.
408 latencyTracker.criticalEventLogStarted();
// 打印 CriticalEventLog 信息
409 final String criticalEventLog =
410 CriticalEventLog.getInstance().logLinesForTraceFile(
411 mApp.getProcessClassEnum(), mApp.processName, mApp.uid);
412 latencyTracker.criticalEventLogEnded();
413 CriticalEventLog.getInstance().logAnr(annotation, mApp.getProcessClassEnum(),
414 mApp.processName, mApp.uid, mApp.mPid);
415
416 // Log the ANR to the main log.
// 打印anr 的log 日志
417 StringBuilder info = new StringBuilder();
418 info.setLength(0);
// 进程名字
419 info.append("ANR in ").append(mApp.processName);
420 if (activityShortComponentName != null) {
421 info.append(" (").append(activityShortComponentName).append(")");
422 }
423 info.append("\n");
424 info.append("PID: ").append(pid).append("\n");
425 if (annotation != null) {
426 info.append("Reason: ").append(annotation).append("\n");
427 }
428 if (parentShortComponentName != null
429 && parentShortComponentName.equals(activityShortComponentName)) {
430 info.append("Parent: ").append(parentShortComponentName).append("\n");
431 }
432 if (errorId != null) {
433 info.append("ErrorId: ").append(errorId.toString()).append("\n");
434 }
// 是否被冻结
435 info.append("Frozen: ").append(mApp.mOptRecord.isFrozen()).append("\n");
// 下列 anrController 为空
441 AnrController anrController = mService.mActivityTaskManager.getAnrController(aInfo);
442 long anrDialogDelayMs = 0;
443 if (anrController != null) {
...
451 }
452
453 StringBuilder report = new StringBuilder();
454
455 latencyTracker.currentPsiStateCalled();
456 String currentPsiState = ResourcePressureUtil.currentPsiState();
457 latencyTracker.currentPsiStateReturned();
458 report.append(currentPsiState);
459 ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true);
460
// 异步收集要dump native 层的pid
463 Future<ArrayList<Integer>> nativePidsFuture =
464 auxiliaryTaskExecutor.submit(
465 () -> {
466 latencyTracker.nativePidCollectionStarted();
467 // don't dump native PIDs for background ANRs unless
468 // it is the process of interest
469 String[] nativeProcs = null;
470 boolean isSystemApp = mApp.info.isSystemApp() || mApp.info.isSystemExt();
// 如果不是系统app,或者SilentAnr,或者只dump,则判断是否是感兴趣的进程
// 1-3-2)判断anr进程是否是感兴趣的进程
473 if (!isSystemApp || isSilentAnr || onlyDumpSelf) {
474 for (int i = 0; i < NATIVE_STACKS_OF_INTEREST.length; i++) {
475 if (NATIVE_STACKS_OF_INTEREST[i].equals(mApp.processName)) {
476 nativeProcs = new String[] { mApp.processName };
477 break;
478 }
479 }
480 } else {
481 nativeProcs = NATIVE_STACKS_OF_INTEREST;
482 }
483
// 如果是系统app 的话,会dumpnative感兴趣的进程
// native层获取到感兴趣的进程的pid 号
484 int[] pids = nativeProcs == null
485 ? null : Process.getPidsForCommands(nativeProcs);
486 ArrayList<Integer> nativePids = null;
487
488 if (pids != null) {
489 nativePids = new ArrayList<>(pids.length);
490 for (int i : pids) {
491 nativePids.add(i);
492 }
493 }
494 latencyTracker.nativePidCollectionEnded();
495 return nativePids;
496 });
497
500 StringWriter tracesFileException = new StringWriter();
501 // To hold the start and end offset to the ANR trace file respectively.
502 final AtomicLong firstPidEndOffset = new AtomicLong(-1);
// 1-3-3)调用 dumpStackTraces方法
503 File tracesFile = StackTracesDumpHelper.dumpStackTraces(firstPids,
504 isSilentAnr ? null : processCpuTracker, isSilentAnr ? null : lastPids,
505 nativePidsFuture, tracesFileException, firstPidEndOffset, annotation,
506 criticalEventLog, memoryHeaders, auxiliaryTaskExecutor, firstPidFilePromise,
507 latencyTracker);
508
509 if (isMonitorCpuUsage()) {
510 // Wait for the first call to finish
511 try {
512 updateCpuStatsNowFirstCall.get();
513 } catch (ExecutionException e) {
514 Slog.w(TAG, "Failed to update the CPU stats", e.getCause());
515 } catch (InterruptedException e) {
516 Slog.w(TAG, "Interrupted while updating the CPU stats", e);
517 }
518 mService.updateCpuStatsNow();
519 mService.mAppProfiler.printCurrentCpuState(report, anrTime);
520 info.append(processCpuTracker.printCurrentLoad());
521 info.append(report);
522 }
523 report.append(tracesFileException.getBuffer());
524
525 info.append(processCpuTracker.printCurrentState(anrTime));
526
527 Slog.e(TAG, info.toString());
// 1-3-1)判断是否需要SilentAnr
如果 isSilentAnr 返回为true,则需要满足下列2个条件都是为false的
746 @VisibleForTesting
747 @GuardedBy("mService")
748 boolean isSilentAnr() {
749 return !getShowBackground() && !isInterestingForBackgroundTraces();
750 }
// getShowBackground 方法
一般是为false
713 private boolean getShowBackground() {
714 final ContentResolver resolver = mService.mContext.getContentResolver();
715 return Settings.Secure.getIntForUser(resolver,
716 Settings.Secure.ANR_SHOW_BACKGROUND,
717 0,
718 resolver.getUserId()) != 0;
719 }
// isInterestingForBackgroundTraces方法,此时phone进程是不满足下列条件的,返回false
即如果是systemui或者有显示activity、OverlayUi或者有前台服务 isInterestingToUserLocked,则不会走 isSilentAnr 流程
692 @GuardedBy("mService")
693 private boolean isInterestingForBackgroundTraces() {
694 // The system_server is always considered interesting.
// 如果当前进程是系统进程,则为true
695 if (mApp.getPid() == MY_PID) {
696 return true;
697 }
698
699 // A package is considered interesting if any of the following is true :
700 //
701 // - It's displaying an activity.
702 // - It's the SystemUI.
703 // - It has an overlay or a top UI visible.
704 //
705 // NOTE: The check whether a given ProcessRecord belongs to the systemui
706 // process is a bit of a kludge, but the same pattern seems repeated at
707 // several places in the system server.
// 如果是systemui或者有显示activity、OverlayUi或者有前台服务 isInterestingToUserLocked
708 return mApp.isInterestingToUserLocked()
709 || (mApp.info != null && "com.android.systemui".equals(mApp.info.packageName))
710 || (mApp.mState.hasTopUi() || mApp.mState.hasOverlayUi());
711 }
// 1-3-2)判断anr进程是否是感兴趣的进程
/frameworks/base/services/core/java/com/android/server/Watchdog.java
113 // Which native processes to dump into dropbox's stack traces
114 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
115 "/system/bin/audioserver",
116 "/system/bin/cameraserver",
117 "/system/bin/drmserver",
118 "/system/bin/keystore2",
119 "/system/bin/mediadrmserver",
120 "/system/bin/mediaserver",
121 "/system/bin/netd",
122 "/system/bin/sdcard",
123 "/system/bin/surfaceflinger",
124 "/system/bin/vold",
125 "media.extractor", // system/bin/mediaextractor
126 "media.metrics", // system/bin/mediametrics
127 "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
128 "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
129 "media.transcoding", // Media transcoding service
130 "com.android.bluetooth", // Bluetooth service
131 "/apex/com.android.os.statsd/bin/statsd", // Stats daemon
132 };
// 1-3-3)调用 dumpStackTraces方法
/frameworks/base/services/core/java/com/android/server/am/StackTracesDumpHelper.java
118 /* package */ static File dumpStackTraces(ArrayList<Integer> firstPids,
119 ProcessCpuTracker processCpuTracker, SparseBooleanArray lastPids,
120 Future<ArrayList<Integer>> nativePidsFuture, StringWriter logExceptionCreatingFile,
121 AtomicLong firstPidEndOffset, String subject, String criticalEventSection,
122 String memoryHeaders, @NonNull Executor auxiliaryTaskExecutor,
123 Future<File> firstPidFilePromise, AnrLatencyTracker latencyTracker) {
124 try {
125
126 if (latencyTracker != null) {
127 latencyTracker.dumpStackTracesStarted();
128 }
129
130 Slog.i(TAG, "dumpStackTraces pids=" + lastPids);
131
132 // Measure CPU usage as soon as we're called in order to get a realistic sampling
133 // of the top users at the time of the request.
134 Supplier<ArrayList<Integer>> extraPidsSupplier = processCpuTracker != null
135 ? () -> getExtraPids(processCpuTracker, lastPids, latencyTracker) : null;
136 Future<ArrayList<Integer>> extraPidsFuture = null;
137 if (extraPidsSupplier != null) {
138 extraPidsFuture =
139 CompletableFuture.supplyAsync(extraPidsSupplier, auxiliaryTaskExecutor);
140 }
141
142 final File tracesDir = new File(ANR_TRACE_DIR);
143
144 // NOTE: We should consider creating the file in native code atomically once we've
145 // gotten rid of the old scheme of dumping and lot of the code that deals with paths
146 // can be removed.
147 File tracesFile;
148 try {
// 创建anr 的文件 createAnrDumpFile
149 tracesFile = createAnrDumpFile(tracesDir);
150 } catch (IOException e) {
151 Slog.w(TAG, "Exception creating ANR dump file:", e);
152 if (logExceptionCreatingFile != null) {
153 logExceptionCreatingFile.append(
154 "----- Exception creating ANR dump file -----\n");
155 e.printStackTrace(new PrintWriter(logExceptionCreatingFile));
156 }
157 if (latencyTracker != null) {
158 latencyTracker.anrSkippedDumpStackTraces();
159 }
160 return null;
161 }
162
// 增加 memoryHeaders 内存和 criticalEventSection 信息
163 if (subject != null || criticalEventSection != null || memoryHeaders != null) {
164 appendtoANRFile(tracesFile.getAbsolutePath(),
165 (subject != null ? "Subject: " + subject + "\n" : "")
166 + (memoryHeaders != null ? memoryHeaders + "\n\n" : "")
167 + (criticalEventSection != null ? criticalEventSection : ""));
168 }
169
// 去dump 调用栈 dumpStackTraces
170 long firstPidEndPos = dumpStackTraces(
171 tracesFile.getAbsolutePath(), firstPids, nativePidsFuture,
172 extraPidsFuture, firstPidFilePromise, latencyTracker);
173 if (firstPidEndOffset != null) {
174 firstPidEndOffset.set(firstPidEndPos);
175 }
176 // Each set of ANR traces is written to a separate file and dumpstate will process
177 // all such files and add them to a captured bug report if they're recent enough.
178 maybePruneOldTraces(tracesDir);
179
180 return tracesFile;
181 } finally {
182 if (latencyTracker != null) {
183 latencyTracker.dumpStackTracesEnded();
184 }
185 }
186 }
// 创建anr 的文件 createAnrDumpFile
449 private static synchronized File createAnrDumpFile(File tracesDir) throws IOException {
450 final String formattedDate = ANR_FILE_DATE_FORMAT.format(new Date());
// 在data/anr 路劲下创建文件名为: anr_时间: anr_2024-08-08-16-56-08-405.log
451 final File anrFile = new File(tracesDir, ANR_FILE_PREFIX + formattedDate);
452
453 if (anrFile.createNewFile()) {
454 FileUtils.setPermissions(anrFile.getAbsolutePath(), 0600, -1, -1); // -rw-------
455 return anrFile;
456 } else {
457 throw new IOException("Unable to create ANR dump file: createNewFile failed");
458 }
459 }
// 去dump 调用栈 dumpStackTraces
191 public static long dumpStackTraces(String tracesFile,
192 ArrayList<Integer> firstPids, Future<ArrayList<Integer>> nativePidsFuture,
193 Future<ArrayList<Integer>> extraPidsFuture, Future<File> firstPidFilePromise,
194 AnrLatencyTracker latencyTracker) {
195
// 会打印对应的anr文件名
196 Slog.i(TAG, "Dumping to " + tracesFile);
197
// 20秒完成dump信息
203 long remainingTime = 20 * 1000 * Build.HW_TIMEOUT_MULTIPLIER;
204
205 // As applications are usually interested with the ANR stack traces, but we can't share
206 // with them the stack traces other than their own stacks. So after the very first PID is
207 // dumped, remember the current file size.
208 long firstPidEnd = -1;
209
210 // Was the first pid copied from the temporary file that was created in the predump phase?
211 boolean firstPidTempDumpCopied = false;
212
213 // First copy the first pid's dump from the temporary file it was dumped into earlier,
214 // The first pid should always exist in firstPids but we check the size just in case.
215 if (firstPidFilePromise != null && firstPids != null && firstPids.size() > 0) {
// 获取到ant 的进程
216 final int primaryPid = firstPids.get(0);
217 final long start = SystemClock.elapsedRealtime();
// temp dump文件保存anr 的dump信息
218 firstPidTempDumpCopied = copyFirstPidTempDump(tracesFile, firstPidFilePromise,
219 remainingTime, latencyTracker);
220 final long timeTaken = SystemClock.elapsedRealtime() - start;
221 remainingTime -= timeTaken;
222 if (remainingTime <= 0) {
223 Slog.e(TAG, "Aborting stack trace dump (currently copying primary pid" + primaryPid
224 + "); deadline exceeded.");
225 return firstPidEnd;
226 }
227 // We don't copy ANR traces from the system_server intentionally.
228 if (firstPidTempDumpCopied && primaryPid != ActivityManagerService.MY_PID) {
229 firstPidEnd = new File(tracesFile).length();
230 }
231 // Append the Durations/latency comma separated array after the first PID.
232 if (firstPidTempDumpCopied && latencyTracker != null) {
233 appendtoANRFile(tracesFile,
234 latencyTracker.dumpAsCommaSeparatedArrayWithHeader());
235 }
236 }
237 // Next collect all of the stacks of the most important pids.
238 if (firstPids != null) {
239 if (latencyTracker != null) {
240 latencyTracker.dumpingFirstPidsStarted();
241 }
242
243 int num = firstPids.size();
// firstPidTempDumpCopied 为true,从第二个pid开始dump
244 for (int i = firstPidTempDumpCopied ? 1 : 0; i < num; i++) {
245 final int pid = firstPids.get(i);
246 // We don't copy ANR traces from the system_server intentionally.
// 不是first pid
247 final boolean firstPid = i == 0 && ActivityManagerService.MY_PID != pid;
// 会打印下列的pid 号
248 Slog.i(TAG, "Collecting stacks for pid " + pid);
// 遍历所有的fistpid 号,去dump java栈
249 final long timeTaken = dumpJavaTracesTombstoned(pid, tracesFile, remainingTime,
250 latencyTracker);
251 remainingTime -= timeTaken;
252 if (remainingTime <= 0) {
253 Slog.e(TAG, "Aborting stack trace dump (current firstPid=" + pid
254 + "); deadline exceeded.");
255 return firstPidEnd;
256 }
257
258 if (firstPid) {
259 firstPidEnd = new File(tracesFile).length();
260 // Full latency dump
261 if (latencyTracker != null) {
262 appendtoANRFile(tracesFile,
263 latencyTracker.dumpAsCommaSeparatedArrayWithHeader());
264 }
265 }
266 if (DEBUG_ANR) {
267 Slog.d(TAG, "Done with pid " + firstPids.get(i) + " in " + timeTaken + "ms");
268 }
269 }
270 if (latencyTracker != null) {
271 latencyTracker.dumpingFirstPidsEnded();
272 }
273 }
274
275 // Next collect the stacks of the native pids
276 ArrayList<Integer> nativePids = collectPids(nativePidsFuture, "native pids");
277
// 开始dump native进程
278 Slog.i(TAG, "dumpStackTraces nativepids=" + nativePids);
279
280 if (nativePids != null) {
281 if (latencyTracker != null) {
282 latencyTracker.dumpingNativePidsStarted();
283 }
284 for (int pid : nativePids) {
285 Slog.i(TAG, "Collecting stacks for native pid " + pid);
// nativeDumpTimeoutMs native进程最多时长为 2秒
286 final long nativeDumpTimeoutMs = Math.min(NATIVE_DUMP_TIMEOUT_MS, remainingTime);
287
288 if (latencyTracker != null) {
289 latencyTracker.dumpingPidStarted(pid);
290 }
291 final long start = SystemClock.elapsedRealtime();
// 通过下列方法dumpNativeBacktraceToFileTimeout
292 Debug.dumpNativeBacktraceToFileTimeout(
293 pid, tracesFile, (int) (nativeDumpTimeoutMs / 1000));
294 final long timeTaken = SystemClock.elapsedRealtime() - start;
295 if (latencyTracker != null) {
296 latencyTracker.dumpingPidEnded();
297 }
298 remainingTime -= timeTaken;
299 if (remainingTime <= 0) {
300 Slog.e(TAG, "Aborting stack trace dump (current native pid=" + pid
301 + "); deadline exceeded.");
302 return firstPidEnd;
303 }
304
305 if (DEBUG_ANR) {
306 Slog.d(TAG, "Done with native pid " + pid + " in " + timeTaken + "ms");
307 }
308 }
309 if (latencyTracker != null) {
310 latencyTracker.dumpingNativePidsEnded();
311 }
312 }
313
314 // Lastly, dump stacks for all extra PIDs from the CPU tracker.
315 ArrayList<Integer> extraPids = collectPids(extraPidsFuture, "extra pids");
316
317 if (extraPidsFuture != null) {
318 try {
319 extraPids = extraPidsFuture.get();
320 } catch (ExecutionException e) {
321 Slog.w(TAG, "Failed to collect extra pids", e.getCause());
322 } catch (InterruptedException e) {
323 Slog.w(TAG, "Interrupted while collecting extra pids", e);
324 }
325 }
326 Slog.i(TAG, "dumpStackTraces extraPids=" + extraPids);
327
328 if (extraPids != null) {
329 if (latencyTracker != null) {
330 latencyTracker.dumpingExtraPidsStarted();
331 }
332 for (int pid : extraPids) {
333 Slog.i(TAG, "Collecting stacks for extra pid " + pid);
334 final long timeTaken = dumpJavaTracesTombstoned(pid, tracesFile, remainingTime,
335 latencyTracker);
336 remainingTime -= timeTaken;
337 if (remainingTime <= 0) {
338 Slog.e(TAG, "Aborting stack trace dump (current extra pid=" + pid
339 + "); deadline exceeded.");
340 return firstPidEnd;
341 }
342
343 if (DEBUG_ANR) {
344 Slog.d(TAG, "Done with extra pid " + pid + " in " + timeTaken + "ms");
345 }
346 }
347 if (latencyTracker != null) {
348 latencyTracker.dumpingExtraPidsEnded();
349 }
350 }
351 // Append the dumping footer with the current uptime
352 appendtoANRFile(tracesFile, "----- dumping ended at " + SystemClock.uptimeMillis() + "\n");
353 Slog.i(TAG, "Done dumping");
354
355 return firstPidEnd;
356 }
// 1-3-4)如果是满足issilentanr ,则会去kill 进程
/frameworks/base/services/core/java/com/android/server/am/ProcessErrorStateRecord.java
628 synchronized (mService) {
629 // mBatteryStatsService can be null if the AMS is constructed with injector only. This
630 // will only happen in tests.
631 if (mService.mBatteryStatsService != null) {
632 mService.mBatteryStatsService.noteProcessAnr(mApp.processName, mApp.uid);
633 }
634
635 if (isSilentAnr() && !mApp.isDebugging()) {
// 这里去kill 进程了,然后return 出去了
636 mApp.killLocked("bg anr", ApplicationExitInfo.REASON_ANR, true);
637 return;
638 }
// 1-3-5)显示anr 弹框
如果不满足是 issilentanr ,则不会去弹框
647 // mUiHandler can be null if the AMS is constructed with injector only. This will only
648 // happen in tests.
649 if (mService.mUiHandler != null) {
650 // Bring up the infamous App Not Responding dialog
651 Message msg = Message.obtain();
652 msg.what = ActivityManagerService.SHOW_NOT_RESPONDING_UI_MSG;
653 msg.obj = new AppNotRespondingDialog.Data(mApp, aInfo, aboveSystem,
654 isContinuousAnr);
655
656 mService.mUiHandler.sendMessageDelayed(msg, anrDialogDelayMs);
657 }
658 }
659 }
不是 ANR 都会kill 进程,只有满足时 issilentanr,没有界面,没有前台service,不是systemui,才会去kill 进程,会打印 bg anr
/frameworks/base/services/core/java/com/android/server/am/ActivityManagerService.java
1714 final class UiHandler extends Handler {
1715 public UiHandler() {
1716 super(com.android.server.UiThread.get().getLooper(), null, true);
1717 }
1718
1719 @Override
1720 public void handleMessage(Message msg) {
1721 switch (msg.what) {
1722 case SHOW_ERROR_UI_MSG: {
1723 mAppErrors.handleShowAppErrorUi(msg);
1724 ensureBootCompleted();
1725 } break;
1726 case SHOW_NOT_RESPONDING_UI_MSG: {
1727 mAppErrors.handleShowAnrUi(msg);
1728 ensureBootCompleted();
1729 } break;
/frameworks/base/services/core/java/com/android/server/am/AppErrors.java
1084 void handleShowAnrUi(Message msg) {
1085 List<VersionedPackage> packageList = null;
1086 boolean doKill = false;
1087 AppNotRespondingDialog.Data data = (AppNotRespondingDialog.Data) msg.obj;
1088 final ProcessRecord proc = data.proc;
1089 if (proc == null) {
1090 Slog.e(TAG, "handleShowAnrUi: proc is null");
1091 return;
1092 }
1093 synchronized (mProcLock) {
1094 final ProcessErrorStateRecord errState = proc.mErrorState;
1095 errState.setAnrData(data);
1096 if (!proc.isPersistent()) {
1097 packageList = proc.getPackageListWithVersionCode();
1098 }
...
1105
1106 boolean showBackground = Settings.Secure.getIntForUser(mContext.getContentResolver(),
1107 Settings.Secure.ANR_SHOW_BACKGROUND, 0,
1108 mService.mUserController.getCurrentUserId()) != 0;
// canShowErrorDialogs 是否显示和anr 的弹框,一般是 true 的
1109 if (mService.mAtmInternal.canShowErrorDialogs() || showBackground) {
1110 AnrController anrController = errState.getDialogController().getAnrController();
1111 if (anrController == null) {
// 调用下列方法去显示anr dialog
1112 errState.getDialogController().showAnrDialogs(data);
1113 } else {
1114 String packageName = proc.info.packageName;
.....
1130 } else {
1131 MetricsLogger.action(mContext, MetricsProto.MetricsEvent.ACTION_APP_ANR,
1132 AppNotRespondingDialog.CANT_SHOW);
1133 // Just kill the app if there is no dialog to be shown.
1134 doKill = true;
1135 }
1136 }
// 如果不满足显示anr dialog,则会去kill 进程
1137 if (doKill) {
1138 mService.killAppAtUsersRequest(proc);
1139 }
1140 // Notify PackageWatchdog without the lock held
1141 if (packageList != null) {
1142 mPackageWatchdog.onPackageFailure(packageList,
1143 PackageWatchdog.FAILURE_REASON_APP_NOT_RESPONDING);
1144 }
1145 }
/frameworks/base/services/core/java/com/android/server/am/ErrorDialogController.java
187 @GuardedBy("mProcLock")
188 void showAnrDialogs(AppNotRespondingDialog.Data data) {
189 List<Context> contexts = getDisplayContexts(
190 mApp.mErrorState.isSilentAnr() /* lastUsedOnly */);
191 mAnrDialogs = new ArrayList<>();
192 for (int i = contexts.size() - 1; i >= 0; i--) {
193 final Context c = contexts.get(i);
// 增加 AppNotRespondingDialog 到 mAnrDialogs 中
194 mAnrDialogs.add(new AppNotRespondingDialog(mService, c, data));
195 }
// 遍历所有的dialog 去显示dialog
196 scheduleForAllDialogs(mAnrDialogs, Dialog::show);
197 }
/frameworks/base/services/core/java/com/android/server/am/AppNotRespondingDialog.java
// 创建对应的dialog 显示
103 @Override
104 protected void onCreate(Bundle savedInstanceState) {
105 super.onCreate(savedInstanceState);
106 final FrameLayout frame = findViewById(android.R.id.custom);
107 final Context context = getContext();
108 LayoutInflater.from(context).inflate(
109 com.android.internal.R.layout.app_anr_dialog, frame, true);
110
111 final TextView report = findViewById(com.android.internal.R.id.aerr_report);
112 report.setOnClickListener(this);
113 final boolean hasReceiver = mProc.mErrorState.getErrorReportReceiver() != null;
114 report.setVisibility(hasReceiver ? View.VISIBLE : View.GONE);
115 final TextView close = findViewById(com.android.internal.R.id.aerr_close);
116 close.setOnClickListener(this);
117 final TextView wait = findViewById(com.android.internal.R.id.aerr_wait);
118 wait.setOnClickListener(this);
119
120 findViewById(com.android.internal.R.id.customPanel).setVisibility(View.VISIBLE);
121 }
// 点击事件的逻辑:
123 @Override
124 public void onClick(View v) {
125 switch (v.getId()) {
126 case com.android.internal.R.id.aerr_report:
127 mHandler.obtainMessage(WAIT_AND_REPORT).sendToTarget();
128 break;
129 case com.android.internal.R.id.aerr_close:
130 mHandler.obtainMessage(FORCE_CLOSE).sendToTarget();
131 break;
132 case com.android.internal.R.id.aerr_wait:
133 mHandler.obtainMessage(WAIT).sendToTarget();
134 break;
135 default:
136 break;
137 }
138 }
140 private final Handler mHandler = new Handler() {
141 public void handleMessage(Message msg) {
142 Intent appErrorIntent = null;
143
144 MetricsLogger.action(getContext(), MetricsProto.MetricsEvent.ACTION_APP_ANR,
145 msg.what);
146
147 switch (msg.what) {
// 如果是强制退出,则会去kill anr 进程
148 case FORCE_CLOSE:
149 // Kill the application.
150 mService.killAppAtUsersRequest(mProc);
151 break;
// 如果是等待,则置位 setNotResponding 位false
152 case WAIT_AND_REPORT:
153 case WAIT:
154 // Continue waiting for the application.
155 synchronized (mService) {
156 ProcessRecord app = mProc;
157 final ProcessErrorStateRecord errState = app.mErrorState;
158
159 if (msg.what == WAIT_AND_REPORT) {
160 appErrorIntent = mService.mAppErrors.createAppErrorIntentLOSP(app,
161 System.currentTimeMillis(), null);
162 }
163
164 synchronized (mService.mProcLock) {
165 errState.setNotResponding(false);
166 // We're not clearing the ANR report here, in case we'd need to report
167 // it again when the ANR dialog shows again.
168 // errState.setNotRespondingReport(null);
169 errState.getDialogController().clearAnrDialogs();
170 }
171 mService.mServices.scheduleServiceTimeoutLocked(app);
172 if (mData.isContinuousAnr) {
173 // If the app remains unresponsive, show the dialog again after a delay.
174 mService.mInternal.rescheduleAnrDialog(mData);
175 }
176 }
177 break;
178 }
179
180 if (appErrorIntent != null) {
181 try {
182 getContext().startActivity(appErrorIntent);
183 } catch (ActivityNotFoundException e) {
184 Slog.w(TAG, "bug report receiver dissappeared", e);
185 }
186 }
187
188 dismiss();
189 }
190 };
二、ams 的dump 信息
// anr in
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: ANR in com.android.phone
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: PID: 4607
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: Reason: executing service com.android.phone/.TelephonyDebugService
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: ErrorId: 2dbe8520-e3d7-437b-924c-1b3e7d22e818
// 是否被冻结 Frozen
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: Frozen: false
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: Load: 0.0 / 0.0 / 0.0
// 打印出memory
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: ----- Output from /proc/pressure/memory -----
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: some avg10=2.23 avg60=3.14 avg300=1.30 total=5403081
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: full avg10=0.43 avg60=0.93 avg300=0.42 total=2152537
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: ----- End output from /proc/pressure/memory -----
// 打印出cpu 使用
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: ----- Output from /proc/pressure/cpu -----
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: some avg10=89.93 avg60=74.57 avg300=29.49 total=109557494
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: full avg10=0.00 avg60=0.00 avg300=0.00 total=0
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: ----- End output from /proc/pressure/cpu -----
//打印出io 压力
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: ----- Output from /proc/pressure/io -----
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: some avg10=11.15 avg60=5.55 avg300=2.14 total=9347178
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: full avg10=0.00 avg60=0.11 avg300=0.19 total=1706551
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: ----- End output from /proc/pressure/io -----
// cpu 的使用率
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager:
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: CPU usage from 24122ms to -651ms ago (2024-08-13 08:11:53.438 to 2024-08-13 08:12:18.211):
// 各个进程占用的CPU的详细情况
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 52% 689/surfaceflinger: 27% user + 25% kernel / faults: 7946 minor
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 46% 1167/system_server: 35% user + 11% kernel / faults: 12735 minor 34 major
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 34% 3569/com.google.android.setupwizard: 23% user + 11% kernel / faults: 10880 minor 66 major
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 24% 2240/com.google.android.gms: 21% user + 3.3% kernel / faults: 17595 minor 90 major
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 17% 1582/com.android.systemui: 14% user + 2.9% kernel / faults: 21078 minor 67 major
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 15% 611/android.hardware.graphics.composer@2.4-service: 2.9% user + 12% kernel / faults: 1787 minor
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 13% 1983/com.google.android.gms.persistent: 11% user + 1.8% kernel / faults: 13063 minor 114 major
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 5.6% 63/kworker/u8:3-loop10: 0% user + 5.6% kernel
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 4.9% 4524/com.google.android.dialer: 3.8% user + 1% kernel / faults: 6492 minor 5 major
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 4.6% 210/logd: 1.7% user + 2.8% kernel / faults: 3381 minor 4 major
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 100% TOTAL: 50% user + 48% kernel + 1.6% softirq
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: CPU usage from 262368ms to 262368ms ago (1970-01-01 02:00:00.000 to 1970-01-01 02:00:00.000) with 0% awake:
S00E75D 08-13 08:12:18.901 1167 5209 E ActivityManager: 0% TOTAL: 0% user + 0% kernel
名词解释:
a. user:用户态,kernel:内核态
b. faults:内存缺页,minor------轻微的,major------重度,需要从磁盘拿数据
c. iowait:IO使用(等待)占比
d. irq:硬中断,softirq:软中断
- iowait占比很高,意味着有很大可能,是io耗时导致ANR,具体进一步查看有没有进程faults major比较多。
- 单进程CPU的负载并不是以100%为上限,而是有几个核,就有百分之几百,如4核上限为400%。
https://segmentfault.com/a/1190000040142277
三.anr 的dump 信息
tmp_anr的信息会合并到 trace 的anr 的文件中
// 通过 ProcfsMemoryUtil.java 的 readMemorySnapshotFromProcfs 方法
即 cat "/proc/" + pid + "/status", STATUS_KEYS
Subject: Broadcast of Intent { act=android.intent.action.SIM_STATE_CHANGED flg=0x15000010 cmp=com.android.phone/.vvm.VvmSimStateTracker (has extras) }
RssHwmKb: 145712
RssKb: 117552
RssAnonKb: 30596
RssShmemKb: 1364
VmSwapKb: 16416
发送sig_quit 信号给java虚拟机,dump java 调用栈。执行方法SignalCatcher::HandleSigQuit
----- pid 1522 at 2024-08-08 16:56:08.323947793+0800 -----
Cmd line: com.android.phone
Build fingerprint: 'Hisense/HLTE245E/HLTE245E:14/UP1A.231005.007/HLTE245E_240705:user/release-keys'
ABI: 'arm'
// 编译类型,不是debug 类型
Build type: optimized
suspend all histogram: Sum: 24.240ms 99% C.I. 5us-12354.560us Avg: 563.720us Max: 14107us
//表示有 62 个线程
DALVIK THREADS (62):
线程调用栈
"main" prio=5 tid=1 Runnable
| group="main" sCount=0 ucsCount=0 flags=0 obj=0x71c088c8 self=0xb2f7be00
| sysTid=1522 nice=0 cgrp=foreground sched=0/0 handle=0xb330b46c
| state=R schedstat=( 12743884467 27625874696 17544 ) utm=996 stm=277 core=0 HZ=100
| stack=0xbe071000-0xbe073000 stackSize=8188KB
| held mutexes= "mutator lock"(shared held)
at com.android.internal.telephony.uicc.IccUtils.hexCharToInt(IccUtils.java:434)
at com.android.internal.telephony.uicc.IccUtils.hexStringToBytes(IccUtils.java:463)
"Signal Catcher" daemon prio=10 tid=6
第一行:"main" prio=5 tid=1 Runnable
- main:线程名,如果有daemon这个参数,就是守护线程,例如Signal Catcher线程
- prio:线程优先级,值越小,优先级越高
- tid:线程内部id
- Runnable:线程状态
第二行:group="main" sCount=0 ucsCount=0 flags=0 obj=0x71c088c8 self=0xb2f7be00
- group:线程所属的线程组
- sCount:线程被挂起的次数
- dsCount:用于调试(debug)的线程挂起次数
- obj:当前线程关联的线程java对象
- self:当前线程的地址
第三行:sysTid=1522 nice=0 cgrp=foreground sched=0/0 handle=0xb330b46c
- sysTid:线程真正意义上的tid
- nice:调度优先级, nice的值越小优先级越高,-20的优先级已经很高了 。
- cgrp:进程所属的进程调度组
- sched:调度策略
- handle:函数处理地址
第四行:state=R schedstat=( 12743884467 27625874696 17544 ) utm=996 stm=277 core=0 HZ=100
- state:线程状态 Runnable
- schedstat:CPU调度时间统计
- utm/stm:用户态/内核态的CPU时间
- core:该线程最后运行所在的核心
- HZ:时钟频率
第五行:stack=0xbe071000-0xbe073000 stackSize=8188KB
- stack:线程栈的地址区间
- stackSize:栈的大小
第六行:held mutexes= "mutator lock"(shared held)
- held mutexes:持有锁的类型,包含独占锁exclusive和共享锁shared两种
Cpu调度的时间:schedstat=( 12743884467 27625874696 17544 )
括号里有3个值,分别为Running、Runable、Switch,代表CPU时间片轮转机制中的3个值:
- Running:CPU运行的时间,单位为ns
- Runable:RQ队列的等待时间,单位为ns
- Switch:CPU调度切换的次数
utm和stm:
- utm:该线程在用户态所执行的时间,单位为jiffies,默认为10ms
- stm:该线程在内核态所执行的时间,单位为jiffies,默认为10ms
所以CPU在内核态和用户态运行的时间为:996 * 10 + 227 * 10 = 1230ms,CPU运行的时间为 12743884467ns,也就是1274ms,大致等于utm + stm的时间,也就是schedstat的第一个参数。
所以从线程的调用栈中,我们能得到发生ANR的时候,线程所属的状态,以及当下CPU的一个运转情况,尤其是线程的状态。
线程的状态:
1)Runnable:此状态可能在等待操作系统的其他资源
2)Waiting:当前主线程调用了wait方法,需要等待另一个线程调用notify来唤醒
3)TimedWaiting:调用了wait方法,但是没有超时时间,也就意味着可能一直无法被唤醒而一直处于等待状态
4)Native:正在执行JNI本地函数
5)Blocked:当一个线程尝试获取一个对象锁,但是被另外一个线程持有了,该线程为Blocked阻塞状态。当该线程获取到锁,则变为Runnable状态
6)Sleeping:调用了sleep方法
nice调度优先级:
- nice 的取值范围为 -20 到 19 差不多对应了 Android API 的线程优先级的取值范围。
- nice 的值越大,进程的优先级就越低,获得 CPU 调用的机会越少,nice 值越小,进程的优先级则越高,获得 CPU 调用的机会越多。
- 一个 nice 值为 -20 的进程优先级最高,nice 值为 19 的进程优先级最低。
- 父进程 fork 出来的子进程继承了父进程的优先级。
安卓线程优先级prio:
//应用程序线程的标准优先级
public static final int THREAD_PRIORITY_DEFAULT = 0;
//最低可用优先级,仅针对那些真的不想在发生任何其他事情的情况下运行的任务。
public static final int THREAD_PRIORITY_LOWEST = 19;
//标准后台优先级,优先级略低于正常优先级,它对用户界面的影响非常小。
public static final int THREAD_PRIORITY_BACKGROUND = 10;
//用户正在进行交互的 UI 线程优先级。当用户在界面之间进行切换时,系统将自动调整应用线程的优先级。该优先级应用程序不能自己设置(也就是代码中不能设置)。
public static final int THREAD_PRIORITY_FOREGROUND = -2;
//系统显示线程的优先级,该优先级应用也不能自己设置。
public static final int THREAD_PRIORITY_DISPLAY = -4;
//最重要的显示线程的优先级,用于合成屏幕和检索输入事件。应用程序不能更改为此优先级。
public static final int THREAD_PRIORITY_URGENT_DISPLAY = -8;
//视频线程的标准优先级。应用程序不能更改为此优先级。
public static final int THREAD_PRIORITY_VIDEO = -10;
//音频线程的标准优先级。应用程序不能更改为此优先级。
public static final int THREAD_PRIORITY_AUDIO = -16;
//最重要的音频线程的标准优先级。应用程序不能更改为此优先级。
public static final int THREAD_PRIORITY_URGENT_AUDIO = -19;
https://blog.csdn.net/u011578734/article/details/110549238
https://brands.cnblogs.com/tencentcloud/p/6216
四.anr 的问题分析
ANR 问题主要有两种原因:应用自身的问题 和 系统异常导致的问题 。在分析 ANR 问题的时候,最主要的就是要确定是哪个原因导致的
ANR 问题一般的分析步骤如下:
-
分析 trace.txt :查看是否有明显的异常,比如死锁、SystemServer 异常持锁等
- 死锁堆栈 : 观察 Trace 堆栈,确认是否有明显问题,如主线程是否与其他线程发生死锁,如果是进程内部发生了死锁,只需找到与当前线程死锁的线程,问题即可解决
- 业务堆栈 : 观察 Trace 堆栈,发现当前主线程堆栈正在执行业务逻辑,这时候需要分析对应的代码,查看是否真的有问题
- 重要 :如果业务方觉得这里没有问题,需要进一步分析,因为 Trace 堆栈可能并不是真正耗时的地方,需要结合其他信息一起分析
- IPC Block 堆栈 : 观察通过 Trace 堆栈,发现主线程堆栈是在跨进程(Binder)通信,这时候可以根据其他 Log、Binder Info 等信息,来分析 IPC 信息
- 大部分 IPC 都是在跟 SystemServer,如果没有 BinderInfo,可以搜索对应的接口关键字,在 SystemServer 进程查找是否有相关的堆栈
- 重要 :如果业务方觉得这里没有问题,需要进一步分析,因为 Trace 堆栈可能并不是真正耗时的地方,需要结合其他信息一起分析
- 系统堆栈 : 通过观察 Trace,发现当前堆栈只是简单的系统堆栈,比如 NativePollOnce,想要搞清楚是否发生严重耗时,以及进一步的分析定位
- 重要 :大部分比较难分析的 ANR 基本上应用主线程堆栈都是 NativePollOnce 这个状态,之所以出现这种状态,可能有下面几个原因
- 确实没有消息在处理,可能进程被冻结,或者 No Focused Window 这种 ANR
- 刚好处理完某一个耗时消息,系统抓堆栈的时候,已经晚了,耗时的状态没有抓到
- 线程调度的原因,主线程没有机会执行
- 重要 :大部分比较难分析的 ANR 基本上应用主线程堆栈都是 NativePollOnce 这个状态,之所以出现这种状态,可能有下面几个原因
-
分析 Event Log :看具体的 ANR 时间(搜索 am_anr ),看看是否跟 ANR log 能够对上,以确定 ANR Log 是否有效,如果 ANR Log 有效,分析 ANR Log,提取有用信息:pid、tid、死锁等,遇到 ANR 问题,摆在我们面前的 trace 是不是第一案发现场,如果 ANR 发生的输出的信息很多,当时的 CPU 和 I/O 资源比较紧张,那么这段日志输出的时间点可能会延迟 10 秒到 20 秒都有可能,所以我们有时候需要提高警惕。不过正常情况下,EventLog 中的 am_anr 的输出时间是最早的,也是最接近 ANR 时间的 (提取有效信息到单独文件中)
-
分析 Android Log :看 MainLog(Android Log) 或者 SystemLog 查看 ANR 详细信息(搜索 ANR in) ,提取有效的信息 (提取有效信息到单独文件中)
- 发生 ANR 的时间
- 打印 ANR 日志的进程
- 发生 ANR 的进程
- 发生 ANR 的原因
- CPU 负载
- Memory 负载
- CPU 使用统计时间段
- 各进程的 CPU 使用率
- 总的 CPU 使用率
- 缺页次数 fault
- xxx minor 表示高速缓存中的缺页次数,可以理解为进程在做内存访问
- xxx major 表示内存的缺页次数,可以理解为进程在做 IO 操作
- CPU 使用汇总
-
配合 Main Log(Android Log) 和 EventLog 把 CPU 开始和结束的时间点内的所有有用信息提取出来到一个 文件中,搜索的主要关键字:**pid,进程名,WindowManager、ActivityManager
- 收集关键操作场景,比如解锁、安装应用、亮灭屏、应用启动等
- 收集异常和系统关键 Log
- 系统变慢 :比如 Slow operation、Slow dispatch、Slow delivery、dvm_lock_sample、binder_sample
- 进程变化 :am_kill、am_proc_died、lowmemorykiller、ANR、应用启动关系等
- 系统信息 :cpu info、meminfo、binder info(是否满了) 、iowait (是否过高)
- 消息监控 :ANR 前的 ANR Message 打印,Block Message 信息,应用自己代码执行逻辑推断出的 Message 耗时等
- 收集 ANR 进程的所有关键线程的运行情况、线程优先级等
- 根据第四步提取出来的关键信息文件,进一步理出系统当时的情况、状态(推荐 vscode 或者 notepad ++ ,有 线索就全局搜索)),比如
- 是处于低内存频繁杀进程?
- 重启第一次解锁系统繁忙
- 还是短时间内多个应用启动系统繁忙
- 还是应用自己的逻辑等待?
- 针对不同的 ANR 类型,提取不同的信息
-
不行就加 Log 复现
ANR 问题的一些实例: