appNotResponding
//com.android.server.am.AppErrors.java/* app: 当前发生ANR的进程* activity: 发生ANR的界面* parent: 发生ANR的界面的上一级界面* aboveSystem: * annotation: 发生ANR的原因*/
final void appNotResponding(ProcessRecord app, ActivityRecord activity,ActivityRecord parent, boolean aboveSystem, final String annotation) {// 构建firstPids和lastPids数组// firstPids: 用于保存ANR进程及其父进程,system_server进程和persistent的进程// lastPids: 用于保存除firstPids外的其他进程ArrayList<Integer> firstPids = new ArrayList<Integer>(5);SparseArray<Boolean> lastPids = new SparseArray<Boolean>(20);if (mService.mController != null) {try {// 0 == continue, -1 = kill process immediatelyint res = mService.mController.appEarlyNotResponding(app.processName, app.pid, annotation);if (res < 0 && app.pid != MY_PID) {app.kill("anr", true);}} catch (RemoteException e) {mService.mController = null;Watchdog.getInstance().setActivityController(null);}}long anrTime = SystemClock.uptimeMillis();if (ActivityManagerService.MONITOR_CPU_USAGE) {// 更新CPU使用信息mService.updateCpuStatsNow();}// Unless configured otherwise, swallow ANRs in background processes & kill the process.// 如果配置了ANR_SHOW_BACKGROUND(anr_show_background)值为非空,则会弹出一个对话框,否则静默killboolean showBackground = Settings.Secure.getInt(mContext.getContentResolver(),Settings.Secure.ANR_SHOW_BACKGROUND, 0) != 0;boolean isSilentANR;// 跳过下面这些场景下的ANRsynchronized (mService) {// PowerManager.reboot() can block for a long time, so ignore ANRs while shutting down.//正在关机的if (mService.mShuttingDown) {Slog.i(TAG, "During shutdown skipping ANR: " + app + " " + annotation);return;//已经有一个ANR弹出框时} else if (app.notResponding) {Slog.i(TAG, "Skipping duplicate ANR: " + app + " " + annotation);return;//正在处理crash的} else if (app.crashing) {Slog.i(TAG, "Crashing app skipping ANR: " + app + " " + annotation);return;//被AMS kill的} else if (app.killedByAm) {Slog.i(TAG, "App already killed by AM skipping ANR: " + app + " " + annotation);return;// 进程已经被kill} else if (app.killed) {Slog.i(TAG, "Skipping died app ANR: " + app + " " + annotation);return;}// In case we come through here for the same app before completing// this one, mark as anring now so we will bail out.//防止同一个进程重复dump信息app.notResponding = true;//1.Log the ANR to the event log. 事件记录到eventlog中EventLog.writeEvent(EventLogTags.AM_ANR, app.userId, app.pid,app.processName, app.info.flags, annotation);// Dump thread traces as quickly as we can, starting with "interesting" processes.// 将当前进程添加到firstPidsfirstPids.add(app.pid);// 2.Don't dump other PIDs if it's a background ANR//showBackground为false(不显示后台ANR的dialog)isSilentANR = !showBackground && !isInterestingForBackgroundTraces(app);if (!isSilentANR) {int parentPid = app.pid;if (parent != null && parent.app != null && parent.app.pid > 0) {parentPid = parent.app.pid;}if (parentPid != app.pid) firstPids.add(parentPid);// MY_PID为system_server的PID,将system_server进程添加到firstPidsif (MY_PID != app.pid && MY_PID != parentPid) firstPids.add(MY_PID);for (int i = mService.mLruProcesses.size() - 1; i >= 0; i--) {ProcessRecord r = mService.mLruProcesses.get(i);if (r != null && r.thread != null) {int pid = r.pid;if (pid > 0 && pid != app.pid && pid != parentPid && pid != MY_PID) {// 将persistent进程添加到firstPidsif (r.persistent) {firstPids.add(pid);if (DEBUG_ANR) Slog.i(TAG, "Adding persistent proc: " + r);} else if (r.treatLikeActivity) {firstPids.add(pid);if (DEBUG_ANR) Slog.i(TAG, "Adding likely IME: " + r);} else {//其他进程添加到lastPidslastPids.put(pid, Boolean.TRUE);if (DEBUG_ANR) Slog.i(TAG, "Adding ANR proc: " + r);}}}}}}// Log the ANR to the main log.// 把ANR信息输出到main logStringBuilder info = new StringBuilder();info.setLength(0);info.append("ANR in ").append(app.processName);if (activity != null && activity.shortComponentName != null) {info.append(" (").append(activity.shortComponentName).append(")");}info.append("\n");info.append("PID: ").append(app.pid).append("\n");if (annotation != null) {info.append("Reason: ").append(annotation).append("\n");}if (parent != null && parent != activity) {info.append("Parent: ").append(parent.shortComponentName).append("\n");}ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true);// don't dump native PIDs for background ANRs unless it is the process of interestString[] nativeProcs = null;if (isSilentANR) {for (int i = 0; i < NATIVE_STACKS_OF_INTEREST.length; i++) {if (NATIVE_STACKS_OF_INTEREST[i].equals(app.processName)) {nativeProcs = new String[] { app.processName };break;}}} else {nativeProcs = NATIVE_STACKS_OF_INTEREST;}// pids为NATIVE_STACKS_OF_INTEREST中定义的几个进程int[] pids = nativeProcs == null ? null : Process.getPidsForCommands(nativeProcs);ArrayList<Integer> nativePids = null;if (pids != null) {nativePids = new ArrayList<Integer>(pids.length);for (int i : pids) {nativePids.add(i);}}// For background ANRs, don't pass the ProcessCpuTracker to// avoid spending 1/2 second collecting stats to rank lastPids.
// 输出traces信息,后面细说File tracesFile = ActivityManagerService.dumpStackTraces(true, firstPids,(isSilentANR) ? null : processCpuTracker,(isSilentANR) ? null : lastPids,nativePids);String cpuInfo = null;
// cpu信息统计,各个进程的cpu使用情况if (ActivityManagerService.MONITOR_CPU_USAGE) {mService.updateCpuStatsNow();synchronized (mService.mProcessCpuTracker) {// 记录ANR之前的cpu使用情况(CPU usage from 38980ms to 0ms ago)cpuInfo = mService.mProcessCpuTracker.printCurrentState(anrTime);}info.append(processCpuTracker.printCurrentLoad());info.append(cpuInfo);}// 记录从anr时间开始的cpu使用情况(CPU usage from 72ms to 465ms later)info.append(processCpuTracker.printCurrentState(anrTime));// 输出info信息,包含ANR的Reason、CPU负载信息以及使用率Slog.e(TAG, info.toString());if (tracesFile == null) {// There is no trace file, so dump (only) the alleged culprit's threads to the log
// 如果trace为空,则发送singal 3到发送ANR的进程,相当于adb shell kill -3 pidProcess.sendSignal(app.pid, Process.SIGNAL_QUIT);}// 将traces文件和CPU使用率信息保存到dropbox,即data/system/dropboxStatsLog.write(StatsLog.ANR_OCCURRED, app.uid, app.processName,activity == null ? "unknown": activity.shortComponentName, annotation,(app.info != null) ? (app.info.isInstantApp()? StatsLog.ANROCCURRED__IS_INSTANT_APP__TRUE: StatsLog.ANROCCURRED__IS_INSTANT_APP__FALSE): StatsLog.ANROCCURRED__IS_INSTANT_APP__UNAVAILABLE,app != null ? (app.isInterestingToUserLocked()? StatsLog.ANROCCURRED__FOREGROUND_STATE__FOREGROUND: StatsLog.ANROCCURRED__FOREGROUND_STATE__BACKGROUND): StatsLog.ANROCCURRED__FOREGROUND_STATE__UNKNOWN);mService.addErrorToDropBox("anr", app, app.processName, activity, parent, annotation,cpuInfo, tracesFile, null);if (mService.mController != null) {try {// 0 == show dialog, 1 = keep waiting, -1 = kill process immediatelyint res = mService.mController.appNotResponding(app.processName, app.pid, info.toString());if (res != 0) {if (res < 0 && app.pid != MY_PID) {app.kill("anr", true);} else {synchronized (mService) {mService.mServices.scheduleServiceTimeoutLocked(app);}}return;}} catch (RemoteException e) {mService.mController = null;Watchdog.getInstance().setActivityController(null);}}synchronized (mService) {mService.mBatteryStatsService.noteProcessAnr(app.processName, app.uid);
// 后台进程ANR,则直接killif (isSilentANR) {app.kill("bg anr", true);return;}// Set the app's notResponding state, and look up the errorReportReceivermakeAppNotRespondingLocked(app,activity != null ? activity.shortComponentName : null,annotation != null ? "ANR " + annotation : "ANR",info.toString());// Bring up the infamous App Not Responding dialogMessage msg = Message.obtain();msg.what = ActivityManagerService.SHOW_NOT_RESPONDING_UI_MSG;msg.obj = new AppNotRespondingDialog.Data(app, activity, aboveSystem);mService.mUiHandler.sendMessage(msg);}
}
- 输出ANR Reason信息到Events log中,关键字是am_anr,一般都是用这个信息来确定Anr的发生时间
- 如果是后台的anr,即isSilentANR值为true,那么不显示ANR对话框,直接kill掉ANR所在进程
- 输出ANR 信息到main log中,关键字是ANR in,含有Cpu负载信息,一般都是用这个看ANR发生时候的其他进程的资源占用情况
- 调用dumpStackTraces去生成trace文件,这个trace文件有哪些进程呢?
- firstPids队列:第一个是ANR进程,第二个是system_server,剩余是所有persistent进程;
- Native队列:是指/system/bin/目录的mediaserver,sdcard 以及surfaceflinger进程; NATIVE_STACKS_OF_INTEREST
- lastPids队列: 是指mLruProcesses中的不属于firstPids的所有进程。
- 将traces文件和 CPU使用情况信息保存到dropbox,即data/system/dropbox目录
- 弹窗告知用户
- am_anr关键字比ANR in要先打印,所以更接近ANR的实际发生时间
dumpStackTraces
//com.android.server.am.ActivityManagerService.java
public static File dumpStackTraces(boolean clearTraces, ArrayList<Integer> firstPids,ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,ArrayList<Integer> nativePids) {ArrayList<Integer> extraPids = null;// Measure CPU usage as soon as we're called in order to get a realistic sampling// of the top users at the time of the request.// 一旦调用此方法就马上计算CPU使用率以在请求的时候获取top用户的实际采样if (processCpuTracker != null) {processCpuTracker.init();try {Thread.sleep(200);} catch (InterruptedException ignored) {}// 测量CPU使用情况processCpuTracker.update();// We'll take the stack crawls of just the top apps using CPU.final int N = processCpuTracker.countWorkingStats();extraPids = new ArrayList<>();// 从lastPids中选取CPU使用率top 5的进程,晚些时机输出这些进程的stacksfor (int i = 0; i < N && extraPids.size() < 5; i++) {ProcessCpuTracker.Stats stats = processCpuTracker.getWorkingStats(i);if (lastPids.indexOfKey(stats.pid) >= 0) {if (DEBUG_ANR) Slog.d(TAG, "Collecting stacks for extra pid " + stats.pid);extraPids.add(stats.pid);} else if (DEBUG_ANR) {Slog.d(TAG, "Skipping next CPU consuming process, not a java proc: "+ stats.pid);}}}boolean useTombstonedForJavaTraces = false;File tracesFile;//如果dalvik.vm.stack-trace-dir没有配置,trace就写到全局文件data/anr/traces.txt中final String tracesDirProp = SystemProperties.get("dalvik.vm.stack-trace-dir", "");if (tracesDirProp.isEmpty()) {// When dalvik.vm.stack-trace-dir is not set, we are using the "old" trace// dumping scheme. All traces are written to a global trace file (usually// "/data/anr/traces.txt") so the code below must take care to unlink and recreate// the file if requested.//// This mode of operation will be removed in the near future.String globalTracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);if (globalTracesPath.isEmpty()) {Slog.w(TAG, "dumpStackTraces: no trace path configured");return null;}tracesFile = new File(globalTracesPath);try {//删除旧的,创建新的if (clearTraces && tracesFile.exists()) {tracesFile.delete();}tracesFile.createNewFile();//注意这里的权限设置 8.0以前的版本是没有的FileUtils.setPermissions(globalTracesPath, 0666, -1, -1); // -rw-rw-rw-} catch (IOException e) {Slog.w(TAG, "Unable to prepare ANR traces file: " + tracesFile, e);return null;}} else {File tracesDir = new File(tracesDirProp);// When dalvik.vm.stack-trace-dir is set, we use the "new" trace dumping scheme.// Each set of ANR traces is written to a separate file and dumpstate will process// all such files and add them to a captured bug report if they're recent enough.maybePruneOldTraces(tracesDir);// NOTE: We should consider creating the file in native code atomically once we've// gotten rid of the old scheme of dumping and lot of the code that deals with paths// can be removed.//以当前时间命名的trace文件 yyyy-MM-dd-HH-mm-ss-SSStracesFile = createAnrDumpFile(tracesDir);if (tracesFile == null) {return null;}useTombstonedForJavaTraces = true;}dumpStackTraces(tracesFile.getAbsolutePath(), firstPids, nativePids, extraPids,useTombstonedForJavaTraces);return tracesFile;
}
总结一下上面代码,这段代码并没有真正的执行dump操作,只是确定了trace文件路径,如果”dalvik.vm.stack-trace-dir”这个属性没有配置,就使用旧的dump策略,trace信息写入到全局trace文件/data/anr/traces.txt中,删除旧的traces文件,创建新的traces文件;如果”dalvik.vm.stack-trace-dir”属性被配置了,就创建格式为anr_yyyy-MM-dd-HH-mm-ss-SSS的文件。
private static void dumpStackTraces(String tracesFile, ArrayList<Integer> firstPids,ArrayList<Integer> nativePids, ArrayList<Integer> extraPids,boolean useTombstonedForJavaTraces) {// We don't need any sort of inotify based monitoring when we're dumping traces via// tombstoned. Data is piped to an "intercept" FD installed in tombstoned so we're in full// control of all writes to the file in question.final DumpStackFileObserver observer;if (useTombstonedForJavaTraces) {observer = null;} else {// Use a FileObserver to detect when traces finish writing.// The order of traces is considered important to maintain for legibility.observer = new DumpStackFileObserver(tracesFile);}// We must complete all stack dumps within 20 seconds.// 所有的dump操作需要在20秒之内完成long remainingTime = 20 * 1000;try {if (observer != null) {observer.startWatching();}// 第一步收集firstPids中进程的trace,超时会自动放弃// First collect all of the stacks of the most important pids.if (firstPids != null) {int num = firstPids.size();for (int i = 0; i < num; i++) {if (DEBUG_ANR) Slog.d(TAG, "Collecting stacks for pid "+ firstPids.get(i));final long timeTaken;if (useTombstonedForJavaTraces) {timeTaken = dumpJavaTracesTombstoned(firstPids.get(i), tracesFile, remainingTime);} else {timeTaken = observer.dumpWithTimeout(firstPids.get(i), remainingTime);}remainingTime -= timeTaken;if (remainingTime <= 0) {Slog.e(TAG, "Aborting stack trace dump (current firstPid=" + firstPids.get(i) +"); deadline exceeded.");return;}if (DEBUG_ANR) {Slog.d(TAG, "Done with pid " + firstPids.get(i) + " in " + timeTaken + "ms");}}}// 第二步收集nativePids中进程的trace,超时会自动放弃// Next collect the stacks of the native pidsif (nativePids != null) {for (int pid : nativePids) {if (DEBUG_ANR) Slog.d(TAG, "Collecting stacks for native pid " + pid);final long nativeDumpTimeoutMs = Math.min(NATIVE_DUMP_TIMEOUT_MS, remainingTime);final long start = SystemClock.elapsedRealtime();Debug.dumpNativeBacktraceToFileTimeout(pid, tracesFile, (int) (nativeDumpTimeoutMs / 1000));final long timeTaken = SystemClock.elapsedRealtime() - start;remainingTime -= timeTaken;if (remainingTime <= 0) {Slog.e(TAG, "Aborting stack trace dump (current native pid=" + pid +"); deadline exceeded.");return;}if (DEBUG_ANR) {Slog.d(TAG, "Done with native pid " + pid + " in " + timeTaken + "ms");}}}//第三步收集extraPids中进程的trace,超时会自动放弃,这个代码和第一步一样的// Lastly, dump stacks for all extra PIDs from the CPU tracker.if (extraPids != null) {for (int pid : extraPids) {if (DEBUG_ANR) Slog.d(TAG, "Collecting stacks for extra pid " + pid);final long timeTaken;if (useTombstonedForJavaTraces) {timeTaken = dumpJavaTracesTombstoned(pid, tracesFile, remainingTime);} else {timeTaken = observer.dumpWithTimeout(pid, remainingTime);}remainingTime -= timeTaken;if (remainingTime <= 0) {Slog.e(TAG, "Aborting stack trace dump (current extra pid=" + pid +"); deadline exceeded.");return;}if (DEBUG_ANR) {Slog.d(TAG, "Done with extra pid " + pid + " in " + timeTaken + "ms");}}}} finally {if (observer != null) {observer.stopWatching();}}
}
总结下以上逻辑,firstPids和extraPids中的进程dump trace有两种方式,看useTombstonedForJavaTraces的取值;
如果useTombstonedForJavaTraces为真,采用dumpJavaTracesTombstoned获取trace
如果useTombstonedForJavaTraces为假,采用DumpStackFileObserver#dumpWithTimeout获取trace
对于nativePids中的进程,只能通过 Debug.dumpNativeBacktraceToFileTimeout来获取trace。
1. 利用am_anr来确定ANR的发生时间真的准确吗?
不一定准确,可能由于系统资源比较紧张造成一定程度的滞后,十多秒都可能,此时需要结合eventlog进一步分析。
2. ANR中经常出现没有trace,或者无效trace是怎么回事?
从appNotResponding方法中,我们看到有不少return的地方,在dumpStackTraces方法,也会有return出现,比如这么多进程的trace需要在20秒之内完成,在系统资源紧张的时候,能不能完成,是非常没有保障的;
3.有一些ANR问题需要binder对端的trace,通过上面的分析看,大概率是没有对端的trace的,除非binder call到system_server或者比较重要的几个native进程里面;
4.发生ANR的时候,线程并没有暂停掉,有可能等到我们去抓这个进程的trace的时候,当前主线程耗时的方法都已经完成,恢复正常了,比如经常出现下面nativePollOnce状态的堆栈。
5. 我们需要通过源码可以发现,当发生的是后台进程ANR,会少收集很多的信息,比如CPU使用率,CPU占比较高的5个应用,NATIVE进程情况。当然同时更不会有弹窗出现,而是直接kill掉进程。但是这些情况也是我们必须要关心的,毕竟进程活着对于应用本身来说才有更多的可能。