微信Android客户端的ANR监控方案

微信公众号,WeMobileDev 2021年7月19日发布的 微信Android客户端的ANR监控方案

该方案的所有代码已经在Matrix(https://github.com/Tencent/matrix)中开源,这篇文章将详细讲解源码实现

当应用发生ANR之后,系统会收集许多进程,来dump堆栈,从而生成ANR Trace文件,收集的第一个,也是一定会被收集到的进程,就是发生ANR的进程,接着系统开始向这些应用进程发送SIGQUIT信号,应用进程收到SIGQUIT后开始dump堆栈。来简单画个示意图:

1.SignalAnrTracer onAlive方法里调用nativeInitSignalAnrDetective方法监听SIGQUIT信号

public class SignalAnrTracer extends Tracer {//region 参数private static final String TAG = "SignalAnrTracer";//检测anr线程名字//监控到SIGQUIT后,我们在20秒内(20秒是ANR dump的timeout时间)不断轮询自己是否有NOT_RESPONDING flag//一旦发现有这个flag,那么马上就可以认定发生了一次ANR。private static final String CHECK_ANR_STATE_THREAD_NAME = "Check-ANR-State-Thread";//检测NOT_RESPONDING flag间隔时间private static final int CHECK_ERROR_STATE_INTERVAL = 500;//dump最长时间20sprivate static final int ANR_DUMP_MAX_TIME = 20000;//检测error次数private static final int CHECK_ERROR_STATE_COUNT =ANR_DUMP_MAX_TIME / CHECK_ERROR_STATE_INTERVAL;//前台消息,超时2s的时候,说明卡住了private static final long FOREGROUND_MSG_THRESHOLD = -2000;//后台消息,超时2s的时候,说明卡住了private static final long BACKGROUND_MSG_THRESHOLD = -10000;//是否hasInstancepublic static boolean hasInstance = false;//是否是前台状态private static boolean currentForeground = false;//anr trace 文件路径private static String sAnrTraceFilePath = "";//    这个Hook Trace的方案,不仅仅可以用来查ANR问题,任何时候我们都可以手动向自己发送一个SIGQUIT信号,
//    从而hook到当时的Trace。Trace的内容对于我们排查线程死锁,线程异常,耗电等问题都非常有帮助。//打印trace 文件路径 ,自己触发的private static String sPrintTraceFilePath = "";//监听private static SignalAnrDetectedListener sSignalAnrDetectedListener;//sApplicationprivate static Application sApplication;//是否初始化了private static boolean hasInit = false;//anr发生时间,负值private static long anrMessageWhen = 0L;//anr发生时主线程处理的消息private static String anrMessageString = "";//endregionstatic {//加载trace-canary libSystem.loadLibrary("trace-canary");}//region 构造函数public SignalAnrTracer(TraceConfig traceConfig) {hasInstance = true;sAnrTraceFilePath = traceConfig.anrTraceFilePath;sPrintTraceFilePath = traceConfig.printTraceFilePath;}public SignalAnrTracer(Application application) {hasInstance = true;sApplication = application;}public SignalAnrTracer(Application application, String anrTraceFilePath, String printTraceFilePath) {hasInstance = true;sAnrTraceFilePath = anrTraceFilePath;sPrintTraceFilePath = printTraceFilePath;sApplication = application;}//endregion/*** AnrDumper.cc里 handleSignal*/@RequiresApi(api = Build.VERSION_CODES.M)@Keepprivate static void onANRDumped() {//是否是前台currentForeground = AppForegroundUtil.isInterestingToUser();//是否是主线程堵塞了,需要reportboolean needReport = isMainThreadBlocked();//有两种情况,主线程消息已经堵住了,或者开启一个线程检测状态 NOT_RESPONDING//需要reportif (needReport) {report(false);} else {
//            监控到SIGQUIT后,我们在20秒内(20秒是ANR dump的timeout时间)不断轮询自己是否有NOT_RESPONDING flag
//            ,一旦发现有这个flag,那么马上就可以认定发生了一次ANR。new Thread(new Runnable() {@Overridepublic void run() {//开启了一个线程检查checkErrorStateCycle();}}, CHECK_ANR_STATE_THREAD_NAME).start();}}@Keepprivate static void onANRDumpTrace() {try {MatrixUtil.printFileByLine(TAG, sAnrTraceFilePath);} catch (Throwable t) {MatrixLog.e(TAG, "onANRDumpTrace error: %s", t.getMessage());}}//endregion@Keepprivate static void onPrintTrace() {try {MatrixUtil.printFileByLine(TAG, sPrintTraceFilePath);} catch (Throwable t) {MatrixLog.e(TAG, "onPrintTrace error: %s", t.getMessage());}}/*** @param fromProcessErrorState false代表主线程阻塞了*/private static void report(boolean fromProcessErrorState) {try {String stackTrace = Utils.getMainThreadJavaStackTrace();if (sSignalAnrDetectedListener != null) {sSignalAnrDetectedListener.onAnrDetected(stackTrace, anrMessageString, anrMessageWhen, fromProcessErrorState);return;}TracePlugin plugin = Matrix.with().getPluginByClass(TracePlugin.class);if (null == plugin) {return;}String scene = AppMethodBeat.getVisibleScene();JSONObject jsonObject = new JSONObject();jsonObject = DeviceUtil.getDeviceInfo(jsonObject, Matrix.with().getApplication());jsonObject.put(SharePluginInfo.ISSUE_STACK_TYPE, Constants.Type.SIGNAL_ANR);jsonObject.put(SharePluginInfo.ISSUE_SCENE, scene);jsonObject.put(SharePluginInfo.ISSUE_THREAD_STACK, stackTrace);jsonObject.put(SharePluginInfo.ISSUE_PROCESS_FOREGROUND, currentForeground);Issue issue = new Issue();issue.setTag(SharePluginInfo.TAG_PLUGIN_EVIL_METHOD);issue.setContent(jsonObject);plugin.onDetectIssue(issue);MatrixLog.e(TAG, "happens real ANR : %s ", jsonObject.toString());} catch (JSONException e) {MatrixLog.e(TAG, "[JSONException error: %s", e);}}//通过消息时间,来判断是否到超出阈值@RequiresApi(api = Build.VERSION_CODES.M)private static boolean isMainThreadBlocked() {try {MessageQueue mainQueue = Looper.getMainLooper().getQueue();Field field = mainQueue.getClass().getDeclaredField("mMessages");field.setAccessible(true);final Message mMessage = (Message) field.get(mainQueue);if (mMessage != null) {anrMessageString = mMessage.toString();long when = mMessage.getWhen();if (when == 0) {return false;}long time = when - SystemClock.uptimeMillis();anrMessageWhen = time;long timeThreshold = BACKGROUND_MSG_THRESHOLD;if (currentForeground) {timeThreshold = FOREGROUND_MSG_THRESHOLD;}return time < timeThreshold;}} catch (Exception e) {return false;}return false;}private static void checkErrorStateCycle() {int checkErrorStateCount = 0;//开启一个循环检测while (checkErrorStateCount < CHECK_ERROR_STATE_COUNT) {try {checkErrorStateCount++;boolean myAnr = checkErrorState();if (myAnr) {report(true);break;}Thread.sleep(CHECK_ERROR_STATE_INTERVAL);} catch (Throwable t) {MatrixLog.e(TAG, "checkErrorStateCycle error, e : " + t.getMessage());break;}}}//用来判断anr发生了
//    在ANR弹窗前,会执行到makeAppNotRespondingLocked方法中,在这里会给发生ANR进程标记一个NOT_RESPONDING的flag。
//    而这个flag我们可以通过ActivityManager来获取:private static boolean checkErrorState() {try {Application application =sApplication == null ? Matrix.with().getApplication() : sApplication;ActivityManager am = (ActivityManager) application.getSystemService(Context.ACTIVITY_SERVICE);//从ActivityManager 获取ProcessErrorStateInfoList<ActivityManager.ProcessErrorStateInfo> procs = am.getProcessesInErrorState();if (procs == null) return false;for (ActivityManager.ProcessErrorStateInfo proc : procs) {MatrixLog.i(TAG, "[checkErrorState] found Error State proccessName = %s, proc.condition = %d", proc.processName, proc.condition);if (proc.uid != android.os.Process.myUid()&& proc.condition == ActivityManager.ProcessErrorStateInfo.NOT_RESPONDING) {MatrixLog.i(TAG, "maybe received other apps ANR signal");}if (proc.pid != android.os.Process.myPid()) continue;if (proc.condition != ActivityManager.ProcessErrorStateInfo.NOT_RESPONDING) {continue;}//只有是自己进程,并且是NOT_RESPONDING的时候,才返回truereturn true;}return false;} catch (Throwable t) {MatrixLog.e(TAG, "[checkErrorState] error : %s", t.getMessage());}return false;}//okpublic static void printTrace() {if (!hasInstance) {MatrixLog.e(TAG, "SignalAnrTracer has not been initialize");return;}if (sPrintTraceFilePath.equals("")) {MatrixLog.e(TAG, "PrintTraceFilePath has not been set");return;}nativePrintTrace();}private static native void nativeInitSignalAnrDetective(String anrPrintTraceFilePath, String printTraceFilePath);private static native void nativeFreeSignalAnrDetective();private static native void nativePrintTrace();@Overrideprotected void onAlive() {super.onAlive();if (!hasInit) {//调用native方法启动监听nativeInitSignalAnrDetective(sAnrTraceFilePath, sPrintTraceFilePath);//主要用来判断是否是前台AppForegroundUtil.INSTANCE.init();hasInit = true;}}@Overrideprotected void onDead() {super.onDead();//free anr检测nativeFreeSignalAnrDetective();}public void setSignalAnrDetectedListener(SignalAnrDetectedListener listener) {sSignalAnrDetectedListener = listener;}public interface SignalAnrDetectedListener {void onAnrDetected(String stackTrace, String mMessageString, long mMessageWhen, boolean fromProcessErrorState);}
}

2.MatrixTracer.cc

2.1 JNI_OnLoad初始化,双向绑定函数

2.2 nativeInitSignalAnrDetective,开启检测,真正检测的地方在AnrDumper.cc

2.3 AnrDumper.cc 里handleSignal里调用MatrixTracer anrDumpCallback ,表示anr可能发生了,通知SignalAnrTracer检测ui线程是否block或者状态为NOT_RESPONDING。并调用hookAnrTraceWrite方法,开启hook,为了找到write trace的点

2.4 my_connect,my_open是开始socket通信了,主要为了检测socket通信之后的write方法

2.5 my_write是我们的write方法


#define PROP_VALUE_MAX  92                      //用于求getApiLevel
#define PROP_SDK_NAME "ro.build.version.sdk"    //用于求getApiLevel
#define HOOK_CONNECT_PATH "/dev/socket/tombstoned_java_trace"   //socket文件地址
#define HOOK_OPEN_PATH "/data/anr/traces.txt"                   //socket文件地址using namespace MatrixTracer;static std::optional<AnrDumper> sAnrDumper; //AnrDumper,是自定义的SignalHandler
static bool isTraceWrite = false;           //isTraceWrite my_connect my_open设置为true,my_write设置为false
static bool fromMyPrintTrace = false;       //fromMyPrintTrace 是否是自己想打的
static bool isHooking = false;              //是否hooking,unHookAnrTraceWrite设置为false
static std::string anrTracePathstring;      //新的anrTracePathstring,系统用的
static std::string printTracePathstring;    //新的printTracePathstring,我自己想打印的时候用的
static int signalCatcherTid;                //signalCatcherTid的线程id//一个结构体,用来保存java层 类,方法地址
static struct StacktraceJNI {jclass AnrDetective;                    //SignalAnrTracerjclass ThreadPriorityDetective;jmethodID AnrDetector_onANRDumped;      //SignalAnrTracer 里的jmethodID AnrDetector_onANRDumpTrace;   //SignalAnrTracer 里的jmethodID AnrDetector_onPrintTrace;     //SignalAnrTracer 里的jmethodID ThreadPriorityDetective_onMainThreadPriorityModified;jmethodID ThreadPriorityDetective_onMainThreadTimerSlackModified;
} gJ;//region MainThreadPriorityModified相关的东西
int (*original_setpriority)(int __which, id_t __who, int __priority);int my_setpriority(int __which, id_t __who, int __priority) {if (__priority <= 0) {return original_setpriority(__which, __who, __priority);}if (__who == 0 && getpid() == gettid()) {JNIEnv *env = JniInvocation::getEnv();env->CallStaticVoidMethod(gJ.ThreadPriorityDetective,gJ.ThreadPriorityDetective_onMainThreadPriorityModified,__priority);} else if (__who == getpid()) {JNIEnv *env = JniInvocation::getEnv();env->CallStaticVoidMethod(gJ.ThreadPriorityDetective,gJ.ThreadPriorityDetective_onMainThreadPriorityModified,__priority);}return original_setpriority(__which, __who, __priority);
}int (*original_prctl)(int option, unsigned long arg2, unsigned long arg3,unsigned long arg4, unsigned long arg5);int my_prctl(int option, unsigned long arg2, unsigned long arg3,unsigned long arg4, unsigned long arg5) {if (option == PR_SET_TIMERSLACK) {if (gettid() == getpid() && arg2 > 50000) {JNIEnv *env = JniInvocation::getEnv();env->CallStaticVoidMethod(gJ.ThreadPriorityDetective,gJ.ThreadPriorityDetective_onMainThreadTimerSlackModified,arg2);}}return original_prctl(option, arg2, arg3, arg4, arg5);
}
//endregion/**** @param content 内容* @param filePath 文件地址*/
void writeAnr(const std::string &content, const std::string &filePath) {//unhook writeunHookAnrTraceWrite();std::stringstream stringStream(content);std::string to;std::ofstream outfile;outfile.open(filePath);outfile << content;
}//region my_connect  original_connect
int (*original_connect)(int __fd, const struct sockaddr *__addr, socklen_t __addr_length);int my_connect(int __fd, const struct sockaddr *__addr, socklen_t __addr_length) {if (__addr != nullptr) {//hook connect方法,检测sockaddr地址是否为HOOK_CONNECT_PATH,表明是signal检测线程if (strcmp(__addr->sa_data, HOOK_CONNECT_PATH) == 0) {//设置signal检测线程idsignalCatcherTid = gettid();//标记开始打印isTraceWrite = true;}}return original_connect(__fd, __addr, __addr_length);
}
//endregion//region my_open original_open
int (*original_open)(const char *pathname, int flags, mode_t mode);int my_open(const char *pathname, int flags, mode_t mode) {if (pathname != nullptr) {//hook connect方法,检测sockaddr地址是否为HOOK_OPEN_PATH,表明是signal检测线程if (strcmp(pathname, HOOK_OPEN_PATH) == 0) {//设置signal检测线程idsignalCatcherTid = gettid();//标记开始打印isTraceWrite = true;}}return original_open(pathname, flags, mode);
}
//endregion//region original_write my_write
ssize_t (*original_write)(int fd, const void *const __pass_object_size0 buf, size_t count);ssize_t my_write(int fd, const void *const buf, size_t count) {//如果标记为isTraceWrite为true,第一个signalCatcher线程,write调用即为打印trace的地方if (isTraceWrite && gettid() == signalCatcherTid) {isTraceWrite = false;signalCatcherTid = 0;if (buf != nullptr) {std::string targetFilePath;if (fromMyPrintTrace) {targetFilePath = printTracePathstring;} else {targetFilePath = anrTracePathstring;}if (!targetFilePath.empty()) {char *content = (char *) buf;writeAnr(content, targetFilePath);if (!fromMyPrintTrace) {anrDumpTraceCallback();} else {printTraceCallback();}fromMyPrintTrace = false;}}}return original_write(fd, buf, count);
}
//endregion//调用java的onANRDumped,AnrDumper.cc 里handleSignal里调用anrCallback然后调用这个anrDumpCallback回调
bool anrDumpCallback() {JNIEnv *env = JniInvocation::getEnv();if (!env) return false;env->CallStaticVoidMethod(gJ.AnrDetective, gJ.AnrDetector_onANRDumped);return true;
}//调用java的onANRDumpTrace,my_write里调用
bool anrDumpTraceCallback() {JNIEnv *env = JniInvocation::getEnv();if (!env) return false;env->CallStaticVoidMethod(gJ.AnrDetective, gJ.AnrDetector_onANRDumpTrace);return true;
}//调用java的onPrintTrace,my_write里调用
bool printTraceCallback() {JNIEnv *env = JniInvocation::getEnv();if (!env) return false;env->CallStaticVoidMethod(gJ.AnrDetective, gJ.AnrDetector_onPrintTrace);return true;
}//ok
int getApiLevel() {char buf[PROP_VALUE_MAX];int len = __system_property_get(PROP_SDK_NAME, buf);if (len <= 0)return 0;return atoi(buf);
}/*** @param isSiUser true为自己的进程* AnrDumper.cc 里handleSignal里调用anrCallback方法,或者调用siUserCallback,然后调用这个hookAnrTraceWrite回调*/
void hookAnrTraceWrite(bool isSiUser) {int apiLevel = getApiLevel();if (apiLevel < 19) {return;}//isSiUser为true,表示自己进程发的时候是通过kill发的,此处不符合逻辑,返回if (!fromMyPrintTrace && isSiUser) {return;}if (isHooking) {return;}isHooking = true;if (apiLevel >= 27) {void *libcutils_info = xhook_elf_open("/system/lib64/libcutils.so");if (!libcutils_info) {libcutils_info = xhook_elf_open("/system/lib/libcutils.so");}xhook_hook_symbol(libcutils_info, "connect", (void *) my_connect,(void **) (&original_connect));} else {void *libart_info = xhook_elf_open("libart.so");xhook_hook_symbol(libart_info, "open", (void *) my_open, (void **) (&original_open));}if (apiLevel >= 30 || apiLevel == 25 || apiLevel == 24) {void *libc_info = xhook_elf_open("libc.so");xhook_hook_symbol(libc_info, "write", (void *) my_write, (void **) (&original_write));} else if (apiLevel == 29) {void *libbase_info = xhook_elf_open("/system/lib64/libbase.so");if (!libbase_info) {libbase_info = xhook_elf_open("/system/lib/libbase.so");}xhook_hook_symbol(libbase_info, "write", (void *) my_write, (void **) (&original_write));xhook_elf_close(libbase_info);} else {void *libart_info = xhook_elf_open("libart.so");xhook_hook_symbol(libart_info, "write", (void *) my_write, (void **) (&original_write));}
}//unhook
void unHookAnrTraceWrite() {int apiLevel = getApiLevel();if (apiLevel >= 27) {void *libcutils_info = xhook_elf_open("/system/lib64/libcutils.so");xhook_hook_symbol(libcutils_info, "connect", (void *) original_connect, nullptr);} else {void *libart_info = xhook_elf_open("libart.so");xhook_hook_symbol(libart_info, "open", (void *) original_connect, nullptr);}if (apiLevel >= 30 || apiLevel == 25 || apiLevel == 24) {void *libc_info = xhook_elf_open("libc.so");xhook_hook_symbol(libc_info, "write", (void *) original_write, nullptr);} else if (apiLevel == 29) {void *libbase_info = xhook_elf_open("/system/lib64/libbase.so");xhook_hook_symbol(libbase_info, "write", (void *) original_write, nullptr);} else {void *libart_info = xhook_elf_open("libart.so");xhook_hook_symbol(libart_info, "write", (void *) original_write, nullptr);}isHooking = false;
}//初始化,开启检测Signalanr检测,真正检测的地方在AnrDumper.cc
static void
nativeInitSignalAnrDetective(JNIEnv *env, jclass, jstring anrTracePath, jstring printTracePath) {//anr发生时,打印pathconst char *anrTracePathChar = env->GetStringUTFChars(anrTracePath, nullptr);//手动发送SIGQUIT,打印的trace地址const char *printTracePathChar = env->GetStringUTFChars(printTracePath, nullptr);anrTracePathstring = std::string(anrTracePathChar);printTracePathstring = std::string(printTracePathChar);//开启检测,真正检测的地方在AnrDumper.ccsAnrDumper.emplace(anrTracePathChar, printTracePathChar, anrDumpCallback);
}//Free Signal Anr Detective 重置,释放
static void nativeFreeSignalAnrDetective(JNIEnv *env, jclass) {//重置,释放sAnrDumper.reset();
}//region MainThreadPriority相关 ,先不看
static void nativeInitMainThreadPriorityDetective(JNIEnv *env, jclass) {xhook_register(".*\\.so$", "setpriority", (void *) my_setpriority,(void **) (&original_setpriority));xhook_register(".*\\.so$", "prctl", (void *) my_prctl, (void **) (&original_prctl));xhook_refresh(true);
}
//endregion//自己打印trace,发送自己的进程发送SIGQUIT
static void nativePrintTrace() {fromMyPrintTrace = true;kill(getpid(), SIGQUIT);
}template<typename T, std::size_t sz>//todo
static inline constexpr std::size_t NELEM(const T(&)[sz]) { return sz; }//todo//JNINativeMethod 数组 anr相关的
static const JNINativeMethod ANR_METHODS[] = {{"nativeInitSignalAnrDetective", "(Ljava/lang/String;Ljava/lang/String;)V", (void *) nativeInitSignalAnrDetective},{"nativeFreeSignalAnrDetective", "()V",                                     (void *) nativeFreeSignalAnrDetective},{"nativePrintTrace",             "()V",                                     (void *) nativePrintTrace},
};//MainThreadPriority相关的,先不看
static const JNINativeMethod THREAD_PRIORITY_METHODS[] = {{"nativeInitMainThreadPriorityDetective", "()V", (void *) nativeInitMainThreadPriorityDetective},
};//JNI_OnLoad 初始化jni环境
JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *) {JniInvocation::init(vm);JNIEnv *env;//获取env环境,如果env环境没有获取成功,返回-1if (vm->GetEnv(reinterpret_cast<void **>(&env), JNI_VERSION_1_6) != JNI_OK)return -1;//获取SignalAnrTracer变为jclassjclass anrDetectiveCls = env->FindClass("com/tencent/matrix/trace/tracer/SignalAnrTracer");if (!anrDetectiveCls)return -1;//保存SignalAnrTracer为jclassgJ.AnrDetective = static_cast<jclass>(env->NewGlobalRef(anrDetectiveCls));//保存方法gJ.AnrDetector_onANRDumped =env->GetStaticMethodID(anrDetectiveCls, "onANRDumped", "()V");gJ.AnrDetector_onANRDumpTrace =env->GetStaticMethodID(anrDetectiveCls, "onANRDumpTrace", "()V");gJ.AnrDetector_onPrintTrace =env->GetStaticMethodID(anrDetectiveCls, "onPrintTrace", "()V");//注册native方法,使得java可以调用nativeif (env->RegisterNatives(anrDetectiveCls, ANR_METHODS, static_cast<jint>(NELEM(ANR_METHODS))) != 0)return -1;//删除anrDetectiveClsenv->DeleteLocalRef(anrDetectiveCls);jclass threadPriorityDetectiveCls = env->FindClass("com/tencent/matrix/trace/tracer/ThreadPriorityTracer");if (!threadPriorityDetectiveCls)return -1;gJ.ThreadPriorityDetective = static_cast<jclass>(env->NewGlobalRef(threadPriorityDetectiveCls));gJ.ThreadPriorityDetective_onMainThreadPriorityModified =env->GetStaticMethodID(threadPriorityDetectiveCls, "onMainThreadPriorityModified","(I)V");gJ.ThreadPriorityDetective_onMainThreadTimerSlackModified =env->GetStaticMethodID(threadPriorityDetectiveCls, "onMainThreadTimerSlackModified","(J)V");if (env->RegisterNatives(threadPriorityDetectiveCls, THREAD_PRIORITY_METHODS,static_cast<jint>(NELEM(THREAD_PRIORITY_METHODS))) != 0)return -1;env->DeleteLocalRef(threadPriorityDetectiveCls);return JNI_VERSION_1_6;
}   // namespace MatrixTracer

3.AnrDumper.h 定义AnrDumper,继承SignalHandler

namespace MatrixTracer {class AnrDumper : public SignalHandler {public://定义回调方法using DumpCallbackFunction = std::function<bool()>;AnrDumper(const char* anrTraceFile, const char* printTraceFile, DumpCallbackFunction&& callback);//&&引用。这个功能是C++的补充,常用在函数传参(C中一般用指针)、临时变量引用等。virtual ~AnrDumper();private://处理signal地方Result handleSignal(int sig, const siginfo_t *info, void *uc) final;const DumpCallbackFunction mCallback;
};
}   // namespace MatrixTracer#endif  // LAGDETECTOR_LAG_DETECTOR_MAIN_CPP_ANRDUMPER_H_

4.AnrDumper.cc handleSignal方法监听 SIGQUIT信号,并根据其他进程还是自己进程来调用anrCallback 或者siUserCallback,

4.1 anr是system_server进程发来的SIGQUIT,anrCallback代表可能发生了anr,之后会调用anrDumpCallback,让SignalAnrTracer检测ui线程是否block或者状态为NOT_RESPONDING


#define SIGNAL_CATCHER_THREAD_NAME "Signal Catcher"
#define SIGNAL_CATCHER_THREAD_SIGBLK 0x1000 //得到SignalCatcherThreadId,todo 没看明白
#define O_WRONLY 00000001
#define O_CREAT 00000100
#define O_TRUNC 00001000namespace MatrixTracer {static sigset_t old_sigSet;const char *mAnrTraceFile;const char *mPrintTraceFile;//建立了Signal Handler之后,我们发现在同时有sigwait和signal handler的情况下,
// 信号没有走到我们的signal handler而是依然被系统的Signal Catcher线程捕获到了,这是什么原因呢?
//
//原来是Android默认把SIGQUIT设置成了BLOCKED,所以只会响应sigwait而不会进入到我们设置的handler方法中。
// 我们通过pthread_sigmask或者sigprocmask把SIGQUIT设置为UNBLOCK,那么再次收到SIGQUIT时,就一定会进入到我们的handler方法中。需要这样设置:AnrDumper::AnrDumper(const char *anrTraceFile, const char *printTraceFile,AnrDumper::DumpCallbackFunction &&callback) : mCallback(callback) {// must unblocked SIGQUIT, otherwise the signal handler can not capture SIGQUIT// 必须unblock,否则signal handler无法接收到信号,而是由signal_cahcher线程中的sigwait接收信号,走一般的ANR流程mAnrTraceFile = anrTraceFile;mPrintTraceFile = printTraceFile;sigset_t sigSet;sigemptyset(&sigSet);sigaddset(&sigSet, SIGQUIT);pthread_sigmask(SIG_UNBLOCK, &sigSet, &old_sigSet);}//得到SignalCatcherThreadId,todo 没看明白static int getSignalCatcherThreadId() {char taskDirPath[128];DIR *taskDir;long long sigblk;int signalCatcherTid = -1;int firstSignalCatcherTid = -1;snprintf(taskDirPath, sizeof(taskDirPath), "/proc/%d/task", getpid());if ((taskDir = opendir(taskDirPath)) == nullptr) {return -1;}struct dirent *dent;pid_t tid;while ((dent = readdir(taskDir)) != nullptr) {tid = atoi(dent->d_name);if (tid <= 0) {continue;}char threadName[1024];char commFilePath[1024];snprintf(commFilePath, sizeof(commFilePath), "/proc/%d/task/%d/comm", getpid(), tid);Support::readFileAsString(commFilePath, threadName, sizeof(threadName));if (strncmp(SIGNAL_CATCHER_THREAD_NAME, threadName,sizeof(SIGNAL_CATCHER_THREAD_NAME) - 1) != 0) {continue;}if (firstSignalCatcherTid == -1) {firstSignalCatcherTid = tid;}sigblk = 0;char taskPath[128];snprintf(taskPath, sizeof(taskPath), "/proc/%d/status", tid);ScopedFileDescriptor fd(open(taskPath, O_RDONLY, 0));LineReader lr(fd.get());const char *line;size_t len;while (lr.getNextLine(&line, &len)) {if (1 == sscanf(line, "SigBlk: %" SCNx64, &sigblk)) {break;}lr.popLine(len);}if (SIGNAL_CATCHER_THREAD_SIGBLK != sigblk) {continue;}signalCatcherTid = tid;break;}closedir(taskDir);if (signalCatcherTid == -1) {signalCatcherTid = firstSignalCatcherTid;}return signalCatcherTid;}//我们通过Signal Handler抢到了SIGQUIT后,原本的Signal Catcher线程中的sigwait就不再能收到SIGQUIT了,
// 原本的dump堆栈的逻辑就无法完成了,我们为了ANR的整个逻辑和流程跟原来完全一致,需要在Signal Handler里面重新向Signal Catcher线程发送一个SIGQUIT:static void sendSigToSignalCatcher() {//遍历/proc/[pid]目录,找到SignalCatcher线程的tidint tid = getSignalCatcherThreadId();syscall(SYS_tgkill, getpid(), tid, SIGQUIT);}//SIGQUIT发生了,其他进程发来的,anr是system_server进程发来的消息,不是自己进程发来的static void *anrCallback(void *arg) {//anr可能发生了,通知SignalAnrTracer检测ui线程是否block或者状态为NOT_RESPONDINGanrDumpCallback();if (strlen(mAnrTraceFile) > 0) {//开始hook write sockethookAnrTraceWrite(false);}//转发SIGQUITsendSigToSignalCatcher();return nullptr;}//SIGQUIT发生了,自己进程发来的,不是anrstatic void *siUserCallback(void *arg) {//这里没有调用anrDumpCallback,因为是自己触发的if (strlen(mPrintTraceFile) > 0) {//开始hook write sockethookAnrTraceWrite(true);}//转发SIGQUITsendSigToSignalCatcher();return nullptr;}//另外,Signal Handler回调的第二个参数siginfo_t,也包含了一些有用的信息,该结构体的第三个字段si_code表示该信号被
// 发送的方法,SI_USER表示信号是通过kill发送的,SI_QUEUE表示信号是通过sigqueue发送的。但在Android的ANR流程中,
// 高版本使用的是sigqueue发送的信号,某些低版本使用的是kill发送的信号,并不统一。
//
//而第五个字段(极少数机型上是第四个字段)si_pid表示的是发送该信号的进程的pid,这里适用几乎所有Android版本和机型的
// 一个条件是:如果发送信号的进程是自己的进程,那么一定不是一个ANR。可以通过这个条件排除自己发送SIGQUIT,
// 而导致误报的情况。SignalHandler::Result AnrDumper::handleSignal(int sig, const siginfo_t *info, void *uc) {// Only process SIGQUIT, which indicates an ANR.if (sig != SIGQUIT) return NOT_HANDLED;//Got An ANRint fromPid1 = info->_si_pad[3];int fromPid2 = info->_si_pad[4];int myPid = getpid();pthread_t thd;if (fromPid1 != myPid && fromPid2 != myPid) {//一个条件是:如果发送信号的进程是自己的进程,那么一定不是一个ANR。可以通过这个条件排除自己发送SIGQUIT,pthread_create(&thd, nullptr, anrCallback, nullptr);} else {//自己的进程pthread_create(&thd, nullptr, siUserCallback, nullptr);}pthread_detach(thd);return HANDLED_NO_RETRIGGER;}//没用到static void *anr_trace_callback(void *args) {anrDumpTraceCallback();return nullptr;}//没用到static void *print_trace_callback(void *args) {printTraceCallback();return nullptr;}AnrDumper::~AnrDumper() {pthread_sigmask(SIG_SETMASK, &old_sigSet, nullptr);}}   // namespace MatrixTracer

5.我们的SignalHandler类

5.1 signalHandler方法主要是收到了信号

5.2 handleSignal处理信号

namespace MatrixTracer {class SignalHandler {public:SignalHandler();virtual ~SignalHandler();//析构函数:
//    当一个类的对象离开作用域时,析构函数将被调用(系统自动调用)。析构函数的名字和类名一样,不过要在前面加上 ~ 。
//    对一个类来说,只能允许一个析构函数,析构函数不能有参数,并且也没有返回值。
//    析构函数的作用是完成一个清理工作,如释放从堆中分配的内存。protected:enum Result {NOT_HANDLED = 0, HANDLED, HANDLED_NO_RETRIGGER};//retriggervirtual Result handleSignal(int sig, const siginfo_t *info, void *uc) = 0;private:static void signalHandler(int sig, siginfo_t *info, void *uc);static bool installHandlersLocked();//https://blog.csdn.net/lmb1612977696/article/details/80035487SignalHandler(const SignalHandler &) = delete;//禁止生成该函数,默认拷贝构造函数SignalHandler &operator=(const SignalHandler &) = delete;//禁止生成该函数,默认赋值函数};}   // namespace MatrixTracer#endif  // LAGDETECTOR_LAG_DETECTOR_MAIN_CPP_SIGNALHANDLER_H_

6.SignalHandler.cc

6.1 installHandlersLocked 通过可以sigaction方法,建立一个Signal Handler,sa_sigaction方法地址设置为我们的signalHandler方法

6.2 signalHandler 信号处理的地方,转发给各SignalHandler的handleSignal

//线程名字,todo,得到SignalCatcherThreadId,todo 没看明白
#define SIGNAL_CATCHER_THREAD_NAME "Signal Catcher"
//退出线程标记,todo,得到SignalCatcherThreadId,todo 没看明白
#define SIGNAL_CATCHER_THREAD_SIGBLK 0x1000namespace MatrixTracer {
//信号const int TARGET_SIG = SIGQUIT;//3
//使用sigaction方法注册signal handler进行异步监听,sOldHandlers是保存老的sigactionstruct sigaction sOldHandlers;//todobool sHandlerInstalled = false;// The global signal handler stack. This is needed because there may exist
// multiple SignalHandler instances in a process. Each will have itself
// registered in this stack.static std::vector<SignalHandler *> *sHandlerStack = nullptr;//todo
// C++11中新增了<mutex>,它是C++标准程序库中的一个头文件,定义了C++11标准中的一些互斥访问的类与方法等。其中std::mutex就是lock、unlock。std::lock_guard与std::mutex配合使用,把锁放到lock_guard中时,mutex自动上锁,lock_guard析构时,同时把mutex解锁。mutex又称互斥量。static std::mutex sHandlerStackMutex;//todostatic bool sStackInstalled = false;
// InstallAlternateStackLocked will store the newly installed stack in new_stack
// and (if it exists) the previously installed stack in old_stack.static stack_t sOldStack;//todostatic stack_t sNewStack;//todostatic void installAlternateStackLocked() {//todoif (sStackInstalled)return;//重置memset(&sOldStack, 0, sizeof(sOldStack));memset(&sNewStack, 0, sizeof(sNewStack));static constexpr unsigned kSigStackSize = std::max(16384, SIGSTKSZ);//取到老的sOldStackif (sigaltstack(nullptr, &sOldStack) == -1 || !sOldStack.ss_sp ||sOldStack.ss_size < kSigStackSize) {sNewStack.ss_sp = calloc(1, kSigStackSize);sNewStack.ss_size = kSigStackSize;//设置新的sNewStackif (sigaltstack(&sNewStack, nullptr) == -1) {free(sNewStack.ss_sp);return;}}sStackInstalled = true;ALOGV("Alternative stack installed.");}// Runs before crashing: normal context.
//    我们通过可以sigaction方法,建立一个Signal Handler:okbool SignalHandler::installHandlersLocked() {if (sHandlerInstalled) {return false;}// Fail if unable to store all the old handlers.//取到老的sOldHandlersif (sigaction(TARGET_SIG, nullptr, &sOldHandlers) == -1) {return false;}struct sigaction sa{};//sigaction结构体sa.sa_sigaction = signalHandler;//方法地址,收到信号的地方sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTART;//我们通过可以sigaction方法,建立一个Signal Handlerif (sigaction(TARGET_SIG, &sa, nullptr) == -1) {//sigaction方法,将sa设置为Signal HandlerALOGV("Signal handler cannot be installed");// At this point it is impractical to back out changes, and so failure to// install a signal is intentionally ignored.}sHandlerInstalled = true;ALOGV("Signal handler installed.");return true;}//todostatic void installDefaultHandler(int sig) {// Android L+ expose signal and sigaction symbols that override the system// ones. There is a bug in these functions where a request to set the handler// to SIG_DFL is ignored. In that case, an infinite loop is entered as the// signal is repeatedly sent to breakpad's signal handler.// To work around this, directly call the system's sigaction.struct sigaction sa;memset(&sa, 0, sizeof(sa));sigemptyset(&sa.sa_mask);sa.sa_handler = SIG_DFL;sa.sa_flags = SA_RESTART;sigaction(sig, &sa, nullptr);}// This function runs in a compromised context: see the top of the file.
// Runs on the crashing thread.static void restoreHandlersLocked() {//todoif (!sHandlerInstalled)return;//将老的sOldHandlers重新sigaction上if (sigaction(TARGET_SIG, &sOldHandlers, nullptr) == -1) {//todoinstallDefaultHandler(TARGET_SIG);}sHandlerInstalled = false;ALOGV("Signal handler restored.");}static void restoreAlternateStackLocked() {//todoif (!sStackInstalled)return;stack_t current_stack;if (sigaltstack(nullptr, &current_stack) == -1)return;// Only restore the old_stack if the current alternative stack is the one// installed by the call to InstallAlternateStackLocked.if (current_stack.ss_sp == sNewStack.ss_sp) {if (sOldStack.ss_sp) {if (sigaltstack(&sOldStack, nullptr) == -1)return;} else {stack_t disable_stack;disable_stack.ss_flags = SS_DISABLE;if (sigaltstack(&disable_stack, nullptr) == -1)return;}}free(sNewStack.ss_sp);sStackInstalled = false;}// This function runs in a compromised context: see the top of the file.
// Runs on the crashing thread.
// 发生信号处理的地方,转发给各sHandlerStack的handleSignal okvoid SignalHandler::signalHandler(int sig, siginfo_t *info, void *uc) {ALOGV("Entered signal handler.");
// All the exception signals are blocked at this point.std::unique_lock<std::mutex> lock(sHandlerStackMutex);for (auto it = sHandlerStack->rbegin(); it != sHandlerStack->rend(); ++it) {(*it)->handleSignal(sig, info, uc);}lock.unlock();}SignalHandler::SignalHandler() {//上锁,todostd::lock_guard<std::mutex> lock(sHandlerStackMutex);//建一个sHandlerStackif (!sHandlerStack)sHandlerStack = new std::vector<SignalHandler *>;//todoinstallAlternateStackLocked();//todoinstallHandlersLocked();//将自己放进去sHandlerStack->push_back(this);}SignalHandler::~SignalHandler() {std::lock_guard<std::mutex> lock(sHandlerStackMutex);auto it = std::find(sHandlerStack->begin(), sHandlerStack->end(), this);sHandlerStack->erase(it);if (sHandlerStack->empty()) {delete sHandlerStack;sHandlerStack = nullptr;restoreAlternateStackLocked();restoreHandlersLocked();}}}   // namespace MatrixTracer

微信Android客户端的ANR监控方案相关推荐

  1. 微信Android客户端的卡顿监控方案

    2021.8.1  Matrix 2.0 TraceCanary新增了以下功能 微信Android客户端的卡顿监控方案 https://mp.weixin.qq.com/s/3dubi2GVW_rVF ...

  2. 微信Android客户端架构演进之路

    去年7月,笔者在InfoQ举办的ArchSummit深圳2014的架构师峰会上,分享了微信Android客户端的架构演进史.可以说,这是一个典型的Android应用在从小到大的成长过程中的" ...

  3. 微信Android客户端中表情雨效果的实现

    微信android客户端中的表情雨效果在聊天中生动活泼,具体会出现特殊效果的词有恭喜发财.年年有余.想你了.生日快乐.么么哒等等,随着节日来临会有更新词库及图片内容出现.具体效果如下图: 现就其实现过 ...

  4. 微信Android客户端架构演进及其对开发流程的影响

    微信Android客户端架构演进及其对开发流程的影响 http://www.infoq.com/cn/presentations/android-client-architecture-evoluti ...

  5. 微信 Android 视频编码爬过的那些坑

    [编者按]Android 视频相关的开发,大概一直是整个 Android 生态.以及 Android API 中,最为分裂以及兼容性问题最为突出的一部分,本文从视频编码器的选择和如何对摄像头输出的 Y ...

  6. 抖音、美团等大厂千万级用户的Android客户端架构演进之路—

    在移动开发中,对开发者来说不同的人具有不同的能力.就像读一本书一样,一千个读者,有一千个哈姆雷特.但不管怎样,只要你是个软件开发者你就必须学习windows或Linux等操作系统的运行原理.Andro ...

  7. android手机活跃度,微信Android机型活跃度曝光,这个结果你满意吗?

    原标题:微信Android机型活跃度曝光,这个结果你满意吗? 在本周的第二届前端开发者大会上,由腾讯微信工程师公布了微信Android客户端机型前十的发布图,说的也是某款手机的存量跟用户活跃度的统计, ...

  8. 人人车Android客户端架构演进实录

    前言 对于大多数创业公司而言, 初版开发时采用的简单架构,在历经数次快速迭代后,已经成为了一个"大泥球"(源于Brian Footer和Joseph Yonder的论文<大泥 ...

  9. 微信android手机中点击大图片会自动放大图片

    自己使用的是微信Android客户端,使用img标签的src属性将图片设置好了以后,在微信中调试,点击图片竟然放大,自己没写放大图片的方法,也没有调用wx.previewImage()方法,最后查找, ...

最新文章

  1. 多重背包单调队列优化思路_多重背包之单调队列优化理论性总结
  2. php如何设定隐藏四位号码,PHP问题:php手机号码中间四位如何隐藏?
  3. python 语言-Python的语言特点
  4. todolist实现删除的功能_coc-todolist: nvim/vim 的 todolist/task 管理插件
  5. 【渝粤教育】国家开放大学2018年春季 4996T水土保持技术 参考试题
  6. 第三次作业:“我去图书馆”公众号用户体验分析
  7. “入圈”高端大获成功!小米10至尊版上市首月霸榜京东、天猫5000元以上档销量第一...
  8. eclipse中怎么复制错误提示
  9. Map<String,Object> map=new HashMap<String,Object>详解
  10. 闲话: 恭喜园子里的MVP一下, 同时问所有奋斗在技术领域的兄弟过节好~
  11. apply族函数应用指南
  12. 医疗保健行业未来发展的5大趋势预测
  13. 30分钟学会正则表达式
  14. WINDOWSXP全面优化
  15. 小镇青年程序员的逆袭人生:从差点回老家到荔枝技术骨干
  16. mysql 5.7 在线ddl
  17. 十几年稳坐“大哥”位,搞Java的程序员就是这么“牛x”!
  18. 【青少年编程】【三级】接苹果
  19. 基于Java代码自动提交Spark任务
  20. 扫雷游戏软件测试,软件测试

热门文章

  1. 设计模式-请假流程-责任链模式
  2. HTML5期末大作业:个人空间相册网页设计 (6页) HTML+CSS+JavaScript
  3. linux的iso镜像文件,linux系统安装iso文件方法
  4. OSI七层模型的功能以及设备
  5. bootstrap 详细教程笔记
  6. ssh 反向代理连接内网服务器并配置开机自启动(解决autossh无法开机自启动)
  7. 简洁好用的jquery 焦点图插件:Basic jQuery Slider
  8. python作图设置背景颜色_如何在matplotlib中设置绘图的外部背景颜色
  9. 每天半小时,一周带你手速大幅提升——几大打字练习网站测评
  10. van-icon自定义图标的引入