cat /proc/pid/limits 查看最大打开文件Max open files

cat /proc/pid/fd 查看打开文件

cat /proc/sys/kernel/threads-max 查看线程最大创建数目

转自:https://www.jianshu.com/p/1f9cff12b84f

备份防丢。

在正式解释什么是fd泄露的时候,先看看三份log,是否有眼熟而不知所措感觉?结合公司同事的深入研究,总结了多种实际案例,才有了这篇文章,以后FD泄露问题在也不慌了。

log 1: Could not read input channel file descriptors from parcel

06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime: FATAL EXCEPTION: main
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime: Process: com.miui.weather2, PID: 20556
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime: java.lang.RuntimeException: Could not read input channel file descriptors from parcel.
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.view.InputChannel.nativeReadFromParcel(Native Method)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.view.InputChannel.readFromParcel(InputChannel.java:148)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.view.InputChannel$1.createFromParcel(InputChannel.java:39)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.view.InputChannel$1.createFromParcel(InputChannel.java:37)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at com.android.internal.view.InputBindResult.<init>(InputBindResult.java:68)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at com.android.internal.view.InputBindResult$1.createFromParcel(InputBindResult.java:112)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at com.android.internal.view.InputBindResult$1.createFromParcel(InputBindResult.java:110)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at com.android.internal.view.IInputMethodManager$Stub$Proxy.startInputOrWindowGainedFocus(IInputMethodManager.java:723)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.view.inputmethod.InputMethodManager.startInputInner(InputMethodManager.java:1295)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.view.inputmethod.InputMethodManager.onPostWindowFocus(InputMethodManager.java:1543)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.view.ViewRootImpl$ViewRootHandler.handleMessage(ViewRootImpl.java:4069)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.os.Handler.dispatchMessage(Handler.java:106)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.os.Looper.loop(Looper.java:171)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at android.app.ActivityThread.main(ActivityThread.java:6642)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at java.lang.reflect.Method.invoke(Native Method)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run(RuntimeInit.java:518)
06-22 20:34:43.035 10037 20556 20556 E AndroidRuntime:     at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:873)

log 2:Could not allocate JNI Env

06-22 11:59:30.335 2308 2308 E AndroidRuntime: FATAL EXCEPTION: main
06-22 11:59:30.335 2308 2308 E AndroidRuntime: Process: com.xiaomi.bluetooth, PID: 2308
06-22 11:59:30.335 2308 2308 E AndroidRuntime: java.lang.OutOfMemoryError: Could not allocate JNI Env
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at java.lang.Thread.nativeCreate(Native Method)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at java.lang.Thread.start(Thread.java:730)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at com.android.bluetooth.ble.c.dk(SynchronizedGattCallback.java:54)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at com.android.bluetooth.ble.m.dk(GattPeripheral.java:97)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at com.android.bluetooth.ble.m.eN(GattPeripheral.java:227)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at com.android.bluetooth.ble.m.eq(GattPeripheral.java:221)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at com.android.bluetooth.ble.z.run(PeripheralConnectionManager.java:462)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at android.os.Handler.handleCallback(Handler.java:754)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at android.os.Handler.dispatchMessage(Handler.java:95)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at android.os.Looper.loop(Looper.java:160)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at android.app.ActivityThread.main(ActivityThread.java:6202)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at java.lang.reflect.Method.invoke(Native Method)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at com.android.internal.os.ZygoteInit$MethodAndArgsCaller.run(ZygoteInit.java:874)
06-22 11:59:30.335 2308 2308 E AndroidRuntime: at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:764)

log 3:unable to open database file (code 14)

android.database.sqlite.SQLiteCantOpenDatabaseException: unable to open database file (code 14)at android.database.sqlite.SQLiteConnection.nativeExecuteForChangedRowCount(Native Method)at android.database.sqlite.SQLiteConnection.executeForChangedRowCount(SQLiteConnection.java:735)at android.database.sqlite.SQLiteSession.executeForChangedRowCount(SQLiteSession.java:754)at android.database.sqlite.SQLiteStatement.executeUpdateDelete(SQLiteStatement.java:64)at android.database.sqlite.SQLiteDatabase.updateWithOnConflict(SQLiteDatabase.java:1653)at android.database.sqlite.SQLiteDatabase.update(SQLiteDatabase.java:1599)at com.android.providers.telephony.TelephonyProvider.update(TelephonyProvider.java:2704)at android.content.ContentProvider$Transport.update(ContentProvider.java:357)at android.content.ContentResolver.update(ContentResolver.java:1688)at com.android.internal.telephony.SubscriptionController.setCarrierText(SubscriptionController.java:1202)at com.android.internal.telephony.SubscriptionControllerInjectorBase$DatabaseHandler.handleMessage(SubscriptionControllerInjectorBase.java:201)at android.os.Handler.dispatchMessage(Handler.java:106)at android.os.Looper.loop(Looper.java:164)at android.os.HandlerThread.run(HandlerThread.java:65)

相信大多人都觉得这类问题不好解决,更有人觉得这种问题直接加个try catch,结果下个版本接中上报。因为上面的Log基本上都不是案发现场,正式开始需要先补一波FD泄露的基础知识。在Android进程系列第一篇---进程基础中的2.3小节也有简单介绍。

一、FD的相关概念

概念:Fd的全称是File descriptor,在linux OS里,所有都可以抽象成文件,比如普通的文件、目录、块设备、字符设备、socket、管道等等。当通过一些系统调用(如open/socket等),会返回一个fd(就是一个数字)给你,然后根据这个fd对应的文件进行操作,比如读、写。

1、FD从何而来?

我们说fd是一个数字,那么这个数字是怎么计算出来的?在内核进程结构体task_struct中为每个进程维护了一个数组,数组下标就是fd,里面存储的是对这个文件的描述了。里面就有files指针,维护着所有打开的文件信息:

struct task_struct {
.../* Open file information: */struct files_struct     *files;
...
}
/*
* Open file table structure
*/
struct files_struct {/** read mostly part*/atomic_t count;bool resize_in_progress;wait_queue_head_t resize_wait;struct fdtable __rcu *fdt;struct fdtable fdtab;/** written part on a separate cache line in SMP*/spinlock_t file_lock ____cacheline_aligned_in_smp;unsigned int next_fd;unsigned long close_on_exec_init[1];unsigned long open_fds_init[1];unsigned long full_fds_bits_init[1];struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

files_struct中维护一个fdtable,fdtable里的fd就是一个数组,file结构体就是为打开文件的信息了。

struct fdtable {unsigned int max_fds;struct file __rcu **fd;      /* current fd array */unsigned long *close_on_exec;unsigned long *open_fds;unsigned long *full_fds_bits;struct rcu_head rcu;
};

2、阈值

linux默认对每个进程最大能打开的fd的个数是1024(软限制是1024,硬限制是4096),你可以通过/proc/$pid/limits查看Max open files:

fd阈值查看

当一个进程打开的文件数超过这个软限制值1024将无法再打开文件了。所以就报出各种问题,和OOM问题一样,crash堆栈有可能只是压死骆驼的最后一根稻草,并不是真实案发现场。所以用完fd后需要close关闭这个fd,那么这个fd对应的数字就被系统回收了,下一次的open才会被重新利用。

软限制和硬限制的区别
硬限制是可以在任何时候任何进程中设置 但硬限制只能由超级用户提起
软限制是内核实际执行的限制,任何进程都可以将软限制设置为任意小于等于对进程限制的硬限制的操作fd

如果觉得fd不够用了,也可以用下面方式调整.

getrlimit(RLIMIT_NOFILE, &rlim);
setrlimit(RLIMIT_NOFILE, &rlim);
ulimit -n 2048
android.system.Os.getrlimit(OsConstants.RLIMIT_NOFILE);

egg:

void modifyfdlimit() {rlimit fdLimit;fdLimit.rlim_cur = 30000;fdLimit.rlim_max = 30000;if (-1 == setrlimit(RLIMIT_NOFILE, & fdLimit)) {printf("Set max fd open count fai. /nl");char cmdBuffer[ 64];sprintf(cmdBuffer, "ulimit -n %d", 30000);if (-1 == system(cmdBuffer)) {printf("%s failed. /n", cmdBuffer);exit(0);}if (-1 == getrlimit(RLIMIT_NOFILE, & fdLimit)){printf("Ulimit fd number failed.");exit(0);}}
}

3、打开的fd查看

使用ls -la /proc/$pid/fd查看

nitrogen:/ # pidof  system_server
1956
nitrogen:/ # ls -la /proc/1956/f
fd/      fdinfo/
nitrogen:/ # ls -la /proc/1956/fd
total 0
dr-x------ 2 system system  0 2018-11-02 10:57 .
dr-xr-xr-x 9 system system  0 2018-11-02 10:57 ..
lrwx------ 1 system system 64 2018-11-02 12:26 0 -> /dev/null
lrwx------ 1 system system 64 2018-11-02 12:26 1 -> /dev/null
lr-x------ 1 system system 64 2018-11-02 12:26 10 -> /system/framework/QPerformance.jar
lrwx------ 1 system system 64 2018-11-02 12:26 100 -> anon_inode:[timerfd]
lrwx------ 1 system system 64 2018-11-02 12:26 101 -> anon_inode:[timerfd]
lrwx------ 1 system system 64 2018-11-02 12:26 102 -> anon_inode:[timerfd]
lrwx------ 1 system system 64 2018-11-02 12:26 103 -> anon_inode:[timerfd]
lrwx------ 1 system system 64 2018-11-02 12:26 104 -> anon_inode:[timerfd]
lrwx------ 1 system system 64 2018-11-02 12:26 105 -> anon_inode:[eventpoll]
lr-x------ 1 system system 64 2018-11-02 12:26 106 -> anon_inode:inotify
lr-x------ 1 system system 64 2018-11-02 12:26 107 -> pipe:[36681]
l-wx------ 1 system system 64 2018-11-02 12:26 108 -> pipe:[36681]
lrwx------ 1 system system 64 2018-11-02 12:26 109 -> anon_inode:[eventfd]
lr-x------ 1 system system 64 2018-11-02 12:26 11 -> /system/framework/core-oj.jar
lrwx------ 1 system system 64 2018-11-02 12:26 110 -> anon_inode:[eventpoll]
lrwx------ 1 system system 64 2018-11-02 12:26 111 -> socket:[35371]
lrwx------ 1 system system 64 2018-11-02 12:26 112 -> socket:[35372]
lr-x------ 1 system system 64 2018-11-02 12:26 113 -> /system/media/theme/defau

二、Fd Leak案例

现在看几种FD泄露问题的案例,FD泄露问题的特点是:

  1. 同一个问题可能出现不同堆栈, 比较隐晦
  2. Fd泄漏时内存可能不会出现不足,就算触发GC也不一定能够回收已经创建的文件句柄

日志关键字:
ashmem_create_region failed for ‘indirect ref table’: Too many open files
"Too many open files"
"Could not allocate JNI Env"
"Could not allocate dup blob fd"
"Could not read input channel file descriptors from parcel"
"pthread_create"
"InputChannel is not initialized"
"Could not open input channel pair"
当你看到上面几种crash的堆栈之后,就需要往fd泄露的方向上去思考了

1、Resource相关

使用输入输出流没有关闭的可能会出问题,FileInputStream,FileOutputStream,FileReader,FileWriter 等,因为每打开一个文件需要fd。一些输入流也提供了基于fd的构造方法

174    public FileInputStream(FileDescriptor fdObj) {
175        this(fdObj, false /* isFdOwner */);
176    }
177

下面是一种泄露案例

frameworks/base/services/core/java/com/android/server/pm/ResmonWhitelistPackage.java
10final class ResmonWhitelistPackage {
11    private final File mSystemDir;
12    private final File mWhitelistFile;
13
14    final ArrayList<String> mPackages = new ArrayList<String>();
15
16    ResmonWhitelistPackage() {
17        mSystemDir = new File("/system/", "etc");
18        mWhitelistFile = new File(mSystemDir, "resmonwhitelist.txt");
19    }
20
21    void readList() {....
25        try {
26            /// M: Clear white list record before update it
27            mPackages.clear();
28            BufferedReader br = new BufferedReader(new FileReader(mWhitelistFile));
29            String line = br.readLine();
30            while (line != null) {
31                mPackages.add(line);
32                line = br.readLine();
33            }
34            br.close();
35        } catch (IOException e) {
36            //Log.e(PackageManagerService.TAG, "IO Exception happened while reading resmon whitelist");
37            e.printStackTrace();
38        }
39    }
40}

br.close并不是在finally语句中,可能会出现未关闭的可能。如果代码写的风骚一点,也有办法。从 Java 7 build 105 版本开始,Java 7 的编译器和运行环境支持新的 try-with-resources 语句,称为 ARM 块(Automatic Resource Management) ,自动资源管理。

private static void customBufferStreamCopy(File source, File target) {try (InputStream fis = new FileInputStream(source);OutputStream fos = new FileOutputStream(target)){byte[] buf = new byte[8192];int i;while ((i = fis.read(buf)) != -1) {fos.write(buf, 0, i);}}catch (Exception e) {e.printStackTrace();}
}

代码清晰,且不会发生泄露。

2、HandlerThread相关

使用HandlerThread不小心也会发生fd泄露,看看这个案例

2.1、现象

systemui总是crash,发生问题系统版本Android O

2.2、初步分析

pid: 18465, tid: 32737, name: async_sensor >>> com.android.systemui <<<
signal 5 (SIGTRAP), code -32763 (PTRACE_EVENT_STOP), fault addr 0x3e800007fe1
x0 fffffffffffffffc x1 000000735df1ec38 x2 0000000000000010 x3 00000000ffffffff
x4 0000000000000000 x5 0000000000000008 x6 0000007428971000 x7 0000000000bb3876
x8 0000000000000016 x9 7fffffffffffffff x10 000000000000000c x11 0000000000000000
x12 000000735df1ed38 x13 000000005b20a831 x14 002cd0c4e58dc31b x15 0000fdf7aa690a91
x16 00000074245e7498 x17 000000742453bd00 x18 0000000000000004 x19 000000735df1f588
x20 000000738727b708 x21 00000000ffffffff x22 000000735df1f588 x23 000000738727b660
x24 0000000000000028 x25 000000000000000c x26 0000000014a000b0 x27 0000007385815300
x28 00000000710507c8 x29 000000735df1ebe0 x30 000000742453bd38
sp 000000735df1ebc0 pc 00000074245866dc pstate 0000000060000000
v0 00000000000000000000000000000000 v1 00000000000000000000000000000001
v2 00000000000000002065766974616e3c v3 00000000000000000000000000000000
v4 00000000000000008020080200000000 v5 00000000000000004000000000000000
v6 00000000000000000000000000000000 v7 00000000000000008020080280200802
v8 00000000000000000000000000000000 v9 00000000000000000000000000000000
v10 00000000000000000000000000000000 v11 00000000000000000000000000000000
v12 00000000000000000000000000000000 v13 00000000000000000000000000000000
v14 00000000000000000000000000000000 v15 00000000000000000000000000000000
v16 40100401401004014010040140100401 v17 a0080000a00a0000a800aa0040404000
v18 80200800000000008020080200000000 v19 000000000000000000000000ebad8083
v20 000000000000000000000000ebad8084 v21 000000000000000000000000ebad8085
v22 000000000000000000000000ebad8086 v23 000000000000000000000000ebad8087
v24 000000000000000000000000ebad8088 v25 000000000000000000000000ebad8089
v26 000000000000000000000000ebad808a v27 000000000000000000000000ebad808b
v28 000000000000000000000000ebad808c v29 000000000000000000000000ebad808d
v30 000000000000000000000000ebad808e v31 00000000000000000000000041e00000
fpsr 00000013 fpcr 00000000backtrace:
#00 pc 000000000006a6dc /system/lib64/libc.so (__epoll_pwait+8)
#01 pc 000000000001fd34 /system/lib64/libc.so (epoll_pwait+52)
#02 pc 0000000000015d08 /system/lib64/libutils.so (android::Looper::pollInner(int)+144)
#03 pc 0000000000015bf0 /system/lib64/libutils.so (android::Looper::pollOnce(int, int*, int*, void**)+108)
#04 pc 0000000000111bac /system/lib64/libandroid_runtime.so (android::android_os_MessageQueue_nativePollOnce(_JNIEnv*, _jobject*, long, int)+44)
#05 pc 0000000000c005cc /system/framework/arm64/boot-framework.oat (offset 0x9cb000) (android.app.NativeActivity.onWindowFocusChangedNative [DEDUPED]+140)
#06 pc 0000000001773f00 /system/framework/arm64/boot-framework.oat (offset 0x9cb000) (android.os.MessageQueue.next+192)

乍看是处理消息的时候挂了?继续查看log发现

06-15 22:00:33.921 1000 2155 2335 E Parcel : fcntl(F_DUPFD_CLOEXEC) failed in Parcel::read, i is 0, fds[i] is -1, fd_count is 2, error: Too many open files
06-15 22:00:33.921 1000 2155 2335 E Surface : dequeueBuffer: IGraphicBufferProducer::requestBuffer failed: -22
06-15 22:00:33.921 1000 2155 2335 I Adreno : DequeueBuffer: dequeueBuffer failed
06-15 22:00:33.921 1000 2155 2335 E Parcel : fcntl(F_DUPFD_CLOEXEC) failed in Parcel::read, i is 0, fds[i] is -1, fd_count is 2, error: Too many open files
06-15 22:00:33.921 1000 2155 2335 E Surface : dequeueBuffer: IGraphicBufferProducer::requestBuffer failed: -22
06-15 22:00:33.921 1000 2155 2335 I Adreno : DequeueBuffer: dequeueBuffer failed
06-15 22:00:33.921 1000 2155 2335 E Parcel : fcntl(F_DUPFD_CLOEXEC) failed in Parcel::read, i is 0, fds[i] is -1, fd_count is 2, error: Too many open files
06-15 22:00:33.921 1000 2155 2335 E Surface : dequeueBuffer: IGraphicBufferProducer::requestBuffer failed: -22
06-15 22:00:33.921 1000 2155 2335 I Adreno : DequeueBuffer: dequeueBuffer failed
06-15 22:00:33.922 1000 2155 2335 E Parcel : fcntl(F_DUPFD_CLOEXEC) failed in Parcel::read, i is 0, fds[i] is -1, fd_count is 2, error: Too many open files
06-15 22:00:33.922 1000 2155 2335 E Surface : dequeueBuffer: IGraphicBufferProducer::requestBuffer failed: -22
06-15 22:00:33.922 1000 2155 2335 I Adreno : DequeueBuffer: dequeueBuffer failed
06-15 22:00:33.922 1000 2155 2335 E OpenGLRenderer: GL error: GL_INVALID_OPERATION
06-15 22:00:33.922 1000 2155 2335 F OpenGLRenderer: glCopyTexSubImage2D error! GL_INVALID_OPERATION (0x502

状态栏open fd超过1024, 看log有很多上面这种log,这个是真实案发现场吗?

2.3、深入分析

O上发生NE时会将fd信息打印到tombstone文件中,看fd信息确实已经满了,多为anon_inode:[eventfd]和anon_inode:dmabuf,

backtrace:
#00 pc 000000000006a6dc /system/lib64/libc.so (__epoll_pwait+8)
#01 pc 000000000001fd34 /system/lib64/libc.so (epoll_pwait+52)
#02 pc 0000000000015d08 /system/lib64/libutils.so (android::Looper::pollInner(int)+144)
#03 pc 0000000000015bf0 /system/lib64/libutils.so (android::Looper::pollOnce(int, int*, int*, void**)+108)
#04 pc 0000000000111bac /system/lib64/libandroid_runtime.so (android::android_os_MessageQueue_nativePollOnce(_JNIEnv*, _jobject*, long, int)+44)
#05 pc 0000000000c005cc /system/framework/arm64/boot-framework.oat (offset 0x9cb000) (android.app.NativeActivity.onWindowFocusChangedNative [DEDUPED]+140)
#06 pc 0000000001773f00 /system/framework/arm64/boot-framework.oat (offset 0x9cb000) (android.os.MessageQueue.next+192)
....
fd 556: anon_inode:[eventpoll]
fd 557: anon_inode:[eventpoll]
fd 558: anon_inode:[eventfd]
fd 559: anon_inode:[eventpoll]
fd 560: anon_inode:[eventfd]
fd 561: anon_inode:[eventpoll]
fd 562: anon_inode:[eventfd]
fd 563: anon_inode:[eventfd]
fd 564: anon_inode:[eventpoll]
fd 565: anon_inode:[eventfd]
fd 566: anon_inode:[eventpoll]
fd 567: /dev/ashmem
..... //省略千行
fd 1022: anon_inode:dmabuf
fd 1023: socket:[3549620]

通过trace分析,还有一个关键的异常log,看到systemui进程有很多个async_sensor线程,为什么这个线程这么多呢?

pid: 11019, tid: 2301, name: async_sensor >>> com.android.systemui <<<
pid: 11019, tid: 2431, name: async_sensor >>> com.android.systemui <<<
pid: 11019, tid: 2522, name: async_sensor >>> com.android.systemui <<<
pid: 11019, tid: 2542, name: async_sensor >>> com.android.systemui <<<
pid: 11019, tid: 2600, name: async_sensor >>> com.android.systemui <<<
.....//省略若干
pid: 11019, tid: 5693, name: async_sensor >>> com.android.systemui <<<

搜查代码async_sensor是什么?发现async_sensor是个HanderThread

image.png

看在DozeFactory中,有new AsyncSensorManager 的地方:

image.png

继续查看assembleMachine方法在哪里调用的,DozeService中有调用 assembleMachine的地方

image.png

回头在结合log发现,DozeService被频繁的启动,看来一步步的接近真相了。这个问题看来是锁屏同事造成的问题。

isTest=false, canDoze=true, userId=0
06-15 21:48:11.888 1000 1313 1384 I DreamController: Stopping dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:48:13.337 1000 1313 1384 I DreamController: Starting dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:48:24.519 1000 1313 1384 I DreamController: Stopping dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:48:33.577 1000 1313 1384 I DreamController: Starting dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:48:50.640 1000 1313 1384 I DreamController: Stopping dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:48:56.540 1000 1313 1384 I DreamController: Starting dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:49:28.240 1000 1313 1384 I DreamController: Stopping dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:49:29.207 1000 1313 1384 I DreamController: Starting dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:49:30.206 1000 1313 1384 I DreamController: Stopping dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:49:33.332 1000 1313 1384 I DreamController: Starting dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:49:37.435 1000 1313 1384 I DreamController: Stopping dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:49:50.529 1000 1313 1384 I DreamController: Starting dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:50:20.010 1000 1313 1384 I DreamController: Stopping dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
06-15 21:50:30.148 1000 1313 1384 I DreamController: Starting dream: name=ComponentInfo{com.android.systemui/com.android.keyguard.doze.DozeService}, isTest=false, canDoze=true, userId=0
......

2.4、修复方案

最终转给锁屏同事,将此修复

image.png

所以本问题的RootCase就是频繁的启动了DozeService,创建了大量的HandlerThread导致fd泄露,那么为什么HandlerThread和fd泄露有关系呢?跟踪源码发现HandlerThread创建会引起Looper的创建,每一个Looper在创建的时候会打开两个fd,一个是eventfd,另外一个是mEpolled,这个和tombstone文件中打印的fd也对上了。

image.png

总结,这种泄露问题如果分析的时候,如果不知道HandlerThread会创建两个fd的基本知识,那么这个问题比较难以分析。

2、Thread.start相关

线程启动的时候,可能也会有fd泄露的风险,不过这种错误不太容易犯下,如果你真是在一个循环中创建1024线程,那么立刻见效,程序死掉。

image.png

trace1

    java.lang.OutOfMemoryError: Could not allocate JNI Envat java.lang.Thread.nativeCreate(Native Method)at java.lang.Thread.start(Thread.java:729)at com.android.server.wifi.WifiNative.startHal(WifiNative.java:1639)at com.android.server.wifi.WifiStateMachine.setupDriverForSoftAp(WifiStateMachine.java:3970)at com.android.server.wifi.WifiStateMachine.-wrap9(WifiStateMachine.java)at com.android.server.wifi.WifiStateMachine$InitialState.processMessage(WifiStateMachine.java:4480)at com.android.internal.util.StateMachine$SmHandler.processMsg(StateMachine.java:980)at com.android.internal.util.StateMachine$SmHandler.handleMessage(StateMachine.java:799)at android.os.Handler.dispatchMessage(Handler.java:102)at android.os.Looper.loop(Looper.java:163)at android.os.HandlerThread.run(HandlerThread.java:61)

trace2

java.lang.OutOfMemoryError: pthread_create (1040KB stack) failed: Try again
at java.lang.Thread.nativeCreate(Native Method)
at java.lang.Thread.start(Thread.java:733)
at com.tencent.mm.sdk.f.b$a.start(SourceFile:61)
at com.tencent.mm.am.a.bU(SourceFile:60)
at com.tencent.mm.ui.MMAppMgr$8.tC(SourceFile:315)
at com.tencent.mm.sdk.platformtools.am.handleMessage(SourceFile:69)
at com.tencent.mm.sdk.platformtools.aj.handleMessage(SourceFile:173)
at com.tencent.mm.sdk.platformtools.aj.dispatchMessage(SourceFile:128)
at android.os.Looper.loop(Looper.java:176)
at android.app.ActivityThread.main(ActivityThread.java:6701)
at java.lang.reflect.Method.invoke(Native Method)
at com.android.internal.os.Zygote$MethodAndArgsCaller.run(Zygote.java:246)
at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:783)

3、Inputchannel 相关

Inputchannel也会可能出现fd泄露问题,如下:

image.png

3.1 在Activity中不断弹Dialog

public class Main2Activity extends Activity {@Overrideprotected void onCreate(Bundle savedInstanceState) {super.onCreate(savedInstanceState);setContentView(R.layout.activity_main2);}public void onClick(View view) {for (int i = 0; i < 1024; i++) {AlertDialog.Builder builder = new AlertDialog.Builder(this);builder.setTitle("fd").setIcon(R.drawable.ic_launcher_background).create();builder.show();}}
}

不过一会,这个App就死了,报出了下面的问题

11-02 17:38:22.263 9351-9351/com.example.wangjing.rebootdemo E/AndroidRuntime: FATAL EXCEPTION: mainProcess: com.example.wangjing.rebootdemo, PID: 9351
java.lang.IllegalStateException: Could not execute method for android:onClickat android.view.View$DeclaredOnClickListener.onClick(View.java:5391)at android.view.View.performClick(View.java:6311)at android.view.View$PerformClick.run(View.java:24833)at android.os.Handler.handleCallback(Handler.java:794)at android.os.Handler.dispatchMessage(Handler.java:99)at android.os.Looper.loop(Looper.java:173)at android.app.ActivityThread.main(ActivityThread.java:6653)at java.lang.reflect.Method.invoke(Native Method)at com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run(RuntimeInit.java:547)at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:821)
Caused by: java.lang.reflect.InvocationTargetExceptionat java.lang.reflect.Method.invoke(Native Method)at android.view.View$DeclaredOnClickListener.onClick(View.java:5386)at android.view.View.performClick(View.java:6311) at android.view.View$PerformClick.run(View.java:24833) at android.os.Handler.handleCallback(Handler.java:794) at android.os.Handler.dispatchMessage(Handler.java:99) at android.os.Looper.loop(Looper.java:173) at android.app.ActivityThread.main(ActivityThread.java:6653) at java.lang.reflect.Method.invoke(Native Method) at com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run(RuntimeInit.java:547) at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:821)
Caused by: java.lang.RuntimeException: Could not read input channel file descriptors from parcel.at android.view.InputChannel.nativeReadFromParcel(Native Method)at android.view.InputChannel.readFromParcel(InputChannel.java:148)at android.view.IWindowSession$Stub$Proxy.addToDisplay(IWindowSession.java:804)at android.view.ViewRootImpl.setView(ViewRootImpl.java:770)at android.view.WindowManagerGlobal.addView(WindowManagerGlobal.java:356)at android.view.WindowManagerImpl.addView(WindowManagerImpl.java:94)at android.app.Dialog.show(Dialog.java:330)at android.app.AlertDialog$Builder.show(AlertDialog.java:1114)at com.example.wangjing.rebootdemo.Main2Activity.onClick(Main2Activity.java:28)at java.lang.reflect.Method.invoke(Native Method) at android.view.View$DeclaredOnClickListener.onClick(View.java:5386) at android.view.View.performClick(View.java:6311) at android.view.View$PerformClick.run(View.java:24833) at android.os.Handler.handleCallback(Handler.java:794) at android.os.Handler.dispatchMessage(Handler.java:99) at android.os.Looper.loop(Looper.java:173) at android.app.ActivityThread.main(ActivityThread.java:6653) at java.lang.reflect.Method.invoke(Native Method) at com.android.internal.os.RuntimeInit$MethodAndArgsCaller.run(RuntimeInit.java:547) at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:821)

查看一下这个进程的fd信息,很多的socket,正常情况下,这些东西是没有的

jason:/ # ps -ef |grep "wangjing"
u0_a161      13134  9462 3 17:43:12 ?     00:00:04 com.example.wangjing.rebootdemo
root         13385 13380 3 17:45:26 pts/1 00:00:00 grep wangjing
jason:/ # ls -la  proc/13134/fd/
total 0
dr-x------ 2 u0_a161 u0_a161  0 2018-11-02 17:43 .
dr-xr-xr-x 9 u0_a161 u0_a161  0 2018-11-02 17:43 ..
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 0 -> /dev/null
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 1 -> /dev/null
lr-x------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 10 -> /system/framework/com.nxp.nfc.nq.jar
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 100 -> socket:[17760179]
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 101 -> socket:[17753761]
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 102 -> socket:[17773237]
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 103 -> socket:[17760182]
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 104 -> socket:[17760184]
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 105 -> socket:[17773239]
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 106 -> socket:[17776657]
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 107 -> socket:[17774959]
lrwx------ 1 u0_a161 u0_a161 64 2018-11-02 17:45 108 -> socket:[17776659]
......

5、Bitmap 相关

bitmap也是需要fd的,如下图,没有关闭,可能引发fd泄露的可能。

image.png

trace1

java.lang.RuntimeException: Could not allocate dup blob fd.at android.graphics.Bitmap.nativeCreateFromParcel(Native Method)at android.graphics.Bitmap.access$100(Bitmap.java:36)at android.graphics.Bitmap$1.createFromParcel(Bitmap.java:1528)at android.graphics.Bitmap$1.createFromParcel(Bitmap.java:1520)at android.widget.RemoteViews$BitmapCache.<init>(RemoteViews.java:954)at android.widget.RemoteViews.<init>(RemoteViews.java:1820)at android.widget.RemoteViews.<init>(RemoteViews.java:1812)at android.widget.RemoteViews.clone(RemoteViews.java:1905)at android.app.Notification.cloneInto(Notification.java:1534)at android.app.Notification.clone(Notification.java:1508)at android.service.notification.StatusBarNotification.clone(StatusBarNotification.java:161)at com.android.server.notification.NotificationManagerService$NotificationListeners.notifyPostedLocked(NotificationManagerService.java:3557)at com.android.server.notification.NotificationManagerService$8.run(NotificationManagerService.java:2337)at android.os.Handler.handleCallback(Handler.java:815)at android.os.Handler.dispatchMessage(Handler.java:104)at android.os.Looper.loop(Looper.java:207)at com.android.server.SystemServer.run(SystemServer.java:410)at com.android.server.SystemServer.main(SystemServer.java:255)at java.lang.reflect.Method.invoke(Native Method)at com.android.internal.os.ZygoteInit$MethodAndArgsCaller.run(ZygoteInit.java:933)at com.android.internal.os.ZygoteInit.main(ZygoteInit.java:782)

三、总结

通过上面的五个案例,总结常见的fd泄露的情景,一般出现下面的log,就需要怀疑是否有fd泄露的情况。

"Too many open files"
"Could not allocate JNI Env"
"Could not allocate dup blob fd"
"Could not read input channel file descriptors from parcel"
"pthread_create"
"InputChannel is not initialized"
"Could not open input channel pair"

  • 大批量的打开“anon_inode:[eventpoll]” 和 "pipe" 或者 "anon_inode:[eventfd]", 超过100个eventpoll, 通常情况下是开启了太多的HandlerThread/Looper/MessageQueue, 线程忘记关闭, 或者looper 没有释放. 可以抓取hprof 进行快速分析

  • 对于system server, 如果有大批量的socket 打开, 可能是因为Input Channel 没有关闭, 此类同样抓取hprof, 查看system server 中WindowState 的情况.

  • 大量的打开“/dev/ashmem”, 如果是Context provider, 或者其他app, 很可能是打开数据库没有关闭, 或者数据库链接频繁打开忘记关闭. 这个时候查看这个进程的maps, cat proc/pid/maps, 即可看到这个ashmem 的name, 然后进一步可知道在哪里泄露.

3.1、容易复现

1.查看fd信息adb shell ls -a -l /proc/<pid>/fd ,lsof

2.查看进程线程信息:ps -t <pid>,或者抓进程trace, kill -3 <pid>

3.抓取hprof定位资源使用情况

3.2、难复现

1.对于应用自身fd泄漏发生JE时可以在复写UncatchHandlerException在应用crash的时候通过readlink的方式读取/proc/self/fd的信息,在后面发生的时候可以以获取fd信息

2.O之后NE的Tombstone文件中有open files,可以查看打开的fd信息

3.抓取进程的ps信息或者trace信息

4.如果是inputchannel类型的,有可能是窗口类型的,因此可以查看window情况,dumpsys window

作者:LooperJing
链接:https://www.jianshu.com/p/1f9cff12b84f
来源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

应用与系统稳定性第三篇---FD泄露问题漫谈相关推荐

  1. 应用与系统稳定性第五篇---Watchdog原理和问题分析

    前面已经这个系列已经更新了4篇,死机重启问题分析中,Watchdog问题最为常见,今天接着写一写Watchdog问题的分析套路以及工作原理. 应用与系统稳定性第一篇---ANR问题分析的一般套路 应用 ...

  2. Noah Mt4跟单系统制作第三篇 Mt4TradeApi获取报价篇

    Noah Mt4跟单系统制作第三篇 Mt4TradeApi获取报价篇 using System; using Mt4TradeApi;namespace Demo {class Program{sta ...

  3. Agv、Rgv 车辆控制调度系统开发第三篇

    Agv.Rgv 车辆控制调度系统开发第三篇地图编辑器 Agv.Rgv 车辆控制调度系统开发第三篇地图编辑器 Agv.Rgv 车辆控制调度系统开发第三篇地图编辑器 前言 一.GOJS 二.使用步骤 1. ...

  4. java dofinalize_应用与系统稳定性第六篇---JVM垃圾回收之finalize执行时引起timed out 闪退分析...

    一.背景 java.util.concurrent.TimeoutException: android.content.res.AssetManager$AssetInputStream.finali ...

  5. linux标准IO实验,Linux系统编程(第三篇) 标准IO.pdf

    第三章:标准I/O 目标: 本章旨在向学员介绍Linux系统 时间:3 学时 I/O相关函数的使用: 1)掌握I/O相关函数的特点及使 教学方法:讲授PPT 用方法 2 )了解I/O与系统调用相关的函 ...

  6. python应用:爬虫框架Scrapy系统学习第三篇——初识scrapy

    scrapy的最通用的爬虫流程:UR2IM U:URL R2:Request 以及 Response I:Item M:More URL 在scrapy shell中打开服务器一个网页 cmd中执行: ...

  7. 稳定性领导者 阿里云获得信通院多项系统稳定性认证

    4 月 26 日,由中国信息通信研究院(以下简称"中国信通院")主办的首届云系统稳定性大会在京召开.会上,中国信通院公布了系统稳定性领域的最新评估结果,混沌工程先锋实践者优秀案例评 ...

  8. Agv、Rgv 车辆控制调度系统开发第四篇

    Agv.Rgv 车辆控制调度系统开发第四篇 车辆调度模拟器 前言 一.车辆模拟器是什么? 二.如何做模拟器 1.动作仿真模拟器 2.完全仿真模拟器 总结 下期预告 系列文章链接 其他文章 新篇章 前言 ...

  9. linux小红帽系统装打印机驱动,Linux系统详解 第三篇:红帽RHEL的安装

    Linux系统详解 第二篇:红帽RHEL的安装 前言: 本系列文章取材广泛,有来自于互联网的,有来自教科书的,有来自自己的笔记的,也有来自自己对Linux的经验积累的.此系列的文章都是经过长时间的整理 ...

最新文章

  1. jQuery——获取当前索引值
  2. 隔空操作之简单的模拟三种行为
  3. ECS服务器指定实例规格最佳推荐
  4. Win7局域网打印机共享设置(详细图文流程)
  5. tmux 如何自定义背景颜色 | How does the tmux color palette work?
  6. 前端学习(3113):react-hello-类式组件
  7. There is no row in position 0
  8. 团队管理---如何管理好团队
  9. jsp显示服务器路径下的图片,jsp 从服务器获取图片路径
  10. 思达BI软件StyleIntelligence实例教程—柱状数据对比分析图
  11. 脸部识别算法_面部识别技术是种族主义者吗? 先进算法的解释
  12. 大数据技术生态体系(截图)
  13. numpy的choose 函数实现条件筛选
  14. Android Wi-Fi源码分析之wpa_supplicant初始化(三):wpa_supplicant_add_iface函数分析
  15. 依存分析:中文依存句法分析简介
  16. 省市县行政区划代码sql及源地址
  17. App、H5、PC应用多端开发框架Flutter 2发布
  18. Android开发实用小技巧九——内嵌WebView的使用(内置浏览器)
  19. 一道十分经典的intern面试题(String字符串)让你彻底搞懂intern方法
  20. C++ 学习笔记之(19) new、delete表达式、RTTI(运行时类型识别)、枚举、类成员指针、嵌套类、局部类、位域、volatile、extern C

热门文章

  1. 【桌游】微信小程序——线下桌游预约
  2. 《沈工智校》技术支持
  3. Wind数据个性化定制抓取
  4. TypeScript中any与unknown的区别
  5. 微型计算机的ALU部件包括在( )之中,微机原理第二章复习题(附答案)期末考试题...
  6. Xms Xmx PermSize MaxPermSize的含义
  7. Java正则表达式提取字符的方法实例
  8. 数据结构——图的定义和实现
  9. java 集合封装树形结构
  10. unity讲解(入门)