Binder 线程池溢出问题

Android Framework 系列教程：yuandaimaahao.github.io/AndroidFram...

视频教程、源码、答疑服务与进入 Framework 技术交流群请联系微信 zzh0838

线程池溢出情景分析

客户端发现服务端线程用完了的情景：

c 复制代码

static void binder_transaction(struct binder_proc *proc,
			       struct binder_thread *thread,
			       struct binder_transaction_data *tr, int reply,
			       binder_size_t extra_buffers_size)
{
	//......
	if (reply) {
		//......
	} else if (!(t->flags & TF_ONE_WAY)) {
		BUG_ON(t->buffer->async_transaction != 0);
		binder_inner_proc_lock(proc);

		//将 tcomplete 插入到事务发起 binder 线程的 todo 队列中
		//等 Server 端收到 ServiceManager 的回复后就会执行这个 binder_work
		binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete);
		//记录一些信息
		t->need_reply = 1;
		t->from_parent = thread->transaction_stack;
		thread->transaction_stack = t;
		binder_inner_proc_unlock(proc);
		//将t->work插入目标线程的todo队列中并唤醒目标进程
		if (!binder_proc_transaction(t, target_proc, target_thread)) {
			binder_inner_proc_lock(proc);
			binder_pop_transaction_ilocked(thread, t);
			binder_inner_proc_unlock(proc);
			goto err_dead_proc_or_thread;
		}
	} else {
		//......
	}
	if (target_thread)
		binder_thread_dec_tmpref(target_thread);
	binder_proc_dec_tmpref(target_proc);
	if (target_node)
		binder_dec_node_tmpref(target_node);
	/*
	 * write barrier to synchronize with initialization
	 * of log entry
	 */
	smp_wmb();
	WRITE_ONCE(e->debug_id_done, t_debug_id);
	return;

	//......
}

binder_proc_transaction 的实现如下：

c 复制代码

static bool binder_proc_transaction(struct binder_transaction *t,
				    struct binder_proc *proc,
				    struct binder_thread *thread)
{
	struct binder_node *node = t->buffer->target_node;
	struct binder_priority node_prio;
	bool oneway = !!(t->flags & TF_ONE_WAY);
	bool pending_async = false;

	BUG_ON(!node);
	binder_node_lock(node);
	node_prio.prio = node->min_priority;
	node_prio.sched_policy = node->sched_policy;

	if (oneway) {
		BUG_ON(thread);
		if (node->has_async_transaction) {
			pending_async = true;
		} else {
			node->has_async_transaction = true;
		}
	}

	binder_inner_proc_lock(proc);

	//如果目标进程死亡或者目标线程不为NULL且死亡
	if (proc->is_dead || (thread && thread->is_dead)) {
		binder_inner_proc_unlock(proc);
		binder_node_unlock(node);
		return false;
	}

	// thread 为空
	// pending_async false
	if (!thread && !pending_async) //走这
	//从 target_proc 的 waiting_threads 链表中选择第一个作为 target_thread
    //如果没用空闲的线程，这里会返回 NULL
		thread = binder_select_thread_ilocked(proc);

	if (thread) {
		binder_transaction_priority(thread->task, t, node_prio,
					    node->inherit_rt);
		//把 binder_transaction 插入到 target_thread 的 todo 链表中
		binder_enqueue_thread_work_ilocked(thread, &t->work);
	} else if (!pending_async) {//同步调用，走这里
        //将数据插入 proc->todo
		binder_enqueue_work_ilocked(&t->work, &proc->todo);
	} else {
		binder_enqueue_work_ilocked(&t->work, &node->async_todo);
	}

	if (!pending_async) //走这
		//唤醒远程线程
		binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);

	binder_inner_proc_unlock(proc);
	binder_node_unlock(node);

	return true;
}

//从 target_proc 的 waiting_threads 链表中选择第一个作为 target_thread
static struct binder_thread *
binder_select_thread_ilocked(struct binder_proc *proc)
{
	struct binder_thread *thread;

	assert_spin_locked(&proc->inner_lock);
	thread = list_first_entry_or_null(&proc->waiting_threads,
					  struct binder_thread,
					  waiting_thread_node);

	if (thread)
		list_del_init(&thread->waiting_thread_node);

	return thread;
}


//把 binder_transaction 插入到 target_thread 的 todo 链表中
static void
binder_enqueue_thread_work_ilocked(struct binder_thread *thread,
				   struct binder_work *work)
{
	binder_enqueue_work_ilocked(work, &thread->todo);
	thread->process_todo = true;
}

static void
binder_enqueue_work_ilocked(struct binder_work *work,
			   struct list_head *target_list)
{
	BUG_ON(target_list == NULL);
	BUG_ON(work->entry.next && !list_empty(&work->entry));
	list_add_tail(&work->entry, target_list);
}

//唤醒接收端
static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
					 struct binder_thread *thread,
					 bool sync)
{
	assert_spin_locked(&proc->inner_lock);

	if (thread) { // thread 为空
		if (sync) 
			wake_up_interruptible_sync(&thread->wait);
		else
			wake_up_interruptible(&thread->wait);
		return;
	}
    //走这
	binder_wakeup_poll_threads_ilocked(proc, sync);
}

static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
					 struct binder_thread *thread,
					 bool sync)
{
	assert_spin_locked(&proc->inner_lock);

	if (thread) {
		if (sync)
			wake_up_interruptible_sync(&thread->wait);
		else
			wake_up_interruptible(&thread->wait);
		return;
	}
	binder_wakeup_poll_threads_ilocked(proc, sync);
}

//从 binder_proc 的 threads 中挨个唤醒
static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc,
					       bool sync)
{
	struct rb_node *n;
	struct binder_thread *thread;

	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
		thread = rb_entry(n, struct binder_thread, rb_node);
		if (thread->looper & BINDER_LOOPER_STATE_POLL &&
		    binder_available_for_proc_work_ilocked(thread)) {
            //此时 thread 处于运行状态，并不会立即响应 wake_up
			if (sync)
                //同步唤醒进程
                //阻塞在这里，直到 thread 响应 wake_up 后，函数返回
				wake_up_interruptible_sync(&thread->wait);
			else
				wake_up_interruptible(&thread->wait);
		}
	}
}

服务端线程的响应：

c 复制代码

//服务端线程忙完自己的工作，就会进入下一个循环，向驱动读数据
static int binder_thread_read(struct binder_proc *proc,
			      struct binder_thread *thread,
			      binder_uintptr_t binder_buffer, size_t size,
			      binder_size_t *consumed, int non_block)
{
	
    //......
retry:
	binder_inner_proc_lock(proc);
    //这里返回 true
	wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
	binder_inner_proc_unlock(proc);

	thread->looper |= BINDER_LOOPER_STATE_WAITING;

	//......

	if (non_block) {
		if (!binder_has_work(thread, wait_for_proc_work))
			ret = -EAGAIN;
	} else {
        //不会 wait ，接着往下执行
		ret = binder_wait_for_work(thread, wait_for_proc_work);
	}

	thread->looper &= ~BINDER_LOOPER_STATE_WAITING;

	if (ret)
		return ret;

    //......

    while (1) {
		uint32_t cmd;
		struct binder_transaction_data_secctx tr;
		struct binder_transaction_data *trd = &tr.transaction_data;
		struct binder_work *w = NULL;
		struct list_head *list = NULL;
		struct binder_transaction *t = NULL;
		struct binder_thread *t_from;
		size_t trsize = sizeof(*trd);

		binder_inner_proc_lock(proc);
		if (!binder_worklist_empty_ilocked(&thread->todo))
			list = &thread->todo;
		else if (!binder_worklist_empty_ilocked(&proc->todo) &&
			   wait_for_proc_work) //走这个分支
			list = &proc->todo; //拿到 binder_proc 中的 todo 链表
		else {
			binder_inner_proc_unlock(proc);

			/* no data added */
			if (ptr - buffer == 4 && !thread->looper_need_return)
				goto retry;
			break;
		}

		if (end - ptr < sizeof(tr) + 4) {
			binder_inner_proc_unlock(proc);
			break;
		}
		w = binder_dequeue_work_head_ilocked(list); //从头部挨个处理 binder_work
		if (binder_worklist_empty_ilocked(&thread->todo))
			thread->process_todo = false;
		
}

static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread)
{   
	return !thread->transaction_stack &&
		binder_worklist_empty_ilocked(&thread->todo) &&
		(thread->looper & (BINDER_LOOPER_STATE_ENTERED |
				   BINDER_LOOPER_STATE_REGISTERED));
}

static int binder_wait_for_work(struct binder_thread *thread,
				bool do_proc_work)
{
	DEFINE_WAIT(wait);
	struct binder_proc *proc = thread->proc;
	int ret = 0;

	freezer_do_not_count();
	binder_inner_proc_lock(proc);
	for (;;) {
		prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
        //返回 true 直接 break
		if (binder_has_work_ilocked(thread, do_proc_work))
			break;
		if (do_proc_work)
			list_add(&thread->waiting_thread_node,
				 &proc->waiting_threads);
		binder_inner_proc_unlock(proc);
		schedule();
		binder_inner_proc_lock(proc);
		list_del_init(&thread->waiting_thread_node);
		if (signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}
	}
    //清除 wait，接着往下执行
	finish_wait(&thread->wait, &wait);
	binder_inner_proc_unlock(proc);
	freezer_count();

	return ret;
}

线程池溢出 Debug 技巧

binder 线程到达上限, 这个的情景是 app 向 service 发起请求的频率过高，service 端如果对所有的业务执行都加了锁的话，则会导致 service 端用于接收处理 binder 事件的线程全部卡住，当线程池（default 16个线程）耗尽之后，就无法再处理请求。如果这个时候 app 的主线程如果再调用该 serivce 提供的方法，就很容易出现 anr

我们可以通过 /sys/kernel/debug/binder 这个目录下的文件来查看所有进程的使用 binder 的状况来确定是否有线程溢出的情况出现:

一般是通过 transactions 这个文件来查看 binder 运行状况：

bash 复制代码

binder transactions:
proc 2890
context binder
  buffer 45136: 0000000000000000 size 4:0:0 delivered
proc 2197
context hwbinder
  buffer 29820: 0000000000000000 size 4:0:0 delivered

第一行是 proc 2890 表示进程的 pid，第二行表示 binder 类型，接下来就是 buffer，用于表示内核中的 binder_buffer，如果这个 buffer 在程序运行过程中越来越多，那么就有内存泄漏的可能存在，不过这种情况很少，应用程序都是通过 libbinder 这个库来使用 binder 驱动的，在一次 ipc 调用结束后，库本生会执行 binder_buffer 的清理操作，这种系统中的公用库一般都非常稳定，出 bug 的几率很小。

bash 复制代码

proc 1523
context hwbinder
  thread 1542: l 02 need_return 0 tr 0
    incoming transaction 126150: 0000000000000000 from 1744:1838 to 1523:1542 code 4 flags 10 pri 1:89 r1 node 234 size 44:0 data 0000000000000000
  buffer 126150: 0000000000000000 size 44:0:0 active

还有一种情况是 proc 下面会出现 thread，如果 thread 特别多，那么就可能存在线程溢出的情况。为了进一步确认是否已经占满，dump 该进程获得其墓碑文件，就基本能看出来线程的情况。

解决线程池耗尽的问题，一般是从 app 端去限制请求的频率，如果不能拿到 app 源码，我们也可以修改 Framework 源码来限制特定的 IPC 远程调用的访问频率。

Binder 线程池溢出问题

线程池溢出情景分析

线程池溢出 Debug 技巧

参考资料