Re: [BUG] sched: leaf_cfs_rq_list use after free

From: Peter Zijlstra
Date: Mon Mar 14 2016 - 07:21:19 EST


On Sat, Mar 12, 2016 at 06:42:57PM +0900, Kazuki Yamaguchi wrote:

> 2e91fa7 cgroup: keep zombies associated with their original cgroups

So the below hackery yields:

[ 192.814857] ------------[ cut here ]------------
[ 192.820025] WARNING: CPU: 38 PID: 3539 at ../kernel/sched/fair.c:288 enqueue_entity+0x90d/0xa10()
[ 192.829930] Modules linked in:
[ 192.833346] CPU: 38 PID: 3539 Comm: test Not tainted 4.5.0-rc7-01136-g89456cf-dirty #346
[ 192.842377] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[ 192.853832] 0000000000000000 ffff880423d87b08 ffffffff81684a55 0000000000000000
[ 192.862136] ffffffff81f36e63 ffff880423d87b40 ffffffff810d7366 ffff880827432c00
[ 192.870437] ffff880827505b80 000000000000024a 0000000000000001 0000000000000000
[ 192.878744] Call Trace:
[ 192.881480] [<ffffffff81684a55>] dump_stack+0x67/0x92
[ 192.887224] [<ffffffff810d7366>] warn_slowpath_common+0x86/0xc0
[ 192.893930] [<ffffffff810d745a>] warn_slowpath_null+0x1a/0x20
[ 192.900431] [<ffffffff8111bd6d>] enqueue_entity+0x90d/0xa10
[ 192.906751] [<ffffffff811168cd>] ? select_task_rq_fair+0x48d/0x790
[ 192.913748] [<ffffffff8111bec9>] enqueue_task_fair+0x59/0x8c0
[ 192.920254] [<ffffffff8112d83d>] ? __lock_is_held+0x4d/0x70
[ 192.926572] [<ffffffff8112d83d>] ? __lock_is_held+0x4d/0x70
[ 192.932895] [<ffffffff811081bc>] activate_task+0x5c/0xb0
[ 192.938923] [<ffffffff8110874e>] ttwu_do_activate.constprop.104+0x3e/0x90
[ 192.946601] [<ffffffff81109669>] try_to_wake_up+0x1f9/0x620
[ 192.952919] [<ffffffff81109b32>] default_wake_function+0x12/0x20
[ 192.959717] [<ffffffff810da021>] child_wait_callback+0x51/0x60
[ 192.966326] [<ffffffff81125c02>] __wake_up_common+0x52/0x90
[ 192.972634] [<ffffffff811261f5>] __wake_up_sync_key+0x45/0x60
[ 192.979146] [<ffffffff810dccd6>] __wake_up_parent+0x26/0x30
[ 192.985468] [<ffffffff810ea09b>] do_notify_parent+0x30b/0x550
[ 192.991980] [<ffffffff810e9edd>] ? do_notify_parent+0x14d/0x550
[ 192.998684] [<ffffffff810e476a>] ? __ptrace_unlink+0xba/0x110
[ 193.005196] [<ffffffff810e3a18>] __ptrace_detach.part.12+0x88/0xd0
[ 193.012183] [<ffffffff810e4897>] exit_ptrace+0x87/0xd0
[ 193.018015] [<ffffffff810dc9ab>] do_exit+0xabb/0xca0
[ 193.023663] [<ffffffff810dcc1e>] do_group_exit+0x4e/0xc0
[ 193.029680] [<ffffffff810dcca4>] SyS_exit_group+0x14/0x20
[ 193.035823] [<ffffffff81b1d965>] entry_SYSCALL_64_fastpath+0x18/0xa8
[ 193.043013] ---[ end trace 8c92cd8599d0fd71 ]---


Which yields the patch in question is indeed full of fail. The
additional hackery below (new cpu_cgroup_exit) does indeed fix the fail.

But given that this is true for cpu, it is also very much true for perf.

So I would suggest TJ to revert that patch and queue it for stable.

It it clearly borken, because cgroup_exit() is called from preemptible
context, so _obviously_ we can (and clearly will) schedule after that,
which is somewhat hard if the cgroup we're supposedly belonging to has
been torn to shreds in the meantime.

---
kernel/sched/core.c | 17 +++++++++++++++++
kernel/sched/fair.c | 13 +++++++++----
2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1c82ded..3892a48 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7640,6 +7640,9 @@ void sched_move_task(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
+ if (tsk->flags & PF_EXITING) /* we're dying, tg could be about to vanish */
+ tg = &root_task_group;
+
tsk->sched_task_group = tg;

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8112,6 +8115,19 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
sched_move_task(task);
}

+static void cpu_cgroup_exit(struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ sched_move_task(task);
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
@@ -8443,6 +8459,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
+ .exit = cpu_cgroup_exit,
.legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3313052..163b829 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -285,7 +285,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->on_list) {
+ WARN_ON(cfs_rq->on_list == -1);
+
+ if (cfs_rq->on_list == 0) {
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
@@ -302,15 +304,17 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}

cfs_rq->on_list = 1;
+ trace_printk("%p\n", cfs_rq);
}
}

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- if (cfs_rq->on_list) {
+ if (cfs_rq->on_list == 1) {
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
- cfs_rq->on_list = 0;
+ trace_printk("%p\n", cfs_rq);
}
+ cfs_rq->on_list = -1;
}

/* Iterate thr' all leaf cfs_rq's on a runqueue */
@@ -8356,9 +8360,10 @@ void unregister_fair_sched_group(struct task_group *tg)
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
- */
+ *
if (!tg->cfs_rq[cpu]->on_list)
continue;
+ */

rq = cpu_rq(cpu);