[PATCH] kthreads: Fix startup synchronization boot crash

From: Ingo Molnar
Date: Tue Sep 01 2009 - 07:36:34 EST


-tip testing found this bootup crash:

[ 0.025010] Checking 'hlt' instruction... OK.
[ 0.032017] calling spawn_ksoftirqd+0x0/0x48 @ 1
[ 0.033024] BUG: unable to handle kernel NULL pointer dereference at (null)
[ 0.033994] IP: [<c102b00d>] try_to_wake_up+0x2f/0x174
[ 0.033994] *pde = 00000000
[ 0.033994] Oops: 0000 [#1] DEBUG_PAGEALLOC
[ 0.033994] last sysfs file:
[ 0.033994] Modules linked in:
[ 0.033994]
[ 0.033994] Pid: 1, comm: swapper Not tainted (2.6.31-rc6-00024-g23386d6-dirty #9074)
[ 0.033994] EIP: 0060:[<c102b00d>] EFLAGS: 00010046 CPU: 0
[ 0.033994] EIP is at try_to_wake_up+0x2f/0x174
[ 0.033994] EAX: 0935b29c EBX: 0000000f ECX: 00000000 EDX: 00000000
[ 0.033994] ESI: 00000000 EDI: 00000000 EBP: f7051edc ESP: f7051eb8
[ 0.033994] DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068
[ 0.033994] Process swapper (pid: 1, ti=f7050000 task=f7048000 task.ti=f7050000)
[ 0.033994] Stack:
[ 0.033994] 00000246 c1045384 00000000 c1b8800c 00000246 0935b29c c1a3b088 00000000
[ 0.033994] <0> 00000000 f7051ee8 c102b1c7 0935b29c f7051f40 c104538e c1034aed 00000000
[ 0.033994] <0> c1a30e5f 00000000 00000001 dead4ead ffffffff ffffffff c1eb1e04 00000000
[ 0.033994] Call Trace:
[ 0.033994] [<c1045384>] ? kthread_create+0x63/0xdb
[ 0.033994] [<c102b1c7>] ? wake_up_process+0x1b/0x2e
[ 0.033994] [<c104538e>] ? kthread_create+0x6d/0xdb
[ 0.033994] [<c1034aed>] ? ksoftirqd+0x0/0xb8
[ 0.033994] [<c1048dc8>] ? ktime_get_ts+0x4e/0x64
[ 0.033994] [<c1d7446b>] ? cpu_callback+0x3e/0x94
[ 0.033994] [<c1034aed>] ? ksoftirqd+0x0/0xb8
[ 0.033994] [<c1d41f3c>] ? spawn_ksoftirqd+0x22/0x48
[ 0.033994] [<c100113c>] ? _stext+0x54/0x135
[ 0.033994] [<c1d41f1a>] ? spawn_ksoftirqd+0x0/0x48
[ 0.033994] [<c1002e75>] ? restore_all_notrace+0x0/0x18
[ 0.033994] [<c1055ce9>] ? trace_hardirqs_on_caller+0xb9/0xf2
[ 0.033994] [<c13294ec>] ? trace_hardirqs_on_thunk+0xc/0x10
[ 0.033994] [<c1002e75>] ? restore_all_notrace+0x0/0x18
[ 0.033994] [<c1d302ab>] ? kernel_init+0x0/0xec
[ 0.033994] [<c1d302ed>] ? kernel_init+0x42/0xec
[ 0.033994] [<c1d302ab>] ? kernel_init+0x0/0xec
[ 0.033994] [<c10037a7>] ? kernel_thread_helper+0x7/0x58

Which is caused by kthreadd_task being NULL.

The modification of that variable is protected by the BKL, but
the _ordering_ of the initial task (which becomes the idle
thread of CPU0) and the init task (which is spawned by the
initial task) is not synchronized.

So we can occasionally end up init running sooner than
rest_init(), and the ksoftirqd creation failing in
kthread_create() due to the NULL kthreadd_task value.

Add a completion to serialize this - made dependent on the
kthreadd_task pointer value. (which will serialize fine right
now as both are accessed via the BKL.)

(I think this code could be cleaned up further to have less
open-coded serialization, the fix here is the minimal change to
fix the regression.)

I think a side-effect of this recent commit might have opened
that race:

cdd140b: kthreads: simplify the startup synchronization

But it needs certain timing sequences to trigger.

c: Oleg Nesterov <oleg@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: <stable@xxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
---
include/linux/kthread.h | 1 +
init/main.c | 2 ++
kernel/kthread.c | 5 +++++
3 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index aabc8a1..1ca19fa 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -33,5 +33,6 @@ int kthread_should_stop(void);

int kthreadd(void *unused);
extern struct task_struct *kthreadd_task;
+extern struct completion kthreadd_task_init_done;

#endif /* _LINUX_KTHREAD_H */
diff --git a/init/main.c b/init/main.c
index 11f4f14..6c1b10b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -455,6 +455,8 @@ static noinline void __init_refok rest_init(void)
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
+ complete(&kthreadd_task_init_done);
+
unlock_kernel();

/*
diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751a..6ec4643 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,9 @@

static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
+
struct task_struct *kthreadd_task;
+DECLARE_COMPLETION(kthreadd_task_init_done);

struct kthread_create_info
{
@@ -129,6 +131,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
list_add_tail(&create.list, &kthread_create_list);
spin_unlock(&kthread_create_lock);

+ if (unlikely(!kthreadd_task))
+ wait_for_completion(&kthreadd_task_init_done);
+
wake_up_process(kthreadd_task);
wait_for_completion(&create.done);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/