[patch] fixed both processes in D state and the /proc/ oopses [Re: [patch] Fixed the race that was o

Andrea Arcangeli (andrea@e-mind.com)
Wed, 27 Jan 1999 12:41:28 +0100 (CET)


On Wed, 27 Jan 1999, Andrea Arcangeli wrote:

> On Wed, 27 Jan 1999, Andrea Arcangeli wrote:
>
> > The other problem that was harming all people is that mmget() was
> > incrasing the counter unconditionally. But it could increase it even if it
> > was just 0. So we could have an mm with count 1 (increased by /proc), but
>
> Before somebody will notice this: my fix is not the right one (probably
> due lack to sleep ;). Incidentally it worked _fine_ because the race is so
> fast that the slab cache had not the time to reuse the mm_struct (the one
> where I hold the mm_lock even if it's just deallocated) for another task.
>
> I am fixing it right this time. The fix is to make atomic the tsk->mm =
> &init_mm and the mmput.
>
> A real fix is coming soon...

The good news is that with the help of Tom Holroyd (that applyed my
sempahore-deadlock-detector and given me back the report) I now know that
the many process in D state that everybody noticed are caused by the
mmget/mmput race too (because the down(&mm->mmap_semp) was done on a
mm_struct that just gone away from some time I think). So this my patch
will fix both the mmap/mmput races and the process stuck in D state
problem.

Here the trace of the deadlocked process generated by my semaphore
deadlock detector hack (trace produced by Tom):

[__down_failed+100/184]
[do_bottom_half+124/192]
[generate_oops+0/64]
[do_exit+952/992]
[sprintf+4560/10652]
[get_process_array+96/160]
[get_stat+56/928]
[d_alloc+64/448]
[proc_root_lookup+140/480]
[cached_lookup+28 /160]
[do_follow_link+184/224]
[lookup_dentry+524/704]
[get_process_array+104/160]
[array_read+268/608]
[sys_read+256/320]
[entSys+168/192]

Here the program I writeen last night to reproduce the race. In clean
2.2.0 produce an Oops after max 2 seconds here:

----------------------------------------
#!/bin/sh
# Copyright (C) 1999 Andrea Arcangeli
# crash-2.2.0.sh

while :; do
free &>/dev/null &
ps &>/dev/null &
done
----------------------------------------

And here the _real_ fix against clean 2.2.0:

Index: arch/i386/kernel/ldt.c
===================================================================
RCS file: /var/cvs/linux/arch/i386/kernel/ldt.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 ldt.c
--- ldt.c 1999/01/18 01:28:57 1.1.1.1
+++ linux/arch/i386/kernel/ldt.c 1999/01/27 03:01:36
@@ -83,7 +83,7 @@
set_ldt_desc(i, ldt, LDT_ENTRIES);
current->tss.ldt = _LDT(i);
load_ldt(i);
- if (atomic_read(&mm->count) > 1)
+ if (mm->count > 1)
printk(KERN_WARNING
"LDT allocated for cloned task!\n");
} else {
Index: fs/binfmt_elf.c
===================================================================
RCS file: /var/cvs/linux/fs/binfmt_elf.c,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 binfmt_elf.c
--- binfmt_elf.c 1999/01/18 01:26:49 1.1.1.1
+++ linux/fs/binfmt_elf.c 1999/01/27 03:01:36
@@ -1067,7 +1067,7 @@

if (!current->dumpable ||
limit < ELF_EXEC_PAGESIZE ||
- atomic_read(&current->mm->count) != 1)
+ current->mm->count != 1)
return 0;
current->dumpable = 0;

Index: fs/exec.c
===================================================================
RCS file: /var/cvs/linux/fs/exec.c,v
retrieving revision 1.1.1.2
diff -u -r1.1.1.2 exec.c
--- exec.c 1999/01/23 16:28:07 1.1.1.2
+++ linux/fs/exec.c 1999/01/27 11:14:17
@@ -378,7 +378,7 @@
struct mm_struct * mm, * old_mm;
int retval, nr;

- if (atomic_read(&current->mm->count) == 1) {
+ if (current->mm->count == 1) {
flush_cache_mm(current->mm);
mm_release();
release_segments(current->mm);
@@ -409,7 +409,9 @@
activate_context(current);
up(&mm->mmap_sem);
mm_release();
+ spin_lock(&mm_lock);
mmput(old_mm);
+ spin_unlock(&mm_lock);
return 0;

/*
Index: fs/proc/array.c
===================================================================
RCS file: /var/cvs/linux/fs/proc/array.c,v
retrieving revision 1.1.1.4
diff -u -r1.1.1.4 array.c
--- array.c 1999/01/26 18:30:44 1.1.1.4
+++ linux/fs/proc/array.c 1999/01/27 11:18:17
@@ -43,6 +43,8 @@
* <Alan.Cox@linux.org>
*
* Andi Kleen : Race Fixes.
+ *
+ * Andrea Arcangeli : do_exit/mmput/mmget race fix on the Race Fixes ;).
*
*/

@@ -399,8 +401,11 @@

read_lock(&tasklist_lock);
tsk = find_task_by_pid(pid);
+ spin_lock(&mm_lock);
if (tsk && tsk->mm && tsk->mm != &init_mm)
- mmget(mm = tsk->mm);
+ if (!mmget(mm = tsk->mm))
+ mm = NULL;
+ spin_unlock(&mm_lock);
read_unlock(&tasklist_lock);
if (mm != NULL)
down(&mm->mmap_sem);
@@ -410,7 +415,9 @@
static void release_mm(struct mm_struct *mm)
{
up(&mm->mmap_sem);
+ spin_lock(&mm_lock);
mmput(mm);
+ spin_unlock(&mm_lock);
}

static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr)
@@ -504,17 +511,9 @@
return res;
}

-/*
- * These bracket the sleeping functions..
- */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
-#define first_sched ((unsigned long) scheduling_functions_start_here)
-#define last_sched ((unsigned long) scheduling_functions_end_here)
-
static unsigned long get_wchan(struct task_struct *p)
{
- if (!p || p == current || p->state == TASK_RUNNING)
+ if (!p || real_tsk(p) == current || p->state == TASK_RUNNING)
return 0;
#if defined(__i386__)
{
@@ -522,7 +521,7 @@
unsigned long stack_page;
int count = 0;

- stack_page = (unsigned long)p;
+ stack_page = (unsigned long)real_tsk(p);
esp = p->tss.esp;
if (!stack_page || esp < stack_page || esp >= 8188+stack_page)
return 0;
@@ -564,7 +563,7 @@
unsigned long stack_page;
int count = 0;

- stack_page = (unsigned long)p;
+ stack_page = (unsigned long)real_tsk(p);
fp = ((struct switch_stack *)p->tss.ksp)->a6;
do {
if (fp < stack_page+sizeof(struct task_struct) ||
@@ -585,7 +584,7 @@
unsigned long stack_page;
int count = 0;

- stack_page = 4096 + (unsigned long)p;
+ stack_page = 4096 + (unsigned long)real_tsk(p);
fp = get_css_fp (&p->tss);
do {
if (fp < stack_page || fp > 4092+stack_page)
@@ -599,7 +598,7 @@
#elif defined (__sparc__)
{
unsigned long pc, fp, bias = 0;
- unsigned long task_base = (unsigned long) p;
+ unsigned long task_base = (unsigned long) real_tsk(p);
struct reg_window *rw;
int count = 0;

@@ -624,8 +623,8 @@
}

#if defined(__i386__)
-# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019])
-# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
+# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1019])
+# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1022])
#elif defined(__alpha__)
/*
* See arch/alpha/kernel/ptrace.c for details.
@@ -633,11 +632,11 @@
# define PT_REG(reg) (PAGE_SIZE - sizeof(struct pt_regs) \
+ (long)&((struct pt_regs *)0)->reg)
# define KSTK_EIP(tsk) \
- (*(unsigned long *)(PT_REG(pc) + PAGE_SIZE + (unsigned long)(tsk)))
+ (*(unsigned long *)(PT_REG(pc) + PAGE_SIZE + (unsigned long)real_tsk(tsk)))
# define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->tss.usp)
#elif defined(CONFIG_ARM)
-# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
-# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1020])
+# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1022])
+# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1020])
#elif defined(__mc68000__)
#define KSTK_EIP(tsk) \
({ \
@@ -646,7 +645,7 @@
MAP_NR((tsk)->tss.esp0) < max_mapnr) \
eip = ((struct pt_regs *) (tsk)->tss.esp0)->pc; \
eip; })
-#define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->tss.usp)
+#define KSTK_ESP(tsk) (real_tsk(tsk) == current ? rdusp() : (tsk)->tss.usp)
#elif defined(__powerpc__)
#define KSTK_EIP(tsk) ((tsk)->tss.regs->nip)
#define KSTK_ESP(tsk) ((tsk)->tss.regs->gpr[1])
@@ -849,14 +848,24 @@
cap_t(p->cap_effective));
}

-static struct task_struct *grab_task(int pid)
+static struct task_struct *grab_task(int pid, struct task_struct *dst)
{
struct task_struct *tsk = current;
if (pid != tsk->pid) {
read_lock(&tasklist_lock);
tsk = find_task_by_pid(pid);
- if (tsk && tsk->mm && tsk->mm != &init_mm)
- mmget(tsk->mm);
+ if (tsk) {
+ spin_lock(&mm_lock);
+ if (tsk->mm && tsk->mm != &init_mm)
+ if (!mmget(tsk->mm))
+ tsk = NULL;
+ spin_unlock(&mm_lock);
+ if (tsk)
+ {
+ memcpy(dst, tsk, sizeof(struct task_struct));
+ tsk = dst;
+ }
+ }
read_unlock(&tasklist_lock);
}
return tsk;
@@ -864,16 +873,18 @@

static void release_task(struct task_struct *tsk)
{
+ spin_lock(&mm_lock);
if (tsk != current && tsk->mm && tsk->mm != &init_mm)
mmput(tsk->mm);
+ spin_unlock(&mm_lock);
}

static int get_status(int pid, char * buffer)
{
char * orig = buffer;
- struct task_struct *tsk;
+ struct task_struct *tsk, mytask;

- tsk = grab_task(pid);
+ tsk = grab_task(pid, &mytask);
if (!tsk)
return 0;
buffer = task_name(tsk, buffer);
@@ -887,7 +898,7 @@

static int get_stat(int pid, char * buffer)
{
- struct task_struct *tsk;
+ struct task_struct *tsk, mytask;
unsigned long vsize, eip, esp, wchan;
long priority, nice;
int tty_pgrp;
@@ -895,7 +906,7 @@
char state;
int res;

- tsk = grab_task(pid);
+ tsk = grab_task(pid, &mytask);
if (!tsk)
return 0;
state = *get_task_state(tsk);
@@ -1149,7 +1160,7 @@
goto getlen_out;

/* Check whether the mmaps could change if we sleep */
- volatile_task = (p != current || atomic_read(&p->mm->count) > 1);
+ volatile_task = (p != current || p->mm->count > 1);

/* decode f_pos */
lineno = *ppos >> MAPS_LINE_SHIFT;
@@ -1250,10 +1261,10 @@
#ifdef __SMP__
static int get_pidcpu(int pid, char * buffer)
{
- struct task_struct * tsk;
+ struct task_struct * tsk, mytask;
int i, len;

- tsk = grab_task(pid);
+ tsk = grab_task(pid, &mytask);
if (!tsk)
return 0;

Index: include/asm-i386/pgtable.h
===================================================================
RCS file: /var/cvs/linux/include/asm-i386/pgtable.h,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 pgtable.h
--- pgtable.h 1999/01/18 01:27:15 1.1.1.1
+++ linux/include/asm-i386/pgtable.h 1999/01/27 03:01:36
@@ -101,7 +101,7 @@
static inline void flush_tlb_current_task(void)
{
/* just one copy of this mm? */
- if (atomic_read(&current->mm->count) == 1)
+ if (current->mm->count == 1)
local_flush_tlb(); /* and that's us, so.. */
else
smp_flush_tlb();
@@ -113,7 +113,7 @@

static inline void flush_tlb_mm(struct mm_struct * mm)
{
- if (mm == current->mm && atomic_read(&mm->count) == 1)
+ if (mm == current->mm && mm->count == 1)
local_flush_tlb();
else
smp_flush_tlb();
@@ -122,7 +122,7 @@
static inline void flush_tlb_page(struct vm_area_struct * vma,
unsigned long va)
{
- if (vma->vm_mm == current->mm && atomic_read(&current->mm->count) == 1)
+ if (vma->vm_mm == current->mm && current->mm->count == 1)
__flush_tlb_one(va);
else
smp_flush_tlb();
Index: include/linux/sched.h
===================================================================
RCS file: /var/cvs/linux/include/linux/sched.h,v
retrieving revision 1.1.1.3
diff -u -r1.1.1.3 sched.h
--- sched.h 1999/01/23 16:29:58 1.1.1.3
+++ linux/include/linux/sched.h 1999/01/27 11:18:42
@@ -114,6 +114,15 @@
extern rwlock_t tasklist_lock;
extern spinlock_t scheduler_lock;

+/*
+ * These bracket the sleeping functions..
+ */
+extern void scheduling_functions_start_here(void);
+extern void scheduling_functions_end_here(void);
+#define first_sched ((unsigned long) scheduling_functions_start_here)
+#define last_sched ((unsigned long) scheduling_functions_end_here)
+#define real_tsk(tsk) (*(tsk)->tarray_ptr)
+
extern void sched_init(void);
extern void show_state(void);
extern void trap_init(void);
@@ -164,7 +173,7 @@
struct vm_area_struct *mmap_avl; /* tree of VMAs */
struct vm_area_struct *mmap_cache; /* last find_vma result */
pgd_t * pgd;
- atomic_t count;
+ int count;
int map_count; /* number of VMAs */
struct semaphore mmap_sem;
unsigned long context;
@@ -184,7 +193,7 @@
#define INIT_MM { \
&init_mmap, NULL, NULL, \
swapper_pg_dir, \
- ATOMIC_INIT(1), 1, \
+ 1, 1, \
MUTEX, \
0, \
0, 0, 0, 0, \
@@ -391,6 +400,7 @@

extern struct task_struct **tarray_freelist;
extern spinlock_t taskslot_lock;
+extern spinlock_t mm_lock;

extern __inline__ void add_free_taskslot(struct task_struct **t)
{
@@ -609,11 +619,8 @@
* Routines for handling mm_structs
*/
extern struct mm_struct * mm_alloc(void);
-static inline void mmget(struct mm_struct * mm)
-{
- atomic_inc(&mm->count);
-}
-extern void mmput(struct mm_struct *);
+extern int FASTCALL(mmget(struct mm_struct *));
+extern void FASTCALL(mmput(struct mm_struct *));
/* Remove the current tasks stale references to the old mm_struct */
extern void mm_release(void);

Index: kernel/exit.c
===================================================================
RCS file: /var/cvs/linux/kernel/exit.c,v
retrieving revision 1.1.1.2
diff -u -r1.1.1.2 exit.c
--- exit.c 1999/01/23 16:30:27 1.1.1.2
+++ linux/kernel/exit.c 1999/01/27 11:19:46
@@ -248,8 +248,10 @@

static inline void __exit_mm(struct task_struct * tsk)
{
- struct mm_struct * mm = tsk->mm;
+ struct mm_struct * mm;

+ spin_lock(&mm_lock);
+ mm = tsk->mm;
/* Set us up to use the kernel mm state */
if (mm != &init_mm) {
flush_cache_mm(mm);
@@ -261,6 +263,7 @@
mm_release();
mmput(mm);
}
+ spin_unlock(&mm_lock);
}

void exit_mm(struct task_struct *tsk)
Index: kernel/fork.c
===================================================================
RCS file: /var/cvs/linux/kernel/fork.c,v
retrieving revision 1.1.1.3
diff -u -r1.1.1.3 fork.c
--- fork.c 1999/01/23 16:30:27 1.1.1.3
+++ linux/kernel/fork.c 1999/01/27 11:20:02
@@ -10,6 +10,11 @@
* Fork is rather simple, once you get the hang of it, but the memory
* management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
*/
+/*
+ * Fixed a race between mmget() and mmput(): added the mm_lock spinlock in
+ * the task_struct to serialize accesses to the tsk->mm field.
+ * Copyright (C) 1999 Andrea Arcangeli
+ */

#include <linux/malloc.h>
#include <linux/init.h>
@@ -39,6 +44,7 @@

struct task_struct **tarray_freelist = NULL;
spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t mm_lock = SPIN_LOCK_UNLOCKED;

/* UID task count cache, to prevent walking entire process list every
* single fork() operation.
@@ -261,7 +267,7 @@
if (mm) {
*mm = *current->mm;
init_new_context(mm);
- atomic_set(&mm->count, 1);
+ mm->count = 1;
mm->map_count = 0;
mm->def_flags = 0;
mm->mmap_sem = MUTEX_LOCKED;
@@ -303,12 +309,25 @@
}
}

+int mmget(struct mm_struct * mm)
+{
+ int retval = 0;
+
+ if (mm->count)
+ {
+ mm->count++;
+ retval = 1;
+ }
+
+ return retval;
+}
+
/*
* Decrement the use count and release all resources for an mm.
*/
void mmput(struct mm_struct *mm)
{
- if (atomic_dec_and_test(&mm->count)) {
+ if (!--mm->count) {
release_segments(mm);
exit_mmap(mm);
free_page_tables(mm);
@@ -322,7 +341,9 @@
int retval;

if (clone_flags & CLONE_VM) {
+ spin_lock(&mm_lock);
mmget(current->mm);
+ spin_unlock(&mm_lock);
/*
* Set up the LDT descriptor for the clone task.
*/

Tom, could you confirm me that this will fix also the popular process in D
state?

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/