Re: Multiple oom_reaper BUGs: unmap_page_range racing with exit_mmap

From: Tetsuo Handa
Date: Wed Dec 06 2017 - 06:37:29 EST


David Rientjes wrote:
> On Wed, 6 Dec 2017, Tetsuo Handa wrote:
> > Also, I don't know what exit_mmap() is doing but I think that there is a
> > possibility that the OOM reaper tries to reclaim mlocked pages as soon as
> > exit_mmap() cleared VM_LOCKED flag by calling munlock_vma_pages_all().
> >
> > if (mm->locked_vm) {
> > vma = mm->mmap;
> > while (vma) {
> > if (vma->vm_flags & VM_LOCKED)
> > munlock_vma_pages_all(vma);
> > vma = vma->vm_next;
> > }
> > }
> >
>
> Yes, that looks possible as well, although the problem I have reported can
> happen with or without mlock. Did you find this by code inspection or
> have you experienced runtime problems with it?

By code inspection.

>
> I think this argues to do MMF_REAPING-style behavior at the beginning of
> exit_mmap() and avoid reaping all together once we have reached that
> point. There are no more users of the mm and we are in the process of
> tearing it down, I'm not sure that the oom reaper should be in the
> business with trying to interfere with that. Or are there actual bug
> reports where an oom victim gets wedged while in exit_mmap() prior to
> releasing its memory?

If our assumption is that the OOM reaper can reclaim majority of OOM
victim's memory via victim's ->signal->oom_mm, what will be wrong with
simply reverting 212925802454672e ("mm: oom: let oom_reap_task and
exit_mmap run concurrently") and replace mmgrab()/mmdrop_async() with
mmget()/mmput_async() so that the OOM reaper no longer need to worry
about tricky __mmput() behavior (like shown below) ?

----------
kernel/fork.c | 17 ++++++++++++-----
mm/mmap.c | 17 -----------------
mm/oom_kill.c | 21 +++++++--------------
3 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 432eadf..018a857 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -394,12 +394,18 @@ static inline void free_signal_struct(struct signal_struct *sig)
{
taskstats_tgid_free(sig);
sched_autogroup_exit(sig);
- /*
- * __mmdrop is not safe to call from softirq context on x86 due to
- * pgd_dtor so postpone it to the async context
- */
- if (sig->oom_mm)
+ if (sig->oom_mm) {
+#ifdef CONFIG_MMU
+ mmput_async(sig->oom_mm);
+#else
+ /*
+ * There might be archtectures where calling __mmdrop() from
+ * softirq context is not safe. Thus, postpone it to the async
+ * context.
+ */
mmdrop_async(sig->oom_mm);
+#endif
+ }
kmem_cache_free(signal_cachep, sig);
}

@@ -931,6 +937,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
mmdrop(mm);
}

diff --git a/mm/mmap.c b/mm/mmap.c
index a4d5468..fafaf06 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3019,23 +3019,6 @@ void exit_mmap(struct mm_struct *mm)
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);

- set_bit(MMF_OOM_SKIP, &mm->flags);
- if (unlikely(tsk_is_oom_victim(current))) {
- /*
- * Wait for oom_reap_task() to stop working on this
- * mm. Because MMF_OOM_SKIP is already set before
- * calling down_read(), oom_reap_task() will not run
- * on this "mm" post up_write().
- *
- * tsk_is_oom_victim() cannot be set from under us
- * either because current->mm is already set to NULL
- * under task_lock before calling mmput and oom_mm is
- * set not NULL by the OOM killer only if current->mm
- * is found not NULL while holding the task_lock.
- */
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
- }
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c957be3..eb2a005 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -528,18 +528,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
goto unlock_oom;
}

- /*
- * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
- * work on the mm anymore. The check for MMF_OOM_SKIP must run
- * under mmap_sem for reading because it serializes against the
- * down_write();up_write() cycle in exit_mmap().
- */
- if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
- up_read(&mm->mmap_sem);
- trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
- }
-
trace_start_task_reaping(tsk->pid);

/*
@@ -683,8 +671,13 @@ static void mark_oom_victim(struct task_struct *tsk)
return;

/* oom_mm is bound to the signal struct life time. */
- if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
- mmgrab(tsk->signal->oom_mm);
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+#ifdef CONFIG_MMU
+ mmget(mm);
+#else
+ mmgrab(mm);
+#endif
+ }

/*
* Make sure that the task is woken up from uninterruptible sleep
----------

If holding the address space when there are so many victim's mm waiting for
the OOM reaper to reclaim is considered dangerous, I think we can try direct
OOM reaping (like untested patch shown below) in order to reclaim first-blocked
first-reaped basis (and also serve as a mitigation for race caused by removing
oom_lock from the OOM reaper).

----------
include/linux/mm_types.h | 3 ++
include/linux/sched.h | 3 --
mm/oom_kill.c | 132 ++++++++++++++---------------------------------
3 files changed, 43 insertions(+), 95 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cfd0ac4..068119b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -501,6 +501,9 @@ struct mm_struct {
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
+#ifdef CONFIG_MMU
+ unsigned long oom_reap_started;
+#endif

#if IS_ENABLED(CONFIG_HMM)
/* HMM needs to track a few things per mm */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a2709d2..d63b599 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1081,9 +1081,6 @@ struct task_struct {
unsigned long task_state_change;
#endif
int pagefault_disabled;
-#ifdef CONFIG_MMU
- struct task_struct *oom_reaper_list;
-#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
#endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3b0d0fe..a9f8bae 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -309,9 +309,14 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
return CONSTRAINT_NONE;
}

+#ifdef CONFIG_MMU
+static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm);
+#endif
+
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
+ struct mm_struct *mm;
unsigned long points;

if (oom_unkillable_task(task, NULL, oc->nodemask))
@@ -324,7 +329,8 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* any memory is quite low.
*/
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
- if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
+ mm = task->signal->oom_mm;
+ if (test_bit(MMF_OOM_SKIP, &mm->flags))
goto next;
goto abort;
}
@@ -357,6 +363,15 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
if (oc->chosen)
put_task_struct(oc->chosen);
oc->chosen = (void *)-1UL;
+#ifdef CONFIG_MMU
+ get_task_struct(task);
+ if (!is_memcg_oom(oc))
+ rcu_read_unlock();
+ oom_reap_task_mm(task, mm);
+ put_task_struct(task);
+ if (!is_memcg_oom(oc))
+ rcu_read_lock();
+#endif
return 1;
}

@@ -474,23 +489,14 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
return false;
}

-
#ifdef CONFIG_MMU
-/*
- * OOM Reaper kernel thread which tries to reap the memory used by the OOM
- * victim (if that is possible) to help the OOM killer to move on.
- */
-static struct task_struct *oom_reaper_th;
-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
-static struct task_struct *oom_reaper_list;
-static DEFINE_SPINLOCK(oom_reaper_lock);
-
static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
bool ret = true;

+ trace_wake_reaper(tsk->pid);
if (!down_read_trylock(&mm->mmap_sem)) {
ret = false;
trace_skip_task_reaping(tsk->pid);
@@ -507,8 +513,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* mmu_notifier_invalidate_range_{start,end} around unmap_page_range
*/
if (mm_has_notifiers(mm)) {
+ ret = false;
up_read(&mm->mmap_sem);
- schedule_timeout_idle(HZ);
goto unlock_oom;
}

@@ -567,82 +573,22 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
return ret;
}

-#define MAX_OOM_REAP_RETRIES 10
-static void oom_reap_task(struct task_struct *tsk)
-{
- int attempts = 0;
- struct mm_struct *mm = tsk->signal->oom_mm;
-
- /* Retry the down_read_trylock(mmap_sem) a few times */
- while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
- schedule_timeout_idle(HZ/10);
-
- if (attempts <= MAX_OOM_REAP_RETRIES)
- goto done;
-
-
- pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
- task_pid_nr(tsk), tsk->comm);
- debug_show_all_locks();
-
-done:
- tsk->oom_reaper_list = NULL;
-
- /*
- * Hide this mm from OOM killer because it has been either reaped or
- * somebody can't call up_write(mmap_sem).
- */
- set_bit(MMF_OOM_SKIP, &mm->flags);
-
- /* Drop a reference taken by wake_oom_reaper */
- put_task_struct(tsk);
-}
-
-static int oom_reaper(void *unused)
+static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
- while (true) {
- struct task_struct *tsk = NULL;
-
- wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
- spin_lock(&oom_reaper_lock);
- if (oom_reaper_list != NULL) {
- tsk = oom_reaper_list;
- oom_reaper_list = tsk->oom_reaper_list;
+ if (__oom_reap_task_mm(tsk, mm))
+ /* Hide this mm from OOM killer because we reaped it. */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ else if (!mm->oom_reap_started)
+ mm->oom_reap_started = jiffies;
+ else if (time_after(jiffies, mm->oom_reap_started + HZ)) {
+ if (!mm_has_notifiers(mm)) {
+ pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ debug_show_all_locks();
}
- spin_unlock(&oom_reaper_lock);
-
- if (tsk)
- oom_reap_task(tsk);
+ /* Hide this mm from OOM killer because we can't reap. */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
}
-
- return 0;
-}
-
-static void wake_oom_reaper(struct task_struct *tsk)
-{
- /* tsk is already queued? */
- if (tsk == oom_reaper_list || tsk->oom_reaper_list)
- return;
-
- get_task_struct(tsk);
-
- spin_lock(&oom_reaper_lock);
- tsk->oom_reaper_list = oom_reaper_list;
- oom_reaper_list = tsk;
- spin_unlock(&oom_reaper_lock);
- trace_wake_reaper(tsk->pid);
- wake_up(&oom_reaper_wait);
-}
-
-static int __init oom_init(void)
-{
- oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
- return 0;
-}
-subsys_initcall(oom_init)
-#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
-{
}
#endif /* CONFIG_MMU */

@@ -825,7 +771,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- bool can_oom_reap = true;

/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -835,7 +780,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
task_lock(p);
if (task_will_free_mem(p)) {
mark_oom_victim(p);
- wake_oom_reaper(p);
task_unlock(p);
put_task_struct(p);
return;
@@ -924,7 +868,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
if (same_thread_group(p, victim))
continue;
if (is_global_init(p)) {
- can_oom_reap = false;
set_bit(MMF_OOM_SKIP, &mm->flags);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
@@ -941,9 +884,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
rcu_read_unlock();

- if (can_oom_reap)
- wake_oom_reaper(victim);
-
+#ifdef CONFIG_MMU
+ /*
+ * sysctl_oom_kill_allocating_task case could get stuck because
+ * select_bad_process() which calls oom_reap_task_mm() is not called.
+ */
+ if (sysctl_oom_kill_allocating_task &&
+ !test_bit(MMF_OOM_SKIP, &mm->flags))
+ oom_reap_task_mm(victim, mm);
+#endif
mmdrop(mm);
put_task_struct(victim);
}
@@ -1019,7 +968,6 @@ bool out_of_memory(struct oom_control *oc)
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
- wake_oom_reaper(current);
return true;
}

----------