Re: [patch 2/2] mm, oom: fix race when specifying a thread as theoom origin

From: Michal Hocko
Date: Thu Nov 08 2012 - 10:51:04 EST


On Thu 08-11-12 01:27:00, David Rientjes wrote:
> test_set_oom_score_adj() and compare_swap_oom_score_adj() are used to
> specify that current should be killed first if an oom condition occurs in
> between the two calls.
>
> The usage is
>
> short oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
> ...
> compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
>
> to store the thread's oom_score_adj, temporarily change it to the maximum
> score possible, and then restore the old value if it is still the same.
>
> This happens to still be racy, however, if the user writes
> OOM_SCORE_ADJ_MAX to /proc/pid/oom_score_adj in between the two calls.
> The compare_swap_oom_score_adj() will then incorrectly reset the old
> value prior to the write of OOM_SCORE_ADJ_MAX.
>
> To fix this, introduce a new oom_flags_t member in struct signal_struct
> that will be used for per-thread oom killer flags. KSM and swapoff can
> now use a bit in this member to specify that threads should be killed
> first in oom conditions without playing around with oom_score_adj.
>
> This also allows the correct oom_score_adj to always be shown when
> reading /proc/pid/oom_score.
>
> Signed-off-by: David Rientjes <rientjes@xxxxxxxxxx>

I didn't like the previous playing with the oom_score_adj and what you
propose looks much nicer.
Maybe s/oom_task_origin/task_oom_origin/ would be a better fit with
{set,clear}_current_oom_origin but I do not care much.

Reviewed-by: Michal Hocko <mhocko@xxxxxxx>

Thanks!
> ---
> include/linux/oom.h | 19 +++++++++++++++++--
> include/linux/sched.h | 1 +
> include/linux/types.h | 1 +
> mm/ksm.c | 7 ++-----
> mm/oom_kill.c | 49 +++++++------------------------------------------
> mm/swapfile.c | 5 ++---
> 6 files changed, 30 insertions(+), 52 deletions(-)
>
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -29,8 +29,23 @@ enum oom_scan_t {
> OOM_SCAN_SELECT, /* always select this thread first */
> };
>
> -extern void compare_swap_oom_score_adj(short old_val, short new_val);
> -extern short test_set_oom_score_adj(short new_val);
> +/* Thread is the potential origin of an oom condition; kill first on oom */
> +#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1)
> +
> +static inline void set_current_oom_origin(void)
> +{
> + current->signal->oom_flags |= OOM_FLAG_ORIGIN;
> +}
> +
> +static inline void clear_current_oom_origin(void)
> +{
> + current->signal->oom_flags &= ~OOM_FLAG_ORIGIN;
> +}
> +
> +static inline bool oom_task_origin(const struct task_struct *p)
> +{
> + return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
> +}
>
> extern unsigned long oom_badness(struct task_struct *p,
> struct mem_cgroup *memcg, const nodemask_t *nodemask,
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -631,6 +631,7 @@ struct signal_struct {
> struct rw_semaphore group_rwsem;
> #endif
>
> + oom_flags_t oom_flags;
> short oom_score_adj; /* OOM kill score adjustment */
> short oom_score_adj_min; /* OOM kill score adjustment min value.
> * Only settable by CAP_SYS_RESOURCE. */
> diff --git a/include/linux/types.h b/include/linux/types.h
> --- a/include/linux/types.h
> +++ b/include/linux/types.h
> @@ -156,6 +156,7 @@ typedef u32 dma_addr_t;
> #endif
> typedef unsigned __bitwise__ gfp_t;
> typedef unsigned __bitwise__ fmode_t;
> +typedef unsigned __bitwise__ oom_flags_t;
>
> #ifdef CONFIG_PHYS_ADDR_T_64BIT
> typedef u64 phys_addr_t;
> diff --git a/mm/ksm.c b/mm/ksm.c
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -1929,12 +1929,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
> if (ksm_run != flags) {
> ksm_run = flags;
> if (flags & KSM_RUN_UNMERGE) {
> - short oom_score_adj;
> -
> - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
> + set_current_oom_origin();
> err = unmerge_and_remove_all_rmap_items();
> - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
> - oom_score_adj);
> + clear_current_oom_origin();
> if (err) {
> ksm_run = KSM_RUN_STOP;
> count = err;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
> int sysctl_oom_dump_tasks = 1;
> static DEFINE_SPINLOCK(zone_scan_lock);
>
> -/*
> - * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
> - * @old_val: old oom_score_adj for compare
> - * @new_val: new oom_score_adj for swap
> - *
> - * Sets the oom_score_adj value for current to @new_val iff its present value is
> - * @old_val. Usually used to reinstate a previous value to prevent racing with
> - * userspacing tuning the value in the interim.
> - */
> -void compare_swap_oom_score_adj(short old_val, short new_val)
> -{
> - struct sighand_struct *sighand = current->sighand;
> -
> - spin_lock_irq(&sighand->siglock);
> - if (current->signal->oom_score_adj == old_val)
> - current->signal->oom_score_adj = new_val;
> - trace_oom_score_adj_update(current);
> - spin_unlock_irq(&sighand->siglock);
> -}
> -
> -/**
> - * test_set_oom_score_adj() - set current's oom_score_adj and return old value
> - * @new_val: new oom_score_adj value
> - *
> - * Sets the oom_score_adj value for current to @new_val with proper
> - * synchronization and returns the old value. Usually used to temporarily
> - * set a value, save the old value in the caller, and then reinstate it later.
> - */
> -short test_set_oom_score_adj(short new_val)
> -{
> - struct sighand_struct *sighand = current->sighand;
> - int old_val;
> -
> - spin_lock_irq(&sighand->siglock);
> - old_val = current->signal->oom_score_adj;
> - current->signal->oom_score_adj = new_val;
> - trace_oom_score_adj_update(current);
> - spin_unlock_irq(&sighand->siglock);
> -
> - return old_val;
> -}
> -
> #ifdef CONFIG_NUMA
> /**
> * has_intersects_mems_allowed() - check task eligiblity for kill
> @@ -310,6 +268,13 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> if (!task->mm)
> return OOM_SCAN_CONTINUE;
>
> + /*
> + * If task is allocating a lot of memory and has been marked to be
> + * killed first if it triggers an oom, then select it.
> + */
> + if (oom_task_origin(task))
> + return OOM_SCAN_SELECT;
> +
> if (task->flags & PF_EXITING) {
> /*
> * If task is current and is in the process of releasing memory,
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1484,7 +1484,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> struct address_space *mapping;
> struct inode *inode;
> struct filename *pathname;
> - short oom_score_adj;
> int i, type, prev;
> int err;
>
> @@ -1544,9 +1543,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> p->flags &= ~SWP_WRITEOK;
> spin_unlock(&swap_lock);
>
> - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
> + set_current_oom_origin();
> err = try_to_unuse(type, false, 0); /* force all pages to be unused */
> - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
> + clear_current_oom_origin();
>
> if (err) {
> /*

--
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/