Re: [patch -mm v2] mm: introduce oom_adj_child

From: David Rientjes
Date: Fri Jul 31 2009 - 15:38:41 EST


On Fri, 31 Jul 2009, KOSAKI Motohiro wrote:

> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 3ce5ae9..c64499e 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1008,7 +1008,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
> return -ESRCH;
> task_lock(task);
> if (task->mm)
> - oom_adjust = task->mm->oom_adj;
> + oom_adjust = task->signal->oom_adj;
> else
> oom_adjust = OOM_DISABLE;
> task_unlock(task);

This may display a /proc/pid/oom_adj that is radically different from
task->mm->oom_adj_cached without knowledge to userspace and you can't
simply display task->mm>oom_adj_cached here because it gets reset on every
write to /proc/pid/oom_adj.

> @@ -1046,12 +1046,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
> put_task_struct(task);
> return -EINVAL;
> }
> - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
> + if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
> task_unlock(task);
> put_task_struct(task);
> return -EACCES;
> }
> - task->mm->oom_adj = oom_adjust;
> + task->signal->oom_adj = oom_adjust;
> + task->mm->oom_adj_cached = OOM_CACHE_DEFAULT;
> task_unlock(task);
> put_task_struct(task);
> if (end - buffer == 0)
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 7acc843..f93f97f 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -240,7 +240,8 @@ struct mm_struct {
>
> unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
>
> - s8 oom_adj; /* OOM kill score adjustment (bit shift) */
> + s8 oom_adj_cached; /* mirror from signal_struct->oom_adj.
> + in vfork case, multiple processes use the same mm. */
>
> cpumask_t cpu_vm_mask;
>
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index a7979ba..a219480 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -3,6 +3,7 @@
>
> /* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
> #define OOM_DISABLE (-17)
> +#define OOM_CACHE_DEFAULT (15)
> /* inclusive */
> #define OOM_ADJUST_MIN (-16)
> #define OOM_ADJUST_MAX 15
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 3ab08e4..e10b12b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -629,6 +629,8 @@ struct signal_struct {
> unsigned audit_tty;
> struct tty_audit_buf *tty_audit_buf;
> #endif
> +
> + s8 oom_adj; /* OOM kill score adjustment (bit shift) */
> };
>
> /* Context switch must be unlocked if interrupts are to be enabled */

I don't believe oom_adj is an appropriate use of signal_struct, sorry.

> diff --git a/kernel/exit.c b/kernel/exit.c
> index 869dc22..c741a45 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -688,6 +689,7 @@ static void exit_mm(struct task_struct * tsk)
> enter_lazy_tlb(mm, current);
> /* We don't want this task to be frozen prematurely */
> clear_freeze_flag(tsk);
> + mm->oom_adj_cached = OOM_CACHE_DEFAULT;
> task_unlock(tsk);
> mm_update_next_owner(mm);
> mmput(mm);

This is similiar to an early proposal that wanted to keep an array of
oom_adj values for tasks attached to the mm in mm_struct. The problem is
that you're obviously losing information about all threads attached to the
mm any time one of the threads exits or writes to /proc/pid/oom_adj. That
information can only be regenerated with a tasklist scan.

> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9b42695..b7cb474 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -426,6 +427,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> init_rwsem(&mm->mmap_sem);
> INIT_LIST_HEAD(&mm->mmlist);
> mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
> + mm->oom_adj_cached = OOM_CACHE_DEFAULT;
> mm->core_state = NULL;
> mm->nr_ptes = 0;
> set_mm_counter(mm, file_rss, 0);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 175a67a..eae2d78 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -58,7 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
> unsigned long points, cpu_time, run_time;
> struct mm_struct *mm;
> struct task_struct *child;
> - int oom_adj;
> + s8 oom_adj;
>
> task_lock(p);
> mm = p->mm;
> @@ -66,7 +66,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
> task_unlock(p);
> return 0;
> }
> - oom_adj = mm->oom_adj;
> +
> + if (mm->oom_adj_cached < p->signal->oom_adj)
> + mm->oom_adj_cached = p->signal->oom_adj;

This conditional will never be true since mm->oom_adj_cached is
initialized to 15, which is the upper bound on which p->signal->oom_adj
may ever be, so mm->oom_adj_cached never gets changed from
OOM_CACHE_DEFAULT.

Thus, this patch doesn't even work, and you probably would have noticed
that if you'd checked /proc/pid/oom_score for any pid.

Even if mm->oom_adj_cached _was_ properly updated here,
/proc/pid/oom_score would be out of sync with more negative oom_adj values
for threads sharing the mm_struct since it calls badness() for only a
single thread.

> + oom_adj = mm->oom_adj_cached;
> if (oom_adj == OOM_DISABLE) {
> task_unlock(p);
> return 0;
> @@ -350,7 +354,7 @@ static int oom_kill_task(struct task_struct *p)
>
> task_lock(p);
> mm = p->mm;
> - if (!mm || mm->oom_adj == OOM_DISABLE) {
> + if (!mm || p->signal->oom_adj == OOM_DISABLE) {
> task_unlock(p);
> return 1;
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/