Re: [RFC][PATCH 2/3] memcg: oom notifier

From: Kirill A. Shutemov
Date: Thu Mar 11 2010 - 09:47:11 EST


On Thu, Mar 11, 2010 at 9:57 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
>
> Considering containers or other resource management softwares in userland,
> event notification of OOM in memcg should be implemented.
> Now, memcg has "threshold" notifier which uses eventfd, we can make
> use of it for oom notification.
>
> This patch adds oom notification eventfd callback for memcg. The usage
> is very similar to threshold notifier, but control file is
> memory.oom_control and no arguments other than eventfd is required.
>
> Â Â Â Â% cgroup_event_notifier /cgroup/A/memory.oom_control dummy
> Â Â Â Â(About cgroup_event_notifier, see Documentation/cgroup/)
>
> TODO:
> Â- add a knob to disable oom-kill under a memcg.
> Â- add read/write function to oom_control
>
> Changelog: 20100309
> Â- splitted from threshold functions. use list rather than array.
> Â- moved all to inside of mutex.
> Changelog: 20100304
> Â- renewed implemenation.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>

Looks great! Two remarks below.

Reviewed-by: Kirill A. Shutemov <kirill@xxxxxxxxxxxxx>

> ---
> ÂDocumentation/cgroups/memory.txt | Â 20 +++++++
> Âmm/memcontrol.c         Â| Â105 ++++++++++++++++++++++++++++++++++++---
> Â2 files changed, 116 insertions(+), 9 deletions(-)
>
> Index: mmotm-2.6.34-Mar9/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.34-Mar9.orig/mm/memcontrol.c
> +++ mmotm-2.6.34-Mar9/mm/memcontrol.c
> @@ -149,6 +149,7 @@ struct mem_cgroup_threshold {
> Â Â Â Âu64 threshold;
> Â};
>
> +/* For threshold */
> Âstruct mem_cgroup_threshold_ary {
> Â Â Â Â/* An array index points to threshold just below usage. */
> Â Â Â Âatomic_t current_threshold;
> @@ -157,8 +158,14 @@ struct mem_cgroup_threshold_ary {
> Â Â Â Â/* Array of thresholds */
> Â Â Â Âstruct mem_cgroup_threshold entries[0];
> Â};
> +/* for OOM */
> +struct mem_cgroup_eventfd_list {
> + Â Â Â struct list_head list;
> + Â Â Â struct eventfd_ctx *eventfd;
> +};
>
> Âstatic void mem_cgroup_threshold(struct mem_cgroup *mem);
> +static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
>
> Â/*
> Â* The memory controller data structure. The memory controller controls both
> @@ -220,6 +227,9 @@ struct mem_cgroup {
> Â Â Â Â/* thresholds for mem+swap usage. RCU-protected */
> Â Â Â Âstruct mem_cgroup_threshold_ary *memsw_thresholds;
>
> + Â Â Â /* For oom notifier event fd */
> + Â Â Â struct list_head oom_notify;
> +
> Â Â Â Â/*
> Â Â Â Â * Should we move charges of a task when a task is moved into this
> Â Â Â Â * mem_cgroup ? And what type of charges should we move ?
> @@ -282,9 +292,12 @@ enum charge_type {
> Â/* for encoding cft->private value on file */
> Â#define _MEM Â Â Â Â Â Â Â Â Â (0)
> Â#define _MEMSWAP Â Â Â Â Â Â Â (1)
> +#define _OOM_TYPE Â Â Â Â Â Â Â(2)
> Â#define MEMFILE_PRIVATE(x, val) Â Â Â Â(((x) << 16) | (val))
> Â#define MEMFILE_TYPE(val) Â Â Â(((val) >> 16) & 0xffff)
> Â#define MEMFILE_ATTR(val) Â Â Â((val) & 0xffff)
> +/* Used for OOM nofiier */
> +#define OOM_CONTROL Â Â Â Â Â Â(0)
>
> Â/*
> Â* Reclaim flags for mem_cgroup_hierarchical_reclaim
> @@ -1351,6 +1364,8 @@ bool mem_cgroup_handle_oom(struct mem_cg
> Â Â Â Â */
> Â Â Â Âif (!locked)
> Â Â Â Â Â Â Â Âprepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
> + Â Â Â else
> + Â Â Â Â Â Â Â mem_cgroup_oom_notify(mem);
> Â Â Â Âmutex_unlock(&memcg_oom_mutex);
>
> Â Â Â Âif (locked)
> @@ -3398,8 +3413,22 @@ static int compare_thresholds(const void
> Â Â Â Âreturn _a->threshold - _b->threshold;
> Â}
>
> -static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
> - Â Â Â Â Â Â Â struct eventfd_ctx *eventfd, const char *args)
> +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
> +{
> + Â Â Â struct mem_cgroup_eventfd_list *ev;
> +
> + Â Â Â list_for_each_entry(ev, &mem->oom_notify, list)
> + Â Â Â Â Â Â Â eventfd_signal(ev->eventfd, 1);
> + Â Â Â return 0;
> +}
> +
> +static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
> +{
> + Â Â Â mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
> +}
> +
> +static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
> + Â Â Â struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
> Â{
> Â Â Â Âstruct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> Â Â Â Âstruct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
> @@ -3483,8 +3512,8 @@ unlock:
> Â Â Â Âreturn ret;
> Â}
>
> -static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
> - Â Â Â Â Â Â Â struct eventfd_ctx *eventfd)
> +static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
> + Â Â Â struct cftype *cft, struct eventfd_ctx *eventfd)
> Â{
> Â Â Â Âstruct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> Â Â Â Âstruct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
> @@ -3568,13 +3597,66 @@ unlock:
> Â Â Â Âreturn ret;
> Â}
>
> +static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
> + Â Â Â struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
> +{
> + Â Â Â struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> + Â Â Â struct mem_cgroup_eventfd_list *event;
> + Â Â Â int type = MEMFILE_TYPE(cft->private);
> + Â Â Â int ret = -ENOMEM;
> +
> + Â Â Â BUG_ON(type != _OOM_TYPE);
> +
> + Â Â Â mutex_lock(&memcg_oom_mutex);
> +
> + Â Â Â /* Allocate memory for new array of thresholds */

Irrelevant comment?

> + Â Â Â event = kmalloc(sizeof(*event), GFP_KERNEL);
> + Â Â Â if (!event)
> + Â Â Â Â Â Â Â goto unlock;
> + Â Â Â /* Add new threshold */

Ditto.

> + Â Â Â event->eventfd = eventfd;
> + Â Â Â list_add(&event->list, &memcg->oom_notify);
> +
> + Â Â Â /* already in OOM ? */
> + Â Â Â if (atomic_read(&memcg->oom_lock))
> + Â Â Â Â Â Â Â eventfd_signal(eventfd, 1);
> + Â Â Â ret = 0;
> +unlock:
> + Â Â Â mutex_unlock(&memcg_oom_mutex);
> +
> + Â Â Â return ret;
> +}
> +
> +static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
> + Â Â Â struct cftype *cft, struct eventfd_ctx *eventfd)
> +{
> + Â Â Â struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
> + Â Â Â struct mem_cgroup_eventfd_list *ev, *tmp;
> + Â Â Â int type = MEMFILE_TYPE(cft->private);
> +
> + Â Â Â BUG_ON(type != _OOM_TYPE);
> +
> + Â Â Â mutex_lock(&memcg_oom_mutex);
> +
> + Â Â Â list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
> + Â Â Â Â Â Â Â if (ev->eventfd == eventfd) {
> + Â Â Â Â Â Â Â Â Â Â Â list_del(&ev->list);
> + Â Â Â Â Â Â Â Â Â Â Â kfree(ev);
> + Â Â Â Â Â Â Â }
> + Â Â Â }
> +
> + Â Â Â mutex_unlock(&memcg_oom_mutex);
> +
> + Â Â Â return 0;
> +}
> +
> Âstatic struct cftype mem_cgroup_files[] = {
> Â Â Â Â{
> Â Â Â Â Â Â Â Â.name = "usage_in_bytes",
> Â Â Â Â Â Â Â Â.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
> Â Â Â Â Â Â Â Â.read_u64 = mem_cgroup_read,
> - Â Â Â Â Â Â Â .register_event = mem_cgroup_register_event,
> - Â Â Â Â Â Â Â .unregister_event = mem_cgroup_unregister_event,
> + Â Â Â Â Â Â Â .register_event = mem_cgroup_usage_register_event,
> + Â Â Â Â Â Â Â .unregister_event = mem_cgroup_usage_unregister_event,
> Â Â Â Â},
> Â Â Â Â{
> Â Â Â Â Â Â Â Â.name = "max_usage_in_bytes",
> @@ -3623,6 +3705,12 @@ static struct cftype mem_cgroup_files[]
> Â Â Â Â Â Â Â Â.read_u64 = mem_cgroup_move_charge_read,
> Â Â Â Â Â Â Â Â.write_u64 = mem_cgroup_move_charge_write,
> Â Â Â Â},
> + Â Â Â {
> + Â Â Â Â Â Â Â .name = "oom_control",
> + Â Â Â Â Â Â Â .register_event = mem_cgroup_oom_register_event,
> + Â Â Â Â Â Â Â .unregister_event = mem_cgroup_oom_unregister_event,
> + Â Â Â Â Â Â Â .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
> + Â Â Â },
> Â};
>
> Â#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
> @@ -3631,8 +3719,8 @@ static struct cftype memsw_cgroup_files[
> Â Â Â Â Â Â Â Â.name = "memsw.usage_in_bytes",
> Â Â Â Â Â Â Â Â.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
> Â Â Â Â Â Â Â Â.read_u64 = mem_cgroup_read,
> - Â Â Â Â Â Â Â .register_event = mem_cgroup_register_event,
> - Â Â Â Â Â Â Â .unregister_event = mem_cgroup_unregister_event,
> + Â Â Â Â Â Â Â .register_event = mem_cgroup_usage_register_event,
> + Â Â Â Â Â Â Â .unregister_event = mem_cgroup_usage_unregister_event,
> Â Â Â Â},
> Â Â Â Â{
> Â Â Â Â Â Â Â Â.name = "memsw.max_usage_in_bytes",
> @@ -3876,6 +3964,7 @@ mem_cgroup_create(struct cgroup_subsys *
> Â Â Â Â}
> Â Â Â Âmem->last_scanned_child = 0;
> Â Â Â Âspin_lock_init(&mem->reclaim_param_lock);
> + Â Â Â INIT_LIST_HEAD(&mem->oom_notify);
>
> Â Â Â Âif (parent)
> Â Â Â Â Â Â Â Âmem->swappiness = get_swappiness(parent);
> Index: mmotm-2.6.34-Mar9/Documentation/cgroups/memory.txt
> ===================================================================
> --- mmotm-2.6.34-Mar9.orig/Documentation/cgroups/memory.txt
> +++ mmotm-2.6.34-Mar9/Documentation/cgroups/memory.txt
> @@ -184,6 +184,9 @@ limits on the root cgroup.
>
> ÂNote2: When panic_on_oom is set to "2", the whole system will panic.
>
> +When oom event notifier is registered, event will be delivered.
> +(See oom_control section)
> +
> Â2. Locking
>
> ÂThe memory controller uses the following hierarchy
> @@ -488,7 +491,22 @@ threshold in any direction.
>
> ÂIt's applicable for root and non-root cgroup.
>
> -10. TODO
> +10. OOM Control
> +
> +Memory controler implements oom notifier using cgroup notification
> +API (See cgroups.txt). It allows to register multiple oom notification
> +delivery and gets notification when oom happens.
> +
> +To register a notifier, application need:
> + - create an eventfd using eventfd(2)
> + - open memory.oom_control file
> + - write string like "<event_fd> <memory.oom_control>" to cgroup.event_control
> +
> +Application will be notifier through eventfd when oom happens.
> +OOM notification doesn't work for root cgroup.
> +
> +
> +11. TODO
>
> Â1. Add support for accounting huge pages (as a separate controller)
> Â2. Make per-cgroup scanner reclaim not-shared pages first
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/