Re: [PATCH v4 4/4] memcg: implement memory thresholds

From: Kirill A. Shutemov
Date: Sun Dec 27 2009 - 22:23:57 EST


On Mon, Dec 28, 2009 at 4:43 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> On Sun, 27 Dec 2009 04:09:02 +0200
> "Kirill A. Shutemov" <kirill@xxxxxxxxxxxxx> wrote:
>
>> It allows to register multiple memory and memsw thresholds and gets
>> notifications when it crosses.
>>
>> To register a threshold application need:
>> - create an eventfd;
>> - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
>> - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
>> Â cgroup.event_control.
>>
>> Application will be notified through eventfd when memory usage crosses
>> threshold in any direction.
>>
>> It's applicable for root and non-root cgroup.
>>
>> It uses stats to track memory usage, simmilar to soft limits. It checks
>> if we need to send event to userspace on every 100 page in/out. I guess
>> it's good compromise between performance and accuracy of thresholds.
>>
>> Signed-off-by: Kirill A. Shutemov <kirill@xxxxxxxxxxxxx>
>> ---
>> ÂDocumentation/cgroups/memory.txt | Â 19 +++-
>> Âmm/memcontrol.c         Â| Â275 ++++++++++++++++++++++++++++++++++++++
>> Â2 files changed, 293 insertions(+), 1 deletions(-)
>>
>> diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
>> index b871f25..195af07 100644
>> --- a/Documentation/cgroups/memory.txt
>> +++ b/Documentation/cgroups/memory.txt
>> @@ -414,7 +414,24 @@ NOTE1: Soft limits take effect over a long period of time, since they involve
>> ÂNOTE2: It is recommended to set the soft limit always below the hard limit,
>> Â Â Â Â otherwise the hard limit will take precedence.
>>
>> -8. TODO
>> +8. Memory thresholds
>> +
>> +Memory controler implements memory thresholds using cgroups notification
>> +API (see cgroups.txt). It allows to register multiple memory and memsw
>> +thresholds and gets notifications when it crosses.
>> +
>> +To register a threshold application need:
>> + - create an eventfd using eventfd(2);
>> + - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
>> + - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
>> + Â cgroup.event_control.
>> +
>> +Application will be notified through eventfd when memory usage crosses
>> +threshold in any direction.
>> +
>> +It's applicable for root and non-root cgroup.
>> +
>> +9. TODO
>>
>> Â1. Add support for accounting huge pages (as a separate controller)
>> Â2. Make per-cgroup scanner reclaim not-shared pages first
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 36eb7af..3a0a6a1 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -6,6 +6,10 @@
>> Â * Copyright 2007 OpenVZ SWsoft Inc
>> Â * Author: Pavel Emelianov <xemul@xxxxxxxxxx>
>> Â *
>> + * Memory thresholds
>> + * Copyright (C) 2009 Nokia Corporation
>> + * Author: Kirill A. Shutemov
>> + *
>> Â * This program is free software; you can redistribute it and/or modify
>> Â * it under the terms of the GNU General Public License as published by
>> Â * the Free Software Foundation; either version 2 of the License, or
>> @@ -39,6 +43,8 @@
>> Â#include <linux/mm_inline.h>
>> Â#include <linux/page_cgroup.h>
>> Â#include <linux/cpu.h>
>> +#include <linux/eventfd.h>
>> +#include <linux/sort.h>
>> Â#include "internal.h"
>>
>> Â#include <asm/uaccess.h>
>> @@ -56,6 +62,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
>> Â#endif
>>
>> Â#define SOFTLIMIT_EVENTS_THRESH (1000)
>> +#define THRESHOLDS_EVENTS_THRESH (100)
>>
>> Â/*
>> Â * Statistics for memory cgroup.
>> @@ -72,6 +79,8 @@ enum mem_cgroup_stat_index {
>> Â Â Â MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
>> Â Â Â MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â used by soft limit implementation */
>> + Â Â MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â used by threshold implementation */
>>
>> Â Â Â MEM_CGROUP_STAT_NSTATS,
>> Â};
>> @@ -182,6 +191,20 @@ struct mem_cgroup_tree {
>>
>> Âstatic struct mem_cgroup_tree soft_limit_tree __read_mostly;
>>
>> +struct mem_cgroup_threshold {
>> + Â Â struct eventfd_ctx *eventfd;
>> + Â Â u64 threshold;
>> +};
>> +
>> +struct mem_cgroup_threshold_ary {
>> + Â Â unsigned int size;
>> + Â Â atomic_t cur;
>> + Â Â struct mem_cgroup_threshold entries[0];
>> +};
>> +
> Why "array" is a choice here ? IOW, why not list ?

We need be able to walk by thresholds in both directions to be fast.
AFAIK, It's impossible with RCU-protected list.

> How many waiters are expected as usual workload ?

Array of thresholds reads every 100 page in/out for every CPU.
Write access only when registering new threshold.

>> +static bool mem_cgroup_threshold_check(struct mem_cgroup* mem);
>> +static void mem_cgroup_threshold(struct mem_cgroup* mem);
>> +
>> Â/*
>> Â * The memory controller data structure. The memory controller controls both
>> Â * page cache and RSS per cgroup. We would eventually like to provide
>> @@ -233,6 +256,15 @@ struct mem_cgroup {
>> Â Â Â /* set when res.limit == memsw.limit */
>>    bool      Âmemsw_is_minimum;
>>
>> + Â Â /* protect arrays of thresholds */
>> + Â Â struct mutex thresholds_lock;
>> +
>> + Â Â /* thresholds for memory usage. RCU-protected */
>> + Â Â struct mem_cgroup_threshold_ary *thresholds;
>> +
>> + Â Â /* thresholds for mem+swap usage. RCU-protected */
>> + Â Â struct mem_cgroup_threshold_ary *memsw_thresholds;
>> +
>> Â Â Â /*
>> Â Â Â Â* statistics. This must be placed at the end of memcg.
>> Â Â Â Â*/
>> @@ -525,6 +557,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
>> Â Â Â Â Â Â Â __mem_cgroup_stat_add_safe(cpustat,
>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
>> Â Â Â __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
>> + Â Â __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
>> +
>> Â Â Â put_cpu();
>> Â}
>>
>> @@ -1510,6 +1544,8 @@ charged:
>> Â Â Â if (mem_cgroup_soft_limit_check(mem))
>> Â Â Â Â Â Â Â mem_cgroup_update_tree(mem, page);
>> Âdone:
>> + Â Â if (mem_cgroup_threshold_check(mem))
>> + Â Â Â Â Â Â mem_cgroup_threshold(mem);
>> Â Â Â return 0;
>> Ânomem:
>> Â Â Â css_put(&mem->css);
>> @@ -2075,6 +2111,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>>
>> Â Â Â if (mem_cgroup_soft_limit_check(mem))
>> Â Â Â Â Â Â Â mem_cgroup_update_tree(mem, page);
>> + Â Â if (mem_cgroup_threshold_check(mem))
>> + Â Â Â Â Â Â mem_cgroup_threshold(mem);
>> Â Â Â /* at swapout, this memcg will be accessed to record to swap */
>> Â Â Â if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
>> Â Â Â Â Â Â Â css_put(&mem->css);
>> @@ -3071,12 +3109,246 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
>> Â Â Â return 0;
>> Â}
>>
>> +static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
>> +{
>> + Â Â bool ret = false;
>> + Â Â int cpu;
>> + Â Â s64 val;
>> + Â Â struct mem_cgroup_stat_cpu *cpustat;
>> +
>> + Â Â cpu = get_cpu();
>> + Â Â cpustat = &mem->stat.cpustat[cpu];
>> + Â Â val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
>> + Â Â if (unlikely(val < 0)) {
>> + Â Â Â Â Â Â __mem_cgroup_stat_set(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â THRESHOLDS_EVENTS_THRESH);
>> + Â Â Â Â Â Â ret = true;
>> + Â Â }
>> + Â Â put_cpu();
>> + Â Â return ret;
>> +}
>> +
>> +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
>> +{
>> + Â Â struct mem_cgroup_threshold_ary *thresholds;
>> + Â Â u64 usage = mem_cgroup_usage(memcg, swap);
>> + Â Â int i, cur;
>> +
>> + Â Â rcu_read_lock();
>> + Â Â if (!swap) {
>> + Â Â Â Â Â Â thresholds = rcu_dereference(memcg->thresholds);
>> + Â Â } else {
>> + Â Â Â Â Â Â thresholds = rcu_dereference(memcg->memsw_thresholds);
>> + Â Â }
>> +
>> + Â Â if (!thresholds)
>> + Â Â Â Â Â Â goto unlock;
>> +
>> + Â Â cur = atomic_read(&thresholds->cur);
>> +
>> + Â Â /* Check if a threshold crossed in any direction */
>> +
>> + Â Â for(i = cur; i >= 0 &&
>> + Â Â Â Â Â Â unlikely(thresholds->entries[i].threshold > usage); i--) {
>> + Â Â Â Â Â Â atomic_dec(&thresholds->cur);
>> + Â Â Â Â Â Â eventfd_signal(thresholds->entries[i].eventfd, 1);
>> + Â Â }
>> +
>> + Â Â for(i = cur + 1; i < thresholds->size &&
>> + Â Â Â Â Â Â unlikely(thresholds->entries[i].threshold <= usage); i++) {
>> + Â Â Â Â Â Â atomic_inc(&thresholds->cur);
>> + Â Â Â Â Â Â eventfd_signal(thresholds->entries[i].eventfd, 1);
>> + Â Â }
>> +unlock:
>> + Â Â rcu_read_unlock();
>> +}
>> +
>> +static void mem_cgroup_threshold(struct mem_cgroup *memcg)
>> +{
>> + Â Â __mem_cgroup_threshold(memcg, false);
>> + Â Â if (do_swap_account)
>> + Â Â Â Â Â Â __mem_cgroup_threshold(memcg, true);
>> +}
>> +
>> +static int compare_thresholds(const void *a, const void *b)
>> +{
>> + Â Â const struct mem_cgroup_threshold *_a = a;
>> + Â Â const struct mem_cgroup_threshold *_b = b;
>> +
>> + Â Â return _a->threshold - _b->threshold;
>> +}
>> +
>> +static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
>> + Â Â Â Â Â Â struct eventfd_ctx *eventfd, const char *args)
>> +{
>> + Â Â struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
>> + Â Â struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
>> + Â Â int type = MEMFILE_TYPE(cft->private);
>> + Â Â u64 threshold, usage;
>> + Â Â int size;
>> + Â Â int i, ret;
>> +
>> + Â Â ret = res_counter_memparse_write_strategy(args, &threshold);
>> + Â Â if (ret)
>> + Â Â Â Â Â Â return ret;
>> +
>> + Â Â mutex_lock(&memcg->thresholds_lock);
>> + Â Â if (type == _MEM)
>> + Â Â Â Â Â Â thresholds = memcg->thresholds;
>> + Â Â else if (type == _MEMSWAP)
>> + Â Â Â Â Â Â thresholds = memcg->memsw_thresholds;
>> + Â Â else
>> + Â Â Â Â Â Â BUG();
>> +
>> + Â Â usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
>> +
>> + Â Â /* Check if a threshold crossed before adding a new one */
>> + Â Â if (thresholds)
>> + Â Â Â Â Â Â __mem_cgroup_threshold(memcg, type == _MEMSWAP);
>> +
>> + Â Â if (thresholds)
>> + Â Â Â Â Â Â size = thresholds->size + 1;
>> + Â Â else
>> + Â Â Â Â Â Â size = 1;
>> +
>> + Â Â /* Allocate memory for new array of thresholds */
>> + Â Â thresholds_new = kmalloc(sizeof(*thresholds_new) +
>> + Â Â Â Â Â Â Â Â Â Â size * sizeof(struct mem_cgroup_threshold),
>> + Â Â Â Â Â Â Â Â Â Â GFP_KERNEL);
>> + Â Â if (!thresholds_new) {
>> + Â Â Â Â Â Â ret = -ENOMEM;
>> + Â Â Â Â Â Â goto unlock;
>> + Â Â }
>> + Â Â thresholds_new->size = size;
>> +
>> + Â Â /* Copy thresholds (if any) to new array */
>> + Â Â if (thresholds)
>> + Â Â Â Â Â Â memcpy(thresholds_new->entries, thresholds->entries,
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â thresholds->size *
>> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â sizeof(struct mem_cgroup_threshold));
>> + Â Â /* Add new threshold */
>> + Â Â thresholds_new->entries[size - 1].eventfd = eventfd;
>> + Â Â thresholds_new->entries[size - 1].threshold = threshold;
>> +
>> + Â Â /* Sort thresholds. Registering of new threshold isn't time-critical */
>> + Â Â sort(thresholds_new->entries, size,
>> + Â Â Â Â Â Â Â Â Â Â sizeof(struct mem_cgroup_threshold),
>> + Â Â Â Â Â Â Â Â Â Â compare_thresholds, NULL);
>> +
>> + Â Â /* Find current threshold */
>> + Â Â atomic_set(&thresholds_new->cur, -1);
>> + Â Â for(i = 0; i < size; i++) {
>> + Â Â Â Â Â Â if (thresholds_new->entries[i].threshold < usage)
>> + Â Â Â Â Â Â Â Â Â Â atomic_inc(&thresholds_new->cur);
>> + Â Â }
>> +
>> + Â Â /*
>> + Â Â Â* We need to increment refcnt to be sure that all thresholds
>> + Â Â Â* will be unregistered before calling __mem_cgroup_free()
>> + Â Â Â*/
>> + Â Â mem_cgroup_get(memcg);
>> +
>> + Â Â if (type == _MEM)
>> + Â Â Â Â Â Â rcu_assign_pointer(memcg->thresholds, thresholds_new);
>> + Â Â else
>> + Â Â Â Â Â Â rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
>> +
>> + Â Â synchronize_rcu();
>
> Could you add explanation when you use synchronize_rcu() ?

It uses before freeing old array of thresholds to be sure than nobody uses it.

>> + Â Â kfree(thresholds);
>
> Can't this be freed by RCU instead of synchronize_rcu() ?

Yes, this can. But I don't think that (un)registering os thresholds is
time critical.
I think my variant is more clean.

>> +unlock:
>> + Â Â mutex_unlock(&memcg->thresholds_lock);
>> +
>> + Â Â return ret;
>> +}
>> +
>> +static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
>> + Â Â Â Â Â Â struct eventfd_ctx *eventfd)
>> +{
>> + Â Â struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
>> + Â Â struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
>> + Â Â int type = MEMFILE_TYPE(cft->private);
>> + Â Â u64 usage;
>> + Â Â int size = 0;
>> + Â Â int i, j, ret;
>> +
>> + Â Â mutex_lock(&memcg->thresholds_lock);
>> + Â Â if (type == _MEM)
>> + Â Â Â Â Â Â thresholds = memcg->thresholds;
>> + Â Â else if (type == _MEMSWAP)
>> + Â Â Â Â Â Â thresholds = memcg->memsw_thresholds;
>> + Â Â else
>> + Â Â Â Â Â Â BUG();
>> +
>> + Â Â /*
>> + Â Â Â* Something went wrong if we trying to unregister a threshold
>> + Â Â Â* if we don't have thresholds
>> + Â Â Â*/
>> + Â Â BUG_ON(!thresholds);
>> +
>> + Â Â usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
>> +
>> + Â Â /* Check if a threshold crossed before removing */
>> + Â Â __mem_cgroup_threshold(memcg, type == _MEMSWAP);
>> +
>> + Â Â /* Calculate new number of threshold */
>> + Â Â for(i = 0; i < thresholds->size; i++) {
>> + Â Â Â Â Â Â if (thresholds->entries[i].eventfd != eventfd)
>> + Â Â Â Â Â Â Â Â Â Â size++;
>> + Â Â }
>> +
>> + Â Â /* Set thresholds array to NULL if we don't have thresholds */
>> + Â Â if (!size) {
>> + Â Â Â Â Â Â thresholds_new = NULL;
>> + Â Â Â Â Â Â goto assign;
>> + Â Â }
>> +
>> + Â Â /* Allocate memory for new array of thresholds */
>> + Â Â thresholds_new = kmalloc(sizeof(*thresholds_new) +
>> + Â Â Â Â Â Â Â Â Â Â size * sizeof(struct mem_cgroup_threshold),
>> + Â Â Â Â Â Â Â Â Â Â GFP_KERNEL);
>> + Â Â if (!thresholds_new) {
>> + Â Â Â Â Â Â ret = -ENOMEM;
>> + Â Â Â Â Â Â goto unlock;
>> + Â Â }
>> + Â Â thresholds_new->size = size;
>> +
>> + Â Â /* Copy thresholds and find current threshold */
>> + Â Â atomic_set(&thresholds_new->cur, -1);
>> + Â Â for(i = 0, j = 0; i < thresholds->size; i++) {
>> + Â Â Â Â Â Â if (thresholds->entries[i].eventfd == eventfd)
>> + Â Â Â Â Â Â Â Â Â Â continue;
>> +
>> + Â Â Â Â Â Â thresholds_new->entries[j] = thresholds->entries[i];
>> + Â Â Â Â Â Â if (thresholds_new->entries[j].threshold < usage)
>> + Â Â Â Â Â Â Â Â Â Â atomic_inc(&thresholds_new->cur);
> It's better to do atomic set after loop.

We need one more counter to do this. Do you like it?

>> + Â Â Â Â Â Â j++;
>> + Â Â }
>
> Hmm..is this "copy array" usual coding style for handling eventfd ?

Since we store only pointer to struct eventfd_ctx, I don't see a problem.

>> +
>> +assign:
>> + Â Â if (type == _MEM)
>> + Â Â Â Â Â Â rcu_assign_pointer(memcg->thresholds, thresholds_new);
>> + Â Â else
>> + Â Â Â Â Â Â rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
>> +
>> + Â Â synchronize_rcu();
>> +
>> + Â Â for(i = 0; i < thresholds->size - size; i++)
>> + Â Â Â Â Â Â mem_cgroup_put(memcg);
>> +
>> + Â Â kfree(thresholds);
>> +unlock:
>> + Â Â mutex_unlock(&memcg->thresholds_lock);
>> +
>> + Â Â return ret;
>> +}
>>
>> Âstatic struct cftype mem_cgroup_files[] = {
>> Â Â Â {
>> Â Â Â Â Â Â Â .name = "usage_in_bytes",
>> Â Â Â Â Â Â Â .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
>> Â Â Â Â Â Â Â .read_u64 = mem_cgroup_read,
>> + Â Â Â Â Â Â .register_event = mem_cgroup_register_event,
>> + Â Â Â Â Â Â .unregister_event = mem_cgroup_unregister_event,
>> Â Â Â },
>> Â Â Â {
>> Â Â Â Â Â Â Â .name = "max_usage_in_bytes",
>> @@ -3128,6 +3400,8 @@ static struct cftype memsw_cgroup_files[] = {
>> Â Â Â Â Â Â Â .name = "memsw.usage_in_bytes",
>> Â Â Â Â Â Â Â .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
>> Â Â Â Â Â Â Â .read_u64 = mem_cgroup_read,
>> + Â Â Â Â Â Â .register_event = mem_cgroup_register_event,
>> + Â Â Â Â Â Â .unregister_event = mem_cgroup_unregister_event,
>> Â Â Â },
>> Â Â Â {
>> Â Â Â Â Â Â Â .name = "memsw.max_usage_in_bytes",
>> @@ -3367,6 +3641,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
>> Â Â Â if (parent)
>> Â Â Â Â Â Â Â mem->swappiness = get_swappiness(parent);
>> Â Â Â atomic_set(&mem->refcnt, 1);
>> + Â Â mutex_init(&mem->thresholds_lock);
>> Â Â Â return &mem->css;
>> Âfree_out:
>> Â Â Â __mem_cgroup_free(mem);
>> --
>> 1.6.5.7
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/