Re: [patch 1/3] mm: memcontrol: lockless page counters

From: Vladimir Davydov
Date: Fri Sep 26 2014 - 06:32:44 EST


Hi Johannes,

On Wed, Sep 24, 2014 at 11:43:08AM -0400, Johannes Weiner wrote:
> Memory is internally accounted in bytes, using spinlock-protected
> 64-bit counters, even though the smallest accounting delta is a page.
> The counter interface is also convoluted and does too many things.
>
> Introduce a new lockless word-sized page counter API, then change all
> memory accounting over to it and remove the old one. The translation
> from and to bytes then only happens when interfacing with userspace.
>
> Aside from the locking costs, this gets rid of the icky unsigned long
> long types in the very heart of memcg, which is great for 32 bit and
> also makes the code a lot more readable.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

It looks much better to me. A few comments below, mostly nit picking.

[...]
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c2c75262a209..52c24119be69 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
[...]
> @@ -1490,12 +1495,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
> */
> static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
> {
> - unsigned long long margin;
> + unsigned long margin = 0;
> + unsigned long count;
> + unsigned long limit;
>
> - margin = res_counter_margin(&memcg->res);
> - if (do_swap_account)
> - margin = min(margin, res_counter_margin(&memcg->memsw));
> - return margin >> PAGE_SHIFT;
> + count = page_counter_read(&memcg->memory);
> + limit = ACCESS_ONCE(memcg->memory.limit);
> + if (count < limit)

Nit: IMO this looks unwieldier and less readable than
res_counter_margin. And two lines below we repeat this.

> + margin = limit - count;
> +
> + if (do_swap_account) {
> + count = page_counter_read(&memcg->memsw);
> + limit = ACCESS_ONCE(memcg->memsw.limit);
> + if (count < limit)
> + margin = min(margin, limit - count);
> + }
> +
> + return margin;
> }
>
> int mem_cgroup_swappiness(struct mem_cgroup *memcg)
[...]
> @@ -1685,30 +1698,19 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
> }
>
> /*
> - * Return the memory (and swap, if configured) limit for a memcg.
> + * Return the memory (and swap, if configured) maximum consumption for a memcg.

Nit: Why did you change the comment? Now it doesn't seem to be relevant.

> */
> -static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
> +static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> {
> - u64 limit;
> + unsigned long limit;
>
> - limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
> -
> - /*
> - * Do not consider swap space if we cannot swap due to swappiness
> - */
> + limit = memcg->memory.limit;
> if (mem_cgroup_swappiness(memcg)) {
> - u64 memsw;
> + unsigned long memsw_limit;
>
> - limit += total_swap_pages << PAGE_SHIFT;
> - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
> -
> - /*
> - * If memsw is finite and limits the amount of swap space
> - * available to this memcg, return that limit.
> - */
> - limit = min(limit, memsw);
> + memsw_limit = memcg->memsw.limit;
> + limit = min(limit + total_swap_pages, memsw_limit);
> }
> -
> return limit;
> }
>
[...]
> @@ -4114,25 +4109,27 @@ out:
> }
>
> static int memcg_activate_kmem(struct mem_cgroup *memcg,
> - unsigned long long limit)
> + unsigned long nr_pages)
> {
> int ret;
>
> mutex_lock(&activate_kmem_mutex);
> - ret = __memcg_activate_kmem(memcg, limit);
> + ret = __memcg_activate_kmem(memcg, nr_pages);
> mutex_unlock(&activate_kmem_mutex);
> return ret;
> }
>
> static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
> - unsigned long long val)
> + unsigned long limit)
> {
> int ret;
>
> + mutex_lock(&memcg_limit_mutex);
> if (!memcg_kmem_is_active(memcg))
> - ret = memcg_activate_kmem(memcg, val);
> + ret = memcg_activate_kmem(memcg, limit);

I think we can now get rid of the activate_kmem_mutex, but this should
be done separately of course.

> else
> - ret = res_counter_set_limit(&memcg->kmem, val);
> + ret = page_counter_limit(&memcg->kmem, limit);
> + mutex_unlock(&memcg_limit_mutex);
> return ret;
> }
>
[...]
> @@ -5570,10 +5530,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>
> - mem_cgroup_resize_limit(memcg, ULLONG_MAX);
> - mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
> - memcg_update_kmem_limit(memcg, ULLONG_MAX);
> - res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
> + mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
> + mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
> + memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);

I think we should do it only if memcg_kmem_is_active, but that's a
different story.

> + memcg->soft_limit = 0;
> }
>
> #ifdef CONFIG_MMU
[...]
> diff --git a/mm/page_counter.c b/mm/page_counter.c
> new file mode 100644
> index 000000000000..51c45921b8d1
> --- /dev/null
> +++ b/mm/page_counter.c
> @@ -0,0 +1,191 @@
> +/*
> + * Lockless hierarchical page accounting & limiting
> + *
> + * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
> + */
> +#include <linux/page_counter.h>
> +#include <linux/atomic.h>
> +
> +/**
> + * page_counter_cancel - take pages out of the local counter
> + * @counter: counter
> + * @nr_pages: number of pages to cancel
> + *
> + * Returns whether there are remaining pages in the counter.
> + */
> +int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
> +{
> + long new;
> +
> + new = atomic_long_sub_return(nr_pages, &counter->count);
> +
> + if (WARN_ON_ONCE(new < 0))
> + atomic_long_add(nr_pages, &counter->count);
> +
> + return new > 0;
> +}
> +
> +/**
> + * page_counter_charge - hierarchically charge pages
> + * @counter: counter
> + * @nr_pages: number of pages to charge
> + *
> + * NOTE: This may exceed the configured counter limits.
> + */
> +void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
> +{
> + struct page_counter *c;
> +
> + for (c = counter; c; c = c->parent) {
> + long new;
> +
> + new = atomic_long_add_return(nr_pages, &c->count);
> + /*
> + * This is racy, but with the per-cpu caches on top
> + * it's just a ballpark metric anyway; and with lazy
> + * cache reclaim, the majority of workloads peg the
> + * watermark to the group limit soon after launch.
> + */
> + if (new > c->watermark)
> + c->watermark = new;
> + }
> +}
> +
> +/**
> + * page_counter_try_charge - try to hierarchically charge pages
> + * @counter: counter
> + * @nr_pages: number of pages to charge
> + * @fail: points first counter to hit its limit, if any
> + *
> + * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
> + * its ancestors has hit its limit.
> + */
> +int page_counter_try_charge(struct page_counter *counter,
> + unsigned long nr_pages,
> + struct page_counter **fail)
> +{
> + struct page_counter *c;
> +
> + for (c = counter; c; c = c->parent) {
> + long new;
> + /*
> + * Charge speculatively to avoid an expensive CAS. If
> + * a bigger charge fails, it might falsely lock out a
> + * racing smaller charge and send it into reclaim
> + * eraly, but the error is limited to the difference

Nit: s/eraly/early

> + * between the two sizes, which is less than 2M/4M in
> + * case of a THP locking out a regular page charge.
> + */
> + new = atomic_long_add_return(nr_pages, &c->count);
> + if (new > c->limit) {
> + atomic_long_sub(nr_pages, &c->count);
> + /*
> + * This is racy, but the failcnt is only a
> + * ballpark metric anyway.
> + */

I still don't think that the failcnt is completely useless. As I
mentioned previously, it can be used to check whether the workload is
behaving badly due to memcg limits or for some other reason. And I don't
see why it couldn't be atomic. This isn't a show stopper though.

> + c->failcnt++;
> + *fail = c;
> + goto failed;
> + }
> + /*
> + * This is racy, but with the per-cpu caches on top
> + * it's just a ballpark metric anyway; and with lazy
> + * cache reclaim, the majority of workloads peg the
> + * watermark to the group limit soon after launch.

Not for kmem, I think.

> + */
> + if (new > c->watermark)
> + c->watermark = new;
> + }
> + return 0;
> +
> +failed:
> + for (c = counter; c != *fail; c = c->parent)
> + page_counter_cancel(c, nr_pages);
> +
> + return -ENOMEM;
> +}
> +
> +/**
> + * page_counter_uncharge - hierarchically uncharge pages
> + * @counter: counter
> + * @nr_pages: number of pages to uncharge
> + *
> + * Returns whether there are remaining charges in @counter.
> + */
> +int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
> +{
> + struct page_counter *c;
> + int ret = 1;
> +
> + for (c = counter; c; c = c->parent) {
> + int remainder;
> +
> + remainder = page_counter_cancel(c, nr_pages);
> + if (c == counter && !remainder)
> + ret = 0;
> + }
> +
> + return ret;
> +}
> +
> +/**
> + * page_counter_limit - limit the number of pages allowed
> + * @counter: counter
> + * @limit: limit to set
> + *
> + * Returns 0 on success, -EBUSY if the current number of pages on the
> + * counter already exceeds the specified limit.
> + *
> + * The caller must serialize invocations on the same counter.
> + */
> +int page_counter_limit(struct page_counter *counter, unsigned long limit)
> +{
> + for (;;) {
> + unsigned long old;
> + long count;
> +
> + count = atomic_long_read(&counter->count);
> +
> + old = xchg(&counter->limit, limit);

Why do you use xchg here?

> +
> + if (atomic_long_read(&counter->count) != count) {
> + counter->limit = old;
> + continue;
> + }
> +
> + if (count > limit) {
> + counter->limit = old;
> + return -EBUSY;
> + }

I have a suspicion that this can race with page_counter_try_charge.
Look, access to c->limit is not marked as volatile in try_charge so the
compiler is allowed to issue read only once, in the very beginning of
the try_charge function. Then try_charge may succeed after the limit was
actually updated to a smaller value.

Strictly speaking, using ACCESS_ONCE in try_charge wouldn't be enough
AFAIU. There must be memory barriers here and there.

> +
> + return 0;
> + }
> +}
> +
> +/**
> + * page_counter_memparse - memparse() for page counter limits
> + * @buf: string to parse
> + * @nr_pages: returns the result in number of pages
> + *
> + * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
> + * limited to %PAGE_COUNTER_MAX.
> + */
> +int page_counter_memparse(const char *buf, unsigned long *nr_pages)
> +{
> + char unlimited[] = "-1";
> + char *end;
> + u64 bytes;
> +
> + if (!strncmp(buf, unlimited, sizeof(unlimited))) {
> + *nr_pages = PAGE_COUNTER_MAX;
> + return 0;
> + }
> +
> + bytes = memparse(buf, &end);
> + if (*end != '\0')
> + return -EINVAL;
> +
> + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
> +
> + return 0;
> +}
> diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
> index 1d191357bf88..272327134a1b 100644
> --- a/net/ipv4/tcp_memcontrol.c
> +++ b/net/ipv4/tcp_memcontrol.c
[...]
> @@ -126,43 +134,36 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
> return ret ?: nbytes;
> }
>
> -static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
> -{
> - struct cg_proto *cg_proto;
> -
> - cg_proto = tcp_prot.proto_cgroup(memcg);
> - if (!cg_proto)
> - return default_val;
> -
> - return res_counter_read_u64(&cg_proto->memory_allocated, type);
> -}
> -
> -static u64 tcp_read_usage(struct mem_cgroup *memcg)
> -{
> - struct cg_proto *cg_proto;
> -
> - cg_proto = tcp_prot.proto_cgroup(memcg);
> - if (!cg_proto)
> - return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
> -
> - return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
> -}
> -
> static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> + struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
> u64 val;
>
> switch (cft->private) {
> case RES_LIMIT:
> - val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX);
> + if (!cg_proto)
> + return PAGE_COUNTER_MAX;

For compatibility it must be ULLONG_MAX.

> + val = cg_proto->memory_allocated.limit;
> + val *= PAGE_SIZE;
> break;
> case RES_USAGE:
> - val = tcp_read_usage(memcg);
> + if (!cg_proto)
> + val = atomic_long_read(&tcp_memory_allocated);
> + else
> + val = page_counter_read(&cg_proto->memory_allocated);
> + val *= PAGE_SIZE;
> break;
> case RES_FAILCNT:
> + if (!cg_proto)
> + return 0;
> + val = cg_proto->memory_allocated.failcnt;
> + break;
> case RES_MAX_USAGE:
> - val = tcp_read_stat(memcg, cft->private, 0);
> + if (!cg_proto)
> + return 0;
> + val = cg_proto->memory_allocated.watermark;
> + val *= PAGE_SIZE;
> break;
> default:
> BUG();
[...]

Thanks,
Vladimir
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/