Re: [PATCH] memcg: remove mem_cgroup_reclaimable check from soft reclaim

From: Vladimir Davydov
Date: Wed Oct 22 2014 - 02:41:00 EST


On Tue, Oct 21, 2014 at 02:22:39PM -0400, Johannes Weiner wrote:
> On Tue, Oct 21, 2014 at 05:15:50PM +0400, Vladimir Davydov wrote:
> > mem_cgroup_reclaimable() checks whether a cgroup has reclaimable pages
> > on *any* NUMA node. However, the only place where it's called is
> > mem_cgroup_soft_reclaim(), which tries to reclaim memory from a
> > *specific* zone. So the way how it's used is incorrect - it will return
> > true even if the cgroup doesn't have pages on the zone we're scanning.
> >
> > I think we can get rid of this check completely, because
> > mem_cgroup_shrink_node_zone(), which is called by
> > mem_cgroup_soft_reclaim() if mem_cgroup_reclaimable() returns true, is
> > equivalent to shrink_lruvec(), which exits almost immediately if the
> > lruvec passed to it is empty. So there's no need to optimize anything
> > here. Besides, we don't have such a check in the general scan path
> > (shrink_zone) either.
> >
> > Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
>
> Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx>
>
> How about this on top?
>
> ---
>
> From 27bd24b00433d9f6c8d60ba2b13dbff158b06c13 Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <hannes@xxxxxxxxxxx>
> Date: Tue, 21 Oct 2014 09:53:54 -0400
> Subject: [patch] mm: memcontrol: do not filter reclaimable nodes in NUMA
> round-robin
>
> The round-robin node reclaim currently tries to include only nodes
> that have memory of the memcg in question, which is quite elaborate.
>
> Just use plain round-robin over the nodes that are allowed by the
> task's cpuset, which are the most likely to contain that memcg's
> memory. But even if zones without memcg memory are encountered,
> direct reclaim will skip over them without too much hassle.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

Totally agree.

Acked-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>

> ---
> mm/memcontrol.c | 97 +++------------------------------------------------------
> 1 file changed, 5 insertions(+), 92 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d353d9e1fdca..293db8234179 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -54,6 +54,7 @@
> #include <linux/page_cgroup.h>
> #include <linux/cpu.h>
> #include <linux/oom.h>
> +#include <linux/cpuset.h>
> #include <linux/lockdep.h>
> #include <linux/file.h>
> #include "internal.h"
> @@ -129,12 +130,10 @@ static const char * const mem_cgroup_lru_names[] = {
> enum mem_cgroup_events_target {
> MEM_CGROUP_TARGET_THRESH,
> MEM_CGROUP_TARGET_SOFTLIMIT,
> - MEM_CGROUP_TARGET_NUMAINFO,
> MEM_CGROUP_NTARGETS,
> };
> #define THRESHOLDS_EVENTS_TARGET 128
> #define SOFTLIMIT_EVENTS_TARGET 1024
> -#define NUMAINFO_EVENTS_TARGET 1024
>
> struct mem_cgroup_stat_cpu {
> long count[MEM_CGROUP_STAT_NSTATS];
> @@ -352,11 +351,6 @@ struct mem_cgroup {
> #endif
>
> int last_scanned_node;
> -#if MAX_NUMNODES > 1
> - nodemask_t scan_nodes;
> - atomic_t numainfo_events;
> - atomic_t numainfo_updating;
> -#endif
>
> /* List of events which userspace want to receive */
> struct list_head event_list;
> @@ -965,9 +959,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> case MEM_CGROUP_TARGET_SOFTLIMIT:
> next = val + SOFTLIMIT_EVENTS_TARGET;
> break;
> - case MEM_CGROUP_TARGET_NUMAINFO:
> - next = val + NUMAINFO_EVENTS_TARGET;
> - break;
> default:
> break;
> }
> @@ -986,22 +977,10 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
> /* threshold event is triggered in finer grain than soft limit */
> if (unlikely(mem_cgroup_event_ratelimit(memcg,
> MEM_CGROUP_TARGET_THRESH))) {
> - bool do_softlimit;
> - bool do_numainfo __maybe_unused;
> -
> - do_softlimit = mem_cgroup_event_ratelimit(memcg,
> - MEM_CGROUP_TARGET_SOFTLIMIT);
> -#if MAX_NUMNODES > 1
> - do_numainfo = mem_cgroup_event_ratelimit(memcg,
> - MEM_CGROUP_TARGET_NUMAINFO);
> -#endif
> mem_cgroup_threshold(memcg);
> - if (unlikely(do_softlimit))
> + if (mem_cgroup_event_ratelimit(memcg,
> + MEM_CGROUP_TARGET_SOFTLIMIT))
> mem_cgroup_update_tree(memcg, page);
> -#if MAX_NUMNODES > 1
> - if (unlikely(do_numainfo))
> - atomic_inc(&memcg->numainfo_events);
> -#endif
> }
> }
>
> @@ -1708,61 +1687,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> NULL, "Memory cgroup out of memory");
> }
>
> -/**
> - * test_mem_cgroup_node_reclaimable
> - * @memcg: the target memcg
> - * @nid: the node ID to be checked.
> - * @noswap : specify true here if the user wants flle only information.
> - *
> - * This function returns whether the specified memcg contains any
> - * reclaimable pages on a node. Returns true if there are any reclaimable
> - * pages in the node.
> - */
> -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
> - int nid, bool noswap)
> -{
> - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
> - return true;
> - if (noswap || !total_swap_pages)
> - return false;
> - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
> - return true;
> - return false;
> -
> -}
> #if MAX_NUMNODES > 1
> -
> -/*
> - * Always updating the nodemask is not very good - even if we have an empty
> - * list or the wrong list here, we can start from some node and traverse all
> - * nodes based on the zonelist. So update the list loosely once per 10 secs.
> - *
> - */
> -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
> -{
> - int nid;
> - /*
> - * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
> - * pagein/pageout changes since the last update.
> - */
> - if (!atomic_read(&memcg->numainfo_events))
> - return;
> - if (atomic_inc_return(&memcg->numainfo_updating) > 1)
> - return;
> -
> - /* make a nodemask where this memcg uses memory from */
> - memcg->scan_nodes = node_states[N_MEMORY];
> -
> - for_each_node_mask(nid, node_states[N_MEMORY]) {
> -
> - if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
> - node_clear(nid, memcg->scan_nodes);
> - }
> -
> - atomic_set(&memcg->numainfo_events, 0);
> - atomic_set(&memcg->numainfo_updating, 0);
> -}
> -
> /*
> * Selecting a node where we start reclaim from. Because what we need is just
> * reducing usage counter, start from anywhere is O,K. Considering
> @@ -1779,21 +1704,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
> {
> int node;
>
> - mem_cgroup_may_update_nodemask(memcg);
> - node = memcg->last_scanned_node;
> -
> - node = next_node(node, memcg->scan_nodes);
> + node = next_node(memcg->last_scanned_node, cpuset_current_mems_allowed);
> if (node == MAX_NUMNODES)
> - node = first_node(memcg->scan_nodes);
> - /*
> - * We call this when we hit limit, not when pages are added to LRU.
> - * No LRU may hold pages because all pages are UNEVICTABLE or
> - * memcg is too small and all pages are not on LRU. In that case,
> - * we use curret node.
> - */
> - if (unlikely(node == MAX_NUMNODES))
> - node = numa_node_id();
> -
> + node = first_node(cpuset_current_mems_allowed);
> memcg->last_scanned_node = node;
> return node;
> }
> --
> 2.1.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/