Re: [PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory nodes

From: Gregory Price
Date: Thu Sep 28 2023 - 14:27:00 EST


On Wed, Sep 27, 2023 at 03:20:02PM +0530, Ravi Jonnalagadda wrote:
> From: Srinivasulu Thanneeru <sthanneeru@xxxxxxxxxx>
>
> Existing interleave policy spreads out pages evenly across a set of
> specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
> have CPU-less memory nodes with different peak bandwidth and
> latency-bandwidth characteristics. In such systems, we will want to
> use the additional bandwidth provided by lowtier memory for
> bandwidth-intensive applications. However, the default 1:1 interleave
> can lead to suboptimal bandwidth distribution.
>
> Introduce an interleave policy for multi-tiers that is based on
> interleave weights, where pages are assigned from nodes of the tier
> based on the tier weight.
>
> For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which
> leads to a 50%/30%/20% traffic breakdown across the three tiers.
>
> Signed-off-by: Srinivasulu Thanneeru <sthanneeru@xxxxxxxxxx>
> Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@xxxxxxxxxx>
> ---
> include/linux/memory-tiers.h | 25 +++++++-
> include/linux/sched.h | 2 +
> mm/memory-tiers.c | 31 ++--------
> mm/mempolicy.c | 107 +++++++++++++++++++++++++++++++++--
> 4 files changed, 132 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index c62d286749d0..74be39cb56c4 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -2,6 +2,7 @@
> #ifndef _LINUX_MEMORY_TIERS_H
> #define _LINUX_MEMORY_TIERS_H
>
> +#include <linux/device.h>
> #include <linux/types.h>
> #include <linux/nodemask.h>
> #include <linux/kref.h>
> @@ -21,7 +22,27 @@
>
> #define MAX_TIER_INTERLEAVE_WEIGHT 100
>
> -struct memory_tier;
> +struct memory_tier {
> + /* hierarchy of memory tiers */
> + struct list_head list;
> + /* list of all memory types part of this tier */
> + struct list_head memory_types;
> + /*
> + * By default all tiers will have weight as 1, which means they
> + * follow default standard allocation.
> + */
> + unsigned short interleave_weight;
> + /*
> + * start value of abstract distance. memory tier maps
> + * an abstract distance range,
> + * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> + */
> + int adistance_start;
> + struct device dev;
> + /* All the nodes that are part of all the lower memory tiers. */
> + nodemask_t lower_tier_mask;
> +};
> +

I would make this change in a separate pull-ahead patch. To make review
of the actual functional changes easier.

> struct memory_dev_type {
> /* list of memory types that are part of same tier as this type */
> struct list_head tier_sibiling;
> @@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance);
> void put_memory_type(struct memory_dev_type *memtype);
> void init_node_memory_type(int node, struct memory_dev_type *default_type);
> void clear_node_memory_type(int node, struct memory_dev_type *memtype);
> +struct memory_tier *node_get_memory_tier(int node);
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
> #ifdef CONFIG_MIGRATION
> int next_demotion_node(int node);
> void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 77f01ac385f7..07ea837c3afb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1252,7 +1252,9 @@ struct task_struct {
> /* Protected by alloc_lock: */
> struct mempolicy *mempolicy;
> short il_prev;
> + unsigned short il_count;
> short pref_node_fork;
> + unsigned int current_node;
> #endif
> #ifdef CONFIG_NUMA_BALANCING
> int numa_scan_seq;
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 7e06c9e0fa41..5e2ddc9f994a 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -8,27 +8,6 @@
>
> #include "internal.h"
>
> -struct memory_tier {
> - /* hierarchy of memory tiers */
> - struct list_head list;
> - /* list of all memory types part of this tier */
> - struct list_head memory_types;
> - /*
> - * By default all tiers will have weight as 1, which means they
> - * follow default standard allocation.
> - */
> - unsigned short interleave_weight;
> - /*
> - * start value of abstract distance. memory tier maps
> - * an abstract distance range,
> - * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> - */
> - int adistance_start;
> - struct device dev;
> - /* All the nodes that are part of all the lower memory tiers. */
> - nodemask_t lower_tier_mask;
> -};
> -

See above

> struct demotion_nodes {
> nodemask_t preferred;
> };
> @@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)
> return container_of(device, struct memory_tier, dev);
> }
>
> -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> {
> nodemask_t nodes = NODE_MASK_NONE;
> struct memory_dev_type *memtype;
> @@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
> return memtier;
> }
>
> -static struct memory_tier *__node_get_memory_tier(int node)
> +struct memory_tier *node_get_memory_tier(int node)
> {
> pg_data_t *pgdat;
>
> @@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void)
> * We are holding memory_tier_lock, it is safe
> * to access pgda->memtier.
> */
> - memtier = __node_get_memory_tier(node);
> + memtier = node_get_memory_tier(node);
> if (memtier)
> memtier->lower_tier_mask = NODE_MASK_NONE;
> }
> @@ -417,7 +396,7 @@ static void establish_demotion_targets(void)
> best_distance = -1;
> nd = &node_demotion[node];
>
> - memtier = __node_get_memory_tier(node);
> + memtier = node_get_memory_tier(node);
> if (!memtier || list_is_last(&memtier->list, &memory_tiers))
> continue;
> /*
> @@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node)
> * This also enables us to free the destroyed memory tier
> * with kfree instead of kfree_rcu
> */
> - memtier = __node_get_memory_tier(node);
> + memtier = node_get_memory_tier(node);
> if (memtier) {
> struct memory_dev_type *memtype;
>

See above

> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 42b5567e3773..4f80c6ee1176 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -100,6 +100,8 @@
> #include <linux/ctype.h>
> #include <linux/mm_inline.h>
> #include <linux/mmu_notifier.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/nodemask.h>
> #include <linux/printk.h>
> #include <linux/swapops.h>
>
> @@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>
> old = current->mempolicy;
> current->mempolicy = new;
> - if (new && new->mode == MPOL_INTERLEAVE)
> + if (new && new->mode == MPOL_INTERLEAVE) {
> current->il_prev = MAX_NUMNODES-1;
> + current->il_count = 0;
> + current->current_node = MAX_NUMNODES;
> + }
> task_unlock(current);
> mpol_put(old);
> ret = 0;
> @@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
> return nd;
> }
>

Should this be a change to MPOL_INTERLEAVE, or should this be a new
memory policy: MPOL_TIERING?

For the sake of retaining the existing behavior of MPOL_INTERLEAVE as-is
and preventing the introduction of bugs, maybe creating MPOL_TIERING is
preferable?

That would allow the rest of mempolicy to be changed cleanly if the
intent is to override mempolicy with memtier behavior.

> +/* Return interleave weight of node from tier's weight */
> +static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
> +{
> + struct memory_tier *memtier;
> + nodemask_t tier_nodes, tier_and_pol;
> + unsigned short avrg_weight = 0;
> + int node, nnodes, reminder;
> +
> + memtier = node_get_memory_tier(nid);
> +
> + if (!memtier)
> + return 0;
> +
> + tier_nodes = get_memtier_nodemask(memtier);
> + nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
> + nnodes = nodes_weight(tier_and_pol);
> + if (!nnodes)
> + return 0;
> +
> + avrg_weight = memtier->interleave_weight / nnodes;
> + /* Set minimum weight of node as 1 so that at least one page
> + * is allocated.
> + */
> + if (!avrg_weight)
> + return 1;
> +
> + reminder = memtier->interleave_weight % nnodes;
> + if (reminder) {
> + for_each_node_mask(node, tier_and_pol) {
> + /* Increment target node's weight by 1, if it falls
> + * within remaining weightage 'reminder'.
> + */
> + if (node == nid) {
> + if (reminder > 0)
> + avrg_weight = avrg_weight + 1;
> + break;
> + }
> + reminder--;
> + }
> + }
> + return avrg_weight;
> +}
> +

With this, there are now 3 components with node masks that have to be
cross-referenced to figure out what is both preferred and what is
allowed: Mempolicy, Cgroups, MemTier.

is abstracting nodes with memtiers actually preferable to abstracting
devices with numa nodes and simply creating a new memory policy with
weights directly described in the policy?

> /* Do dynamic interleaving for a process */
> static unsigned interleave_nodes(struct mempolicy *policy)
> {
> unsigned next;
> struct task_struct *me = current;
> + unsigned short node_weight = 0;
>
> - next = next_node_in(me->il_prev, policy->nodes);
> + /* select current node or next node from nodelist based on
> + * available tier interleave weight.
> + */
> + if (me->current_node == MAX_NUMNODES)
> + next = next_node_in(me->il_prev, policy->nodes);
> + else
> + next = me->current_node;
> + node_weight = node_interleave_weight(next, policy->nodes);
> + if (!node_weight)
> + goto set_il_prev;
> + if (me->il_count < node_weight) {

What happens if a node weight changes to be less than il_count in the
middle of operation? Is il_count reset when node weights are changed?

Since memtier's weights are global, this can't possibly work as intended.

> + me->il_count++;
> + me->current_node = next;
> + if (me->il_count == node_weight) {
> + me->current_node = MAX_NUMNODES;
> + me->il_count = 0;
> + }
> + }
> +
> +set_il_prev:
> if (next < MAX_NUMNODES)
> me->il_prev = next;
> return next;
> @@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void)
> static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
> {
> nodemask_t nodemask = pol->nodes;
> - unsigned int target, nnodes;
> - int i;
> - int nid;
> + unsigned int target, nnodes, vnnodes = 0;
> + unsigned short node_weight = 0;
> + int nid, vtarget, i;
> +
> /*
> * The barrier will stabilize the nodemask in a register or on
> * the stack so that it will stop changing under the code.
> @@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
> nnodes = nodes_weight(nodemask);
> if (!nnodes)
> return numa_node_id();
> - target = (unsigned int)n % nnodes;
> +
> + /*
> + * Calculate the virtual target for @n in a nodelist that is scaled
> + * with interleave weights....
> + */
> + for_each_node_mask(nid, nodemask) {
> + node_weight = node_interleave_weight(nid, nodemask);
> + if (!node_weight)
> + continue;
> + vnnodes += node_weight;
> + }

Should this be precomputed? This seems like a considerable amount of
work in the hot path for page allocation and there are many numa nodes.

> + if (!vnnodes)
> + return numa_node_id();
> + vtarget = (int)((unsigned int)n % vnnodes);
> +
> + /* ...then map it back to the physical nodelist */
> + target = 0;
> + for_each_node_mask(nid, nodemask) {
> + node_weight = node_interleave_weight(nid, nodemask);
> + if (!node_weight)
> + continue;
> + vtarget -= node_weight;
> + if (vtarget < 0)
> + break;
> + target++;
> + }

See above, now you're double-counting.

> +
> nid = first_node(nodemask);
> for (i = 0; i < target; i++)
> nid = next_node(nid, nodemask);
> --
> 2.39.3
>