[PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory nodes

From: Ravi Jonnalagadda
Date: Wed Sep 27 2023 - 05:51:05 EST


From: Srinivasulu Thanneeru <sthanneeru@xxxxxxxxxx>

Existing interleave policy spreads out pages evenly across a set of
specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
have CPU-less memory nodes with different peak bandwidth and
latency-bandwidth characteristics. In such systems, we will want to
use the additional bandwidth provided by lowtier memory for
bandwidth-intensive applications. However, the default 1:1 interleave
can lead to suboptimal bandwidth distribution.

Introduce an interleave policy for multi-tiers that is based on
interleave weights, where pages are assigned from nodes of the tier
based on the tier weight.

For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which
leads to a 50%/30%/20% traffic breakdown across the three tiers.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru@xxxxxxxxxx>
Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@xxxxxxxxxx>
---
include/linux/memory-tiers.h | 25 +++++++-
include/linux/sched.h | 2 +
mm/memory-tiers.c | 31 ++--------
mm/mempolicy.c | 107 +++++++++++++++++++++++++++++++++--
4 files changed, 132 insertions(+), 33 deletions(-)

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index c62d286749d0..74be39cb56c4 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_MEMORY_TIERS_H
#define _LINUX_MEMORY_TIERS_H

+#include <linux/device.h>
#include <linux/types.h>
#include <linux/nodemask.h>
#include <linux/kref.h>
@@ -21,7 +22,27 @@

#define MAX_TIER_INTERLEAVE_WEIGHT 100

-struct memory_tier;
+struct memory_tier {
+ /* hierarchy of memory tiers */
+ struct list_head list;
+ /* list of all memory types part of this tier */
+ struct list_head memory_types;
+ /*
+ * By default all tiers will have weight as 1, which means they
+ * follow default standard allocation.
+ */
+ unsigned short interleave_weight;
+ /*
+ * start value of abstract distance. memory tier maps
+ * an abstract distance range,
+ * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
+ */
+ int adistance_start;
+ struct device dev;
+ /* All the nodes that are part of all the lower memory tiers. */
+ nodemask_t lower_tier_mask;
+};
+
struct memory_dev_type {
/* list of memory types that are part of same tier as this type */
struct list_head tier_sibiling;
@@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance);
void put_memory_type(struct memory_dev_type *memtype);
void init_node_memory_type(int node, struct memory_dev_type *default_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+struct memory_tier *node_get_memory_tier(int node);
+nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..07ea837c3afb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1252,7 +1252,9 @@ struct task_struct {
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
+ unsigned short il_count;
short pref_node_fork;
+ unsigned int current_node;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 7e06c9e0fa41..5e2ddc9f994a 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -8,27 +8,6 @@

#include "internal.h"

-struct memory_tier {
- /* hierarchy of memory tiers */
- struct list_head list;
- /* list of all memory types part of this tier */
- struct list_head memory_types;
- /*
- * By default all tiers will have weight as 1, which means they
- * follow default standard allocation.
- */
- unsigned short interleave_weight;
- /*
- * start value of abstract distance. memory tier maps
- * an abstract distance range,
- * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
- */
- int adistance_start;
- struct device dev;
- /* All the nodes that are part of all the lower memory tiers. */
- nodemask_t lower_tier_mask;
-};
-
struct demotion_nodes {
nodemask_t preferred;
};
@@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)
return container_of(device, struct memory_tier, dev);
}

-static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
{
nodemask_t nodes = NODE_MASK_NONE;
struct memory_dev_type *memtype;
@@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
return memtier;
}

-static struct memory_tier *__node_get_memory_tier(int node)
+struct memory_tier *node_get_memory_tier(int node)
{
pg_data_t *pgdat;

@@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void)
* We are holding memory_tier_lock, it is safe
* to access pgda->memtier.
*/
- memtier = __node_get_memory_tier(node);
+ memtier = node_get_memory_tier(node);
if (memtier)
memtier->lower_tier_mask = NODE_MASK_NONE;
}
@@ -417,7 +396,7 @@ static void establish_demotion_targets(void)
best_distance = -1;
nd = &node_demotion[node];

- memtier = __node_get_memory_tier(node);
+ memtier = node_get_memory_tier(node);
if (!memtier || list_is_last(&memtier->list, &memory_tiers))
continue;
/*
@@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node)
* This also enables us to free the destroyed memory tier
* with kfree instead of kfree_rcu
*/
- memtier = __node_get_memory_tier(node);
+ memtier = node_get_memory_tier(node);
if (memtier) {
struct memory_dev_type *memtype;

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 42b5567e3773..4f80c6ee1176 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -100,6 +100,8 @@
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
+#include <linux/memory-tiers.h>
+#include <linux/nodemask.h>
#include <linux/printk.h>
#include <linux/swapops.h>

@@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,

old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE)
+ if (new && new->mode == MPOL_INTERLEAVE) {
current->il_prev = MAX_NUMNODES-1;
+ current->il_count = 0;
+ current->current_node = MAX_NUMNODES;
+ }
task_unlock(current);
mpol_put(old);
ret = 0;
@@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
return nd;
}

+/* Return interleave weight of node from tier's weight */
+static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
+{
+ struct memory_tier *memtier;
+ nodemask_t tier_nodes, tier_and_pol;
+ unsigned short avrg_weight = 0;
+ int node, nnodes, reminder;
+
+ memtier = node_get_memory_tier(nid);
+
+ if (!memtier)
+ return 0;
+
+ tier_nodes = get_memtier_nodemask(memtier);
+ nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
+ nnodes = nodes_weight(tier_and_pol);
+ if (!nnodes)
+ return 0;
+
+ avrg_weight = memtier->interleave_weight / nnodes;
+ /* Set minimum weight of node as 1 so that at least one page
+ * is allocated.
+ */
+ if (!avrg_weight)
+ return 1;
+
+ reminder = memtier->interleave_weight % nnodes;
+ if (reminder) {
+ for_each_node_mask(node, tier_and_pol) {
+ /* Increment target node's weight by 1, if it falls
+ * within remaining weightage 'reminder'.
+ */
+ if (node == nid) {
+ if (reminder > 0)
+ avrg_weight = avrg_weight + 1;
+ break;
+ }
+ reminder--;
+ }
+ }
+ return avrg_weight;
+}
+
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned next;
struct task_struct *me = current;
+ unsigned short node_weight = 0;

- next = next_node_in(me->il_prev, policy->nodes);
+ /* select current node or next node from nodelist based on
+ * available tier interleave weight.
+ */
+ if (me->current_node == MAX_NUMNODES)
+ next = next_node_in(me->il_prev, policy->nodes);
+ else
+ next = me->current_node;
+ node_weight = node_interleave_weight(next, policy->nodes);
+ if (!node_weight)
+ goto set_il_prev;
+ if (me->il_count < node_weight) {
+ me->il_count++;
+ me->current_node = next;
+ if (me->il_count == node_weight) {
+ me->current_node = MAX_NUMNODES;
+ me->il_count = 0;
+ }
+ }
+
+set_il_prev:
if (next < MAX_NUMNODES)
me->il_prev = next;
return next;
@@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void)
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
nodemask_t nodemask = pol->nodes;
- unsigned int target, nnodes;
- int i;
- int nid;
+ unsigned int target, nnodes, vnnodes = 0;
+ unsigned short node_weight = 0;
+ int nid, vtarget, i;
+
/*
* The barrier will stabilize the nodemask in a register or on
* the stack so that it will stop changing under the code.
@@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
nnodes = nodes_weight(nodemask);
if (!nnodes)
return numa_node_id();
- target = (unsigned int)n % nnodes;
+
+ /*
+ * Calculate the virtual target for @n in a nodelist that is scaled
+ * with interleave weights....
+ */
+ for_each_node_mask(nid, nodemask) {
+ node_weight = node_interleave_weight(nid, nodemask);
+ if (!node_weight)
+ continue;
+ vnnodes += node_weight;
+ }
+ if (!vnnodes)
+ return numa_node_id();
+ vtarget = (int)((unsigned int)n % vnnodes);
+
+ /* ...then map it back to the physical nodelist */
+ target = 0;
+ for_each_node_mask(nid, nodemask) {
+ node_weight = node_interleave_weight(nid, nodemask);
+ if (!node_weight)
+ continue;
+ vtarget -= node_weight;
+ if (vtarget < 0)
+ break;
+ target++;
+ }
+
nid = first_node(nodemask);
for (i = 0; i < target; i++)
nid = next_node(nid, nodemask);
--
2.39.3