[RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave

From: Gregory Price
Date: Mon Oct 02 2023 - 20:22:28 EST


The weighted-interleave mempolicy implements weights per-node
which are used to distribute memory while interleaving.

For example:
nodes: 0,1,2
weights: 5,3,2

Over 10 consecutive allocations, the following nodes will be selected:
[0,0,0,0,0,1,1,1,2,2]

In this example there is a 50%/30%/20% distribution of memory across
the enabled nodes.

If a node is enabled, the minimum weight is expected to be 0. If an
enabled node ends up with a weight of 0 (as can happen if weights
are being recalculated due to a cgroup mask update), a minimum
of 1 is applied during the interleave mechanism.

Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx>
---
include/linux/mempolicy.h | 6 +
include/uapi/linux/mempolicy.h | 6 +
mm/mempolicy.c | 261 ++++++++++++++++++++++++++++++++-
3 files changed, 269 insertions(+), 4 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8f918488c61c..8763e536d4a2 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -54,6 +54,12 @@ struct mempolicy {
int weight;
int count;
} pil;
+ /* weighted interleave */
+ struct {
+ unsigned int il_weight;
+ unsigned char cur_weight;
+ unsigned char weights[MAX_NUMNODES];
+ } wil;
};

union {
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 41c35f404c5e..913ca9bf9af7 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -25,6 +25,7 @@ enum {
MPOL_PREFERRED_MANY,
MPOL_LEGACY, /* set_mempolicy limited to above modes */
MPOL_PREFERRED_INTERLEAVE,
+ MPOL_WEIGHTED_INTERLEAVE,
MPOL_MAX, /* always last member of enum */
};

@@ -58,6 +59,11 @@ struct mempolicy_args {
unsigned long weight; /* get and set */
unsigned long next_node; /* get only */
} pil;
+ /* Weighted interleave */
+ struct {
+ unsigned long next_node; /* get only */
+ unsigned char *weights; /* get and set */
+ } wil;
};
};

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6374312cef5f..92be74d4c431 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -195,11 +195,43 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
nodes_onto(*ret, tmp, *rel);
}

+static void mpol_recalculate_weights(struct mempolicy *pol)
+{
+ unsigned int il_weight = 0;
+ int node;
+
+ /* Recalculate weights to ensure minimum node weight */
+ for (node = 0; node < MAX_NUMNODES; node++) {
+ if (!node_isset(node, pol->nodes) && pol->wil.weights[node]) {
+ /* If node is not set, weight should be 0 */
+ pol->wil.weights[node] = 0;
+ } else if (!pol->wil.weights[node]) {
+ /* If node is set, weight should be minimum of 1 */
+ pol->wil.weights[node] = 1;
+ pol->wil.il_weight += 1;
+ il_weight += 1;
+ } else {
+ /* Otherwise, keep the existing weight */
+ il_weight += pol->wil.weights[node];
+ }
+ }
+ pol->wil.il_weight = il_weight;
+ /*
+ * It's possible an allocation has been occurring at this point
+ * force it to go to the next node, since we just changed weights
+ */
+ pol->wil.cur_weight = 0;
+}
+
static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->nodes = *nodes;
+
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ mpol_recalculate_weights(pol);
+
return 0;
}

@@ -334,6 +366,10 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
tmp = *nodes;

pol->nodes = tmp;
+
+ /* After a change to the nodemask, weights must be recalculated */
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ mpol_recalculate_weights(pol);
}

static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -403,6 +439,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
+ [MPOL_WEIGHTED_INTERLEAVE] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_nodemask,
+ },
[MPOL_PREFERRED] = {
.create = mpol_new_preferred,
.rebind = mpol_rebind_preferred,
@@ -878,8 +918,10 @@ static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes)
old = current->mempolicy;
current->mempolicy = new;
if (new && (new->mode == MPOL_INTERLEAVE ||
- new->mode == MPOL_PREFERRED_INTERLEAVE))
+ new->mode == MPOL_PREFERRED_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE))
current->il_prev = MAX_NUMNODES-1;
+
out:
task_unlock(current);
mpol_put(old);
@@ -921,6 +963,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
*nodes = p->nodes;
@@ -1632,6 +1675,56 @@ static long do_set_preferred_interleave(struct mempolicy_args *args,
return 0;
}

+static long do_set_weighted_interleave(struct mempolicy_args *args,
+ struct mempolicy *new,
+ nodemask_t *nodes)
+{
+ unsigned char weight;
+ unsigned char *weights;
+ int node;
+ int ret = 0;
+
+ /* Weighted interleave cannot be done with no nodemask */
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+
+ /* Weighted interleave requires a set of weights */
+ if (!args->wil.weights)
+ return -EINVAL;
+
+ weights = kmalloc(MAX_NUMNODES, GFP_KERNEL);
+ if (!weights)
+ return -ENOMEM;
+
+ ret = copy_from_user(weights, args->wil.weights, MAX_NUMNODES);
+ if (ret) {
+ ret = -EFAULT;
+ goto weights_out;
+ }
+
+ new->wil.cur_weight = 0;
+ new->wil.il_weight = 0;
+ memset(new->wil.weights, 0, sizeof(new->wil.weights));
+
+ /* Weights for set nodes cannot be 0 */
+ node = first_node(*nodes);
+ while (node != MAX_NUMNODES) {
+ weight = weights[node];
+ if (!weight) {
+ ret = -EINVAL;
+ goto weights_out;
+ }
+ /* policy creation initializes total to nr_nodes, adjust it */
+ new->wil.il_weight += weight;
+ new->wil.weights[node] = weight;
+ node = next_node(node, *nodes);
+ }
+
+weights_out:
+ kfree(weights);
+ return ret;
+}
+
static long do_set_mempolicy2(struct mempolicy_args *args)
{
struct mempolicy *new = NULL;
@@ -1656,6 +1749,9 @@ static long do_set_mempolicy2(struct mempolicy_args *args)
case MPOL_PREFERRED_INTERLEAVE:
err = do_set_preferred_interleave(args, new, &nodes);
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ err = do_set_weighted_interleave(args, new, &nodes);
+ break;
default:
BUG();
}
@@ -1799,6 +1895,12 @@ static long do_get_mempolicy2(struct mempolicy_args *kargs)
kargs->pil.weight = pol->pil.weight;
rc = 0;
break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ kargs->wil.next_node = next_node_in(current->il_prev,
+ pol->nodes);
+ rc = copy_to_user(kargs->wil.weights, pol->wil.weights,
+ MAX_NUMNODES);
+ break;
default:
BUG();
}
@@ -2160,6 +2262,27 @@ static unsigned int preferred_interleave_nodes(struct mempolicy *policy)
return next;
}

+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+ unsigned int next;
+ unsigned char next_weight;
+ struct task_struct *me = current;
+
+ /* When weight reaches 0, we're on a new node, reset the weight */
+ next = next_node_in(me->il_prev, policy->nodes);
+ if (!policy->wil.cur_weight) {
+ /* If the node is set, at least 1 allocation is required */
+ next_weight = policy->wil.weights[next];
+ policy->wil.cur_weight = next_weight ? next_weight : 1;
+ }
+
+ policy->wil.cur_weight--;
+ if (next < MAX_NUMNODES && !policy->wil.cur_weight)
+ me->il_prev = next;
+
+ return next;
+}
+
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
@@ -2168,6 +2291,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)

if (policy->mode == MPOL_PREFERRED_INTERLEAVE)
return preferred_interleave_nodes(policy);
+ else if (policy->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return weighted_interleave_nodes(policy);

next = next_node_in(me->il_prev, policy->nodes);
if (next < MAX_NUMNODES)
@@ -2197,6 +2322,7 @@ unsigned int mempolicy_slab_node(void)

case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
return interleave_nodes(policy);

case MPOL_BIND:
@@ -2273,6 +2399,40 @@ static unsigned int offset_pil_node(struct mempolicy *pol, unsigned long n)
return nid;
}

+static unsigned int offset_wil_node(struct mempolicy *pol, unsigned long n)
+{
+ nodemask_t nodemask = pol->nodes;
+ unsigned int target, nnodes;
+ unsigned char weight;
+ int nid;
+
+ /*
+ * The barrier will stabilize the nodemask in a register or on
+ * the stack so that it will stop changing under the code.
+ *
+ * Between first_node() and next_node(), pol->nodes could be changed
+ * by other threads. So we put pol->nodes in a local stack.
+ */
+ barrier();
+
+ nnodes = nodes_weight(nodemask);
+ if (!nnodes)
+ return numa_node_id();
+ target = (unsigned int)n % pol->wil.il_weight;
+ nid = first_node(nodemask);
+ while (target) {
+ weight = pol->wil.weights[nid];
+ /* If weights are being recaculated, revert to interleave */
+ if (!weight)
+ weight = 1;
+ if (target < weight)
+ break;
+ target -= weight;
+ nid = next_node_in(nid, nodemask);
+ }
+ return nid;
+}
+
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
* node in pol->nodes (starting from n=0), wrapping around if n exceeds the
@@ -2287,6 +2447,8 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)

if (pol->mode == MPOL_PREFERRED_INTERLEAVE)
return offset_pil_node(pol, n);
+ else if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return offset_wil_node(pol, n);

nodemask = pol->nodes;

@@ -2358,7 +2520,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
mode = (*mpol)->mode;

if (unlikely(mode == MPOL_INTERLEAVE) ||
- unlikely(mode == MPOL_PREFERRED_INTERLEAVE)) {
+ unlikely(mode == MPOL_PREFERRED_INTERLEAVE) ||
+ unlikely(mode == MPOL_WEIGHTED_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
@@ -2400,6 +2563,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
*mask = mempolicy->nodes;
break;

@@ -2511,7 +2675,8 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
pol = get_vma_policy(vma, addr);

if (pol->mode == MPOL_INTERLEAVE ||
- pol->mode == MPOL_PREFERRED_INTERLEAVE) {
+ pol->mode == MPOL_PREFERRED_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
struct page *page;
unsigned nid;

@@ -2614,7 +2779,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE ||
- pol->mode == MPOL_PREFERRED_INTERLEAVE)
+ pol->mode == MPOL_PREFERRED_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else if (pol->mode == MPOL_PREFERRED_MANY)
page = alloc_pages_preferred_many(gfp, order,
@@ -2737,6 +2903,84 @@ static unsigned long alloc_pages_bulk_array_pil(gfp_t gfp,
return allocated;
}

+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ struct task_struct *me = current;
+ unsigned long total_allocated = 0;
+ unsigned long nr_allocated;
+ unsigned long rounds;
+ unsigned long node_pages, delta;
+ unsigned char weight;
+ int nnodes, node, prev_node;
+ int i;
+
+ nnodes = nodes_weight(pol->nodes);
+ /* Continue allocating from most recent node and adjust the nr_pages */
+ if (pol->wil.cur_weight) {
+ node = next_node_in(me->il_prev, pol->nodes);
+ node_pages = pol->wil.cur_weight;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ /* if that's all the pages, no need to interleave */
+ if (nr_pages <= pol->wil.cur_weight) {
+ pol->wil.cur_weight -= nr_pages;
+ return total_allocated;
+ }
+ /* Otherwise we adjust nr_pages down, and continue from there */
+ nr_pages -= pol->wil.cur_weight;
+ pol->wil.cur_weight = 0;
+ prev_node = node;
+ }
+
+ /* Now we can continue allocating from this point */
+ rounds = nr_pages / pol->wil.il_weight;
+ delta = nr_pages % pol->wil.il_weight;
+ for (i = 0; i < nnodes; i++) {
+ node = next_node_in(prev_node, pol->nodes);
+ weight = pol->wil.weights[node];
+ node_pages = weight * rounds;
+ if (delta) {
+ if (delta > weight) {
+ node_pages += weight;
+ delta -= weight;
+ } else {
+ node_pages += delta;
+ delta = 0;
+ }
+ }
+ /* We may not make it all the way around */
+ if (!node_pages)
+ break;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ NULL, page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ prev_node = node;
+ }
+
+ /*
+ * Finally, we need to update me->il_prev and pol->wil.cur_weight
+ * if there were overflow pages, but not equivalent to the node
+ * weight, set the cur_weight to node_weight - delta and the
+ * me->il_prev to the previous node. Otherwise if it was perfect
+ * we can simply set il_prev to node and cur_weight to 0
+ */
+ delta %= weight;
+ if (node_pages) {
+ me->il_prev = prev_node;
+ pol->wil.cur_weight = pol->wil.weights[node] - node_pages;
+ } else {
+ me->il_prev = node;
+ pol->wil.cur_weight = 0;
+ }
+
+ return total_allocated;
+}
+
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
@@ -2779,6 +3023,11 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
return alloc_pages_bulk_array_pil(gfp, pol, nr_pages,
page_array);

+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return alloc_pages_bulk_array_weighted_interleave(gfp, pol,
+ nr_pages,
+ page_array);
+
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
@@ -2852,6 +3101,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
return !!nodes_equal(a->nodes, b->nodes);
@@ -2989,6 +3239,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
switch (pol->mode) {
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
polnid = offset_il_node(pol, pgoff);
@@ -3377,6 +3628,7 @@ static const char * const policy_modes[] =
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_PREFERRED_INTERLEAVE] = "preferred interleave",
+ [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
@@ -3548,6 +3800,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
nodes = pol->nodes;
break;
default:
--
2.39.1