[PATCH v6 12/12] mm/mempolicy: extend mempolicy2 and mbind2 to support weighted interleave

From: Gregory Price
Date: Wed Jan 03 2024 - 17:47:39 EST


Extend mempolicy2 and mbind2 to support weighted interleave, and
demonstrate the extensibility of the mpol_param structure.

To support weighted interleave we add interleave weight fields to the
following structures:

Kernel Internal: (include/linux/mempolicy.h)
struct mempolicy {
/* task-local weights to apply to weighted interleave */
u8 weights[MAX_NUMNODES];
}
struct mempolicy_param {
/* Optional: interleave weights for MPOL_WEIGHTED_INTERLEAVE */
u8 *il_weights; /* of size MAX_NUMNODES */
}

UAPI: (/include/uapi/linux/mempolicy.h)
struct mpol_param {
/* Optional: interleave weights for MPOL_WEIGHTED_INTERLEAVE */
__u8 *il_weights; /* of size pol_maxnodes */
}

The minimum weight of a node is always 1. If the user desires 0
allocations on a node, the node should be removed from the nodemask.

If the user does not provide weights (il_weights == NULL), global
weights will be used during allocation. Changes made to global weights
will be reflected in future allocations.

If the user provides weights and a weight is set to 0, the weight for
that node will be initialized to the global value.

If a user provides weights and a node is not set in the node mask,
the weight for that node will be set to the globally defined weight.
This is so a reasonable default value can be expected if the nodemask
changes (e.g. cgroups causes a migration or mems_allowed change).

Local weights are never updated when a global weight is updated.

Examples:

global weights: [4,4,2,2]
Set: Nodes-0,1,2,3 Weights: NULL
[global weights] are used.

Set: Nodes-0,1,2,3 Weights: [1,2,3,4]
local_weights = [1,2,3,4]

Set: Nodes-0,2 Weights: [2,0,2,0]
local_weights = [2,4,1,2]

Basic logic during allocation is as follows:

weight = pol->wil.weights[node]
/* if no local weight, use sysfs weight */
if (!weight)
weight = iw_table[weight]
/* if no sysfs weight, use system default */
if (!weight)
weight = default_iw_table[weight]

To simplify creations and duplication of mempolicies, the weights are
added as a structure directly within mempolicy. This allows the
existing logic in __mpol_dup to copy the weights without additional
allocations:

if (old == current->mempolicy) {
task_lock(current);
*new = *old;
task_unlock(current);
} else
*new = *old

Suggested-by: Rakie Kim <rakie.kim@xxxxxx>
Suggested-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx>
Suggested-by: Honggyu Kim <honggyu.kim@xxxxxx>
Suggested-by: Vinicius Tavares Petrucci <vtavarespetr@xxxxxxxxxx>
Suggested-by: Huang Ying <ying.huang@xxxxxxxxx>
Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx>
Co-developed-by: Rakie Kim <rakie.kim@xxxxxx>
Signed-off-by: Rakie Kim <rakie.kim@xxxxxx>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx>
Co-developed-by: Honggyu Kim <honggyu.kim@xxxxxx>
Signed-off-by: Honggyu Kim <honggyu.kim@xxxxxx>
Co-developed-by: Vinicius Tavares Petrucci <vtavarespetr@xxxxxxxxxx>
Signed-off-by: Vinicius Tavares Petrucci <vtavarespetr@xxxxxxxxxx>
---
.../admin-guide/mm/numa_memory_policy.rst | 12 ++
include/linux/mempolicy.h | 2 +
include/uapi/linux/mempolicy.h | 1 +
mm/mempolicy.c | 134 ++++++++++++++++--
4 files changed, 141 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 66a778d58899..620b54ff2cef 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -254,11 +254,22 @@ MPOL_WEIGHTED_INTERLEAVE
This mode operates the same as MPOL_INTERLEAVE, except that
interleaving behavior is executed based on weights set in
/sys/kernel/mm/mempolicy/weighted_interleave/
+ when configured to utilize global weights, or based on task-local
+ weights configured with set_mempolicy2(2) or mbind2(2).

Weighted interleave allocates pages on nodes according to a
weight. For example if nodes [0,1] are weighted [5,2], 5 pages
will be allocated on node0 for every 2 pages allocated on node1.

+ When utilizing task-local weights, if node's is not set in the
+ nodemask, or its weight was set to 0, the local weight will be
+ set to the system default. Updates to system default weights
+ will not be refleted in local weights.
+
+ The minimum weight for a node set in the policy nodemask is
+ always 1. If no allocations on a node, the node should be
+ removed from the nodemask.
+
NUMA memory policy supports the following optional mode flags:

MPOL_F_STATIC_NODES
@@ -514,6 +525,7 @@ Extended Mempolicy Arguments::
__s32 home_node; /* mbind2: set home node */
__u64 pol_maxnodes;
__aligned_u64 pol_nodes; /* nodemask pointer */
+ __aligned_u64 il_weights; /* u8 buf of size pol_maxnodes */
};

The extended mempolicy argument structure is defined to allow the mempolicy
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index e6795e2d0cc2..9854790a9aac 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -58,6 +58,7 @@ struct mempolicy {
/* Weighted interleave settings */
struct {
u8 cur_weight;
+ u8 weights[MAX_NUMNODES];
u8 scratch_weights[MAX_NUMNODES]; /* Used to avoid allocations */
} wil;
};
@@ -71,6 +72,7 @@ struct mempolicy_param {
unsigned short mode_flags; /* policy mode flags */
int home_node; /* mbind: use MPOL_MF_HOME_NODE */
nodemask_t *policy_nodes; /* get/set/mbind */
+ u8 *il_weights; /* for mode MPOL_WEIGHTED_INTERLEAVE */
};

/*
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 7c7c384479fc..06e0fc2bb29b 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -34,6 +34,7 @@ struct mpol_param {
__u16 pol_maxnodes;
__u8 resv[6];
__aligned_u64 pol_nodes;
+ __aligned_u64 il_weights; /* size: pol_maxnodes * sizeof(__u8) */
};

/* Flags for set_mempolicy */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 53301e173c90..78e7614e0cd4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -286,6 +286,7 @@ static struct mempolicy *mpol_new(struct mempolicy_param *param)
unsigned short mode = param->mode;
unsigned short flags = param->mode_flags;
nodemask_t *nodes = param->policy_nodes;
+ int node;

if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
@@ -323,6 +324,23 @@ static struct mempolicy *mpol_new(struct mempolicy_param *param)
policy->flags = flags;
policy->home_node = param->home_node;
policy->wil.cur_weight = 0;
+ memset(policy->wil.weights, 0, MAX_NUMNODES);
+
+ /* If user provides weights, ensure all weights are set to something */
+ if (policy->mode == MPOL_WEIGHTED_INTERLEAVE && param->il_weights) {
+ for (node = 0; node < MAX_NUMNODES; node++) {
+ u8 weight = 0;
+
+ if (node_isset(node, *nodes))
+ weight = param->il_weights[node];
+ /* If a user sets a weight to 0, use global default */
+ if (!weight)
+ weight = iw_table[node];
+ if (!weight)
+ weight = default_iw_table[node];
+ policy->wil.weights[node] = weight;
+ }
+ }

return policy;
}
@@ -952,6 +970,26 @@ static void do_get_mempolicy_nodemask(struct mempolicy *pol, nodemask_t *nmask)
}
}

+static void do_get_mempolicy_il_weights(struct mempolicy *pol,
+ u8 weights[MAX_NUMNODES])
+{
+ int i = 0;
+
+ if (pol->mode != MPOL_WEIGHTED_INTERLEAVE) {
+ memset(weights, 0, MAX_NUMNODES);
+ return;
+ }
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ u8 weight = pol->wil.weights[i];
+
+ if (!weight)
+ weight = iw_table[i];
+ if (!weight)
+ weight = default_iw_table[i];
+ weights[i] = weight;
+ }
+}
+
/* Retrieve NUMA policy for a VMA assocated with a given address */
static long do_get_vma_mempolicy(unsigned long addr, int *addr_node,
struct mempolicy_param *param)
@@ -985,6 +1023,9 @@ static long do_get_vma_mempolicy(unsigned long addr, int *addr_node,
if (param->policy_nodes)
do_get_mempolicy_nodemask(pol, param->policy_nodes);

+ if (param->il_weights)
+ do_get_mempolicy_il_weights(pol, param->il_weights);
+
if (pol != &default_policy) {
mpol_put(pol);
mpol_cond_put(pol);
@@ -1012,6 +1053,9 @@ static long do_get_task_mempolicy(struct mempolicy_param *param, int *pol_node)
if (param->policy_nodes)
do_get_mempolicy_nodemask(pol, param->policy_nodes);

+ if (param->il_weights)
+ do_get_mempolicy_il_weights(pol, param->il_weights);
+
return 0;
}

@@ -1620,6 +1664,8 @@ SYSCALL_DEFINE5(mbind2, unsigned long, start, unsigned long, len,
struct mempolicy_param mparam;
nodemask_t policy_nodes;
unsigned long __user *nodes_ptr;
+ u8 *weights = NULL;
+ u8 __user *weights_ptr;
int err;

if (!start || !len)
@@ -1652,7 +1698,27 @@ SYSCALL_DEFINE5(mbind2, unsigned long, start, unsigned long, len,
return err;
mparam.policy_nodes = &policy_nodes;

- return do_mbind(untagged_addr(start), len, &mparam, flags);
+ if (kparam.mode == MPOL_WEIGHTED_INTERLEAVE) {
+ weights_ptr = u64_to_user_ptr(kparam.il_weights);
+ if (weights_ptr) {
+ weights = kzalloc(MAX_NUMNODES,
+ GFP_KERNEL | __GFP_NORETRY);
+ if (!weights)
+ return -ENOMEM;
+ err = copy_struct_from_user(weights,
+ MAX_NUMNODES,
+ weights_ptr,
+ kparam.pol_maxnodes);
+ if (err)
+ goto leave_weights;
+ }
+ }
+ mparam.il_weights = weights;
+
+ err = do_mbind(untagged_addr(start), len, &mparam, flags);
+leave_weights:
+ kfree(weights);
+ return err;
}

/* Set the process memory policy */
@@ -1696,6 +1762,8 @@ SYSCALL_DEFINE3(set_mempolicy2, struct mpol_param __user *, uparam,
int err;
nodemask_t policy_nodemask;
unsigned long __user *nodes_ptr;
+ u8 *weights = NULL;
+ u8 __user *weights_ptr;

if (flags)
return -EINVAL;
@@ -1721,7 +1789,24 @@ SYSCALL_DEFINE3(set_mempolicy2, struct mpol_param __user *, uparam,
} else
mparam.policy_nodes = NULL;

- return do_set_mempolicy(&mparam);
+ if (kparam.mode == MPOL_WEIGHTED_INTERLEAVE && kparam.il_weights) {
+ weights = kzalloc(MAX_NUMNODES, GFP_KERNEL | __GFP_NORETRY);
+ if (!weights)
+ return -ENOMEM;
+ weights_ptr = u64_to_user_ptr(kparam.il_weights);
+ err = copy_struct_from_user(weights,
+ MAX_NUMNODES,
+ weights_ptr,
+ kparam.pol_maxnodes);
+ if (err)
+ goto leave_weights;
+ }
+ mparam.il_weights = weights;
+
+ err = do_set_mempolicy(&mparam);
+leave_weights:
+ kfree(weights);
+ return err;
}

static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1924,6 +2009,8 @@ SYSCALL_DEFINE4(get_mempolicy2, struct mpol_param __user *, uparam, size_t, usiz
int err;
nodemask_t policy_nodemask;
unsigned long __user *nodes_ptr;
+ u8 __user *weights_ptr;
+ u8 *weights = NULL;

if (flags & ~(MPOL_F_ADDR))
return -EINVAL;
@@ -1935,6 +2022,13 @@ SYSCALL_DEFINE4(get_mempolicy2, struct mpol_param __user *, uparam, size_t, usiz
if (err)
return -EINVAL;

+ if (kparam.il_weights) {
+ weights = kzalloc(MAX_NUMNODES, GFP_KERNEL | __GFP_NORETRY);
+ if (!weights)
+ return -ENOMEM;
+ }
+ mparam.il_weights = weights;
+
mparam.policy_nodes = kparam.pol_nodes ? &policy_nodemask : NULL;
if (flags & MPOL_F_ADDR)
err = do_get_vma_mempolicy(untagged_addr(addr), NULL, &mparam);
@@ -1942,7 +2036,7 @@ SYSCALL_DEFINE4(get_mempolicy2, struct mpol_param __user *, uparam, size_t, usiz
err = do_get_task_mempolicy(&mparam, NULL);

if (err)
- return err;
+ goto leave_weights;

kparam.mode = mparam.mode;
kparam.mode_flags = mparam.mode_flags;
@@ -1952,10 +2046,21 @@ SYSCALL_DEFINE4(get_mempolicy2, struct mpol_param __user *, uparam, size_t, usiz
err = copy_nodes_to_user(nodes_ptr, kparam.pol_maxnodes,
mparam.policy_nodes);
if (err)
- return err;
+ goto leave_weights;
+ }
+
+ if (kparam.mode == MPOL_WEIGHTED_INTERLEAVE && kparam.il_weights) {
+ weights_ptr = u64_to_user_ptr(kparam.il_weights);
+ if (copy_to_user(weights_ptr, weights, kparam.pol_maxnodes)) {
+ err = -EFAULT;
+ goto leave_weights;
+ }
}

- return copy_to_user(uparam, &kparam, usize) ? -EFAULT : 0;
+ err = copy_to_user(uparam, &kparam, usize) ? -EFAULT : 0;
+leave_weights:
+ kfree(weights);
+ return err;
}

bool vma_migratable(struct vm_area_struct *vma)
@@ -2077,8 +2182,10 @@ static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
return next;

if (!policy->wil.cur_weight) {
- u8 next_weight = iw_table[next];
+ u8 next_weight = policy->wil.weights[next];

+ if (!next_weight)
+ next_weight = iw_table[next];
if (!next_weight)
next_weight = default_iw_table[next];
policy->wil.cur_weight = next_weight;
@@ -2175,8 +2282,10 @@ static unsigned int read_once_interleave_weights(struct mempolicy *pol,
/* Similar issue to read_once_policy_nodemask */
barrier();
for_each_node_mask(nid, *mask) {
- u8 weight = iw_table[nid];
+ u8 weight = pol->wil.weights[nid];

+ if (!weight)
+ weight = iw_table[nid];
if (!weight)
weight = default_iw_table[nid];
weight_total += weight;
@@ -3115,21 +3224,28 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
if (mpol) {
struct sp_node *sn;
struct mempolicy *npol;
+ u8 *weights = NULL;
NODEMASK_SCRATCH(scratch);

if (!scratch)
goto put_mpol;

+ weights = kzalloc(MAX_NUMNODES, GFP_KERNEL | __GFP_NORETRY);
+ if (!weights)
+ goto free_scratch;
+ memcpy(weights, mpol->wil.weights, sizeof(weights));
+
memset(&mparam, 0, sizeof(mparam));
mparam.mode = mpol->mode;
mparam.mode_flags = mpol->flags;
mparam.policy_nodes = &mpol->w.user_nodemask;
mparam.home_node = NUMA_NO_NODE;
+ mparam.il_weights = weights;

/* contextualize the tmpfs mount point mempolicy to this file */
npol = mpol_new(&mparam);
if (IS_ERR(npol))
- goto free_scratch; /* no valid nodemask intersection */
+ goto free_weights; /* no valid nodemask intersection */

task_lock(current);
ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
@@ -3143,6 +3259,8 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
sp_insert(sp, sn);
put_npol:
mpol_put(npol); /* drop initial ref on file's npol */
+free_weights:
+ kfree(weights);
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
--
2.39.1