[PATCH v2 01/11] mm/mempolicy: implement the sysfs-based weighted_interleave interface

From: Gregory Price
Date: Sat Dec 09 2023 - 02:00:05 EST


From: Rakie Kim <rakie.kim@xxxxxx>

This patch provides a way to set interleave weight information under
sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN/weight

The sysfs structure is designed as follows.

$ tree /sys/kernel/mm/mempolicy/
/sys/kernel/mm/mempolicy/ [1]
├── possible_nodes [2]
└── weighted_interleave [3]
├── node0 [4]
│  └── weight [5]
└── node1
  └── weight

Each file above can be explained as follows.

[1] mm/mempolicy: configuration interface for mempolicy subsystem

[2] possible_nodes: list of possible nodes

informational interface which may be used across multiple memory
policy configurations. Lists the `possible` nodes for which
configurations may be required. A `possible` node is one which has
been reserved by the kernel at boot, but may or may not be online.

For example, the weighted_interleave policy generates a nodeN/
folder for possible node N.

[3] weighted_interleave/: config interface for weighted interleave policy

[4] weighted_interleave/nodeN/: possible node configurations

[5] weighted_interleave/nodeN/weight: weight for nodeN

Signed-off-by: Rakie Kim <rakie.kim@xxxxxx>
Signed-off-by: Honggyu Kim <honggyu.kim@xxxxxx>
Co-developed-by: Gregory Price <gregory.price@xxxxxxxxxxxx>
Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@xxxxxx>
---
.../ABI/testing/sysfs-kernel-mm-mempolicy | 18 ++
...fs-kernel-mm-mempolicy-weighted-interleave | 21 +++
mm/mempolicy.c | 169 ++++++++++++++++++
3 files changed, 208 insertions(+)
create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 000000000000..445377dfd232
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,18 @@
+What: /sys/kernel/mm/mempolicy/
+Date: December 2023
+Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx>
+Description: Interface for Mempolicy
+
+What: /sys/kernel/mm/mempolicy/possible_nodes
+Date: December 2023
+Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx>
+Description: The numa nodes which are possible to come online
+
+ A possible numa node is one which has been reserved by the
+ system at boot, but may or may not be online at runtime.
+
+ Example output:
+
+ ========= ========================================
+ "0,1,2,3" nodes 0-3 are possibly online or offline
+ ========= ========================================
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
new file mode 100644
index 000000000000..7c19a606725f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -0,0 +1,21 @@
+What: /sys/kernel/mm/mempolicy/weighted_interleave/
+Date: December 2023
+Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx>
+Description: Configuration Interface for the Weighted Interleave policy
+
+What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN/
+ /sys/kernel/mm/mempolicy/weighted_interleave/nodeN/weight
+Date: December 2023
+Contact: Linux memory management mailing list <linux-mm@xxxxxxxxx>
+Description: Weight configuration interface for nodeN
+
+ The interleave weight for a memory node (N). These weights are
+ utilized by processes which have set their mempolicy to
+ MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by
+ omitting a task-local weight array.
+
+ These weights only affect new allocations, and changes at runtime
+ will not cause migrations on already allocated pages.
+
+ Minimum weight: 1
+ Maximum weight: 255
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..28dfae195beb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,6 +131,8 @@ static struct mempolicy default_policy = {

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

+static char iw_table[MAX_NUMNODES];
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -3067,3 +3069,170 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+struct iw_node_info {
+ struct kobject kobj;
+ int nid;
+};
+
+static ssize_t node_weight_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct iw_node_info *node_info = container_of(kobj, struct iw_node_info,
+ kobj);
+ return sysfs_emit(buf, "%d\n", iw_table[node_info->nid]);
+}
+
+static ssize_t node_weight_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned char weight = 0;
+ struct iw_node_info *node_info = NULL;
+
+ node_info = container_of(kobj, struct iw_node_info, kobj);
+
+ if (kstrtou8(buf, 0, &weight) || !weight)
+ return -EINVAL;
+
+ iw_table[node_info->nid] = weight;
+
+ return count;
+}
+
+static struct kobj_attribute node_weight =
+ __ATTR(weight, 0664, node_weight_show, node_weight_store);
+
+static struct attribute *dst_node_attrs[] = {
+ &node_weight.attr,
+ NULL,
+};
+
+static struct attribute_group dst_node_attr_group = {
+ .attrs = dst_node_attrs,
+};
+
+static const struct attribute_group *dst_node_attr_groups[] = {
+ &dst_node_attr_group,
+ NULL,
+};
+
+static const struct kobj_type dst_node_kobj_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = dst_node_attr_groups,
+};
+
+static int add_weight_node(int nid, struct kobject *src_kobj)
+{
+ struct iw_node_info *node_info = NULL;
+ int ret;
+
+ node_info = kzalloc(sizeof(struct iw_node_info), GFP_KERNEL);
+ if (!node_info)
+ return -ENOMEM;
+ node_info->nid = nid;
+
+ kobject_init(&node_info->kobj, &dst_node_kobj_ktype);
+ ret = kobject_add(&node_info->kobj, src_kobj, "node%d", nid);
+ if (ret) {
+ pr_err("kobject_add error [node%d]: %d", nid, ret);
+ kobject_put(&node_info->kobj);
+ }
+ return ret;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+ struct kobject *wi_kobj;
+ int nid, err;
+
+ wi_kobj = kobject_create_and_add("weighted_interleave", root_kobj);
+ if (!wi_kobj) {
+ pr_err("failed to create node kobject\n");
+ return -ENOMEM;
+ }
+
+ for_each_node_state(nid, N_POSSIBLE) {
+ err = add_weight_node(nid, wi_kobj);
+ if (err) {
+ pr_err("failed to add sysfs [node%d]\n", nid);
+ break;
+ }
+ }
+ if (err)
+ kobject_put(wi_kobj);
+ return 0;
+
+}
+
+static ssize_t possible_nodes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int nid, next_nid;
+ int len = 0;
+
+ for_each_node_state(nid, N_POSSIBLE) {
+ len += sysfs_emit_at(buf, len, "%d", nid);
+ next_nid = next_node(nid, node_states[N_POSSIBLE]);
+ if (next_nid < MAX_NUMNODES)
+ len += sysfs_emit_at(buf, len, ",");
+ }
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
+}
+
+static struct kobj_attribute possible_nodes_attr = __ATTR_RO(possible_nodes);
+
+static struct attribute *mempolicy_attrs[] = {
+ &possible_nodes_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group mempolicy_attr_group = {
+ .attrs = mempolicy_attrs,
+ NULL,
+};
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_kobj_ktype = {
+ .release = mempolicy_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
+static int __init mempolicy_sysfs_init(void)
+{
+ int err;
+ struct kobject *root_kobj;
+
+ memset(&iw_table, 1, sizeof(iw_table));
+
+ root_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!root_kobj)
+ return -ENOMEM;
+
+ kobject_init(root_kobj, &mempolicy_kobj_ktype);
+ err = kobject_add(root_kobj, mm_kobj, "mempolicy");
+ if (err) {
+ pr_err("failed to add kobject to the system\n");
+ goto fail_obj;
+ }
+
+ err = sysfs_create_group(root_kobj, &mempolicy_attr_group);
+ if (err) {
+ pr_err("failed to register mempolicy group\n");
+ goto fail_obj;
+ }
+
+ err = add_weighted_interleave_group(root_kobj);
+fail_obj:
+ if (err)
+ kobject_put(root_kobj);
+ return err;
+
+}
+late_initcall(mempolicy_sysfs_init);
--
2.39.1