[RFC/POC 2/2] sched/numa: Adds simple prctl for setting task's preferred node affinity.

From: chris hyser
Date: Fri Dec 15 2023 - 19:19:15 EST


EXPERIMENTAL - NOT INTENDED FOR SUBMISSION

Adds a simple prctl() interface to the preferred node affinity test code.

Signed-off-by: Chris Hyser <chris.hyser@xxxxxxxxxx>
---
include/uapi/linux/prctl.h | 9 ++++++
kernel/sys.c | 66 ++++++++++++++++++++++++++++++++++++++
2 files changed, 75 insertions(+)

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 370ed14b1ae0..6c8f6c0156d8 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -293,6 +293,15 @@ struct prctl_mm_map {

#define PR_GET_AUXV 0x41555856

+/*
+ * This is experimental and placed out of order to keep surrounding context
+ * the same in the presence of new prctls. Thus the patch should just apply.
+ */
+#define PR_PREFERRED_NID 101
+# define PR_PREFERRED_NID_GET 0
+# define PR_PREFERRED_NID_SET 1
+# define PR_PREFERRED_NID_CMD_MAX 2
+
#define PR_SET_MEMORY_MERGE 67
#define PR_GET_MEMORY_MERGE 68

diff --git a/kernel/sys.c b/kernel/sys.c
index 420d9cb9cc8e..6dca12da6ade 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2406,6 +2406,67 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
return 0;
}

+#ifdef CONFIG_NUMA_BALANCING
+
+void sched_setnuma(struct task_struct *p, int node);
+
+static int prctl_chg_pref_nid(unsigned long cmd, int nid, pid_t pid, unsigned long uaddr)
+{
+ struct task_struct *task;
+ int err = 0;
+
+ if (cmd >= PR_PREFERRED_NID_CMD_MAX)
+ return -ERANGE;
+
+ rcu_read_lock();
+ if (pid == 0) {
+ task = current;
+ } else {
+ task = find_task_by_vpid((pid_t)pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ switch (cmd) {
+ case PR_PREFERRED_NID_GET:
+ if (uaddr & 0x3) {
+ err = -EINVAL;
+ goto out;
+ }
+ err = put_user(task->numa_preferred_nid_force, (int __user *)uaddr);
+ break;
+
+ case PR_PREFERRED_NID_SET:
+ if (!(-1 <= nid && nid < num_possible_nodes())) {
+ pr_err("prctl_chg_pref_nid: %d error\n", nid);
+ err = -EINVAL;
+ goto out;
+ }
+
+ task->numa_preferred_nid_force = nid;
+ sched_setnuma(task, nid);
+ break;
+ }
+
+out:
+ put_task_struct(task);
+ return err;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
@@ -2698,6 +2759,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SCHED_CORE:
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ case PR_PREFERRED_NID:
+ error = prctl_chg_pref_nid(arg2, arg3, arg4, arg5);
+ break;
#endif
case PR_SET_MDWE:
error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
--
2.39.3