[RFC PATCH 07/11] mm/mempolicy: add task mempolicy syscall variants

From: Gregory Price
Date: Wed Nov 22 2023 - 16:12:58 EST


Add system calls to allow one task to view or change another task's
mempolicy settings. The task mempolicy has traditionally been a feature
that could only be changed by the task itself. This creates issues
with task migrations between cgroups where cpusets may differ.

Attempts were made to allow policy nodemasks to be shifted via a flag
(MPOL_F_RELATIVE_NODES), but this is not foolproof.

Additionally, as new policies emerge (like weighted interleave), it
may be necessary to allow not just the policy to be changed, but
individual attributes of the policy (such as a node weight) in
response to other system events - such as memory hotplug.

If pid is 0, this behaves the same as the original mempolicy syscalls,
otherwise this interface requires CAP_SYS_NICE.

Syscalls in this patch:
sys_set_task_mempolicy
sys_get_task_mempolicy
sys_set_task_mempolicy_home_node
sys_task_mbind

Signed-off-by: Gregory Price <gregory.price@xxxxxxxxxxxx>
---
arch/x86/entry/syscalls/syscall_32.tbl | 4 +
arch/x86/entry/syscalls/syscall_64.tbl | 4 +
include/linux/syscalls.h | 14 +++
include/uapi/asm-generic/unistd.h | 10 ++-
include/uapi/linux/mempolicy.h | 10 +++
mm/mempolicy.c | 119 +++++++++++++++++++++++++
6 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index c8fac5205803..358bd91d7461 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -461,3 +461,7 @@
454 i386 futex_wake sys_futex_wake
455 i386 futex_wait sys_futex_wait
456 i386 futex_requeue sys_futex_requeue
+457 i386 set_task_mempolicy sys_set_task_mempolicy
+458 i386 get_task_mempolicy sys_get_task_mempolicy
+459 i386 set_task_mempolicy_home_node sys_set_task_mempolicy_home_node
+460 i386 task_mbind sys_task_mbind
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 8cb8bf68721c..c83b0c5c1ff9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -378,6 +378,10 @@
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
+457 common set_task_mempolicy sys_set_task_mempolicy
+458 common get_task_mempolicy sys_get_task_mempolicy
+459 common set_task_mempolicy_home_node sys_set_task_mempolicy_home_node
+460 common task_mbind sys_task_mbind

#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fd9d12de7e92..fd1a8863b5c1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -816,12 +816,21 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
const unsigned long __user *nmask,
unsigned long maxnode,
unsigned flags);
+asmlinkage long sys_task_mbind(const struct mbind_args __user *uargs,
+ size_t usize);
asmlinkage long sys_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr, unsigned long flags);
asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode);
+asmlinkage long sys_get_task_mempolicy(pid_t pid, int __user *policy,
+ unsigned long __user *nmask,
+ unsigned long maxnode,
+ unsigned long addr, unsigned long flags);
+asmlinkage long sys_set_task_mempolicy(pid_t pid, int mode,
+ const unsigned long __user *nmask,
+ unsigned long maxnode);
asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
const unsigned long __user *from,
const unsigned long __user *to);
@@ -945,6 +954,11 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
unsigned long home_node,
unsigned long flags);
+asmlinkage long sys_set_task_mempolicy_home_node(pid_t pid,
+ unsigned long start,
+ unsigned long len,
+ unsigned long home_node,
+ unsigned long flags);
asmlinkage long sys_cachestat(unsigned int fd,
struct cachestat_range __user *cstat_range,
struct cachestat __user *cstat, unsigned int flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 756b013fb832..f179715f1d59 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -828,9 +828,17 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake)
__SYSCALL(__NR_futex_wait, sys_futex_wait)
#define __NR_futex_requeue 456
__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
+#define __NR_set_task_mempolicy 457
+__SYSCALL(__NR_set_task_mempolicy, sys_set_task_mempolicy)
+#define __NR_get_task_mempolicy 458
+__SYSCALL(__NR_get_task_mempolicy, sys_get_task_mempolicy)
+#define __NR_set_task_mempolicy_home_node 459
+__SYSCALL(__NR_set_task_mempolicy_home_node, sys_set_task_mempolicy_home_node)
+#define __NR_task_mbind 460
+__SYSCALL(__NR_task_mbind, sys_task_mbind)

#undef __NR_syscalls
-#define __NR_syscalls 457
+#define __NR_syscalls 461

/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index a8963f7ef4c2..c29cfb25db29 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -26,6 +26,16 @@ enum {
MPOL_MAX, /* always last member of enum */
};

+struct mbind_args {
+ pid_t pid;
+ unsigned long start;
+ unsigned long len;
+ unsigned long mode;
+ unsigned long *nmask;
+ unsigned long maxnode;
+ unsigned int flags;
+};
+
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3d2171ac4098..fb295ade8ad7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1654,6 +1654,32 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
return __set_mempolicy_home_node(current, start, len, home_node, flags);
}

+SYSCALL_DEFINE5(set_task_mempolicy_home_node, pid_t, pid, unsigned long, start,
+ unsigned long, len, unsigned long, home_node,
+ unsigned long, flags)
+{
+ struct task_struct *task;
+ int err;
+
+ if (pid && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ rcu_read_lock();
+ task = pid ? find_task_by_vpid(pid) : current;
+ if (!task) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto out;
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ err = __set_mempolicy_home_node(task, start, len, home_node, flags);
+ put_task_struct(task);
+out:
+ return err;
+}
+
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned int, flags)
@@ -1661,6 +1687,48 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
return kernel_mbind(current, start, len, mode, nmask, maxnode, flags);
}

+static long kernel_task_mbind(const struct mbind_args __user *uargs,
+ size_t usize)
+{
+ struct mbind_args kargs;
+ struct task_struct *task;
+ int err;
+
+ if (usize < sizeof(kargs))
+ return -EINVAL;
+
+ err = copy_struct_from_user(&kargs, sizeof(kargs), uargs, usize);
+ if (err)
+ return err;
+
+
+ if (kargs.pid && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ rcu_read_lock();
+ task = kargs.pid ? find_task_by_vpid(kargs.pid) : current;
+ if (!task) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto out;
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ err = kernel_mbind(task, kargs.start, kargs.len, kargs.mode,
+ kargs.nmask, kargs.maxnode, kargs.flags);
+
+ put_task_struct(task);
+out:
+ return err;
+}
+
+SYSCALL_DEFINE2(task_mbind, const struct mbind_args __user *, args,
+ size_t, size)
+{
+ return kernel_task_mbind(args, size);
+}
+
/* Set the process memory policy */
static long kernel_set_mempolicy(struct task_struct *task, int mode,
const unsigned long __user *nmask,
@@ -1688,6 +1756,31 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
return kernel_set_mempolicy(current, mode, nmask, maxnode);
}

+SYSCALL_DEFINE4(set_task_mempolicy, pid_t, pid, int, mode,
+ const unsigned long __user *, nmask, unsigned long, maxnode)
+{
+ struct task_struct *task;
+ int err;
+
+ if (pid && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ rcu_read_lock();
+ task = pid ? find_task_by_vpid(pid) : current;
+ if (!task) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto out;
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ err = kernel_set_mempolicy(task, mode, nmask, maxnode);
+ put_task_struct(task);
+out:
+ return err;
+}
+
static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
const unsigned long __user *old_nodes,
const unsigned long __user *new_nodes)
@@ -1821,6 +1914,32 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
flags);
}

+SYSCALL_DEFINE6(get_task_mempolicy, pid_t, pid, int __user *, policy,
+ unsigned long __user *, nmask, unsigned long, maxnode,
+ unsigned long, addr, unsigned long, flags)
+{
+ struct task_struct *task;
+ int err;
+
+ if (pid && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ rcu_read_lock();
+ task = pid ? find_task_by_vpid(pid) : current;
+ if (!task) {
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto out;
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ err = kernel_get_mempolicy(task, policy, nmask, maxnode, addr, flags);
+ put_task_struct(task);
+out:
+ return err;
+}
+
bool vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
--
2.39.1