[RFC PATCH v0.1 4/9] sched/umcg: implement core UMCG API

From: Peter Oskolkov
Date: Thu May 20 2021 - 14:36:47 EST


Implement version 1 of core UMCG API (wait/wake/swap).

As has been outlined in
https://lore.kernel.org/lkml/20200722234538.166697-1-posk@xxxxxxx/,
efficient and synchronous on-CPU context switching is key
to enabling two broad use cases: in-process M:N userspace scheduling
and fast X-process RPCs for security wrappers.

High-level design considerations/approaches used:
- wait & wake can race with each other;
- offload as much work as possible to libumcg in tools/lib/umcg,
specifically:
- most state changes, e.g. RUNNABLE <=> RUNNING, are done in
the userspace (libumcg);
- retries are offloaded to the userspace.

This implementation misses timeout handling in sys_umcg_wait
and sys_umcg_swap, which will be added in version 2.

Signed-off-by: Peter Oskolkov <posk@xxxxxxxxxx>
---
include/linux/sched.h | 7 +-
kernel/sched/core.c | 3 +
kernel/sched/umcg.c | 237 ++++++++++++++++++++++++++++++++++++++++--
kernel/sched/umcg.h | 42 ++++++++
4 files changed, 282 insertions(+), 7 deletions(-)
create mode 100644 kernel/sched/umcg.h

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c7e7d50e2fdc..fc4b8775f514 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -66,6 +66,7 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct umcg_task_data;

/*
* Task state bitmask. NOTE! These bits are also
@@ -778,6 +779,10 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;

+#ifdef CONFIG_UMCG
+ struct umcg_task_data __rcu *umcg_task_data;
+#endif
+
/* Per-thread vma caching: */
struct vmacache vmacache;

@@ -1022,7 +1027,7 @@ struct task_struct {
u64 parent_exec_id;
u64 self_exec_id;

- /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
+ /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy, umcg: */
spinlock_t alloc_lock;

/* Protection of the PI data structures: */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88506bc2617f..462104f13c28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3964,6 +3964,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
+#ifdef CONFIG_UMCG
+ rcu_assign_pointer(p->umcg_task_data, NULL);
+#endif
}

DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c
index b8195cfdb76a..2d718433c773 100644
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -4,11 +4,23 @@
* User Managed Concurrency Groups (UMCG).
*/

+#include <linux/freezer.h>
#include <linux/syscalls.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include <linux/umcg.h>

+#include "sched.h"
+#include "umcg.h"
+
+static int __api_version(u32 requested)
+{
+ if (requested == 1)
+ return 0;
+
+ return 1;
+}
+
/**
* sys_umcg_api_version - query UMCG API versions that are supported.
* @api_version: Requested API version.
@@ -26,7 +38,52 @@
*/
SYSCALL_DEFINE2(umcg_api_version, u32, api_version, u32, flags)
{
- return -ENOSYS;
+ if (flags)
+ return -EINVAL;
+
+ return __api_version(api_version);
+}
+
+static int get_state(struct umcg_task __user *ut, u32 *state)
+{
+ return get_user(*state, (u32 __user *)ut);
+}
+
+static int put_state(struct umcg_task __user *ut, u32 state)
+{
+ return put_user(state, (u32 __user *)ut);
+}
+
+static int register_core_task(u32 api_version, struct umcg_task __user *umcg_task)
+{
+ struct umcg_task_data *utd;
+ u32 state;
+
+ if (get_state(umcg_task, &state))
+ return -EFAULT;
+
+ if (state != UMCG_TASK_NONE)
+ return -EINVAL;
+
+ utd = kzalloc(sizeof(struct umcg_task_data), GFP_KERNEL);
+ if (!utd)
+ return -EINVAL;
+
+ utd->self = current;
+ utd->umcg_task = umcg_task;
+ utd->task_type = UMCG_TT_CORE;
+ utd->api_version = api_version;
+
+ if (put_state(umcg_task, UMCG_TASK_RUNNING)) {
+ kfree(utd);
+ return -EFAULT;
+ }
+
+ task_lock(current);
+ rcu_assign_pointer(current->umcg_task_data, utd);
+ task_unlock(current);
+
+ return 0;
}

/**
@@ -54,7 +111,20 @@ SYSCALL_DEFINE2(umcg_api_version, u32, api_version, u32, flags)
SYSCALL_DEFINE4(umcg_register_task, u32, api_version, u32, flags, u32, group_id,
struct umcg_task __user *, umcg_task)
{
- return -ENOSYS;
+ if (__api_version(api_version))
+ return -EOPNOTSUPP;
+
+ if (rcu_access_pointer(current->umcg_task_data) || !umcg_task)
+ return -EINVAL;
+
+ switch (flags) {
+ case UMCG_REGISTER_CORE_TASK:
+ if (group_id != UMCG_NOID)
+ return -EINVAL;
+ return register_core_task(api_version, umcg_task);
+ default:
+ return -EINVAL;
+ }
}

/**
@@ -67,7 +137,75 @@ SYSCALL_DEFINE4(umcg_register_task, u32, api_version, u32, flags, u32, group_id,
*/
SYSCALL_DEFINE1(umcg_unregister_task, u32, flags)
{
- return -ENOSYS;
+ struct umcg_task_data *utd;
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ utd = rcu_dereference(current->umcg_task_data);
+
+ if (!utd || flags)
+ goto out;
+
+ task_lock(current);
+ rcu_assign_pointer(current->umcg_task_data, NULL);
+ task_unlock(current);
+
+ ret = 0;
+
+out:
+ rcu_read_unlock();
+ if (!ret && utd) {
+ synchronize_rcu();
+ kfree(utd);
+ }
+ return ret;
+}
+
+static int do_context_switch(struct task_struct *next)
+{
+ struct umcg_task_data *utd = rcu_access_pointer(current->umcg_task_data);
+
+ /*
+ * It is important to set_current_state(TASK_INTERRUPTIBLE) before
+ * waking @next, as @next may immediately try to wake current back
+ * (e.g. current is a server, @next is a worker that immediately
+ * blocks or waits), and this next wakeup must not be lost.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ WRITE_ONCE(utd->in_wait, true);
+
+ if (!try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU))
+ return -EAGAIN;
+
+ freezable_schedule();
+
+ WRITE_ONCE(utd->in_wait, false);
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ return 0;
+}
+
+static int do_wait(void)
+{
+ struct umcg_task_data *utd = rcu_access_pointer(current->umcg_task_data);
+
+ if (!utd)
+ return -EINVAL;
+
+ WRITE_ONCE(utd->in_wait, true);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ freezable_schedule();
+
+ WRITE_ONCE(utd->in_wait, false);
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ return 0;
}

/**
@@ -90,7 +228,23 @@ SYSCALL_DEFINE1(umcg_unregister_task, u32, flags)
SYSCALL_DEFINE2(umcg_wait, u32, flags,
const struct __kernel_timespec __user *, timeout)
{
- return -ENOSYS;
+ struct umcg_task_data *utd;
+
+ if (flags)
+ return -EINVAL;
+ if (timeout)
+ return -EOPNOTSUPP;
+
+ rcu_read_lock();
+ utd = rcu_dereference(current->umcg_task_data);
+ if (!utd) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ rcu_read_unlock();
+
+ return do_wait();
}

/**
@@ -110,7 +264,39 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags,
*/
SYSCALL_DEFINE2(umcg_wake, u32, flags, u32, next_tid)
{
- return -ENOSYS;
+ struct umcg_task_data *next_utd;
+ struct task_struct *next;
+ int ret = -EINVAL;
+
+ if (!next_tid)
+ return -EINVAL;
+ if (flags)
+ return -EINVAL;
+
+ next = find_get_task_by_vpid(next_tid);
+ if (!next)
+ return -ESRCH;
+
+ rcu_read_lock();
+ next_utd = rcu_dereference(next->umcg_task_data);
+ if (!next_utd)
+ goto out;
+
+ if (!READ_ONCE(next_utd->in_wait)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ ret = wake_up_process(next);
+ put_task_struct(next);
+ if (ret)
+ ret = 0;
+ else
+ ret = -EAGAIN;
+
+out:
+ rcu_read_unlock();
+ return ret;
}

/**
@@ -139,5 +325,44 @@ SYSCALL_DEFINE2(umcg_wake, u32, flags, u32, next_tid)
SYSCALL_DEFINE4(umcg_swap, u32, wake_flags, u32, next_tid, u32, wait_flags,
const struct __kernel_timespec __user *, timeout)
{
- return -ENOSYS;
+ struct umcg_task_data *curr_utd;
+ struct umcg_task_data *next_utd;
+ struct task_struct *next;
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ curr_utd = rcu_dereference(current->umcg_task_data);
+
+ if (!next_tid || wake_flags || wait_flags || !curr_utd)
+ goto out;
+
+ if (timeout) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ next = find_get_task_by_vpid(next_tid);
+ if (!next) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ next_utd = rcu_dereference(next->umcg_task_data);
+ if (!next_utd) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!READ_ONCE(next_utd->in_wait)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ rcu_read_unlock();
+
+ return do_context_switch(next);
+
+out:
+ rcu_read_unlock();
+ return ret;
}
diff --git a/kernel/sched/umcg.h b/kernel/sched/umcg.h
new file mode 100644
index 000000000000..6791d570f622
--- /dev/null
+++ b/kernel/sched/umcg.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_SCHED_UMCG_H
+#define _KERNEL_SCHED_UMCG_H
+
+#ifdef CONFIG_UMCG
+
+#include <linux/sched.h>
+#include <linux/umcg.h>
+
+enum umcg_task_type {
+ UMCG_TT_CORE = 1,
+ UMCG_TT_SERVER = 2,
+ UMCG_TT_WORKER = 3
+};
+
+struct umcg_task_data {
+ /* umcg_task != NULL. Never changes. */
+ struct umcg_task __user *umcg_task;
+
+ /* The task that owns this umcg_task_data. Never changes. */
+ struct task_struct *self;
+
+ /* Core task, server, or worker. Never changes. */
+ enum umcg_task_type task_type;
+
+ /*
+ * The API version used to register this task. If this is a
+ * worker or a server, must equal group->api_version.
+ *
+ * Never changes.
+ */
+ u32 api_version;
+
+ /*
+ * Used by wait/wake routines to handle races. Written only by current.
+ */
+ bool in_wait;
+};
+
+#endif /* CONFIG_UMCG */
+#endif /* _KERNEL_SCHED_UMCG_H */
--
2.31.1.818.g46aad6cb9e-goog