Re: [PATCH v5 08/24] sched: Introduce per memory space current virtual cpu id

From: Peter Zijlstra
Date: Wed Nov 09 2022 - 05:19:45 EST


On Tue, Nov 08, 2022 at 03:07:42PM -0500, Mathieu Desnoyers wrote:
> On 2022-11-08 08:04, Peter Zijlstra wrote:
> > On Thu, Nov 03, 2022 at 04:03:43PM -0400, Mathieu Desnoyers wrote:
> >
> > > The credit goes to Paul Turner (Google) for the vcpu_id idea. This
> > > feature is implemented based on the discussions with Paul Turner and
> > > Peter Oskolkov (Google), but I took the liberty to implement scheduler
> > > fast-path optimizations and my own NUMA-awareness scheme. The rumor has
> > > it that Google have been running a rseq vcpu_id extension internally at
> > > Google in production for a year. The tcmalloc source code indeed has
> > > comments hinting at a vcpu_id prototype extension to the rseq system
> > > call [1].
> >
> > Re NUMA thing -- that means that on a 512 node system a single threaded
> > task can still observe 512 separate vcpu-ids, right?
>
> Yes, that's correct.

So,.. I've been thinking. How about we expose _two_ vcpu-ids :-)

One is the original, system wide min(nr_thread, nr_cpus) and the other
is a per-node vcpu-id. Not like you did, but min(nr_threads,
nr_cpus_per_node) like.

That way userspace can either use:

- rseq::cpu_id -- as it does today

- rseq::vcpu_id -- (when -1, see rseq::cpu_id) the original vcpu_id
proposal, which is typically good enough when your
application is not NUMA aware and relies on NUMA
balancing (most every application out there)

- rseq::node_id *and*
rseq::vcpu_node_id -- Combined it gives both node locality
and a *dense* space within the node.

This then lets the application pick whatever is best.

Note that using node affine memory allocations by default is a *very*
bad idea since it wrecks NUMA balancing, you really should only use this
if you application is fully NUMA aware and knows what it is doing.


---
include/linux/mm.h | 15 ++--
include/linux/mm_types.h | 23 +-----
include/linux/sched.h | 1
include/uapi/linux/rseq.h | 1
kernel/fork.c | 1
kernel/rseq.c | 9 ++
kernel/sched/core.c | 18 ++---
kernel/sched/sched.h | 157 +++++++++-------------------------------------
8 files changed, 66 insertions(+), 159 deletions(-)

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3484,6 +3484,10 @@ static inline int task_mm_vcpu_id(struct
{
return t->mm_vcpu;
}
+static inline int task_mm_vcpu_node_id(struct task_struct *t)
+{
+ return t->mm_vcpu_node;
+}
#else
static inline void sched_vcpu_before_execve(struct task_struct *t) { }
static inline void sched_vcpu_after_execve(struct task_struct *t) { }
@@ -3491,12 +3495,11 @@ static inline void sched_vcpu_fork(struc
static inline void sched_vcpu_exit_signals(struct task_struct *t) { }
static inline int task_mm_vcpu_id(struct task_struct *t)
{
- /*
- * Use the processor id as a fall-back when the mm vcpu feature is
- * disabled. This provides functional per-cpu data structure accesses
- * in user-space, althrough it won't provide the memory usage benefits.
- */
- return raw_smp_processor_id();
+ return -1; /* userspace should use cpu_id */
+}
+static inline int task_mm_vcpu_node_id(struct task_struct *t)
+{
+ return -1;
}
#endif

--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -568,7 +568,7 @@ struct mm_struct {
* bitmap words as they were being concurrently updated by the
* updaters.
*/
- spinlock_t vcpu_lock;
+ raw_spinlock_t vcpu_lock;
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* PTE page table pages */
@@ -875,28 +875,15 @@ static inline unsigned int mm_vcpumask_s

#if defined(CONFIG_SCHED_MM_VCPU) && defined(CONFIG_NUMA)
/*
- * Layout of node vcpumasks:
- * - node_alloc vcpumask: cpumask tracking which vcpu_id were
- * allocated (across nodes) in this
- * memory space.
* - node vcpumask[nr_node_ids]: per-node cpumask tracking which vcpu_id
* were allocated in this memory space.
*/
-static inline cpumask_t *mm_node_alloc_vcpumask(struct mm_struct *mm)
+static inline cpumask_t *mm_node_vcpumask(struct mm_struct *mm, unsigned int node)
{
unsigned long vcpu_bitmap = (unsigned long)mm_vcpumask(mm);

/* Skip mm_vcpumask */
vcpu_bitmap += cpumask_size();
- return (struct cpumask *)vcpu_bitmap;
-}
-
-static inline cpumask_t *mm_node_vcpumask(struct mm_struct *mm, unsigned int node)
-{
- unsigned long vcpu_bitmap = (unsigned long)mm_node_alloc_vcpumask(mm);
-
- /* Skip node alloc vcpumask */
- vcpu_bitmap += cpumask_size();
vcpu_bitmap += node * cpumask_size();
return (struct cpumask *)vcpu_bitmap;
}
@@ -907,21 +894,21 @@ static inline void mm_init_node_vcpumask

if (num_possible_nodes() == 1)
return;
- cpumask_clear(mm_node_alloc_vcpumask(mm));
+
for (node = 0; node < nr_node_ids; node++)
cpumask_clear(mm_node_vcpumask(mm, node));
}

static inline void mm_init_vcpu_lock(struct mm_struct *mm)
{
- spin_lock_init(&mm->vcpu_lock);
+ raw_spin_lock_init(&mm->vcpu_lock);
}

static inline unsigned int mm_node_vcpumask_size(void)
{
if (num_possible_nodes() == 1)
return 0;
- return (nr_node_ids + 1) * cpumask_size();
+ return nr_node_ids * cpumask_size();
}
#else
static inline void mm_init_node_vcpumask(struct mm_struct *mm) { }
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1316,6 +1316,7 @@ struct task_struct {

#ifdef CONFIG_SCHED_MM_VCPU
int mm_vcpu; /* Current vcpu in mm */
+ int mm_node_vcpu; /* Current vcpu for this node */
int mm_vcpu_active; /* Whether vcpu bitmap is active */
#endif

--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -147,6 +147,7 @@ struct rseq {
* (allocated uniquely within a memory space).
*/
__u32 vm_vcpu_id;
+ __u32 vm_vcpu_node_id;

/*
* Flexible array member at end of structure, after last feature field.
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1049,6 +1049,7 @@ static struct task_struct *dup_task_stru

#ifdef CONFIG_SCHED_MM_VCPU
tsk->mm_vcpu = -1;
+ tsk->mm_vcpu_node = -1;
tsk->mm_vcpu_active = 0;
#endif
return tsk;
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -91,6 +91,7 @@ static int rseq_update_cpu_node_id(struc
u32 cpu_id = raw_smp_processor_id();
u32 node_id = cpu_to_node(cpu_id);
u32 vm_vcpu_id = task_mm_vcpu_id(t);
+ u32 vm_vcpu_node_id = task_mm_vcpu_node_id(t);

WARN_ON_ONCE((int) vm_vcpu_id < 0);
if (!user_write_access_begin(rseq, t->rseq_len))
@@ -99,6 +100,7 @@ static int rseq_update_cpu_node_id(struc
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
unsafe_put_user(node_id, &rseq->node_id, efault_end);
unsafe_put_user(vm_vcpu_id, &rseq->vm_vcpu_id, efault_end);
+ unsafe_put_user(vm_vcpu_node_id, &rseq->vm_vcpu_node_id, efault_end);
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally updated only if
@@ -116,8 +118,8 @@ static int rseq_update_cpu_node_id(struc

static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
- vm_vcpu_id = 0;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 node_id = 0, vm_vcpu_id = -1, vm_vcpu_node_id = -1;

/*
* Reset cpu_id_start to its initial state (0).
@@ -141,6 +143,9 @@ static int rseq_reset_rseq_cpu_node_id(s
*/
if (put_user(vm_vcpu_id, &t->rseq->vm_vcpu_id))
return -EFAULT;
+
+ if (put_user(vm_vcpu_node_id, &t->rseq->vm_vcpu_node_id))
+ return -EFAULT;
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11278,15 +11278,13 @@ void call_trace_sched_update_nr_running(
#ifdef CONFIG_SCHED_MM_VCPU
void sched_vcpu_exit_signals(struct task_struct *t)
{
- struct mm_struct *mm = t->mm;
unsigned long flags;

- if (!mm)
+ if (!t->mm)
return;
+
local_irq_save(flags);
- mm_vcpu_put(mm, t->mm_vcpu);
- t->mm_vcpu = -1;
- t->mm_vcpu_active = 0;
+ mm_vcpu_put(t);
local_irq_restore(flags);
}

@@ -11297,10 +11295,9 @@ void sched_vcpu_before_execve(struct tas

if (!mm)
return;
+
local_irq_save(flags);
- mm_vcpu_put(mm, t->mm_vcpu);
- t->mm_vcpu = -1;
- t->mm_vcpu_active = 0;
+ mm_vcpu_put(t);
local_irq_restore(flags);
}

@@ -11312,9 +11309,9 @@ void sched_vcpu_after_execve(struct task
WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm);

local_irq_save(flags);
- t->mm_vcpu = mm_vcpu_get(mm);
- t->mm_vcpu_active = 1;
+ mm_vcpu_get(t);
local_irq_restore(flags);
+
rseq_set_notify_resume(t);
}

@@ -11322,6 +11319,7 @@ void sched_vcpu_fork(struct task_struct
{
WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm);
t->mm_vcpu = -1;
+ t->mm_vcpu_node = -1;
t->mm_vcpu_active = 1;
}
#endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3262,147 +3262,57 @@ static inline void update_current_exec_r
}

#ifdef CONFIG_SCHED_MM_VCPU
-static inline int __mm_vcpu_get_single_node(struct mm_struct *mm)
+static inline int __mm_vcpu_get(struct cpumask *cpumask)
{
- struct cpumask *cpumask;
int vcpu;

- cpumask = mm_vcpumask(mm);
vcpu = cpumask_first_zero(cpumask);
- if (vcpu >= nr_cpu_ids)
+ if (WARN_ON_ONCE(vcpu >= nr_cpu_ids))
return -1;
+
__cpumask_set_cpu(vcpu, cpumask);
return vcpu;
}

-#ifdef CONFIG_NUMA
-static inline bool mm_node_vcpumask_test_cpu(struct mm_struct *mm, int vcpu_id)
-{
- if (num_possible_nodes() == 1)
- return true;
- return cpumask_test_cpu(vcpu_id, mm_node_vcpumask(mm, numa_node_id()));
-}

-static inline int __mm_vcpu_get(struct mm_struct *mm)
+static inline void __mm_vcpu_put(struct task_struct *p, bool clear_active)
{
- struct cpumask *cpumask = mm_vcpumask(mm),
- *node_cpumask = mm_node_vcpumask(mm, numa_node_id()),
- *node_alloc_cpumask = mm_node_alloc_vcpumask(mm);
- unsigned int node;
- int vcpu;
-
- if (num_possible_nodes() == 1)
- return __mm_vcpu_get_single_node(mm);
-
- /*
- * Try to reserve lowest available vcpu number within those already
- * reserved for this NUMA node.
- */
- vcpu = cpumask_first_andnot(node_cpumask, cpumask);
- if (vcpu >= nr_cpu_ids)
- goto alloc_numa;
- __cpumask_set_cpu(vcpu, cpumask);
- goto end;
+ lockdep_assert_irqs_disabled();
+ WARN_ON_ONCE(p->mm_vcpu < 0);

-alloc_numa:
- /*
- * Try to reserve lowest available vcpu number within those not already
- * allocated for numa nodes.
- */
- vcpu = cpumask_first_notandnot(node_alloc_cpumask, cpumask);
- if (vcpu >= nr_cpu_ids)
- goto numa_update;
- __cpumask_set_cpu(vcpu, cpumask);
- __cpumask_set_cpu(vcpu, node_cpumask);
- __cpumask_set_cpu(vcpu, node_alloc_cpumask);
- goto end;
-
-numa_update:
- /*
- * NUMA node id configuration changed for at least one CPU in the system.
- * We need to steal a currently unused vcpu_id from an overprovisioned
- * node for our current node. Userspace must handle the fact that the
- * node id associated with this vcpu_id may change due to node ID
- * reconfiguration.
- *
- * Count how many possible cpus are attached to each (other) node id,
- * and compare this with the per-mm node vcpumask cpu count. Find one
- * which has too many cpus in its mask to steal from.
- */
- for (node = 0; node < nr_node_ids; node++) {
- struct cpumask *iter_cpumask;
-
- if (node == numa_node_id())
- continue;
- iter_cpumask = mm_node_vcpumask(mm, node);
- if (nr_cpus_node(node) < cpumask_weight(iter_cpumask)) {
- /* Try to steal from this node. */
- vcpu = cpumask_first_andnot(iter_cpumask, cpumask);
- if (vcpu >= nr_cpu_ids)
- goto steal_fail;
- __cpumask_set_cpu(vcpu, cpumask);
- __cpumask_clear_cpu(vcpu, iter_cpumask);
- __cpumask_set_cpu(vcpu, node_cpumask);
- goto end;
- }
- }
+ raw_spin_lock(&mm->vcpu_lock);

-steal_fail:
- /*
- * Our attempt at gracefully stealing a vcpu_id from another
- * overprovisioned NUMA node failed. Fallback to grabbing the first
- * available vcpu_id.
- */
- vcpu = cpumask_first_zero(cpumask);
- if (vcpu >= nr_cpu_ids)
- return -1;
- __cpumask_set_cpu(vcpu, cpumask);
- /* Steal vcpu from its numa node mask. */
- for (node = 0; node < nr_node_ids; node++) {
- struct cpumask *iter_cpumask;
-
- if (node == numa_node_id())
- continue;
- iter_cpumask = mm_node_vcpumask(mm, node);
- if (cpumask_test_cpu(vcpu, iter_cpumask)) {
- __cpumask_clear_cpu(vcpu, iter_cpumask);
- break;
- }
- }
- __cpumask_set_cpu(vcpu, node_cpumask);
-end:
- return vcpu;
-}
+ __cpumask_clear_cpu(p->mm_vcpu, mm_vcpumask(mm));
+ p->mm_vcpu = -1;
+#ifdef CONFIG_NUMA
+ __cpumask_clear_cpu(p->mm_vcpu_node, mm_node_vcpumask(mm, task_node(p)));
+ p->mm_vcpu_node = -1;
+#endif
+ if (clear_active)
+ p->mm_vcpu_active = 0;

-#else
-static inline bool mm_node_vcpumask_test_cpu(struct mm_struct *mm, int vcpu_id)
-{
- return true;
-}
-static inline int __mm_vcpu_get(struct mm_struct *mm)
-{
- return __mm_vcpu_get_single_node(mm);
+ raw_spin_unlock(&mm->vcpu_lock);
}
-#endif

-static inline void mm_vcpu_put(struct mm_struct *mm, int vcpu)
+static inline void mm_vcpu_put(struct task_struct *p, bool clear_active)
{
- lockdep_assert_irqs_disabled();
- if (vcpu < 0)
- return;
- spin_lock(&mm->vcpu_lock);
- __cpumask_clear_cpu(vcpu, mm_vcpumask(mm));
- spin_unlock(&mm->vcpu_lock);
+ __mm_vcpu_put(p, true);
}

-static inline int mm_vcpu_get(struct mm_struct *mm)
+static inline void mm_vcpu_get(struct task_struct *p)
{
int ret;

lockdep_assert_irqs_disabled();
- spin_lock(&mm->vcpu_lock);
- ret = __mm_vcpu_get(mm);
- spin_unlock(&mm->vcpu_lock);
+ WARN_ON_ONCE(p->mm_vcpu > 0);
+
+ raw_spin_lock(&mm->vcpu_lock);
+ p->mm_vcpu = __mm_vcpu_get(mm_vcpumask(m));
+#ifdef CONFIG_NUMA
+ p->mm_vcpu_node = __mm_vcpu_get(mm_node_vcpumask(mm, task_node(p)));
+#endif
+ p->mm_vcpu_active = 1;
+ raw_spin_unlock(&mm->vcpu_lock);
return ret;
}

@@ -3414,15 +3324,16 @@ static inline void switch_mm_vcpu(struct
* Context switch between threads in same mm, hand over
* the mm_vcpu from prev to next.
*/
- next->mm_vcpu = prev->mm_vcpu;
- prev->mm_vcpu = -1;
+ swap(next->mm_vcpu, prev->mm_vcpu);
+#ifdef CONFIG_NUMA
+ swap(next->mm_vcpu_node, prev->mm_vcpu_node);
+#endif
return;
}
- mm_vcpu_put(prev->mm, prev->mm_vcpu);
- prev->mm_vcpu = -1;
+ __mm_vcpu_put(prev, false);
}
if (next->mm_vcpu_active)
- next->mm_vcpu = mm_vcpu_get(next->mm);
+ mm_vcpu_get(next);
}

#else