[RFC PATCH] sched: Rate limit migrations

From: Mathieu Desnoyers
Date: Tue Apr 11 2023 - 17:41:29 EST


This WIP patch rate-limits migrations to 32 migrations per 10ms window
for each task.

The specific migration count and window size can be changed with the
following defines in kernel/sched/sched.h:

- SCHED_MIGRATION_WINDOW_NS
- SCHED_MIGRATION_LIMIT

Testing is welcome, especially to see if it helps with Aaron's
migration-heavy workload wrt rseq concurrency id performance
regression.

Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Cc: Aaron Lu <aaron.lu@xxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Olivier Dion <odion@xxxxxxxxxxxx>
Cc: michael.christie@xxxxxxxxxx
---
include/linux/sched.h | 9 +++++++++
kernel/fork.c | 3 +++
kernel/sched/core.c | 42 ++++++++++++++++++++++++++++++++++++++----
kernel/sched/sched.h | 7 +++++++
4 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 48d48b2c73a5..bfd5e268900c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1316,6 +1316,15 @@ struct task_struct {
int last_mm_cid; /* Most recent cid in mm */
int mm_cid_active; /* Whether cid bitmap is active */
#endif
+ /*
+ * Keep track of last migration time to compare sched_clock
+ * locally from a single CPU perspective.
+ */
+ u64 last_migration_time;
+ /* Time slice used in current migration window. */
+ u64 migration_window_time_slice;
+ /* Number of migrations in current migration window. */
+ u32 migration_count;

struct tlbflush_unmap_batch tlb_ubc;

diff --git a/kernel/fork.c b/kernel/fork.c
index 3832bea713c4..791792a218f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,6 +1061,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
#endif
+ tsk->last_migration_time = 0;
+ tsk->migration_window_time_slice = 0;
+ tsk->migration_count = 0;
return tsk;

free_stack:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2aac6f14f21c..a530727b11f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2087,6 +2087,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
if (task_on_rq_migrating(p)) {
flags |= ENQUEUE_MIGRATED;
sched_mm_cid_migrate_to(rq, p);
+ p->last_migration_time = sched_clock();
}

enqueue_task(rq, p, flags);
@@ -3547,17 +3548,47 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
return dest_cpu;
}

+static inline
+bool migration_allowed(struct task_struct *p, u64 current_time)
+{
+ u64 delta = current_time - p->last_migration_time;
+
+ if (delta + p->migration_window_time_slice > SCHED_MIGRATION_WINDOW_NS ||
+ p->migration_count < SCHED_MIGRATION_LIMIT)
+ return true;
+ return false;
+}
+
+static inline
+void migration_add_delta_to_slice(struct task_struct *p, u64 current_time)
+{
+ u64 delta = current_time - p->last_migration_time;
+
+ if (delta + p->migration_window_time_slice > SCHED_MIGRATION_WINDOW_NS) {
+ /* Reset the migration window if it has ended. */
+ p->migration_window_time_slice = 0;
+ p->migration_count = 0;
+ return;
+ }
+ p->migration_window_time_slice += delta;
+ p->migration_count++;
+}
+
/*
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
+int select_task_rq(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ u64 current_time = sched_clock();
+ int cpu = prev_cpu;
+
lockdep_assert_held(&p->pi_lock);

- if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
- cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
- else
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
+ if (migration_allowed(p, current_time))
+ cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
+ } else
cpu = cpumask_any(p->cpus_ptr);

/*
@@ -3573,6 +3604,9 @@ int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
if (unlikely(!is_cpu_allowed(p, cpu)))
cpu = select_fallback_rq(task_cpu(p), p);

+ if (prev_cpu != cpu)
+ migration_add_delta_to_slice(p, current_time);
+
return cpu;
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 64220134fb45..e52cc38f10fc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -104,6 +104,13 @@ struct cpuidle_state;
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2

+/*
+ * A task can be migrated at most SCHED_MIGRATION_WINDOW_LIMIT times per
+ * sched-migration window.
+ */
+#define SCHED_MIGRATION_WINDOW_NS (10ULL * 1000000) /* 10 ms */
+#define SCHED_MIGRATION_LIMIT 32
+
extern __read_mostly int scheduler_running;

extern unsigned long calc_load_update;
--
2.25.1