[RFC PATCH 44/86] sched: voluntary preemption

From: Ankur Arora
Date: Tue Nov 07 2023 - 17:04:38 EST


The no preemption model allows running to completion in kernel context.
For voluntary preemption, allow preemption by higher scheduling
classes.

To do this resched_curr() now takes a parameter that specifies if the
resched is for a scheduler class above the runqueue's current task.
And reschedules eagerly, if so.

Also define scheduler feature PREEMPT_PRIORITY which can be used to
toggle voluntary preemption model at runtime.

TODO: Both RT, deadline work but I'm almost certainly not doing all the
right things for both.

Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
---
kernel/Kconfig.preempt | 19 ++++++-------------
kernel/sched/core.c | 28 +++++++++++++++++-----------
kernel/sched/core_sched.c | 2 +-
kernel/sched/deadline.c | 22 +++++++++++-----------
kernel/sched/fair.c | 18 +++++++++---------
kernel/sched/features.h | 5 +++++
kernel/sched/idle.c | 2 +-
kernel/sched/rt.c | 26 +++++++++++++-------------
kernel/sched/sched.h | 2 +-
9 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 074fe5e253b5..e16114b679e3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -20,23 +20,16 @@ config PREEMPT_NONE
at runtime.

config PREEMPT_VOLUNTARY
- bool "Voluntary Kernel Preemption (Desktop)"
+ bool "Voluntary Kernel Preemption"
depends on !ARCH_NO_PREEMPT
select PREEMPTION
help
- This option reduces the latency of the kernel by adding more
- "explicit preemption points" to the kernel code. These new
- preemption points have been selected to reduce the maximum
- latency of rescheduling, providing faster application reactions,
- at the cost of slightly lower throughput.
+ This option reduces the latency of the kernel by allowing
+ processes in higher scheduling policy classes preempt ones
+ lower down.

- This allows reaction to interactive events by allowing a
- low priority process to voluntarily preempt itself even if it
- is in kernel mode executing a system call. This allows
- applications to run more 'smoothly' even when the system is
- under load.
-
- Select this if you are building a kernel for a desktop system.
+ Higher priority processes in the same scheduling policy class
+ do not preempt others in the same class.

config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a50a64255c6..3fa78e8afb7d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -256,7 +256,7 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
*/
if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
rq->core->core_forceidle_count && rq->curr == rq->idle)
- resched_curr(rq);
+ resched_curr(rq, false);
}

static int sched_task_is_throttled(struct task_struct *p, int cpu)
@@ -1074,9 +1074,12 @@ void __resched_curr(struct rq *rq, resched_t rs)
*
* - in userspace: run to completion semantics are only for kernel tasks
*
- * Otherwise (regardless of priority), run to completion.
+ * - running under voluntary preemption (sched_feat(PREEMPT_PRIORITY))
+ * and a task from a sched_class above wants the CPU
+ *
+ * Otherwise, run to completion.
*/
-void resched_curr(struct rq *rq)
+void resched_curr(struct rq *rq, bool above)
{
resched_t rs = RESCHED_lazy;
int context;
@@ -1112,6 +1115,9 @@ void resched_curr(struct rq *rq)
goto resched;
}

+ if (sched_feat(PREEMPT_PRIORITY) && above)
+ rs = RESCHED_eager;
+
resched:
__resched_curr(rq, rs);
}
@@ -1123,7 +1129,7 @@ void resched_cpu(int cpu)

raw_spin_rq_lock_irqsave(rq, flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
- resched_curr(rq);
+ resched_curr(rq, true);
raw_spin_rq_unlock_irqrestore(rq, flags);
}

@@ -2277,7 +2283,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
- resched_curr(rq);
+ resched_curr(rq, true);

/*
* A queue event has occurred, and we're going to schedule. In
@@ -2764,7 +2770,7 @@ int push_cpu_stop(void *arg)
deactivate_task(rq, p, 0);
set_task_cpu(p, lowest_rq->cpu);
activate_task(lowest_rq, p, 0);
- resched_curr(lowest_rq);
+ resched_curr(lowest_rq, true);
}

double_unlock_balance(rq, lowest_rq);
@@ -3999,7 +4005,7 @@ void wake_up_if_idle(int cpu)
if (is_idle_task(rcu_dereference(rq->curr))) {
guard(rq_lock_irqsave)(rq);
if (is_idle_task(rq->curr))
- resched_curr(rq);
+ resched_curr(rq, true);
}
}

@@ -6333,7 +6339,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
continue;
}

- resched_curr(rq_i);
+ resched_curr(rq_i, false);
}

out_set_next:
@@ -6388,7 +6394,7 @@ static bool try_steal_cookie(int this, int that)
set_task_cpu(p, this);
activate_task(dst, p, 0);

- resched_curr(dst);
+ resched_curr(dst, false);

success = true;
break;
@@ -8743,7 +8749,7 @@ int __sched yield_to(struct task_struct *p, bool preempt)
* fairness.
*/
if (preempt && rq != p_rq)
- resched_curr(p_rq);
+ resched_curr(p_rq, true);
}

out_unlock:
@@ -10300,7 +10306,7 @@ void sched_move_task(struct task_struct *tsk)
* throttled one but it's still the running task. Trigger a
* resched to make sure that task can still run.
*/
- resched_curr(rq);
+ resched_curr(rq, true);
}

unlock:
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index a57fd8f27498..32f234f2a210 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -89,7 +89,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* next scheduling edge, rather than always forcing a reschedule here.
*/
if (task_on_cpu(rq, p))
- resched_curr(rq);
+ resched_curr(rq, false);

task_rq_unlock(rq, p, &rf);

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e6815c3bd2f0..ecb47b5e9588 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1177,7 +1177,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
- resched_curr(rq);
+ resched_curr(rq, false);

#ifdef CONFIG_SMP
/*
@@ -1367,7 +1367,7 @@ static void update_curr_dl(struct rq *rq)
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);

if (!is_leftmost(curr, &rq->dl))
- resched_curr(rq);
+ resched_curr(rq, false);
}

/*
@@ -1914,7 +1914,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
cpudl_find(&rq->rd->cpudl, p, NULL))
return;

- resched_curr(rq);
+ resched_curr(rq, false);
}

static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
@@ -1943,7 +1943,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
int flags)
{
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
- resched_curr(rq);
+ resched_curr(rq, false);
return;
}

@@ -2307,7 +2307,7 @@ static int push_dl_task(struct rq *rq)
if (dl_task(rq->curr) &&
dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
rq->curr->nr_cpus_allowed > 1) {
- resched_curr(rq);
+ resched_curr(rq, false);
return 0;
}

@@ -2353,7 +2353,7 @@ static int push_dl_task(struct rq *rq)
activate_task(later_rq, next_task, 0);
ret = 1;

- resched_curr(later_rq);
+ resched_curr(later_rq, false);

double_unlock_balance(rq, later_rq);

@@ -2457,7 +2457,7 @@ static void pull_dl_task(struct rq *this_rq)
}

if (resched)
- resched_curr(this_rq);
+ resched_curr(this_rq, false);
}

/*
@@ -2654,7 +2654,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
- resched_curr(rq);
+ resched_curr(rq, false);
} else {
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
}
@@ -2687,7 +2687,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
* runqueue.
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
- resched_curr(rq);
+ resched_curr(rq, false);
} else {
/*
* Current may not be deadline in case p was throttled but we
@@ -2697,14 +2697,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (!dl_task(rq->curr) ||
dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
- resched_curr(rq);
+ resched_curr(rq, false);
}
#else
/*
* We don't know if p has a earlier or later deadline, so let's blindly
* set a (maybe not needed) rescheduling point.
*/
- resched_curr(rq);
+ resched_curr(rq, false);
#endif
}

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe7e5e9b2207..448fe36e7bbb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1046,7 +1046,7 @@ static void update_deadline(struct cfs_rq *cfs_rq,
if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
__resched_curr(rq, RESCHED_eager);
else
- resched_curr(rq);
+ resched_curr(rq, false);

clear_buddies(cfs_rq, se);
}
@@ -5337,7 +5337,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr(rq_of(cfs_rq), false);
return;
}
/*
@@ -5483,7 +5483,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_curr(rq_of(cfs_rq));
+ resched_curr(rq_of(cfs_rq), false);
}

static __always_inline
@@ -5743,7 +5743,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
- resched_curr(rq);
+ resched_curr(rq, false);
}

#ifdef CONFIG_SMP
@@ -6448,7 +6448,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)

if (delta < 0) {
if (task_current(rq, p))
- resched_curr(rq);
+ resched_curr(rq, false);
return;
}
hrtick_start(rq, delta);
@@ -8143,7 +8143,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;

preempt:
- resched_curr(rq);
+ resched_curr(rq, false);
}

#ifdef CONFIG_SMP
@@ -12294,7 +12294,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
*/
if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
- resched_curr(rq);
+ resched_curr(rq, false);
}

/*
@@ -12459,7 +12459,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (task_current(rq, p)) {
if (p->prio > oldprio)
- resched_curr(rq);
+ resched_curr(rq, false);
} else
check_preempt_curr(rq, p, 0);
}
@@ -12561,7 +12561,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* if we can still preempt the current task.
*/
if (task_current(rq, p))
- resched_curr(rq);
+ resched_curr(rq, false);
else
check_preempt_curr(rq, p, 0);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9b4c2967b2b7..9bf30732b03f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -92,6 +92,11 @@ SCHED_FEAT(HZ_BW, true)

#if defined(CONFIG_PREEMPT)
SCHED_FEAT(FORCE_PREEMPT, true)
+SCHED_FEAT(PREEMPT_PRIORITY, true)
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+SCHED_FEAT(FORCE_PREEMPT, false)
+SCHED_FEAT(PREEMPT_PRIORITY, true)
#else
SCHED_FEAT(FORCE_PREEMPT, false)
+SCHED_FEAT(PREEMPT_PRIORITY, false)
#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index eacd204e2879..3ef039869be9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -403,7 +403,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
- resched_curr(rq);
+ resched_curr(rq, true);
}

static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5fdb93f1b87e..8d87e42d30d8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -589,7 +589,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
enqueue_rt_entity(rt_se, 0);

if (rt_rq->highest_prio.curr < curr->prio)
- resched_curr(rq);
+ resched_curr(rq, false);
}
}

@@ -682,7 +682,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
return;

enqueue_top_rt_rq(rt_rq);
- resched_curr(rq);
+ resched_curr(rq, false);
}

static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -1076,7 +1076,7 @@ static void update_curr_rt(struct rq *rq)
rt_rq->rt_time += delta_exec;
exceeded = sched_rt_runtime_exceeded(rt_rq);
if (exceeded)
- resched_curr(rq);
+ resched_curr(rq, false);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
if (exceeded)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
@@ -1691,7 +1691,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* to try and push the current task away:
*/
requeue_task_rt(rq, p, 1);
- resched_curr(rq);
+ resched_curr(rq, false);
}

static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
@@ -1718,7 +1718,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
- resched_curr(rq);
+ resched_curr(rq, false);
return;
}

@@ -2074,7 +2074,7 @@ static int push_rt_task(struct rq *rq, bool pull)
* just reschedule current.
*/
if (unlikely(next_task->prio < rq->curr->prio)) {
- resched_curr(rq);
+ resched_curr(rq, false);
return 0;
}

@@ -2162,7 +2162,7 @@ static int push_rt_task(struct rq *rq, bool pull)
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
- resched_curr(lowest_rq);
+ resched_curr(lowest_rq, false);
ret = 1;

double_unlock_balance(rq, lowest_rq);
@@ -2456,7 +2456,7 @@ static void pull_rt_task(struct rq *this_rq)
}

if (resched)
- resched_curr(this_rq);
+ resched_curr(this_rq, false);
}

/*
@@ -2555,7 +2555,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
- resched_curr(rq);
+ resched_curr(rq, false);
}
}

@@ -2583,11 +2583,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* then reschedule.
*/
if (p->prio > rq->rt.highest_prio.curr)
- resched_curr(rq);
+ resched_curr(rq, false);
#else
/* For UP simply resched on drop of prio */
if (oldprio < p->prio)
- resched_curr(rq);
+ resched_curr(rq, false);
#endif /* CONFIG_SMP */
} else {
/*
@@ -2596,7 +2596,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* then reschedule.
*/
if (p->prio < rq->curr->prio)
- resched_curr(rq);
+ resched_curr(rq, false);
}
}

@@ -2668,7 +2668,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
if (test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
__resched_curr(rq, RESCHED_eager);
else
- resched_curr(rq);
+ resched_curr(rq, false);

return;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e29a8897f573..9a745dd7482f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2435,7 +2435,7 @@ extern void init_sched_fair_class(void);
extern void reweight_task(struct task_struct *p, int prio);

extern void __resched_curr(struct rq *rq, resched_t rs);
-extern void resched_curr(struct rq *rq);
+extern void resched_curr(struct rq *rq, bool above);
extern void resched_cpu(int cpu);

extern struct rt_bandwidth def_rt_bandwidth;
--
2.31.1