[PATCH 01/30] preempt: introduce CONFIG_PREEMPT_AUTO

From: Ankur Arora
Date: Tue Feb 13 2024 - 00:59:03 EST


PREEMPT_AUTO adds a new scheduling model which, like PREEMPT_DYNAMIC,
allows dynamic switching between a none/voluntary/full preemption
model. However, unlike PREEMPT_DYNAMIC, it doesn't use explicit
preemption points for the voluntary models.

It works by depending on CONFIG_PREEMPTION (and thus PREEMPT_COUNT),
allowing the scheduler to always know when it is safe to preempt
for all three preemption models.

In addition, it uses an additional need-resched bit
(TIF_NEED_RESCHED_LAZY) which, with TIF_NEED_RESCHED allows the
scheduler to express two kinds of rescheduling intent: schedule at
the earliest opportunity (the usual TIF_NEED_RESCHED semantics), or
express a need for rescheduling while allowing the task on the
runqueue to run to timeslice completion (TIF_NEED_RESCHED_LAZY).

Based on the preemption model in use, the scheduler chooses
need-resched in the following manner:

TIF_NEED_RESCHED TIF_NEED_RESCHED_LAZY

none never always [*]
voluntary higher sched class other tasks [*]
full always never

[*] when preempting idle, or for kernel tasks that are 'urgent' in
some way (ex. resched_cpu() used as an RCU hammer), we use
TIF_NEED_RESCHED.

As mentioned above, the other part is when preemption happens -- when
are the need-resched flags checked:

exit-to-user ret-to-kernel preempt_count()
NEED_RESCHED_LAZY Y N N
NEED_RESCHED Y Y Y

Exposed under CONFIG_EXPERT for now.

Cc: Peter Ziljstra <peterz@xxxxxxxxxxxxx>
Cc: Jonathan Corbet <corbet@xxxxxxx>
Originally-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Link: https://lore.kernel.org/lkml/87jzshhexi.ffs@tglx/
Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
---
.../admin-guide/kernel-parameters.txt | 1 +
include/linux/thread_info.h | 8 ++++
init/Makefile | 1 +
kernel/Kconfig.preempt | 37 +++++++++++++++++--
4 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 31b3a25680d0..5d2bd21f98e1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4662,6 +4662,7 @@

preempt= [KNL]
Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC
+ or CONFIG_PREEMPT_AUTO.
none - Limited to cond_resched() calls
voluntary - Limited to cond_resched() and might_sleep() calls
full - Any section that isn't explicitly preempt disabled
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9ea0b28068f4..7b1d9185aac6 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,14 @@ enum syscall_work_bit {

#include <asm/thread_info.h>

+/*
+ * Fall back to the default behaviour if we don't have CONFIG_PREEMPT_AUTO.
+ */
+#ifndef CONFIG_PREEMPT_AUTO
+#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
#ifdef __KERNEL__

#ifndef arch_set_restart_data
diff --git a/init/Makefile b/init/Makefile
index cbac576c57d6..da1dba3116dc 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -27,6 +27,7 @@ smp-flag-$(CONFIG_SMP) := SMP
preempt-flag-$(CONFIG_PREEMPT_BUILD) := PREEMPT
preempt-flag-$(CONFIG_PREEMPT_DYNAMIC) := PREEMPT_DYNAMIC
preempt-flag-$(CONFIG_PREEMPT_RT) := PREEMPT_RT
+preempt-flag-$(CONFIG_PREEMPT_AUTO) := PREEMPT_AUTO

build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto))
build-timestamp = $(or $(KBUILD_BUILD_TIMESTAMP), $(build-timestamp-auto))
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..fe83040ad755 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,13 +11,17 @@ config PREEMPT_BUILD
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK

+config HAVE_PREEMPT_AUTO
+ bool
+
choice
prompt "Preemption Model"
default PREEMPT_NONE

config PREEMPT_NONE
bool "No Forced Preemption (Server)"
- select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
+ select PREEMPT_NONE_BUILD if (!PREEMPT_DYNAMIC && !PREEMPT_AUTO)
+
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -32,7 +36,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
+ select PREEMPT_VOLUNTARY_BUILD if (!PREEMPT_DYNAMIC && !PREEMPT_AUTO)
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -95,7 +99,7 @@ config PREEMPTION

config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
- depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO
select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
select PREEMPT_BUILD
default y if HAVE_PREEMPT_DYNAMIC_CALL
@@ -115,6 +119,33 @@ config PREEMPT_DYNAMIC
Interesting if you want the same pre-built kernel should be used for
both Server and Desktop workloads.

+config PREEMPT_AUTO
+ bool "Scheduler controlled preemption model"
+ depends on EXPERT && HAVE_PREEMPT_AUTO && !ARCH_NO_PREEMPT
+ select PREEMPT_BUILD
+ help
+ This option allows to define the preemption model on the kernel
+ command line parameter and thus override the default preemption
+ model selected during compile time.
+
+ However, note that the compile time choice of preemption model
+ might impact other kernel options like the specific RCU model.
+
+ This feature makes the latency of the kernel configurable by
+ allowing the scheduler to choose when to preempt based on
+ the preemption policy in effect. It does this without needing
+ voluntary preemption points.
+
+ With PREEMPT_NONE: the scheduler allows a task (executing in
+ user or kernel context) to run to completion, at least until
+ its current tick expires.
+
+ With PREEMPT_VOLUNTARY: similar to PREEMPT_NONE, but the scheduler
+ will also preempt for higher priority class of processes but not
+ lower.
+
+ With PREEMPT: the scheduler preempts at the earliest opportunity.
+
config SCHED_CORE
bool "Core Scheduling for SMT"
depends on SCHED_SMT
--
2.31.1