[PATCH RFC 1/1] x86/paravirt: introduce param to disable pv sched_clock

From: Dongli Zhang
Date: Wed Oct 18 2023 - 18:17:53 EST


As mentioned in the linux kernel development document, "sched_clock() is
used for scheduling and timestamping". While there is a default native
implementation, many paravirtualizations have their own implementations.

About KVM, it uses kvm_sched_clock_read() and there is no way to only
disable KVM's sched_clock. The "no-kvmclock" may disable all
paravirtualized kvmclock features.

94 static inline void kvm_sched_clock_init(bool stable)
95 {
96 if (!stable)
97 clear_sched_clock_stable();
98 kvm_sched_clock_offset = kvm_clock_read();
99 paravirt_set_sched_clock(kvm_sched_clock_read);
100
101 pr_info("kvm-clock: using sched offset of %llu cycles",
102 kvm_sched_clock_offset);
103
104 BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
105 sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
106 }

There is known issue that kvmclock may drift during vCPU hotplug [1].
Although a temporary fix is available [2], we may need a way to disable pv
sched_clock. Nowadays, the TSC is more stable and has less performance
overhead than kvmclock.

This is to propose to introduce a global param to disable pv sched_clock
for all paravirtualizations.

Please suggest and comment if other options are better:

1. Global param (this RFC patch).

2. The kvmclock specific param (e.g., "no-vmw-sched-clock" in vmware).

Indeed I like the 2nd method.

3. Enforce native sched_clock only when TSC is invariant (hyper-v method).

4. Remove and cleanup pv sched_clock, and always use pv_sched_clock() for
all (suggested by Peter Zijlstra in [3]). Some paravirtualizations may
want to keep the pv sched_clock.

To introduce a param may be easier to backport to old kernel version.

References:
[1] https://lore.kernel.org/all/20230926230649.67852-1-dongli.zhang@xxxxxxxxxx/
[2] https://lore.kernel.org/all/20231018195638.1898375-1-seanjc@xxxxxxxxxx/
[3] https://lore.kernel.org/all/20231002211651.GA3774@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/

Thank you very much for the suggestion!

Signed-off-by: Dongli Zhang <dongli.zhang@xxxxxxxxxx>
---
arch/x86/include/asm/paravirt.h | 2 +-
arch/x86/kernel/kvmclock.c | 12 +++++++-----
arch/x86/kernel/paravirt.c | 18 +++++++++++++++++-
3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6c8ff12140ae..f36edf608b6b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -24,7 +24,7 @@ u64 dummy_sched_clock(void);
DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);

-void paravirt_set_sched_clock(u64 (*func)(void));
+int paravirt_set_sched_clock(u64 (*func)(void));

static __always_inline u64 paravirt_sched_clock(void)
{
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index fb8f52149be9..0b8bf5677d44 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -93,13 +93,15 @@ static noinstr u64 kvm_sched_clock_read(void)

static inline void kvm_sched_clock_init(bool stable)
{
- if (!stable)
- clear_sched_clock_stable();
kvm_sched_clock_offset = kvm_clock_read();
- paravirt_set_sched_clock(kvm_sched_clock_read);

- pr_info("kvm-clock: using sched offset of %llu cycles",
- kvm_sched_clock_offset);
+ if (!paravirt_set_sched_clock(kvm_sched_clock_read)) {
+ if (!stable)
+ clear_sched_clock_stable();
+
+ pr_info("kvm-clock: using sched offset of %llu cycles",
+ kvm_sched_clock_offset);
+ }

BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 97f1436c1a20..2cfef94317b0 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -118,9 +118,25 @@ static u64 native_steal_clock(int cpu)
DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);

-void paravirt_set_sched_clock(u64 (*func)(void))
+static bool no_pv_sched_clock;
+
+static int __init parse_no_pv_sched_clock(char *arg)
+{
+ no_pv_sched_clock = true;
+ return 0;
+}
+early_param("no_pv_sched_clock", parse_no_pv_sched_clock);
+
+int paravirt_set_sched_clock(u64 (*func)(void))
{
+ if (no_pv_sched_clock) {
+ pr_info("sched_clock: not configurable\n");
+ return -EPERM;
+ }
+
static_call_update(pv_sched_clock, func);
+
+ return 0;
}

/* These are in entry.S */
--
2.34.1