[PATCH RFC 1/1] KVM: x86: add param to update master clock periodically

From: Dongli Zhang
Date: Tue Sep 26 2023 - 19:50:41 EST


This is to minimize the kvmclock drift during CPU hotplug (or when the
master clock and pvclock_vcpu_time_info are updated). The drift is
because kvmclock and raw monotonic (tsc) use different
equation/mult/shift to calculate that how many nanoseconds (given the tsc
as input) has passed.

The calculation of the kvmclock is based on the pvclock_vcpu_time_info
provided by the host side.

struct pvclock_vcpu_time_info {
u32 version;
u32 pad0;
u64 tsc_timestamp; --> by host raw monotonic
u64 system_time; --> by host raw monotonic
u32 tsc_to_system_mul; --> by host KVM
s8 tsc_shift; --> by host KVM
u8 flags;
u8 pad[2];
} __attribute__((__packed__));

To calculate the current guest kvmclock:

1. Obtain the tsc = rdtsc() of guest.

2. If shift < 0:
tmp = tsc >> tsc_shift
if shift > 0:
tmp = tsc << tsc_shift

3. The kvmclock value will be: (tmp * tsc_to_system_mul) >> 32

Therefore, the current kvmclock will be either:

(rdtsc() >> tsc_shift) * tsc_to_system_mul >> 32

... or ...

(rdtsc() << tsc_shift) * tsc_to_system_mul >> 32

The 'tsc_to_system_mul' and 'tsc_shift' are calculated by the host KVM.

When the master clock is actively used, the 'tsc_timestamp' and
'system_time' are derived from the host raw monotonic time, which is
calculated based on the 'mult' and 'shift' of clocksource_tsc:

elapsed_time = (tsc * mult) >> shift

Since kvmclock and raw monotonic (clocksource_tsc) use different
equation/mult/shift to convert the tsc to nanosecond, there may be clock
drift issue during CPU hotplug (when the master clock is updated).

1. The guest boots and all vcpus have the same 'pvclock_vcpu_time_info'
(suppose the master clock is used).

2. Since the master clock is never updated, the periodic kvmclock_sync_work
does not update the values in 'pvclock_vcpu_time_info'.

3. Suppose a very long period has passed (e.g., 30-day).

4. The user adds another vcpu. Both master clock and
'pvclock_vcpu_time_info' are updated, based on the raw monotonic.

(Ideally, we expect the update is based on 'tsc_to_system_mul' and
'tsc_shift' from kvmclock).


Because kvmclock and raw monotonic (clocksource_tsc) use different
equation/mult/shift to convert the tsc to nanosecond, there will be drift
between:

(1) kvmclock based on current rdtsc and old 'pvclock_vcpu_time_info'
(2) kvmclock based on current rdtsc and new 'pvclock_vcpu_time_info'

According to the test, there is a drift of 4502145ns between (1) and (2)
after about 40 hours. The longer the time, the large the drift.

This is to add a module param to allow the user to configure for how often
to refresh the master clock, in order to reduce the kvmclock drift based on
user requirement (e.g., every 5-min to every day). The more often that the
master clock is refreshed, the smaller the kvmclock drift during the vcpu
hotplug.

Cc: Joe Jin <joe.jin@xxxxxxxxxx>
Signed-off-by: Dongli Zhang <dongli.zhang@xxxxxxxxxx>
---
Other options are to update the masterclock in:
- kvmclock_sync_work, or
- pvclock_gtod_notify()

arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/x86.c | 34 +++++++++++++++++++++++++++++++++
2 files changed, 35 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 17715cb8731d..57409dce5d73 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1331,6 +1331,7 @@ struct kvm_arch {
u64 master_cycle_now;
struct delayed_work kvmclock_update_work;
struct delayed_work kvmclock_sync_work;
+ struct delayed_work masterclock_sync_work;

struct kvm_xen_hvm_config xen_hvm_config;

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9f18b06bbda6..0b71dc3785eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -157,6 +157,9 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);

+unsigned int __read_mostly masterclock_sync_period;
+module_param(masterclock_sync_period, uint, 0444);
+
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
@@ -3298,6 +3301,31 @@ static void kvmclock_sync_fn(struct work_struct *work)
KVMCLOCK_SYNC_PERIOD);
}

+static void masterclock_sync_fn(struct work_struct *work)
+{
+ unsigned long i;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ masterclock_sync_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct kvm_vcpu *vcpu;
+
+ if (!masterclock_sync_period)
+ return;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * It is not required to kick the vcpu because it is not
+ * expected to update the master clock immediately.
+ */
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ }
+
+ schedule_delayed_work(&ka->masterclock_sync_work,
+ masterclock_sync_period * HZ);
+}
+
+
/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
static bool is_mci_control_msr(u32 msr)
{
@@ -11970,6 +11998,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
+
+ if (masterclock_sync_period && vcpu->vcpu_idx == 0)
+ schedule_delayed_work(&kvm->arch.masterclock_sync_work,
+ masterclock_sync_period * HZ);
}

void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -12344,6 +12376,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+ INIT_DELAYED_WORK(&kvm->arch.masterclock_sync_work, masterclock_sync_fn);

kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
@@ -12383,6 +12416,7 @@ static void kvm_unload_vcpu_mmus(struct kvm *kvm)

void kvm_arch_sync_events(struct kvm *kvm)
{
+ cancel_delayed_work_sync(&kvm->arch.masterclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
kvm_free_pit(kvm);
--
2.34.1