Re: [PATCH v2 07/10] KVM: X86: Move write_l1_tsc_offset() logic to common code and rename it

From: Stamatis, Ilias
Date: Wed May 19 2021 - 07:45:29 EST


On Wed, 2021-05-19 at 00:05 +0000, Sean Christopherson wrote:
> On Wed, May 12, 2021, Ilias Stamatis wrote:
> > The write_l1_tsc_offset() callback has a misleading name. It does not
> > set L1's TSC offset, it rather updates the current TSC offset which
> > might be different if a nested guest is executing. Additionally, both
> > the vmx and svm implementations use the same logic for calculating the
> > current TSC before writing it to hardware.
>
> I don't disagree, but the current name as the advantage of clarifying (well,
> hinting) that the offset is L1's offset. That hint is lost in this refactoring.
> Maybe rename "u64 offset" to "u64 l1_tsc_offset"?
>
> > This patch renames the function and moves the common logic to the
>
> Use imperative mood instead of "This patch. From
> Documentation/process/submitting-patches.rst:
>
> Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
> instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
> to do frotz", as if you are giving orders to the codebase to change
> its behaviour.
>
> > caller. The vmx/svm-specific code now merely sets the given offset to
> > the corresponding hardware structure.
> >
> > Signed-off-by: Ilias Stamatis <ilstam@xxxxxxxxxx>
> > ---
> > arch/x86/include/asm/kvm-x86-ops.h | 2 +-
> > arch/x86/include/asm/kvm_host.h | 3 +--
> > arch/x86/kvm/svm/svm.c | 21 ++++-----------------
> > arch/x86/kvm/vmx/vmx.c | 23 +++--------------------
> > arch/x86/kvm/x86.c | 17 ++++++++++++++++-
> > 5 files changed, 25 insertions(+), 41 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> > index 2063616fba1c..029c9615378f 100644
> > --- a/arch/x86/include/asm/kvm-x86-ops.h
> > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > @@ -89,7 +89,7 @@ KVM_X86_OP(load_mmu_pgd)
> > KVM_X86_OP_NULL(has_wbinvd_exit)
> > KVM_X86_OP(get_l2_tsc_offset)
> > KVM_X86_OP(get_l2_tsc_multiplier)
> > -KVM_X86_OP(write_l1_tsc_offset)
> > +KVM_X86_OP(write_tsc_offset)
> > KVM_X86_OP(get_exit_info)
> > KVM_X86_OP(check_intercept)
> > KVM_X86_OP(handle_exit_irqoff)
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 57a25d8e8b0f..61cf201c001a 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1307,8 +1307,7 @@ struct kvm_x86_ops {
> >
> > u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
> > u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
> > - /* Returns actual tsc_offset set in active VMCS */
> > - u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> > + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> >
> > /*
> > * Retrieve somewhat arbitrary exit information. Intended to be used
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 679b2fc1a3f9..b18f60463073 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -1094,26 +1094,13 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
> > return kvm_default_tsc_scaling_ratio;
> > }
> >
> > -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> > +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> > {
> > struct vcpu_svm *svm = to_svm(vcpu);
> > - u64 g_tsc_offset = 0;
> > -
> > - if (is_guest_mode(vcpu)) {
> > - /* Write L1's TSC offset. */
> > - g_tsc_offset = svm->vmcb->control.tsc_offset -
> > - svm->vmcb01.ptr->control.tsc_offset;
> > - svm->vmcb01.ptr->control.tsc_offset = offset;
> > - }
> > -
> > - trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> > - svm->vmcb->control.tsc_offset - g_tsc_offset,
> > - offset);
> > -
> > - svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
> >
> > + svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
> > + svm->vmcb->control.tsc_offset = offset;
> > vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> > - return svm->vmcb->control.tsc_offset;
> > }
> >
> > /* Evaluate instruction intercepts that depend on guest CPUID features. */
> > @@ -4540,7 +4527,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
> >
> > .get_l2_tsc_offset = svm_get_l2_tsc_offset,
> > .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
> > - .write_l1_tsc_offset = svm_write_l1_tsc_offset,
> > + .write_tsc_offset = svm_write_tsc_offset,
> >
> > .load_mmu_pgd = svm_load_mmu_pgd,
> >
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 575e13bddda8..3c4eb14a1e86 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -1810,26 +1810,9 @@ static u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
> > return multiplier;
> > }
> >
> > -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> > +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> > {
> > - struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> > - u64 g_tsc_offset = 0;
> > -
> > - /*
> > - * We're here if L1 chose not to trap WRMSR to TSC. According
> > - * to the spec, this should set L1's TSC; The offset that L1
> > - * set for L2 remains unchanged, and still needs to be added
> > - * to the newly set TSC to get L2's TSC.
> > - */
> > - if (is_guest_mode(vcpu) &&
> > - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
> > - g_tsc_offset = vmcs12->tsc_offset;
> > -
> > - trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> > - vcpu->arch.tsc_offset - g_tsc_offset,
> > - offset);
> > - vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
> > - return offset + g_tsc_offset;
> > + vmcs_write64(TSC_OFFSET, offset);
> > }
> >
> > /*
> > @@ -7725,7 +7708,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
> >
> > .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
> > .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
> > - .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
> > + .write_tsc_offset = vmx_write_tsc_offset,
> >
> > .load_mmu_pgd = vmx_load_mmu_pgd,
> >
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 1db6cfc2079f..f3ba1be4d5b9 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -2377,8 +2377,23 @@ EXPORT_SYMBOL_GPL(kvm_set_02_tsc_multiplier);
> >
> > static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> > {
> > + trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> > + vcpu->arch.l1_tsc_offset,
> > + offset);
> > +
> > vcpu->arch.l1_tsc_offset = offset;
> > - vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
> > + vcpu->arch.tsc_offset = offset;
> > +
> > + if (is_guest_mode(vcpu)) {
>
> Unnecessary curly braces.

Really? We are supposed to have a 6-lines body without brackets? I'm not
opposing, I'm just surprised that that's the coding standard.

>
> > + /*
> > + * We're here if L1 chose not to trap WRMSR to TSC and
> > + * according to the spec this should set L1's TSC (as opposed
> > + * to setting L1's offset for L2).
> > + */
>
> While we're shuffling code, can we improve this comment? It works for the WRMSR
> case, but makes no sense in the context of host TSC adjustments. It's not at all
> clear to me that it's even correct or relevant in those cases.
>

Do you suggest removing it completely or how do you want it to be? I don't
mind deleting it.

> > + kvm_set_02_tsc_offset(vcpu);
>
> I really dislike that kvm_set_02_tsc_offset() consumes a bunch of variables
> _and_ sets arch.tsc_offset, e.g. it's not at all obvious that moving this call
> around will break L2. Even more bizarre is that arch.tsc_offset is conditionally
> consumed. Oh, and kvm_set_02_tsc_offset() is not idempotent since it can do a
> RMW on arch.tsc_offset.
>
> The below static_call() dependency doesn't appear to be easily solved, but we
> can at least strongly hint that vcpu->arch.tsc_offset is set. For everything
> else, I think we can clean things up by doing this (with the vendor calls
> providing the L2 variables directly):
>
> void kvm_set_l2_tsc_offset(struct kvm_vcpu *vcpu, u64 l2_offset,
> u64 l2_multiplier)
> {
> u64 l1_offset = vcpu->arch.l1_tsc_offset;
>
> if (l2_multiplier != kvm_default_tsc_scaling_ratio)
> l2_offset += mul_s64_u64_shr((s64)l1_tsc_offset, l2_multiplier,
> kvm_tsc_scaling_ratio_frac_bits);
>
> vcpu->arch.tsc_offset = l2_offset;
> }
> EXPORT_SYMBOL_GPL(kvm_get_l2_tsc_offset);
>
> static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> {
> trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> vcpu->arch.l1_tsc_offset,
> offset);
>
> vcpu->arch.l1_tsc_offset = offset;
>
> if (is_guest_mode(vcpu))
> kvm_set_l2_tsc_offset(vcpu,
> static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
> static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
> else
> vcpu->arch.tsc_offset = offset;
>
> static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
> }
>

OK, I will change it to what you suggest above.

>
> An alternative would be to explicitly track L1 and L2, and _never_ track the
> current TSC values. I.e. always compute the correct value on the fly. I think
> the only hot path is the TSC deadline timer, and AFAICT that always runs with
> L1's timer. Speaking of which, at the end of this series, vmx_set_hv_timer()
> uses L1's TSC but the current scaling ratio; that can't be right.
>

You are right, good catch, I will add this to patch 2.

I think let's leave the vcpu->arch.tsc_scaling_ratio variable as is for now.

> > + }
> > +
> > + static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
> > }
> >
> > static inline bool kvm_check_tsc_unstable(void)
> > --
> > 2.17.1
> >