[PATCH RFC 3/7] kvm: x86: XSAVE state and XFD MSRs context switch

From: Jing Liu
Date: Sun Feb 07 2021 - 01:58:48 EST


XFD allows the kernel to enable a feature state in XCR0 and to
receive a #NM trap when a task uses instructions accessing that state.
Kernel defines "struct fpu.state_mask" to indicate the saved xstate and
interact with the XFD hardware when needed via a simple conversion.
Once a dynamic feature is detected, "state_mask" is expanded and
"state_ptr" is dynamically allocated to hold the whole state. Meanwhile
once the state is not in INIT state, the corresponding XFD bit should
not be armed anymore.

In KVM, "guest_fpu" serves for any guest task working on this vcpu
during vmexit and vmenter. We provide a pre-allocated guest_fpu space
and entire "guest_fpu.state_mask" to avoid each dynamic features
detection on each vcpu task. Meanwhile, to ensure correctly
xsaves/xrstors guest state, set IA32_XFD as zero during vmexit and
vmenter.

For "current->thread.fpu", since host and guest probably have different
state and mask, it also need be switched to the right context when fpu
load and put.

Signed-off-by: Jing Liu <jing2.liu@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 3 ++
arch/x86/kernel/fpu/init.c | 1 +
arch/x86/kernel/fpu/xstate.c | 2 +
arch/x86/kvm/vmx/vmx.c | 76 +++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.h | 1 +
arch/x86/kvm/x86.c | 69 +++++++++++++++++++++++++-----
6 files changed, 141 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e5f33a0d0e2..6dedf3d22659 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1203,6 +1203,9 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);

+ void (*xfd_load)(struct kvm_vcpu *vcpu);
+ void (*xfd_put)(struct kvm_vcpu *vcpu);
+
void (*request_immediate_exit)(struct kvm_vcpu *vcpu);

void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7e0c68043ce3..fbb761fc13ec 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -145,6 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_kernel_xstate_min_size);
* can be dynamically expanded to include some states up to this size.
*/
unsigned int fpu_kernel_xstate_max_size;
+EXPORT_SYMBOL_GPL(fpu_kernel_xstate_max_size);

/* Get alignment of the TYPE. */
#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 080f3be9a5e6..9c471a0364e2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -77,12 +77,14 @@ static struct xfeature_capflag_info xfeature_capflags[] __initdata = {
* XSAVE buffer, both supervisor and user xstates.
*/
u64 xfeatures_mask_all __read_mostly;
+EXPORT_SYMBOL_GPL(xfeatures_mask_all);

/*
* This represents user xstates, a subset of xfeatures_mask_all, saved in a
* dynamic kernel XSAVE buffer.
*/
u64 xfeatures_mask_user_dynamic __read_mostly;
+EXPORT_SYMBOL_GPL(xfeatures_mask_user_dynamic);

static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7fa54e78c45c..be3cc0f3ec6d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1167,6 +1167,75 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}

+static void vmx_xfd_load(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_XFD)) {
+ vmx->host_ia32_xfd = xfirstuse_not_detected(vcpu->arch.user_fpu);
+ /*
+ * Keep IA32_XFD as zero in hypervisor.
+ * Guest non-zero IA32_XFD is restored until kvm_x86_ops.run
+ */
+ if (vmx->host_ia32_xfd)
+ wrmsrl(MSR_IA32_XFD, 0);
+ }
+}
+
+static void vmx_xfd_put(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_XFD)) {
+ /* IA32_XFD register is kept as zero in hypervisor. */
+ if (vmx->host_ia32_xfd)
+ wrmsrl(MSR_IA32_XFD, vmx->host_ia32_xfd);
+ /* User (qemu) IA32_XFD_ERR should be zero. */
+ if (vmx->msr_ia32_xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, 0);
+ }
+}
+
+/* Load guest XFD MSRs before entering. */
+static void xfd_guest_enter(struct vcpu_vmx *vmx)
+{
+ if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_XFD)) {
+ if (vmx->msr_ia32_xfd)
+ wrmsrl(MSR_IA32_XFD, vmx->msr_ia32_xfd);
+ /*
+ * We do not rdmsr here since in most cases
+ * IA32_XFD_ERR is zero. One rare exception is that,
+ * this vmenter follows a vmexit with non-zero
+ * MSR_IA32_XFD_ERR and it doesn't change during
+ * this interval.
+ *
+ * So just simply load the non-zero guest value.
+ */
+ if (vmx->msr_ia32_xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, vmx->msr_ia32_xfd_err);
+ }
+}
+
+/*
+ * Save guest XFD MSRs once vmexit since the registers may be changed
+ * when control is transferred out of KVM, e.g. preemption.
+ */
+static void xfd_guest_exit(struct vcpu_vmx *vmx)
+{
+ if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_XFD)) {
+ rdmsrl(MSR_IA32_XFD, vmx->msr_ia32_xfd);
+ rdmsrl(MSR_IA32_XFD_ERR, vmx->msr_ia32_xfd_err);
+ /*
+ * Clear the MSR_IA32_XFD to ensure correctly protect guest
+ * fpu context in hypervisor.
+ * No need to reset MSR_IA32_XFD_ERR in hypervisor since it
+ * has no impact on others.
+ */
+ if (vmx->msr_ia32_xfd)
+ wrmsrl(MSR_IA32_XFD, 0);
+ }
+}
+
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
unsigned long fs_base, unsigned long gs_base)
{
@@ -6735,6 +6804,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)

kvm_load_guest_xsave_state(vcpu);

+ xfd_guest_enter(vmx);
+
pt_guest_enter(vmx);

atomic_switch_perf_msrs(vmx);
@@ -6804,6 +6875,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)

pt_guest_exit(vmx);

+ xfd_guest_exit(vmx);
+
kvm_load_host_xsave_state(vcpu);

vmx->nested.nested_run_pending = 0;
@@ -7644,6 +7717,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,

+ .xfd_load = vmx_xfd_load,
+ .xfd_put = vmx_xfd_put,
+
.update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index d487f5a53a08..9a9ea37a29b1 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -288,6 +288,7 @@ struct vcpu_vmx {
} shadow_msr_intercept;

/* eXtended Feature Disabling (XFD) MSRs */
+ u64 host_ia32_xfd;
u64 msr_ia32_xfd;
u64 msr_ia32_xfd_err;
};
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9ca8b1e58afa..15908bc65d1c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9220,22 +9220,44 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)

static void kvm_save_current_fpu(struct fpu *fpu)
{
- struct fpu *src_fpu = &current->thread.fpu;
+ struct fpu *cur_fpu = &current->thread.fpu;

+ fpu->state_ptr = cur_fpu->state_ptr;
+ fpu->state_mask = cur_fpu->state_mask;
/*
* If the target FPU state is not resident in the CPU registers, just
* memcpy() from current, else save CPU state directly to the target.
*/
if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
- memcpy(&fpu->state, &src_fpu->state,
- fpu_kernel_xstate_min_size);
+ /*
+ * No need to copy if dynamic feature is used, because
+ * they just simply point to the same recent state.
+ */
+ if (!cur_fpu->state_ptr)
+ memcpy(&fpu->state, &cur_fpu->state,
+ fpu_kernel_xstate_min_size);
} else {
- if (fpu->state_mask != src_fpu->state_mask)
- fpu->state_mask = src_fpu->state_mask;
copy_fpregs_to_fpstate(fpu);
}
}

+/*
+ * Swap fpu context to next fpu role.
+ *
+ * "current" fpu acts two roles: user contexts and guest contexts.
+ * Swap "current" fpu to next role to ensure correctly handle
+ * dynamic state buffers, e.g. in preemption case.
+ */
+static void kvm_load_next_fpu(struct fpu *next_fpu, u64 mask)
+{
+ struct fpu *cur_fpu = &current->thread.fpu;
+
+ cur_fpu->state_ptr = next_fpu->state_ptr;
+ cur_fpu->state_mask = next_fpu->state_mask;
+
+ __copy_kernel_to_fpregs(__xstate(next_fpu), mask);
+}
+
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
@@ -9243,9 +9265,11 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)

kvm_save_current_fpu(vcpu->arch.user_fpu);

+ if (static_cpu_has(X86_FEATURE_XFD) && kvm_x86_ops.xfd_load)
+ kvm_x86_ops.xfd_load(vcpu);
+
/* PKRU is separately restored in kvm_x86_ops.run. */
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
- ~XFEATURE_MASK_PKRU);
+ kvm_load_next_fpu(vcpu->arch.guest_fpu, ~XFEATURE_MASK_PKRU);

fpregs_mark_activate();
fpregs_unlock();
@@ -9260,7 +9284,10 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)

kvm_save_current_fpu(vcpu->arch.guest_fpu);

- copy_kernel_to_fpregs(vcpu->arch.user_fpu);
+ if (static_cpu_has(X86_FEATURE_XFD) && kvm_x86_ops.xfd_put)
+ kvm_x86_ops.xfd_put(vcpu);
+
+ kvm_load_next_fpu(vcpu->arch.user_fpu, -1);

fpregs_mark_activate();
fpregs_unlock();
@@ -9840,11 +9867,13 @@ static int sync_regs(struct kvm_vcpu *vcpu)

static void fx_init(struct kvm_vcpu *vcpu)
{
+ struct xregs_state *xsave;
+
+ xsave = __xsave(vcpu->arch.guest_fpu);
fpstate_init(vcpu->arch.guest_fpu);
if (boot_cpu_has(X86_FEATURE_XSAVES))
- vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
+ xsave->header.xcomp_bv =
host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
/*
* Ensure guest xcr0 is valid for loading
*/
@@ -9920,6 +9949,21 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
pr_err("kvm: failed to allocate vcpu's fpu\n");
goto free_user_fpu;
}
+
+ vcpu->arch.guest_fpu->state_mask = xfeatures_mask_all &
+ ~xfeatures_mask_user_dynamic;
+
+ /* If have dynamic features, initialize full context. */
+ if (xfeatures_mask_user_dynamic) {
+ vcpu->arch.guest_fpu->state_ptr =
+ kmalloc(fpu_kernel_xstate_max_size, GFP_KERNEL);
+ if (!vcpu->arch.guest_fpu->state_ptr)
+ goto free_guest_fpu;
+
+ vcpu->arch.guest_fpu->state_mask |=
+ xfeatures_mask_user_dynamic;
+ }
+
fx_init(vcpu);

vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -9936,7 +9980,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)

r = kvm_x86_ops.vcpu_create(vcpu);
if (r)
- goto free_guest_fpu;
+ goto free_guest_fpu_exp;

vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
@@ -9947,6 +9991,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu_put(vcpu);
return 0;

+free_guest_fpu_exp:
+ kfree(vcpu->arch.guest_fpu->state_ptr);
free_guest_fpu:
kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
free_user_fpu:
@@ -10002,6 +10048,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ kfree(vcpu->arch.guest_fpu->state_ptr);
kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);

kvm_hv_vcpu_uninit(vcpu);
--
2.18.4