[RFC 6/6] x86/kvm: use enlightened VMCS when running on Hyper-V

From: Vitaly Kuznetsov
Date: Mon Jan 15 2018 - 12:32:10 EST


Early prototype.

When running nested KVM on Hyper-V it's possible to use so called
'Enlightened VMCS' and do normal memory reads/writes instead of
doing VMWRITE/VMREAD instructions. Tests show that this speeds up
tight CPUID loop almost 3 times:

Before:
./cpuid_tight
20459

After:
./cpuid_tight
7698

checkpatch.pl errors/warnings and 32bit brokenness are known things.

Main RFC questions I have are:
- Do we want to have this per L2 VM or per L1 host?
- How can we achieve zero overhead for non-Hyper-V deployments? Use static
keys? But this will only work if we decide to do eVMCS per host.
- Can we do better than a big switch in evmcs_read()/evmcs_write()? And
probably don't use 'case' defines which checkpatch.pl hates.

Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
---
arch/x86/kvm/vmx.c | 595 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 593 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index efff9d035543..dfdfd15c3d60 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,6 +51,7 @@
#include <asm/irq_remapping.h>
#include <asm/mmu_context.h>
#include <asm/intel_pt.h>
+#include <asm/mshyperv.h>

#include "trace.h"
#include "pmu.h"
@@ -198,6 +199,9 @@ extern const ulong vmx_return;

#define NR_AUTOLOAD_MSRS 8

+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
struct vmcs {
u32 revision_id;
u32 abort;
@@ -1498,11 +1502,22 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
loaded_vmcs->launched = 0;
}

+static inline void vmcs_load_enlightened(u64 phys_addr)
+{
+ int cpu = smp_processor_id();
+
+ hv_vp_assist_page[cpu]->current_nested_vmcs = phys_addr;
+ hv_vp_assist_page[cpu]->enlighten_vmentry = 1;
+}
+
static void vmcs_load(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
u8 error;

+ if (enlightened_vmcs)
+ return vmcs_load_enlightened(phys_addr);
+
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
@@ -1620,6 +1635,514 @@ static inline void ept_sync_context(u64 eptp)
ept_sync_global();
}

+/*
+ * Enlightened VMCSv1 doesn't support these:
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * GUEST_PML_INDEX = 0x00000812,
+ * IO_BITMAP_A_HIGH = 0x00002001,
+ * IO_BITMAP_B_HIGH = 0x00002003,
+ * MSR_BITMAP_HIGH = 0x00002005,
+ * VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
+ * VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
+ * VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
+ * PML_ADDRESS = 0x0000200e,
+ * PML_ADDRESS_HIGH = 0x0000200f,
+ * TSC_OFFSET_HIGH = 0x00002011,
+ * VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * APIC_ACCESS_ADDR_HIGH = 0x00002015,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
+ * VM_FUNCTION_CONTROL = 0x00002018,
+ * VM_FUNCTION_CONTROL_HIGH = 0x00002019,
+ * EPT_POINTER_HIGH = 0x0000201b,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP2_HIGH = 0x00002021,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ * EOI_EXIT_BITMAP3_HIGH = 0x00002023,
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * EPTP_LIST_ADDRESS_HIGH = 0x00002025,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ * XSS_EXIT_BITMAP_HIGH = 0x0000202D,
+ * TSC_MULTIPLIER = 0x00002032,
+ * TSC_MULTIPLIER_HIGH = 0x00002033,
+ * GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
+ * VMCS_LINK_POINTER_HIGH = 0x00002801,
+ * GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
+ * GUEST_IA32_PAT_HIGH = 0x00002805,
+ * GUEST_IA32_EFER_HIGH = 0x00002807,
+ * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ * GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+ * GUEST_PDPTR0_HIGH = 0x0000280b,
+ * GUEST_PDPTR1_HIGH = 0x0000280d,
+ * GUEST_PDPTR2_HIGH = 0x0000280f,
+ * GUEST_PDPTR3_HIGH = 0x00002811,
+ * GUEST_BNDCFGS_HIGH = 0x00002813,
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ * GUEST_IA32_RTIT_CTL_HIGH = 0x00002815,
+ * HOST_IA32_PAT_HIGH = 0x00002c01,
+ * HOST_IA32_EFER_HIGH = 0x00002c03,
+ * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ * HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
+ * VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
+ * VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
+ * VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ */
+
+#define evmcs_write_field(field, efield, mask) \
+ case field: \
+ evmcs->efield = value; \
+ evmcs->hv_clean_fields &= ~mask; \
+ break;
+
+#define evmcs_read_field(field, efield) \
+ case field: \
+ return evmcs->efield; \
+
+static void evmcs_write(unsigned long field, u64 value)
+{
+ int cpu = smp_processor_id();
+ struct hv_enlightened_vmcs *evmcs =
+ __va(hv_vp_assist_page[cpu]->current_nested_vmcs);
+
+ switch (field) {
+ /* 64 bit fields */
+ evmcs_write_field(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ evmcs_write_field(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC);
+ evmcs_write_field(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC);
+ evmcs_write_field(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_IA32_SYSENTER_ESP,
+ host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_IA32_SYSENTER_EIP,
+ host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP);
+ evmcs_write_field(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP);
+ evmcs_write_field(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP);
+ evmcs_write_field(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2);
+ evmcs_write_field(VIRTUAL_APIC_PAGE_ADDR,
+ virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2);
+ evmcs_write_field(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_PENDING_DBG_EXCEPTIONS,
+ guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+ evmcs_write_field(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+ evmcs_write_field(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+ evmcs_write_field(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+ evmcs_write_field(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+ evmcs_write_field(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+ evmcs_write_field(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+ evmcs_write_field(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR);
+ evmcs_write_field(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+ evmcs_write_field(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+ evmcs_write_field(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+ evmcs_write_field(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+ evmcs_write_field(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+ evmcs_write_field(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER);
+ evmcs_write_field(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT);
+ evmcs_write_field(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2);
+ /* no mask defined in the spec */
+ evmcs_write_field(VM_EXIT_MSR_STORE_ADDR,
+ vm_exit_msr_store_addr, 0xffff);
+ evmcs_write_field(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ 0xffff);
+ evmcs_write_field(VM_ENTRY_MSR_LOAD_ADDR,
+ vm_entry_msr_load_addr, 0xffff);
+ evmcs_write_field(CR3_TARGET_VALUE0, cr3_target_value0, 0xffff);
+ evmcs_write_field(CR3_TARGET_VALUE1, cr3_target_value1, 0xffff);
+ evmcs_write_field(CR3_TARGET_VALUE2, cr3_target_value2, 0xffff);
+ evmcs_write_field(CR3_TARGET_VALUE3, cr3_target_value3, 0xffff);
+
+ /* 32 bit fields */
+ evmcs_write_field(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ evmcs_write_field(GUEST_INTERRUPTIBILITY_INFO,
+ guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC);
+ evmcs_write_field(CPU_BASED_VM_EXEC_CONTROL,
+ cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC);
+ evmcs_write_field(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN);
+ evmcs_write_field(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY);
+ evmcs_write_field(VM_ENTRY_INTR_INFO_FIELD,
+ vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT);
+ evmcs_write_field(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT);
+ evmcs_write_field(VM_ENTRY_INSTRUCTION_LEN,
+ vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT);
+ evmcs_write_field(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(PIN_BASED_VM_EXEC_CONTROL,
+ pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1);
+ evmcs_write_field(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1);
+ evmcs_write_field(SECONDARY_VM_EXEC_CONTROL,
+ secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1);
+ evmcs_write_field(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ evmcs_write_field(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
+ /* no mask defined in the spec */
+ evmcs_write_field(PAGE_FAULT_ERROR_CODE_MASK,
+ page_fault_error_code_mask, 0xffff);
+ evmcs_write_field(PAGE_FAULT_ERROR_CODE_MATCH,
+ page_fault_error_code_match, 0xffff);
+ evmcs_write_field(CR3_TARGET_COUNT, cr3_target_count,
+ 0xffff);
+ evmcs_write_field(VM_EXIT_MSR_STORE_COUNT,
+ vm_exit_msr_store_count, 0xffff);
+ evmcs_write_field(VM_EXIT_MSR_LOAD_COUNT,
+ vm_exit_msr_load_count, 0xffff);
+ evmcs_write_field(VM_ENTRY_MSR_LOAD_COUNT,
+ vm_entry_msr_load_count, 0xffff);
+
+ /* 16 bit fields */
+ evmcs_write_field(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1);
+ evmcs_write_field(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2);
+ evmcs_write_field(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT);
+ default:
+ pr_err("VMX: no EVMCS support write:0x%lx\n", field);
+ }
+}
+
+static u64 evmcs_read(unsigned long field)
+{
+ int cpu = smp_processor_id();
+ struct hv_enlightened_vmcs *evmcs =
+ __va(hv_vp_assist_page[cpu]->current_nested_vmcs);
+
+ switch (field) {
+ /* 64 bit fields */
+ evmcs_read_field(GUEST_RIP, guest_rip);
+ evmcs_read_field(GUEST_RSP, guest_rsp);
+ evmcs_read_field(GUEST_RFLAGS, guest_rflags);
+ evmcs_read_field(HOST_IA32_PAT, host_ia32_pat);
+ evmcs_read_field(HOST_IA32_EFER, host_ia32_efer);
+ evmcs_read_field(HOST_CR0, host_cr0);
+ evmcs_read_field(HOST_CR3, host_cr3);
+ evmcs_read_field(HOST_CR4, host_cr4);
+ evmcs_read_field(HOST_IA32_SYSENTER_ESP,
+ host_ia32_sysenter_esp);
+ evmcs_read_field(HOST_IA32_SYSENTER_EIP,
+ host_ia32_sysenter_eip);
+ evmcs_read_field(HOST_RIP, host_rip);
+ evmcs_read_field(IO_BITMAP_A, io_bitmap_a);
+ evmcs_read_field(IO_BITMAP_B, io_bitmap_b);
+ evmcs_read_field(MSR_BITMAP, msr_bitmap);
+ evmcs_read_field(GUEST_ES_BASE, guest_es_base);
+ evmcs_read_field(GUEST_CS_BASE, guest_cs_base);
+ evmcs_read_field(GUEST_SS_BASE, guest_ss_base);
+ evmcs_read_field(GUEST_DS_BASE, guest_ds_base);
+ evmcs_read_field(GUEST_FS_BASE, guest_fs_base);
+ evmcs_read_field(GUEST_GS_BASE, guest_gs_base);
+ evmcs_read_field(GUEST_LDTR_BASE, guest_ldtr_base);
+ evmcs_read_field(GUEST_TR_BASE, guest_tr_base);
+ evmcs_read_field(GUEST_GDTR_BASE, guest_gdtr_base);
+ evmcs_read_field(GUEST_IDTR_BASE, guest_idtr_base);
+ evmcs_read_field(TSC_OFFSET, tsc_offset);
+ evmcs_read_field(VIRTUAL_APIC_PAGE_ADDR,
+ virtual_apic_page_addr);
+ evmcs_read_field(VMCS_LINK_POINTER, vmcs_link_pointer);
+ evmcs_read_field(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl);
+ evmcs_read_field(GUEST_IA32_PAT, guest_ia32_pat);
+ evmcs_read_field(GUEST_IA32_EFER, guest_ia32_efer);
+ evmcs_read_field(GUEST_PDPTR0, guest_pdptr0);
+ evmcs_read_field(GUEST_PDPTR1, guest_pdptr1);
+ evmcs_read_field(GUEST_PDPTR2, guest_pdptr2);
+ evmcs_read_field(GUEST_PDPTR3, guest_pdptr3);
+ evmcs_read_field(GUEST_PENDING_DBG_EXCEPTIONS,
+ guest_pending_dbg_exceptions);
+ evmcs_read_field(GUEST_SYSENTER_ESP, guest_sysenter_esp);
+ evmcs_read_field(GUEST_SYSENTER_EIP, guest_sysenter_eip);
+ evmcs_read_field(CR0_GUEST_HOST_MASK, cr0_guest_host_mask);
+ evmcs_read_field(CR4_GUEST_HOST_MASK, cr4_guest_host_mask);
+ evmcs_read_field(CR0_READ_SHADOW, cr0_read_shadow);
+ evmcs_read_field(CR4_READ_SHADOW, cr4_read_shadow);
+ evmcs_read_field(GUEST_CR0, guest_cr0);
+ evmcs_read_field(GUEST_CR3, guest_cr3);
+ evmcs_read_field(GUEST_CR4, guest_cr4);
+ evmcs_read_field(GUEST_DR7, guest_dr7);
+ evmcs_read_field(HOST_FS_BASE, host_fs_base);
+ evmcs_read_field(HOST_GS_BASE, host_gs_base);
+ evmcs_read_field(HOST_TR_BASE, host_tr_base);
+ evmcs_read_field(HOST_GDTR_BASE, host_gdtr_base);
+ evmcs_read_field(HOST_IDTR_BASE, host_idtr_base);
+ evmcs_read_field(HOST_RSP, host_rsp);
+ evmcs_read_field(EPT_POINTER, ept_pointer);
+ evmcs_read_field(GUEST_BNDCFGS, guest_bndcfgs);
+ evmcs_read_field(XSS_EXIT_BITMAP, xss_exit_bitmap);
+ evmcs_read_field(GUEST_PHYSICAL_ADDRESS,
+ guest_physical_address);
+ evmcs_read_field(EXIT_QUALIFICATION, exit_qualification);
+ /*
+ * Not implemented in KVM:
+ * evmcs_read_field(0x00006402, exit_io_instruction_ecx);
+ * evmcs_read_field(0x00006404, exit_io_instruction_esi);
+ * evmcs_read_field(0x00006406, exit_io_instruction_esi);
+ * evmcs_read_field(0x00006408, exit_io_instruction_eip);
+ */
+ evmcs_read_field(GUEST_LINEAR_ADDRESS, guest_linear_address);
+
+ /* no mask defined in the spec */
+ evmcs_read_field(VM_EXIT_MSR_STORE_ADDR,
+ vm_exit_msr_store_addr);
+ evmcs_read_field(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr);
+ evmcs_read_field(VM_ENTRY_MSR_LOAD_ADDR,
+ vm_entry_msr_load_addr);
+ evmcs_read_field(CR3_TARGET_VALUE0, cr3_target_value0);
+ evmcs_read_field(CR3_TARGET_VALUE1, cr3_target_value1);
+ evmcs_read_field(CR3_TARGET_VALUE2, cr3_target_value2);
+ evmcs_read_field(CR3_TARGET_VALUE3, cr3_target_value3);
+
+ /* 32 bit fields */
+ evmcs_read_field(TPR_THRESHOLD, tpr_threshold);
+ evmcs_read_field(GUEST_INTERRUPTIBILITY_INFO,
+ guest_interruptibility_info);
+ evmcs_read_field(CPU_BASED_VM_EXEC_CONTROL,
+ cpu_based_vm_exec_control);
+ evmcs_read_field(EXCEPTION_BITMAP, exception_bitmap);
+ evmcs_read_field(VM_ENTRY_CONTROLS, vm_entry_controls);
+ evmcs_read_field(VM_ENTRY_INTR_INFO_FIELD,
+ vm_entry_intr_info_field);
+ evmcs_read_field(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code);
+ evmcs_read_field(VM_ENTRY_INSTRUCTION_LEN,
+ vm_entry_instruction_len);
+ evmcs_read_field(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs);
+ evmcs_read_field(PIN_BASED_VM_EXEC_CONTROL,
+ pin_based_vm_exec_control);
+ evmcs_read_field(VM_EXIT_CONTROLS, vm_exit_controls);
+ evmcs_read_field(SECONDARY_VM_EXEC_CONTROL,
+ secondary_vm_exec_control);
+ evmcs_read_field(GUEST_ES_LIMIT, guest_es_limit);
+ evmcs_read_field(GUEST_CS_LIMIT, guest_cs_limit);
+ evmcs_read_field(GUEST_SS_LIMIT, guest_ss_limit);
+ evmcs_read_field(GUEST_DS_LIMIT, guest_ds_limit);
+ evmcs_read_field(GUEST_FS_LIMIT, guest_fs_limit);
+ evmcs_read_field(GUEST_GS_LIMIT, guest_gs_limit);
+ evmcs_read_field(GUEST_LDTR_LIMIT, guest_ldtr_limit);
+ evmcs_read_field(GUEST_TR_LIMIT, guest_tr_limit);
+ evmcs_read_field(GUEST_GDTR_LIMIT, guest_gdtr_limit);
+ evmcs_read_field(GUEST_IDTR_LIMIT, guest_idtr_limit);
+ evmcs_read_field(GUEST_ES_AR_BYTES, guest_es_ar_bytes);
+ evmcs_read_field(GUEST_CS_AR_BYTES, guest_cs_ar_bytes);
+ evmcs_read_field(GUEST_SS_AR_BYTES, guest_ss_ar_bytes);
+ evmcs_read_field(GUEST_DS_AR_BYTES, guest_ds_ar_bytes);
+ evmcs_read_field(GUEST_FS_AR_BYTES, guest_fs_ar_bytes);
+ evmcs_read_field(GUEST_GS_AR_BYTES, guest_gs_ar_bytes);
+ evmcs_read_field(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes);
+ evmcs_read_field(GUEST_TR_AR_BYTES, guest_tr_ar_bytes);
+ evmcs_read_field(GUEST_ACTIVITY_STATE, guest_activity_state);
+ evmcs_read_field(GUEST_SYSENTER_CS, guest_sysenter_cs);
+ evmcs_read_field(VM_INSTRUCTION_ERROR, vm_instruction_error);
+ evmcs_read_field(VM_EXIT_REASON, vm_exit_reason);
+ evmcs_read_field(VM_EXIT_INTR_INFO, vm_exit_intr_info);
+ evmcs_read_field(VM_EXIT_INTR_ERROR_CODE,
+ vm_exit_intr_error_code);
+ evmcs_read_field(IDT_VECTORING_INFO_FIELD,
+ idt_vectoring_info_field);
+ evmcs_read_field(IDT_VECTORING_ERROR_CODE,
+ idt_vectoring_error_code);
+ evmcs_read_field(VM_EXIT_INSTRUCTION_LEN,
+ vm_exit_instruction_len);
+ evmcs_read_field(VMX_INSTRUCTION_INFO, vmx_instruction_info);
+ /* no mask defined in the spec */
+ evmcs_read_field(PAGE_FAULT_ERROR_CODE_MASK,
+ page_fault_error_code_mask);
+ evmcs_read_field(PAGE_FAULT_ERROR_CODE_MATCH,
+ page_fault_error_code_match);
+ evmcs_read_field(CR3_TARGET_COUNT, cr3_target_count);
+ evmcs_read_field(VM_EXIT_MSR_STORE_COUNT,
+ vm_exit_msr_store_count);
+ evmcs_read_field(VM_EXIT_MSR_LOAD_COUNT,
+ vm_exit_msr_load_count);
+ evmcs_read_field(VM_ENTRY_MSR_LOAD_COUNT,
+ vm_entry_msr_load_count);
+
+ /* 16 bit fields */
+ evmcs_read_field(HOST_ES_SELECTOR, host_es_selector);
+ evmcs_read_field(HOST_CS_SELECTOR, host_cs_selector);
+ evmcs_read_field(HOST_SS_SELECTOR, host_ss_selector);
+ evmcs_read_field(HOST_DS_SELECTOR, host_ds_selector);
+ evmcs_read_field(HOST_FS_SELECTOR, host_fs_selector);
+ evmcs_read_field(HOST_GS_SELECTOR, host_gs_selector);
+ evmcs_read_field(HOST_TR_SELECTOR, host_tr_selector);
+ evmcs_read_field(GUEST_ES_SELECTOR, guest_es_selector);
+ evmcs_read_field(GUEST_CS_SELECTOR, guest_cs_selector);
+ evmcs_read_field(GUEST_SS_SELECTOR, guest_ss_selector);
+ evmcs_read_field(GUEST_DS_SELECTOR, guest_ds_selector);
+ evmcs_read_field(GUEST_FS_SELECTOR, guest_fs_selector);
+ evmcs_read_field(GUEST_GS_SELECTOR, guest_gs_selector);
+ evmcs_read_field(GUEST_LDTR_SELECTOR, guest_ldtr_selector);
+ evmcs_read_field(GUEST_TR_SELECTOR, guest_tr_selector);
+ evmcs_read_field(VIRTUAL_PROCESSOR_ID, virtual_processor_id);
+
+ default:
+ pr_err("VMX: no EVMCS support read:0x%lx\n", field);
+ }
+
+ return 0;
+}
+
static __always_inline void vmcs_check16(unsigned long field)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
@@ -1676,18 +2199,24 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
static __always_inline u16 vmcs_read16(unsigned long field)
{
vmcs_check16(field);
+ if (enlightened_vmcs)
+ return evmcs_read(field);
return __vmcs_readl(field);
}

static __always_inline u32 vmcs_read32(unsigned long field)
{
vmcs_check32(field);
+ if (enlightened_vmcs)
+ return evmcs_read(field);
return __vmcs_readl(field);
}

static __always_inline u64 vmcs_read64(unsigned long field)
{
vmcs_check64(field);
+ if (enlightened_vmcs)
+ return evmcs_read(field);
#ifdef CONFIG_X86_64
return __vmcs_readl(field);
#else
@@ -1698,6 +2227,8 @@ static __always_inline u64 vmcs_read64(unsigned long field)
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
vmcs_checkl(field);
+ if (enlightened_vmcs)
+ return evmcs_read(field);
return __vmcs_readl(field);
}

@@ -1721,18 +2252,27 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
static __always_inline void vmcs_write16(unsigned long field, u16 value)
{
vmcs_check16(field);
+ if (enlightened_vmcs)
+ return evmcs_write(field, value);
+
__vmcs_writel(field, value);
}

static __always_inline void vmcs_write32(unsigned long field, u32 value)
{
vmcs_check32(field);
+ if (enlightened_vmcs)
+ return evmcs_write(field, value);
+
__vmcs_writel(field, value);
}

static __always_inline void vmcs_write64(unsigned long field, u64 value)
{
vmcs_check64(field);
+ if (enlightened_vmcs)
+ return evmcs_write(field, value);
+
__vmcs_writel(field, value);
#ifndef CONFIG_X86_64
asm volatile ("");
@@ -1743,6 +2283,9 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value)
static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
{
vmcs_checkl(field);
+ if (enlightened_vmcs)
+ return evmcs_write(field, value);
+
__vmcs_writel(field, value);
}

@@ -1750,6 +2293,9 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_clear_bits does not support 64-bit fields");
+ if (enlightened_vmcs)
+ return evmcs_write(field, evmcs_read(field) & ~mask);
+
__vmcs_writel(field, __vmcs_readl(field) & ~mask);
}

@@ -1757,6 +2303,9 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_set_bits does not support 64-bit fields");
+ if (enlightened_vmcs)
+ return evmcs_write(field, evmcs_read(field) | mask);
+
__vmcs_writel(field, __vmcs_readl(field) | mask);
}

@@ -3891,7 +4440,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
- vmcs_conf->revision_id = vmx_msr_low;
+
+ if (enlightened_vmcs)
+ vmcs_conf->revision_id = ms_hyperv.nested_features & 0xff;
+ else
+ vmcs_conf->revision_id = vmx_msr_low;

vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
@@ -9520,6 +10073,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
+ struct hv_enlightened_vmcs *evmcs = NULL;

/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -9581,6 +10135,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_arm_hv_timer(vcpu);

vmx->__launched = vmx->loaded_vmcs->launched;
+
+ if (enlightened_vmcs) {
+ int cpu = smp_processor_id();
+
+ evmcs = __va(hv_vp_assist_page[cpu]->current_nested_vmcs);
+
+ /* Crude hack: put RSP-8 to enlightened VMCS host_rsp field */
+ asm volatile ("mov %%rsp, (%%rax); sub $32, (%%rax)" : :
+ "a"(&evmcs->host_rsp));
+ vmx->host_rsp = evmcs->host_rsp;
+ }
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9686,6 +10251,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);

+ /* All fields are CLEAN */
+ if (evmcs)
+ evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
if (have_spec_ctrl) {
rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
if (vmx->spec_ctrl)
@@ -12463,7 +13032,29 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {

static int __init vmx_init(void)
{
- int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ int r;
+
+#ifdef CONFIG_HYPERVISOR_GUEST
+ if (enlightened_vmcs &&
+ ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+ int cpu;
+
+ /* check that we have assist pages on all CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_vp_assist_page[cpu]) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs)
+ pr_info("VMX: using Hyper-V Enlightened VMCS\n");
+ } else {
+ enlightened_vmcs = false;
+ }
+#endif
+
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
--
2.14.3