Re: [PATCH v2] kvm: nVMX: Introduce KVM_CAP_STATE

From: Jim Mattson
Date: Mon Apr 09 2018 - 14:30:26 EST


On Mon, Apr 9, 2018 at 1:37 AM, KarimAllah Ahmed <karahmed@xxxxxxxxx> wrote:
> From: Jim Mattson <jmattson@xxxxxxxxxx>
>
> For nested virtualization L0 KVM is managing a bit of state for L2 guests,
> this state can not be captured through the currently available IOCTLs. In
> fact the state captured through all of these IOCTLs is usually a mix of L1
> and L2 state. It is also dependent on whether the L2 guest was running at
> the moment when the process was interrupted to save its state.
>
> With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and
> KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is
> in VMX operation.
>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: H. Peter Anvin <hpa@xxxxxxxxx>
> Cc: x86@xxxxxxxxxx
> Cc: kvm@xxxxxxxxxxxxxxx
> Cc: linux-kernel@xxxxxxxxxxxxxxx
> Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx>
> [karahmed@ - rename structs and functions and make them ready for AMD and
> address previous comments.
> - rebase & a bit of refactoring.
> - Merge 7/8 and 8/8 into one patch.
> - Force a VMExit from L2 after reading the kvm_state to avoid
> mixed state between L1 and L2 on resurrecting the instance. ]
> Signed-off-by: KarimAllah Ahmed <karahmed@xxxxxxxxx>

First, let me say "thank you" for picking this up!

> ---
> v1 -> v2:
> - rename structs and functions and make them ready for AMD and address
> previous comments.
> - rebase & a bit of refactoring.
> - Merge 7/8 and 8/8 into one patch.
> - Force a VMExit from L2 after reading the kvm_state to avoid mixed state
> between L1 and L2 on resurrecting the instance.
> ---
> Documentation/virtual/kvm/api.txt | 46 ++++++++++
> arch/x86/include/asm/kvm_host.h | 7 ++
> arch/x86/include/uapi/asm/kvm.h | 38 ++++++++
> arch/x86/kvm/vmx.c | 189 +++++++++++++++++++++++++++++++++++++-
> arch/x86/kvm/x86.c | 21 +++++
> include/uapi/linux/kvm.h | 5 +
> 6 files changed, 302 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index d6b3ff5..3ed56df 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3516,6 +3516,52 @@ Returns: 0 on success; -1 on error
> This ioctl can be used to unregister the guest memory region registered
> with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
>
> +4.112 KVM_GET_STATE

More specifically, KVM_GET_NESTED_STATE?

> +
> +Capability: KVM_CAP_STATE

KVM_CAP_NESTED_STATE?

> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_state (in/out)
> +Returns: 0 on success, -1 on error
> +Errors:
> + E2BIG: the data size exceeds the value of 'size' specified by
> + the user (the size required will be written into size).
> +
> +struct kvm_state {
> + __u16 flags;
> + __u16 format;
> + __u32 size;
> + union {
> + struct kvm_vmx_state vmx;
> + struct kvm_svm_state svm;
> + __u8 pad[120];
> + };
> + __u8 data[0];
> +};
> +
> +This ioctl copies the vcpu's kvm_state struct from the kernel to userspace.
> +
> +4.113 KVM_SET_STATE

KVM_SET_NESTED_STATE?

> +
> +Capability: KVM_CAP_STATE

KVM_CAP_NESTED_STATE?

> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_state (in)
> +Returns: 0 on success, -1 on error
> +
> +struct kvm_state {
> + __u16 flags;
> + __u16 format;
> + __u32 size;
> + union {
> + struct kvm_vmx_state vmx;
> + struct kvm_svm_state svm;
> + __u8 pad[120];
> + };
> + __u8 data[0];
> +};
> +
> +This copies the vcpu's kvm_state struct from userspace to the kernel.
>
> 5. The kvm_run structure
> ------------------------
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fad4d46..902db9e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -73,6 +73,7 @@
> #define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
> #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
> #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
> +#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(23)
>
> #define CR0_RESERVED_BITS \
> (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> @@ -1090,6 +1091,12 @@ struct kvm_x86_ops {
>
> void (*setup_mce)(struct kvm_vcpu *vcpu);
>
> + int (*get_state)(struct kvm_vcpu *vcpu,

get_nested_state

> + struct kvm_state __user *user_kvm_state);
> + int (*set_state)(struct kvm_vcpu *vcpu,

set_nested_state

> + struct kvm_state __user *user_kvm_state);
> + void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
> +
> int (*smi_allowed)(struct kvm_vcpu *vcpu);
> int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
> int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index f3a9604..1d1cd26 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -361,4 +361,42 @@ struct kvm_sync_regs {
> #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
> #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
>
> +#define KVM_STATE_GUEST_MODE 0x00000001
> +#define KVM_STATE_RUN_PENDING 0x00000002
> +#define KVM_STATE_GIF 0x00000004
> +
> +struct kvm_vmx_state {
> + __u64 vmxon_pa;
> + __u64 vmcs_pa;
> +};
> +
> +struct kvm_svm_state {
> + __u64 hsave_pa;
> + __u64 vmcb_pa;
> +};
> +
> +/* for KVM_CAP_STATE */
> +struct kvm_state {
> + /* KVM_STATE_* flags */
> + __u16 flags;
> +
> + /* 0 for VMX, 1 for SVM. */
> + __u16 format;
> +
> + /* 128 for SVM, 128 + VMCS size for VMX. */
> + __u32 size;
> +
> + union {
> + /* VMXON, VMCS */
> + struct kvm_vmx_state vmx;
> + /* HSAVE_PA, VMCB */
> + struct kvm_svm_state svm;
> +
> + /* Pad the union to 120 bytes. */
> + __u8 pad[120];
> + };
> +
> + __u8 data[0];
> +};
> +
> #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 14655df..4d830f7 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -10056,10 +10056,10 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
> static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
> struct vmcs12 *vmcs12);
>
> -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
> - struct vmcs12 *vmcs12)
> +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>
> if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
> if (vmcs12->apic_access_addr != vmx->nested.apic_access_mapping.gfn << PAGE_SHIFT) {
> @@ -11101,8 +11101,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
> return 1;
> }
>
> - nested_get_vmcs12_pages(vcpu, vmcs12);
> -
> msr_entry_idx = nested_vmx_load_msr(vcpu,
> vmcs12->vm_entry_msr_load_addr,
> vmcs12->vm_entry_msr_load_count);
> @@ -11200,6 +11198,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
> if (ret)
> return ret;
>
> + nested_get_vmcs12_pages(vcpu);
> +
> /*
> * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
> * by event injection, halt vcpu.
> @@ -12259,6 +12259,183 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
> return 0;
> }
>
> +static int get_vmcs_cache(struct kvm_vcpu *vcpu,
> + struct kvm_state __user *user_kvm_state)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> + /*
> + * When running L2, the authoritative vmcs12 state is in the
> + * vmcs02. When running L1, the authoritative vmcs12 state is
> + * in the shadow vmcs linked to vmcs01, unless
> + * sync_shadow_vmcs is set, in which case, the authoritative
> + * vmcs12 state is in the vmcs12 already.
> + */
> + if (is_guest_mode(vcpu))
> + sync_vmcs12(vcpu, vmcs12);
> + else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
> + copy_shadow_to_vmcs12(vmx);
> +
> + if (copy_to_user(user_kvm_state->data, vmcs12, sizeof(*vmcs12)))
> + return -EFAULT;
> +
> + /*
> + * Force a nested exit that guarantees that any state capture
> + * afterwards by any IOCTLs (MSRs, etc) will not capture a mix of L1
> + * and L2 state.
> + *
> + * One example where that would lead to an issue is the TSC DEADLINE
> + * MSR vs the guest TSC. If the L2 guest is running, the guest TSC will
> + * be the L2 TSC while the TSC deadline MSR will contain the L1 TSC
> + * deadline MSR. That would lead to a very large (and wrong) "expire"
> + * diff when LAPIC is initialized during instance restore (i.e. the
> + * instance will appear to have hanged!).
> + */
> + if (is_guest_mode(vcpu))
> + nested_vmx_vmexit(vcpu, -1, 0, 0);

Injecting a fake VM-exit on restore is as bad as injecting a fake
VM-exit on save, and I don't think this is a good approach.

> +
> + return 0;
> +}
> +
> +static int get_vmx_state(struct kvm_vcpu *vcpu,
> + struct kvm_state __user *user_kvm_state)
> +{
> + u32 user_data_size;
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct kvm_state kvm_state = {
> + .flags = 0,
> + .format = 0,
> + .size = sizeof(kvm_state),
> + .vmx.vmxon_pa = -1ull,
> + .vmx.vmcs_pa = -1ull,
> + };
> +
> + if (copy_from_user(&user_data_size, &user_kvm_state->size,
> + sizeof(user_data_size)))
> + return -EFAULT;
> +
> + if (nested_vmx_allowed(vcpu) && vmx->nested.vmxon) {
> + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
> + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
> +
> + if (vmx->nested.current_vmptr != -1ull)
> + kvm_state.size += VMCS12_SIZE;
> +
> + if (is_guest_mode(vcpu)) {
> + kvm_state.flags |= KVM_STATE_GUEST_MODE;
> +
> + if (vmx->nested.nested_run_pending)
> + kvm_state.flags |= KVM_STATE_RUN_PENDING;

IIRC, when I initially posted this set of changes, I neglected to
include the one that set nested_run_pending before prepare_vmcs02(),
and so this bit isn't actually tracked correctly for save/restore at
the moment.

> + }
> + }
> +
> + if (user_data_size < kvm_state.size) {
> + if (copy_to_user(&user_kvm_state->size, &kvm_state.size,
> + sizeof(kvm_state.size)))
> + return -EFAULT;
> + return -E2BIG;
> + }
> +
> + if (copy_to_user(user_kvm_state, &kvm_state, sizeof(kvm_state)))
> + return -EFAULT;
> +
> + if (vmx->nested.current_vmptr == -1ull)
> + return 0;
> +
> + return get_vmcs_cache(vcpu, user_kvm_state);
> +}
> +
> +static int set_vmcs_cache(struct kvm_vcpu *vcpu,
> + struct kvm_state __user *user_kvm_state,
> + struct kvm_state *kvm_state)
> +
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> + u32 exit_qual;
> + int ret;
> +
> + if ((kvm_state->size < (sizeof(*vmcs12) + sizeof(*kvm_state))) ||
> + kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
> + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
> + return -EINVAL;
> +
> + if (copy_from_user(vmcs12, user_kvm_state->data, sizeof(*vmcs12)))
> + return -EFAULT;
> +
> + if (vmcs12->revision_id != VMCS12_REVISION)
> + return -EINVAL;
> +
> + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
> +
> + if (!(kvm_state->flags & KVM_STATE_GUEST_MODE))
> + return 0;
> +
> + if (check_vmentry_prereqs(vcpu, vmcs12) ||
> + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
> + return -EINVAL;
> +
> + ret = enter_vmx_non_root_mode(vcpu, true);
> + if (ret)
> + return ret;
> +
> + /*
> + * This request will result in a call to
> + * nested_get_vmcs12_pages before the next VM-entry.
> + */
> + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
> +
> + vmx->nested.nested_run_pending = 1;
> +
> + return 0;
> +}
> +
> +static int set_vmx_state(struct kvm_vcpu *vcpu,
> + struct kvm_state __user *user_kvm_state)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct kvm_state kvm_state;
> + int ret;
> +
> + if (copy_from_user(&kvm_state, user_kvm_state, sizeof(kvm_state)))
> + return -EFAULT;
> +
> + if (kvm_state.size < sizeof(kvm_state))
> + return -EINVAL;
> +
> + if (kvm_state.format != 0)
> + return -EINVAL;
> +
> + if (kvm_state.flags &
> + ~(KVM_STATE_RUN_PENDING | KVM_STATE_GUEST_MODE))
> + return -EINVAL;
> +
> + if (!nested_vmx_allowed(vcpu))
> + return kvm_state.vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
> +
> + vmx_leave_nested(vcpu);
> +
> + vmx->nested.nested_run_pending =
> + !!(kvm_state.flags & KVM_STATE_RUN_PENDING);
> +
> + if (kvm_state.vmx.vmxon_pa == -1ull)
> + return 0;
> +
> + if (!page_address_valid(vcpu, kvm_state.vmx.vmxon_pa))
> + return -EINVAL;
> +
> + vmx->nested.vmxon_ptr = kvm_state.vmx.vmxon_pa;
> + ret = enter_vmx_operation(vcpu);
> + if (ret)
> + return ret;
> +
> + if (kvm_state.vmx.vmcs_pa == -1ull)
> + return 0;
> +
> + return set_vmcs_cache(vcpu, user_kvm_state, &kvm_state);
> +}
> +
> static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
> .cpu_has_kvm_support = cpu_has_kvm_support,
> .disabled_by_bios = vmx_disabled_by_bios,
> @@ -12387,6 +12564,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>
> .setup_mce = vmx_setup_mce,
>
> + .get_state = get_vmx_state,
> + .set_state = set_vmx_state,
> + .get_vmcs12_pages = nested_get_vmcs12_pages,
> +
> .smi_allowed = vmx_smi_allowed,
> .pre_enter_smm = vmx_pre_enter_smm,
> .pre_leave_smm = vmx_pre_leave_smm,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 963cdb9..1ab7cc5 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2873,6 +2873,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_X2APIC_API:
> r = KVM_X2APIC_API_VALID_FLAGS;
> break;
> + case KVM_CAP_STATE:
> + r = !!kvm_x86_ops->get_state;
> + break;
> default:
> r = 0;
> break;
> @@ -3892,6 +3895,22 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
> break;
> }
> + case KVM_GET_STATE: {
> + struct kvm_state __user *user_kvm_state = argp;
> +
> + r = -EINVAL;
> + if (kvm_x86_ops->get_state)
> + r = kvm_x86_ops->get_state(vcpu, user_kvm_state);
> + break;
> + }
> + case KVM_SET_STATE: {
> + struct kvm_state __user *user_kvm_state = argp;
> +
> + r = -EINVAL;
> + if (kvm_x86_ops->set_state)
> + r = kvm_x86_ops->set_state(vcpu, user_kvm_state);
> + break;
> + }
> default:
> r = -EINVAL;
> }
> @@ -7051,6 +7070,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> bool req_immediate_exit = false;
>
> if (kvm_request_pending(vcpu)) {
> + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
> + kvm_x86_ops->get_vmcs12_pages(vcpu);
> if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
> kvm_mmu_unload(vcpu);
> if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 4e1d7f5..4c170ff 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -949,6 +949,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_PPC_GET_CPU_CHAR 151
> #define KVM_CAP_S390_BPB 152
> #define KVM_CAP_GET_MSR_FEATURES 153
> +#define KVM_CAP_STATE 154
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -1380,6 +1381,10 @@ struct kvm_s390_ucas_mapping {
> /* Memory Encryption Commands */
> #define KVM_MEMORY_ENCRYPT_OP _IOWR(KVMIO, 0xba, unsigned long)
>
> +/* Available with KVM_CAP_STATE */
> +#define KVM_GET_STATE _IOWR(KVMIO, 0xbb, struct kvm_vmx_state)
> +#define KVM_SET_STATE _IOW(KVMIO, 0xbc, struct kvm_vmx_state)
> +
> struct kvm_enc_region {
> __u64 addr;
> __u64 size;
> --
> 2.7.4
>