Re: [PATCH v11 26/35] KVM: SEV: Support SEV-SNP AP Creation NAE event

From: Jacob Xu
Date: Fri Jan 05 2024 - 17:08:50 EST


On Sat, Dec 30, 2023 at 9:32 AM Michael Roth <michael.roth@xxxxxxx> wrote:
>
> From: Tom Lendacky <thomas.lendacky@xxxxxxx>
>
> Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP
> guests to alter the register state of the APs on their own. This allows
> the guest a way of simulating INIT-SIPI.
>
> A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used
> so as to avoid updating the VMSA pointer while the vCPU is running.
>
> For CREATE
> The guest supplies the GPA of the VMSA to be used for the vCPU with
> the specified APIC ID. The GPA is saved in the svm struct of the
> target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added
> to the vCPU and then the vCPU is kicked.
>
> For CREATE_ON_INIT:
> The guest supplies the GPA of the VMSA to be used for the vCPU with
> the specified APIC ID the next time an INIT is performed. The GPA is
> saved in the svm struct of the target vCPU.
>
> For DESTROY:
> The guest indicates it wishes to stop the vCPU. The GPA is cleared
> from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is
> added to vCPU and then the vCPU is kicked.
>
> The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked
> as a result of the event or as a result of an INIT. The handler sets the
> vCPU to the KVM_MP_STATE_UNINITIALIZED state, so that any errors will
> leave the vCPU as not runnable. Any previous VMSA pages that were
> installed as part of an SEV-SNP AP Creation NAE event are un-pinned. If
> a new VMSA is to be installed, the VMSA guest page is pinned and set as
> the VMSA in the vCPU VMCB and the vCPU state is set to
> KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is
> cleared in the vCPU VMCB and the vCPU state is left as
> KVM_MP_STATE_UNINITIALIZED to prevent it from being run.
>
> Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
> Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
> Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx>
> [mdr: add handling for gmem]
> Signed-off-by: Michael Roth <michael.roth@xxxxxxx>
> ---
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/include/asm/svm.h | 5 +
> arch/x86/kvm/svm/sev.c | 219 ++++++++++++++++++++++++++++++++
> arch/x86/kvm/svm/svm.c | 3 +
> arch/x86/kvm/svm/svm.h | 8 +-
> arch/x86/kvm/x86.c | 11 ++
> 6 files changed, 246 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3fdcbb1da856..9e45402e51bc 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -121,6 +121,7 @@
> KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
> #define KVM_REQ_HV_TLB_FLUSH \
> KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
> +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
>
> #define CR0_RESERVED_BITS \
> (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
> index ba8ce15b27d7..4b73cf5e9de0 100644
> --- a/arch/x86/include/asm/svm.h
> +++ b/arch/x86/include/asm/svm.h
> @@ -287,6 +287,11 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
>
> #define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
> #define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
> +#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
> +#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
> +#define SVM_SEV_FEAT_INT_INJ_MODES \
> + (SVM_SEV_FEAT_RESTRICTED_INJECTION | \
> + SVM_SEV_FEAT_ALTERNATE_INJECTION)
>
> struct vmcb_seg {
> u16 selector;
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 996b5a668938..3bb89c4df5d6 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -652,6 +652,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
>
> static int sev_es_sync_vmsa(struct vcpu_svm *svm)
> {
> + struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
> struct sev_es_save_area *save = svm->sev_es.vmsa;
>
> /* Check some debug related fields before encrypting the VMSA */
> @@ -700,6 +701,12 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
> if (sev_snp_guest(svm->vcpu.kvm))
> save->sev_features |= SVM_SEV_FEAT_SNP_ACTIVE;
>
> + /*
> + * Save the VMSA synced SEV features. For now, they are the same for
> + * all vCPUs, so just save each time.
> + */
> + sev->sev_features = save->sev_features;
> +
> pr_debug("Virtual Machine Save Area (VMSA):\n");
> print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
>
> @@ -3082,6 +3089,11 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
> if (!kvm_ghcb_sw_scratch_is_valid(svm))
> goto vmgexit_err;
> break;
> + case SVM_VMGEXIT_AP_CREATION:
> + if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
> + if (!kvm_ghcb_rax_is_valid(svm))
> + goto vmgexit_err;
> + break;
> case SVM_VMGEXIT_NMI_COMPLETE:
> case SVM_VMGEXIT_AP_HLT_LOOP:
> case SVM_VMGEXIT_AP_JUMP_TABLE:
> @@ -3322,6 +3334,202 @@ static int snp_complete_psc(struct kvm_vcpu *vcpu)
> return 1; /* resume guest */
> }
>
> +static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + hpa_t cur_pa;
> +
> + WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex));
> +
> + /* Save off the current VMSA PA for later checks */
> + cur_pa = svm->sev_es.vmsa_pa;
> +
> + /* Mark the vCPU as offline and not runnable */
> + vcpu->arch.pv.pv_unhalted = false;
> + vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
> +
> + /* Clear use of the VMSA */
> + svm->sev_es.vmsa_pa = INVALID_PAGE;
> + svm->vmcb->control.vmsa_pa = INVALID_PAGE;
> +
> + /*
> + * sev->sev_es.vmsa holds the virtual address of the VMSA initially
> + * allocated by the host. If the guest specified a new a VMSA via
> + * AP_CREATION, it will have been pinned to avoid future issues
> + * with things like page migration support. Make sure to un-pin it
> + * before switching to a newer guest-specified VMSA.
> + */
> + if (cur_pa != __pa(svm->sev_es.vmsa) && VALID_PAGE(cur_pa))
> + kvm_release_pfn_dirty(__phys_to_pfn(cur_pa));
> +
> + if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
> + gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
> + struct kvm_memory_slot *slot;
> + kvm_pfn_t pfn;
> +
> + slot = gfn_to_memslot(vcpu->kvm, gfn);
> + if (!slot)
> + return -EINVAL;
> +
> + /*
> + * The new VMSA will be private memory guest memory, so
> + * retrieve the PFN from the gmem backend, and leave the ref
> + * count of the associated folio elevated to ensure it won't
> + * ever be migrated.
> + */
> + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
> + return -EINVAL;
> +
> + /* Use the new VMSA */
> + svm->sev_es.vmsa_pa = pfn_to_hpa(pfn);
> + svm->vmcb->control.vmsa_pa = svm->sev_es.vmsa_pa;
> +
> + /* Mark the vCPU as runnable */
> + vcpu->arch.pv.pv_unhalted = false;
> + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +
> + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
> + }
> +
> + /*
> + * When replacing the VMSA during SEV-SNP AP creation,
> + * mark the VMCB dirty so that full state is always reloaded.
> + */
> + vmcb_mark_all_dirty(svm->vmcb);
> +
> + return 0;
> +}
> +
> +/*
> + * Invoked as part of svm_vcpu_reset() processing of an init event.
> + */
> +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + int ret;
> +
> + if (!sev_snp_guest(vcpu->kvm))
> + return;
> +
> + mutex_lock(&svm->sev_es.snp_vmsa_mutex);
> +
> + if (!svm->sev_es.snp_ap_create)
> + goto unlock;
> +
> + svm->sev_es.snp_ap_create = false;
> +
> + ret = __sev_snp_update_protected_guest_state(vcpu);
> + if (ret)
> + vcpu_unimpl(vcpu, "snp: AP state update on init failed\n");
> +
> +unlock:
> + mutex_unlock(&svm->sev_es.snp_vmsa_mutex);
> +}
> +
> +static int sev_snp_ap_creation(struct vcpu_svm *svm)
> +{
> + struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
> + struct kvm_vcpu *vcpu = &svm->vcpu;
> + struct kvm_vcpu *target_vcpu;
> + struct vcpu_svm *target_svm;
> + unsigned int request;
> + unsigned int apic_id;
> + bool kick;
> + int ret;
> +
> + request = lower_32_bits(svm->vmcb->control.exit_info_1);
> + apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
> +
> + /* Validate the APIC ID */
> + target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
> + if (!target_vcpu) {
> + vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
> + apic_id);
> + return -EINVAL;
> + }
> +
> + ret = 0;
> +
> + target_svm = to_svm(target_vcpu);
> +
> + /*
> + * The target vCPU is valid, so the vCPU will be kicked unless the
> + * request is for CREATE_ON_INIT. For any errors at this stage, the
> + * kick will place the vCPU in an non-runnable state.
> + */
> + kick = true;
> +
> + mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
> +
> + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
> + target_svm->sev_es.snp_ap_create = true;
> +
> + /* Interrupt injection mode shouldn't change for AP creation */
> + if (request < SVM_VMGEXIT_AP_DESTROY) {
> + u64 sev_features;
> +
> + sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
> + sev_features ^= sev->sev_features;
> + if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) {
> + vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
> + vcpu->arch.regs[VCPU_REGS_RAX]);
> + ret = -EINVAL;
> + goto out;
> + }
> + }
> +
> + switch (request) {
> + case SVM_VMGEXIT_AP_CREATE_ON_INIT:
> + kick = false;
> + fallthrough;
> + case SVM_VMGEXIT_AP_CREATE:
> + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
> + vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
> + svm->vmcb->control.exit_info_2);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + /*
> + * Malicious guest can RMPADJUST a large page into VMSA which
> + * will hit the SNP erratum where the CPU will incorrectly signal
> + * an RMP violation #PF if a hugepage collides with the RMP entry
> + * of VMSA page, reject the AP CREATE request if VMSA address from
> + * guest is 2M aligned.
> + */
> + if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
> + vcpu_unimpl(vcpu,
> + "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
> + svm->vmcb->control.exit_info_2);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
> + break;
> + case SVM_VMGEXIT_AP_DESTROY:
> + break;
> + default:
> + vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
> + request);
> + ret = -EINVAL;
> + break;
> + }
> +
> +out:
> + if (kick) {
> + if (target_vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
> + target_vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +
> + kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);

I think we should switch the order of these two statements for
setting mp_state and for making the request for
KVM_REQ_UPDATE_PROTECTED_GUEST_STATE.
There is a race condition I observed when booting with SVSM where:
1. BSP sets target vcpu to KVM_MP_STATE_RUNNABLE
2. AP thread within the loop of arch/x86/kvm.c:vcpu_run() checks
vm_vcpu_running()
3. AP enters the guest without having updated the VMSA state from
KVM_REQ_UPDATE_PROTECTED_GUEST_STATE

This results in the AP executing on a bad RIP and then crashing.
If we set the request first, then we avoid the race condition.

> + kvm_vcpu_kick(target_vcpu);
> + }
> +
> + mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
> +
> + return ret;
> +}
> +
> static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
> {
> struct vmcb_control_area *control = &svm->vmcb->control;
> @@ -3565,6 +3773,15 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
> vcpu->run->vmgexit.psc.shared_gpa = svm->sev_es.sw_scratch;
> vcpu->arch.complete_userspace_io = snp_complete_psc;
> break;
> + case SVM_VMGEXIT_AP_CREATION:
> + ret = sev_snp_ap_creation(svm);
> + if (ret) {
> + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
> + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
> + }
> +
> + ret = 1;
> + break;
> case SVM_VMGEXIT_UNSUPPORTED_EVENT:
> vcpu_unimpl(vcpu,
> "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
> @@ -3731,6 +3948,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
> set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
> GHCB_VERSION_MIN,
> sev_enc_bit));
> +
> + mutex_init(&svm->sev_es.snp_vmsa_mutex);
> }
>
> void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index da49e4981d75..240518f8d6c7 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1398,6 +1398,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> svm->spec_ctrl = 0;
> svm->virt_spec_ctrl = 0;
>
> + if (init_event)
> + sev_snp_init_protected_guest_state(vcpu);
> +
> init_vmcb(vcpu);
>
> if (!init_event)
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 4ef41f4d4ee6..d953ae41c619 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -96,6 +96,7 @@ struct kvm_sev_info {
> atomic_t migration_in_progress;
> u64 snp_init_flags;
> void *snp_context; /* SNP guest context page */
> + u64 sev_features; /* Features set at VMSA creation */
> };
>
> struct kvm_svm {
> @@ -214,6 +215,10 @@ struct vcpu_sev_es_state {
> bool ghcb_sa_free;
>
> u64 ghcb_registered_gpa;
> +
> + struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
> + gpa_t snp_vmsa_gpa;
> + bool snp_ap_create;
> };
>
> struct vcpu_svm {
> @@ -689,7 +694,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
> #define GHCB_VERSION_MAX 2ULL
> #define GHCB_VERSION_MIN 1ULL
>
> -#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP
> +#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
>
> extern unsigned int max_sev_asid;
>
> @@ -719,6 +724,7 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
> void sev_es_unmap_ghcb(struct vcpu_svm *svm);
> struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
> void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
> +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
>
> /* vmenter.S */
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 87b78d63e81d..df9ec357d538 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10858,6 +10858,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>
> if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
> static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
> +
> + if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
> + kvm_vcpu_reset(vcpu, true);
> + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
> + r = 1;
> + goto out;
> + }
> + }
> }
>
> if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
> @@ -13072,6 +13080,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
> if (kvm_test_request(KVM_REQ_PMI, vcpu))
> return true;
>
> + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
> + return true;
> +
> if (kvm_arch_interrupt_allowed(vcpu) &&
> (kvm_cpu_has_interrupt(vcpu) ||
> kvm_guest_apic_has_interrupt(vcpu)))
> --
> 2.25.1
>
>