Re: [PATCH v19 062/130] KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU

From: Chao Gao
Date: Wed Mar 27 2024 - 10:36:22 EST


>--- a/arch/x86/kvm/mmu/mmu.c
>+++ b/arch/x86/kvm/mmu/mmu.c
>@@ -3717,7 +3717,12 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
> goto out_unlock;
>
> if (tdp_mmu_enabled) {
>- root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
>+ if (kvm_gfn_shared_mask(vcpu->kvm) &&
>+ !VALID_PAGE(mmu->private_root_hpa)) {
>+ root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, true);
>+ mmu->private_root_hpa = root;

just
mmu->private_root_hpa =
kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, true);
>+ }
>+ root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, false);
> mmu->root.hpa = root;

ditto

> } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
>@@ -4627,7 +4632,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
> for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
> int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
>- gfn_t base = gfn_round_for_level(fault->gfn,
>+ gfn_t base = gfn_round_for_level(gpa_to_gfn(fault->addr),
> fault->max_level);

..

>
> if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
>@@ -4662,6 +4667,7 @@ int kvm_mmu_map_tdp_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
> };
>
> WARN_ON_ONCE(!vcpu->arch.mmu->root_role.direct);
>+ fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_shared_mask(vcpu->kvm);

Could you clarify when shared bits need to be masked out or kept? shared bits
are masked out here but kept in the hunk right above and ..

>+++ b/arch/x86/kvm/mmu/tdp_iter.h
>@@ -91,7 +91,7 @@ struct tdp_iter {
> tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
> /* A pointer to the current SPTE */
> tdp_ptep_t sptep;
>- /* The lowest GFN mapped by the current SPTE */
>+ /* The lowest GFN (shared bits included) mapped by the current SPTE */
> gfn_t gfn;

. in @gfn of tdp_iter.

> /* The level of the root page given to the iterator */
> int root_level;


>
>-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
>+static struct kvm_mmu_page *kvm_tdp_mmu_get_vcpu_root(struct kvm_vcpu *vcpu,

Maybe fold it into its sole caller.

>+ bool private)
> {
> union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
> struct kvm *kvm = vcpu->kvm;
>@@ -221,6 +225,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> * Check for an existing root before allocating a new one. Note, the
> * role check prevents consuming an invalid root.
> */
>+ if (private)
>+ kvm_mmu_page_role_set_private(&role);
> for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
> if (root->role.word == role.word &&
> kvm_tdp_mmu_get_root(root))
>@@ -244,12 +250,17 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
>
> out:
>- return __pa(root->spt);
>+ return root;
>+}
>+
>+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu, bool private)
>+{
>+ return __pa(kvm_tdp_mmu_get_vcpu_root(vcpu, private)->spt);
> }
>
> static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
>- u64 old_spte, u64 new_spte, int level,
>- bool shared);
>+ u64 old_spte, u64 new_spte,
>+ union kvm_mmu_page_role role, bool shared);
>
> static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> {
>@@ -376,12 +387,78 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
> REMOVED_SPTE, level);
> }
> handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
>- old_spte, REMOVED_SPTE, level, shared);
>+ old_spte, REMOVED_SPTE, sp->role,
>+ shared);
>+ }
>+
>+ if (is_private_sp(sp) &&
>+ WARN_ON(static_call(kvm_x86_free_private_spt)(kvm, sp->gfn, sp->role.level,

WARN_ON_ONCE()?

>+ kvm_mmu_private_spt(sp)))) {
>+ /*
>+ * Failed to unlink Secure EPT page and there is nothing to do
>+ * further. Intentionally leak the page to prevent the kernel
>+ * from accessing the encrypted page.
>+ */
>+ kvm_mmu_init_private_spt(sp, NULL);
> }
>
> call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
> }
>

> rcu_read_lock();
>
> for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
>@@ -960,10 +1158,26 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>
> if (unlikely(!fault->slot))
> new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
>- else
>- wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
>- fault->pfn, iter->old_spte, fault->prefetch, true,
>- fault->map_writable, &new_spte);
>+ else {
>+ unsigned long pte_access = ACC_ALL;
>+ gfn_t gfn = iter->gfn;
>+
>+ if (kvm_gfn_shared_mask(vcpu->kvm)) {
>+ if (fault->is_private)
>+ gfn |= kvm_gfn_shared_mask(vcpu->kvm);

this is an open-coded kvm_gfn_to_shared().

I don't get why a spte is installed for a shared gfn when fault->is_private
is true. could you elaborate?

>+ else
>+ /*
>+ * TDX shared GPAs are no executable, enforce
>+ * this for the SDV.
>+ */

what do you mean by the SDV?

>+ pte_access &= ~ACC_EXEC_MASK;
>+ }
>+
>+ wrprot = make_spte(vcpu, sp, fault->slot, pte_access, gfn,
>+ fault->pfn, iter->old_spte,
>+ fault->prefetch, true, fault->map_writable,
>+ &new_spte);
>+ }
>
> if (new_spte == iter->old_spte)
> ret = RET_PF_SPURIOUS;
>@@ -1041,6 +1255,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> struct kvm *kvm = vcpu->kvm;
> struct tdp_iter iter;
> struct kvm_mmu_page *sp;
>+ gfn_t raw_gfn;
>+ bool is_private = fault->is_private && kvm_gfn_shared_mask(kvm);
> int ret = RET_PF_RETRY;
>
> kvm_mmu_hugepage_adjust(vcpu, fault);
>@@ -1049,7 +1265,17 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>
> rcu_read_lock();
>
>- tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
>+ raw_gfn = gpa_to_gfn(fault->addr);
>+
>+ if (is_error_noslot_pfn(fault->pfn) ||
>+ !kvm_pfn_to_refcounted_page(fault->pfn)) {
>+ if (is_private) {
>+ rcu_read_unlock();
>+ return -EFAULT;

This needs a comment. why this check is necessary? does this imply some
kernel bugs?