Re: [PATCH v7 046/102] KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU

From: Isaku Yamahata
Date: Tue Jul 26 2022 - 19:40:17 EST


On Fri, Jul 08, 2022 at 03:44:05PM +1200,
Kai Huang <kai.huang@xxxxxxxxx> wrote:

> > +static int kvm_faultin_pfn_private_mapped(struct kvm_vcpu *vcpu,
> > + struct kvm_page_fault *fault)
> > +{
> > + hva_t hva = gfn_to_hva_memslot(fault->slot, fault->gfn);
> > + struct page *page[1];
> > +
> > + fault->map_writable = false;
> > + fault->pfn = KVM_PFN_ERR_FAULT;
> > + if (hva == KVM_HVA_ERR_RO_BAD || hva == KVM_HVA_ERR_BAD)
> > + return RET_PF_CONTINUE;
> > +
> > + /* TDX allows only RWX. Read-only isn't supported. */
> > + WARN_ON_ONCE(!fault->write);
> > + if (pin_user_pages_fast(hva, 1, FOLL_WRITE, page) != 1)
> > + return RET_PF_INVALID;
> > +
> > + fault->map_writable = true;
> > + fault->pfn = page_to_pfn(page[0]);
> > + return RET_PF_CONTINUE;
> > +}
> > +
> > static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > {
> > struct kvm_memory_slot *slot = fault->slot;
> > @@ -4058,6 +4094,9 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > return RET_PF_EMULATE;
> > }
> >
> > + if (fault->is_private)
> > + return kvm_faultin_pfn_private_mapped(vcpu, fault);
> > +
> > async = false;
> > fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
> > fault->write, &fault->map_writable,
> > @@ -4110,6 +4149,17 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
> > mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> > }
> >
> > +void kvm_mmu_release_fault(struct kvm *kvm, struct kvm_page_fault *fault, int r)
> > +{
> > + if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
> > + return;
> > +
> > + if (fault->is_private)
> > + put_page(pfn_to_page(fault->pfn));
> > + else
> > + kvm_release_pfn_clean(fault->pfn);
> > +}
>
> What's the purpose of 'int r'? Is it even used?

removed r because r is unused.


> > static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > {
> > bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
> > @@ -4117,7 +4167,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> > unsigned long mmu_seq;
> > int r;
> >
> > - fault->gfn = fault->addr >> PAGE_SHIFT;
> > + fault->gfn = gpa_to_gfn(fault->addr) & ~kvm_gfn_shared_mask(vcpu->kvm);
> > fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
>
> Where is fault->is_private set? Shouldn't it be set here?

kvm_mmu_do_page_fault() does it and no because is_private is constant.
is_private is input. On the other hand gfn and slot is working variables.


> > }
> >
> > if (flush)
> > @@ -6023,6 +6079,11 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> > write_unlock(&kvm->mmu_lock);
> > }
> >
> > + /*
> > + * For now this can only happen for non-TD VM, because TD private
> > + * mapping doesn't support write protection. kvm_tdp_mmu_wrprot_slot()
> > + * will give a WARN() if it hits for TD.
> > + */
>
> Unless I am mistaken, 'kvm_tdp_mmu_wrprot_slot() will give a WARN() if it hits
> for TD" is done in your later patch "KVM: x86/tdp_mmu: Ignore unsupported mmu
> operation on private GFNs". Why putting comment here?
>
> Please move this comment to that patch, and I think you can put that patch
> before this patch.
>
> And this problem happens repeatedly in this series. Could you check the entire
> series?

Split out those stuff into a patch.


> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 9f3a6bea60a3..d3b30d62aca0 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -6,6 +6,8 @@
> > #include <linux/kvm_host.h>
> > #include <asm/kvm_host.h>
> >
> > +#include "mmu.h"
> > +
> > #undef MMU_DEBUG
> >
> > #ifdef MMU_DEBUG
> > @@ -164,11 +166,30 @@ static inline void kvm_mmu_alloc_private_sp(
> > WARN_ON_ONCE(!sp->private_sp);
> > }
> >
> > +static inline int kvm_alloc_private_sp_for_split(
> > + struct kvm_mmu_page *sp, gfp_t gfp)
> > +{
> > + gfp &= ~__GFP_ZERO;
> > + sp->private_sp = (void*)__get_free_page(gfp);
> > + if (!sp->private_sp)
> > + return -ENOMEM;
> > + return 0;
> > +}
>
> What does "for_split" mean? Why do we need it?

Split large page into smaller sized one. Followed tdp_mmu_alloc_sp_for_split().
We can defer to introduce this function until large page support.


> > +
> > static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> > {
> > if (sp->private_sp != KVM_MMU_PRIVATE_SP_ROOT)
> > free_page((unsigned long)sp->private_sp);
> > }
> > +
> > +static inline gfn_t kvm_gfn_for_root(struct kvm *kvm, struct kvm_mmu_page *root,
> > + gfn_t gfn)
> > +{
> > + if (is_private_sp(root))
> > + return kvm_gfn_private(kvm, gfn);
> > + else
> > + return kvm_gfn_shared(kvm, gfn);
> > +}
> > #else
> > static inline bool is_private_sp(struct kvm_mmu_page *sp)
> > {
> > @@ -194,11 +215,25 @@ static inline void kvm_mmu_alloc_private_sp(
> > {
> > }
> >
> > +static inline int kvm_alloc_private_sp_for_split(
> > + struct kvm_mmu_page *sp, gfp_t gfp)
> > +{
> > + return -ENOMEM;
> > +}
> > +
> > static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> > {
> > }
> > +
> > +static inline gfn_t kvm_gfn_for_root(struct kvm *kvm, struct kvm_mmu_page *root,
> > + gfn_t gfn)
> > +{
> > + return gfn;
> > +}
> > #endif
> >
> > +void kvm_mmu_release_fault(struct kvm *kvm, struct kvm_page_fault *fault, int r);
> > +
> > static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
> > {
> > /*
> > @@ -246,6 +281,7 @@ struct kvm_page_fault {
> > /* Derived from mmu and global state. */
> > const bool is_tdp;
> > const bool nx_huge_page_workaround_enabled;
> > + const bool is_private;
> >
> > /*
> > * Whether a >4KB mapping can be created or is forbidden due to NX
> > @@ -327,6 +363,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > .prefetch = prefetch,
> > .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
> > .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
> > + .is_private = kvm_is_private_gpa(vcpu->kvm, cr2_or_gpa),
>
> I guess putting this chunk and setting up fault->gfn together would be clearer?

is_private is input for kvm page fault. gfn is working variable to resolve
kvm page fault.

> > static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
> > - u64 old_spte, u64 new_spte, int level,
> > - bool shared)
> > + bool private_spte, u64 old_spte, u64 new_spte,
> > + int level, bool shared)
> > {
> > - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
> > - shared);
> > + __handle_changed_spte(kvm, as_id, gfn, private_spte,
> > + old_spte, new_spte, level, shared);
> > handle_changed_spte_acc_track(old_spte, new_spte, level);
> > handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
> > new_spte, level);
> > @@ -640,6 +714,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
> > struct tdp_iter *iter,
> > u64 new_spte)
> > {
> > + bool freeze_spte = iter->is_private && !is_removed_spte(new_spte);
> > + u64 tmp_spte = freeze_spte ? REMOVED_SPTE : new_spte;
>
> Perhaps I am missing something. Could you add comments to explain the logic?

Add a comment.
+ /*
+ * For conventional page table, the update flow is
+ * - update STPE with atomic operation
+ * - hanlde changed SPTE. __handle_changed_spte()
+ * NOTE: __handle_changed_spte() (and functions) must be safe against
+ * concurrent update. It is an exception to zap SPTE. See
+ * tdp_mmu_zap_spte_atomic().
+ *
+ * For private page table, callbacks are needed to propagate SPTE
+ * change into the protected page table. In order to atomically update
+ * both the SPTE and the protected page tables with callbacks, utilize
+ * freezing SPTE.
+ * - Freeze the SPTE. Set entry to REMOVED_SPTE.
+ * - Trigger callbacks for protected page tables. __handle_changed_spte()
+ * - Unfreeze the SPTE. Set the entry to new_spte.
+ */


> > @@ -1067,6 +1163,12 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
> >
> > lockdep_assert_held_write(&kvm->mmu_lock);
> > list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
> > + /*
> > + * Skip private root since private page table
> > + * is only torn down when VM is destroyed.
> > + */
> > + if (is_private_sp(root))
> > + continue;
> > if (!root->role.invalid &&
> > !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
> > root->role.invalid = true;
> > @@ -1087,14 +1189,22 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> > u64 new_spte;
> > int ret = RET_PF_FIXED;
> > bool wrprot = false;
> > + unsigned long pte_access = ACC_ALL;
> > + gfn_t gfn_unalias = iter->gfn & ~kvm_gfn_shared_mask(vcpu->kvm);
>
> Here looks the iter->gfn still contains the shared bits. It is not consistent
> with above.
>
> Can you put some words into the changelog explaining exactly what GFN will you
> put to iterator?
>
> Or can you even split out this part as a separate patch?

I think you meant the above is zap_leafs function. It zaps GPA range module
alias (module shared bit).
This function is to resolve kvm page fault. It means gpa includes shared bit.

here is the updated patch.