Re: [PATCH v7 046/102] KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU

From: Isaku Yamahata
Date: Tue Jul 26 2022 - 19:40:17 EST

Next message: Florian Fainelli: "Re: [PATCH v3 0/7] PCI: brcmstb: Re-submit reverted patchset"
Previous message: Stephen Rothwell: "linux-next: taking a break"
Next in thread: Isaku Yamahata: "Re: [PATCH v7 046/102] KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Fri, Jul 08, 2022 at 03:44:05PM +1200,
Kai Huang <kai.huang@xxxxxxxxx> wrote:

> > +static int kvm_faultin_pfn_private_mapped(struct kvm_vcpu *vcpu,
> > + struct kvm_page_fault *fault)
> > +{
> > + hva_t hva = gfn_to_hva_memslot(fault->slot, fault->gfn);
> > + struct page *page[1];
> > +
> > + fault->map_writable = false;
> > + fault->pfn = KVM_PFN_ERR_FAULT;
> > + if (hva == KVM_HVA_ERR_RO_BAD || hva == KVM_HVA_ERR_BAD)
> > + return RET_PF_CONTINUE;
> > +
> > + /* TDX allows only RWX. Read-only isn't supported. */
> > + WARN_ON_ONCE(!fault->write);
> > + if (pin_user_pages_fast(hva, 1, FOLL_WRITE, page) != 1)
> > + return RET_PF_INVALID;
> > +
> > + fault->map_writable = true;
> > + fault->pfn = page_to_pfn(page[0]);
> > + return RET_PF_CONTINUE;
> > +}
> > +
> > static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > {
> > struct kvm_memory_slot *slot = fault->slot;
> > @@ -4058,6 +4094,9 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > return RET_PF_EMULATE;
> > }
> >
> > + if (fault->is_private)
> > + return kvm_faultin_pfn_private_mapped(vcpu, fault);
> > +
> > async = false;
> > fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
> > fault->write, &fault->map_writable,
> > @@ -4110,6 +4149,17 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
> > mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
> > }
> >
> > +void kvm_mmu_release_fault(struct kvm *kvm, struct kvm_page_fault *fault, int r)
> > +{
> > + if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
> > + return;
> > +
> > + if (fault->is_private)
> > + put_page(pfn_to_page(fault->pfn));
> > + else
> > + kvm_release_pfn_clean(fault->pfn);
> > +}
>
> What's the purpose of 'int r'? Is it even used?

removed r because r is unused.

> > static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > {
> > bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
> > @@ -4117,7 +4167,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> > unsigned long mmu_seq;
> > int r;
> >
> > - fault->gfn = fault->addr >> PAGE_SHIFT;
> > + fault->gfn = gpa_to_gfn(fault->addr) & ~kvm_gfn_shared_mask(vcpu->kvm);
> > fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
>
> Where is fault->is_private set? Shouldn't it be set here?

kvm_mmu_do_page_fault() does it and no because is_private is constant.
is_private is input. On the other hand gfn and slot is working variables.

> > }
> >
> > if (flush)
> > @@ -6023,6 +6079,11 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> > write_unlock(&kvm->mmu_lock);
> > }
> >
> > + /*
> > + * For now this can only happen for non-TD VM, because TD private
> > + * mapping doesn't support write protection. kvm_tdp_mmu_wrprot_slot()
> > + * will give a WARN() if it hits for TD.
> > + */
>
> Unless I am mistaken, 'kvm_tdp_mmu_wrprot_slot() will give a WARN() if it hits
> for TD" is done in your later patch "KVM: x86/tdp_mmu: Ignore unsupported mmu
> operation on private GFNs". Why putting comment here?
>
> Please move this comment to that patch, and I think you can put that patch
> before this patch.
>
> And this problem happens repeatedly in this series. Could you check the entire
> series?

Split out those stuff into a patch.

> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 9f3a6bea60a3..d3b30d62aca0 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -6,6 +6,8 @@
> > #include <linux/kvm_host.h>
> > #include <asm/kvm_host.h>
> >
> > +#include "mmu.h"
> > +
> > #undef MMU_DEBUG
> >
> > #ifdef MMU_DEBUG
> > @@ -164,11 +166,30 @@ static inline void kvm_mmu_alloc_private_sp(
> > WARN_ON_ONCE(!sp->private_sp);
> > }
> >
> > +static inline int kvm_alloc_private_sp_for_split(
> > + struct kvm_mmu_page *sp, gfp_t gfp)
> > +{
> > + gfp &= ~__GFP_ZERO;
> > + sp->private_sp = (void*)__get_free_page(gfp);
> > + if (!sp->private_sp)
> > + return -ENOMEM;
> > + return 0;
> > +}
>
> What does "for_split" mean? Why do we need it?

Split large page into smaller sized one. Followed tdp_mmu_alloc_sp_for_split().
We can defer to introduce this function until large page support.

> > +
> > static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> > {
> > if (sp->private_sp != KVM_MMU_PRIVATE_SP_ROOT)
> > free_page((unsigned long)sp->private_sp);
> > }
> > +
> > +static inline gfn_t kvm_gfn_for_root(struct kvm *kvm, struct kvm_mmu_page *root,
> > + gfn_t gfn)
> > +{
> > + if (is_private_sp(root))
> > + return kvm_gfn_private(kvm, gfn);
> > + else
> > + return kvm_gfn_shared(kvm, gfn);
> > +}
> > #else
> > static inline bool is_private_sp(struct kvm_mmu_page *sp)
> > {
> > @@ -194,11 +215,25 @@ static inline void kvm_mmu_alloc_private_sp(
> > {
> > }
> >
> > +static inline int kvm_alloc_private_sp_for_split(
> > + struct kvm_mmu_page *sp, gfp_t gfp)
> > +{
> > + return -ENOMEM;
> > +}
> > +
> > static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> > {
> > }
> > +
> > +static inline gfn_t kvm_gfn_for_root(struct kvm *kvm, struct kvm_mmu_page *root,
> > + gfn_t gfn)
> > +{
> > + return gfn;
> > +}
> > #endif
> >
> > +void kvm_mmu_release_fault(struct kvm *kvm, struct kvm_page_fault *fault, int r);
> > +
> > static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
> > {
> > /*
> > @@ -246,6 +281,7 @@ struct kvm_page_fault {
> > /* Derived from mmu and global state. */
> > const bool is_tdp;
> > const bool nx_huge_page_workaround_enabled;
> > + const bool is_private;
> >
> > /*
> > * Whether a >4KB mapping can be created or is forbidden due to NX
> > @@ -327,6 +363,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > .prefetch = prefetch,
> > .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
> > .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
> > + .is_private = kvm_is_private_gpa(vcpu->kvm, cr2_or_gpa),
>
> I guess putting this chunk and setting up fault->gfn together would be clearer?

is_private is input for kvm page fault. gfn is working variable to resolve
kvm page fault.

> > static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
> > - u64 old_spte, u64 new_spte, int level,
> > - bool shared)
> > + bool private_spte, u64 old_spte, u64 new_spte,
> > + int level, bool shared)
> > {
> > - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
> > - shared);
> > + __handle_changed_spte(kvm, as_id, gfn, private_spte,
> > + old_spte, new_spte, level, shared);
> > handle_changed_spte_acc_track(old_spte, new_spte, level);
> > handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
> > new_spte, level);
> > @@ -640,6 +714,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
> > struct tdp_iter *iter,
> > u64 new_spte)
> > {
> > + bool freeze_spte = iter->is_private && !is_removed_spte(new_spte);
> > + u64 tmp_spte = freeze_spte ? REMOVED_SPTE : new_spte;
>
> Perhaps I am missing something. Could you add comments to explain the logic?

Add a comment.
+ /*
+ * For conventional page table, the update flow is
+ * - update STPE with atomic operation
+ * - hanlde changed SPTE. __handle_changed_spte()
+ * NOTE: __handle_changed_spte() (and functions) must be safe against
+ * concurrent update. It is an exception to zap SPTE. See
+ * tdp_mmu_zap_spte_atomic().
+ *
+ * For private page table, callbacks are needed to propagate SPTE
+ * change into the protected page table. In order to atomically update
+ * both the SPTE and the protected page tables with callbacks, utilize
+ * freezing SPTE.
+ * - Freeze the SPTE. Set entry to REMOVED_SPTE.
+ * - Trigger callbacks for protected page tables. __handle_changed_spte()
+ * - Unfreeze the SPTE. Set the entry to new_spte.
+ */

> > @@ -1067,6 +1163,12 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
> >
> > lockdep_assert_held_write(&kvm->mmu_lock);
> > list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
> > + /*
> > + * Skip private root since private page table
> > + * is only torn down when VM is destroyed.
> > + */
> > + if (is_private_sp(root))
> > + continue;
> > if (!root->role.invalid &&
> > !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
> > root->role.invalid = true;
> > @@ -1087,14 +1189,22 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> > u64 new_spte;
> > int ret = RET_PF_FIXED;
> > bool wrprot = false;
> > + unsigned long pte_access = ACC_ALL;
> > + gfn_t gfn_unalias = iter->gfn & ~kvm_gfn_shared_mask(vcpu->kvm);
>
> Here looks the iter->gfn still contains the shared bits. It is not consistent
> with above.
>
> Can you put some words into the changelog explaining exactly what GFN will you
> put to iterator?
>
> Or can you even split out this part as a separate patch?

I think you meant the above is zap_leafs function. It zaps GPA range module
alias (module shared bit).
This function is to resolve kvm page fault. It means gpa includes shared bit.

here is the updated patch.

Next message: Florian Fainelli: "Re: [PATCH v3 0/7] PCI: brcmstb: Re-submit reverted patchset"
Previous message: Stephen Rothwell: "linux-next: taking a break"
Next in thread: Isaku Yamahata: "Re: [PATCH v7 046/102] KVM: x86/tdp_mmu: Support TDX private mapping for TDP MMU"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]