Re: [PATCH v7 5/8] KVM: x86/mmu: Don't pass FOLL_GET to __kvm_follow_pfn

From: David Stevens
Date: Thu Aug 24 2023 - 04:05:32 EST


On Wed, Jul 5, 2023 at 7:25 PM Yu Zhang <yu.c.zhang@xxxxxxxxxxxxxxx> wrote:
>
> On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> > From: David Stevens <stevensd@xxxxxxxxxxxx>
> >
> > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> > memory into the guest that is backed by un-refcounted struct pages - for
> > example, higher order non-compound pages allocated by the amdgpu driver
> > via ttm_pool_alloc_page.
> >
> > The bulk of this change is tracking the is_refcounted_page flag so that
> > non-refcounted pages don't trigger page_count() == 0 warnings. This is
> > done by storing the flag in an unused bit in the sptes.
> >
> > Signed-off-by: David Stevens <stevensd@xxxxxxxxxxxx>
> > ---
> > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------
> > arch/x86/kvm/mmu/mmu_internal.h | 1 +
> > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++---
> > arch/x86/kvm/mmu/spte.c | 4 ++-
> > arch/x86/kvm/mmu/spte.h | 12 ++++++++-
> > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++-------
> > 6 files changed, 62 insertions(+), 30 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index e44ab512c3a1..b1607e314497 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -553,12 +553,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
> >
> > if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
> > flush = true;
> > - kvm_set_pfn_accessed(spte_to_pfn(old_spte));
> > + if (is_refcounted_page_pte(old_spte))
> > + kvm_set_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
> > }
> >
> > if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
> > flush = true;
> > - kvm_set_pfn_dirty(spte_to_pfn(old_spte));
> > + if (is_refcounted_page_pte(old_spte))
> > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(old_spte)));
> > }
> >
> > return flush;
> > @@ -596,14 +598,18 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
> > * before they are reclaimed. Sanity check that, if the pfn is backed
> > * by a refcounted page, the refcount is elevated.
> > */
> > - page = kvm_pfn_to_refcounted_page(pfn);
> > - WARN_ON(page && !page_count(page));
> > + if (is_refcounted_page_pte(old_spte)) {
> > + page = kvm_pfn_to_refcounted_page(pfn);
> > + WARN_ON(!page || !page_count(page));
> > + }
> >
> > - if (is_accessed_spte(old_spte))
> > - kvm_set_pfn_accessed(pfn);
> > + if (is_refcounted_page_pte(old_spte)) {
> > + if (is_accessed_spte(old_spte))
> > + kvm_set_page_accessed(pfn_to_page(pfn));
> >
> > - if (is_dirty_spte(old_spte))
> > - kvm_set_pfn_dirty(pfn);
> > + if (is_dirty_spte(old_spte))
> > + kvm_set_page_dirty(pfn_to_page(pfn));
> > + }
> >
> > return old_spte;
> > }
> > @@ -639,8 +645,8 @@ static bool mmu_spte_age(u64 *sptep)
> > * Capture the dirty status of the page, so that it doesn't get
> > * lost when the SPTE is marked for access tracking.
> > */
> > - if (is_writable_pte(spte))
> > - kvm_set_pfn_dirty(spte_to_pfn(spte));
> > + if (is_writable_pte(spte) && is_refcounted_page_pte(spte))
> > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(spte)));
> >
> > spte = mark_spte_for_access_track(spte);
> > mmu_spte_update_no_track(sptep, spte);
> > @@ -1278,8 +1284,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
> > {
> > bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
> > (unsigned long *)sptep);
> > - if (was_writable && !spte_ad_enabled(*sptep))
> > - kvm_set_pfn_dirty(spte_to_pfn(*sptep));
> > + if (was_writable && !spte_ad_enabled(*sptep) && is_refcounted_page_pte(*sptep))
> > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(*sptep)));
> >
> > return was_writable;
> > }
> > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> > bool host_writable = !fault || fault->map_writable;
> > bool prefetch = !fault || fault->prefetch;
> > bool write_fault = fault && fault->write;
> > + bool is_refcounted = !fault || fault->is_refcounted_page;
> >
> > pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
> > *sptep, write_fault, gfn);
> > @@ -2969,7 +2976,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> > }
> >
> > wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
> > - true, host_writable, &spte);
> > + true, host_writable, is_refcounted, &spte);
> >
> > if (*sptep == spte) {
> > ret = RET_PF_SPURIOUS;
> > @@ -4299,8 +4306,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> > struct kvm_follow_pfn foll = {
> > .slot = slot,
> > .gfn = fault->gfn,
> > - .flags = FOLL_GET | (fault->write ? FOLL_WRITE : 0),
> > + .flags = fault->write ? FOLL_WRITE : 0,
> > .allow_write_mapping = true,
> > + .guarded_by_mmu_notifier = true,
> > };
> >
> > /*
> > @@ -4317,6 +4325,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> > fault->slot = NULL;
> > fault->pfn = KVM_PFN_NOSLOT;
> > fault->map_writable = false;
> > + fault->is_refcounted_page = false;
> > return RET_PF_CONTINUE;
> > }
> > /*
> > @@ -4366,6 +4375,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> > success:
> > fault->hva = foll.hva;
> > fault->map_writable = foll.writable;
> > + fault->is_refcounted_page = foll.is_refcounted_page;
> > return RET_PF_CONTINUE;
> > }
> >
> > @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> >
> > out_unlock:
> > write_unlock(&vcpu->kvm->mmu_lock);
> > - kvm_release_pfn_clean(fault->pfn);
> > + if (fault->is_refcounted_page)
> > + kvm_set_page_accessed(pfn_to_page(fault->pfn));
> > return r;
> > }
> >
> > @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
> >
> > out_unlock:
> > read_unlock(&vcpu->kvm->mmu_lock);
> > - kvm_release_pfn_clean(fault->pfn);
>
> Yet kvm_release_pfn() can still be triggered for the kvm_vcpu_maped gfns.
> What if guest uses a non-referenced page(e.g., as a vmcs12)? Although I
> believe this is not gonna happen in real world...

kvm_vcpu_map still uses gfn_to_pfn, which eventually passes FOLL_GET
to __kvm_follow_pfn. So if a guest tries to use a non-refcounted page
like that, then kvm_vcpu_map will fail and the guest will probably
crash. It won't trigger any bugs in the host, though.

It is unfortunate that the guest will be able to use certain types of
memory for some purposes but not for others. However, while it is
theoretically fixable, it's an unreasonable amount of work for
something that, as you say, nobody really cares about in practice [1].

[1] https://lore.kernel.org/all/ZBEEQtmtNPaEqU1i@xxxxxxxxxx/

-David