Re: [PATCH v7 2/8] KVM: Introduce __kvm_follow_pfn function

From: Zhi Wang
Date: Tue Jul 11 2023 - 13:37:44 EST


On Wed, 5 Jul 2023 18:08:17 +0900
David Stevens <stevensd@xxxxxxxxxxxx> wrote:

> On Wed, Jul 5, 2023 at 5:47___PM Zhi Wang <zhi.wang.linux@xxxxxxxxx> wrote:
> >
> > On Tue, 4 Jul 2023 16:50:47 +0900
> > David Stevens <stevensd@xxxxxxxxxxxx> wrote:
> >
> > > From: David Stevens <stevensd@xxxxxxxxxxxx>
> > >
> > > Introduce __kvm_follow_pfn, which will replace __gfn_to_pfn_memslot.
> > > __kvm_follow_pfn refactors the old API's arguments into a struct and,
> > > where possible, combines the boolean arguments into a single flags
> > > argument.
> > >
> > > Signed-off-by: David Stevens <stevensd@xxxxxxxxxxxx>
> > > ---
> > > include/linux/kvm_host.h | 16 ++++
> > > virt/kvm/kvm_main.c | 171 ++++++++++++++++++++++-----------------
> > > virt/kvm/kvm_mm.h | 3 +-
> > > virt/kvm/pfncache.c | 8 +-
> > > 4 files changed, 122 insertions(+), 76 deletions(-)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 9d3ac7720da9..ef2763c2b12e 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -97,6 +97,7 @@
> > > #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
> > > #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
> > > #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3)
> > > +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4)
> > >
> > > /*
> > > * error pfns indicate that the gfn is in slot but faild to
> > > @@ -1156,6 +1157,21 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
> > > void kvm_release_page_clean(struct page *page);
> > > void kvm_release_page_dirty(struct page *page);
> > >
> > > +struct kvm_follow_pfn {
> > > + const struct kvm_memory_slot *slot;
> > > + gfn_t gfn;
> > > + unsigned int flags;
> > > + bool atomic;
> > > + /* Allow a read fault to create a writeable mapping. */
> > > + bool allow_write_mapping;
> > > +
> > > + /* Outputs of __kvm_follow_pfn */
> > > + hva_t hva;
> > > + bool writable;
> > > +};
> > > +
> > > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
> > > +
> > > kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
> > > kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
> > > bool *writable);
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 371bd783ff2b..b13f22861d2f 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -2486,24 +2486,22 @@ static inline int check_user_page_hwpoison(unsigned long addr)
> > > * true indicates success, otherwise false is returned. It's also the
> > > * only part that runs if we can in atomic context.
> > > */
> > > -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > > - bool *writable, kvm_pfn_t *pfn)
> > > +static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > > {
> > > struct page *page[1];
> > > + bool write_fault = foll->flags & FOLL_WRITE;
> > >
> > > /*
> > > * Fast pin a writable pfn only if it is a write fault request
> > > * or the caller allows to map a writable pfn for a read fault
> > > * request.
> > > */
> > > - if (!(write_fault || writable))
> > > + if (!(write_fault || foll->allow_write_mapping))
> > > return false;
> > >
> > > - if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
> > > + if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
> > > *pfn = page_to_pfn(page[0]);
> > > -
> > > - if (writable)
> > > - *writable = true;
> > > + foll->writable = foll->allow_write_mapping;
> > > return true;
> > > }
> > >
> > > @@ -2514,35 +2512,26 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > > * The slow path to get the pfn of the specified host virtual address,
> > > * 1 indicates success, -errno is returned if error is detected.
> > > */
> > > -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > > - bool interruptible, bool *writable, kvm_pfn_t *pfn)
> > > +static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > > {
> > > - unsigned int flags = FOLL_HWPOISON;
> > > + unsigned int flags = FOLL_HWPOISON | FOLL_GET | foll->flags;
> > > struct page *page;
> > > int npages;
> > >
> > > might_sleep();
> > >
> > > - if (writable)
> > > - *writable = write_fault;
> > > -
> > > - if (write_fault)
> > > - flags |= FOLL_WRITE;
> > > - if (async)
> > > - flags |= FOLL_NOWAIT;
> > > - if (interruptible)
> > > - flags |= FOLL_INTERRUPTIBLE;
> > > -
> > > - npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > > + npages = get_user_pages_unlocked(foll->hva, 1, &page, flags);
> > > if (npages != 1)
> > > return npages;
> > >
> > > + foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > > +
> > > /* map read fault as writable if possible */
> > > - if (unlikely(!write_fault) && writable) {
> > > + if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> > > struct page *wpage;
> > >
> > > - if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
> > > - *writable = true;
> > > + if (get_user_page_fast_only(foll->hva, FOLL_WRITE, &wpage)) {
> > > + foll->writable = true;
> > > put_page(page);
> > > page = wpage;
> > > }
> > > @@ -2572,23 +2561,23 @@ static int kvm_try_get_pfn(kvm_pfn_t pfn)
> > > return get_page_unless_zero(page);
> > > }
> > >
> > > -static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > > - unsigned long addr, bool write_fault,
> > > - bool *writable, kvm_pfn_t *p_pfn)
> > > +static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
> > > + kvm_pfn_t *p_pfn)
> > > {
> > > kvm_pfn_t pfn;
> > > pte_t *ptep;
> > > spinlock_t *ptl;
> > > + bool write_fault = foll->flags & FOLL_WRITE;
> > > int r;
> > >
> > > - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> > > + r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> > > if (r) {
> > > /*
> > > * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
> > > * not call the fault handler, so do it here.
> > > */
> > > bool unlocked = false;
> > > - r = fixup_user_fault(current->mm, addr,
> > > + r = fixup_user_fault(current->mm, foll->hva,
> > > (write_fault ? FAULT_FLAG_WRITE : 0),
> > > &unlocked);
> > > if (unlocked)
> > > @@ -2596,7 +2585,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > > if (r)
> > > return r;
> > >
> > > - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
> > > + r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> > > if (r)
> > > return r;
> > > }
> > > @@ -2606,8 +2595,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > > goto out;
> > > }
> > >
> > > - if (writable)
> > > - *writable = pte_write(*ptep);
> > > + foll->writable = pte_write(*ptep) && foll->allow_write_mapping;
> > > pfn = pte_pfn(*ptep);
> > >
> > > /*
> > > @@ -2652,24 +2640,22 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> > > * 2): @write_fault = false && @writable, @writable will tell the caller
> > > * whether the mapping is writable.
> > > */
> > > -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> > > - bool *async, bool write_fault, bool *writable)
> > > +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
> > > {
> > > struct vm_area_struct *vma;
> > > kvm_pfn_t pfn;
> > > int npages, r;
> > >
> > > /* we can do it either atomically or asynchronously, not both */
> > > - BUG_ON(atomic && async);
> > > + BUG_ON(foll->atomic && (foll->flags & FOLL_NOWAIT));
> > >
> > > - if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
> > > + if (hva_to_pfn_fast(foll, &pfn))
> > > return pfn;
> > >
> > > - if (atomic)
> > > + if (foll->atomic)
> > > return KVM_PFN_ERR_FAULT;
> > >
> > > - npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
> > > - writable, &pfn);
> > > + npages = hva_to_pfn_slow(foll, &pfn);
> > > if (npages == 1)
> > > return pfn;
> > > if (npages == -EINTR)
> > > @@ -2677,83 +2663,122 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
> > >
> > > mmap_read_lock(current->mm);
> > > if (npages == -EHWPOISON ||
> > > - (!async && check_user_page_hwpoison(addr))) {
> > > + (!(foll->flags & FOLL_NOWAIT) && check_user_page_hwpoison(foll->hva))) {
> > > pfn = KVM_PFN_ERR_HWPOISON;
> > > goto exit;
> > > }
> > >
> > > retry:
> > > - vma = vma_lookup(current->mm, addr);
> > > + vma = vma_lookup(current->mm, foll->hva);
> > >
> > > if (vma == NULL)
> > > pfn = KVM_PFN_ERR_FAULT;
> > > else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> > > - r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
> > > + r = hva_to_pfn_remapped(vma, foll, &pfn);
> > > if (r == -EAGAIN)
> > > goto retry;
> > > if (r < 0)
> > > pfn = KVM_PFN_ERR_FAULT;
> > > } else {
> > > - if (async && vma_is_valid(vma, write_fault))
> > > - *async = true;
> > > - pfn = KVM_PFN_ERR_FAULT;
> > > + if ((foll->flags & FOLL_NOWAIT) &&
> > > + vma_is_valid(vma, foll->flags & FOLL_WRITE))
> > > + pfn = KVM_PFN_ERR_NEEDS_IO;
> > > + else
> > > + pfn = KVM_PFN_ERR_FAULT;
> > > }
> > > exit:
> > > mmap_read_unlock(current->mm);
> > > return pfn;
> > > }
> > >
> > > -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
> > > - bool atomic, bool interruptible, bool *async,
> > > - bool write_fault, bool *writable, hva_t *hva)
> > > +kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
> > > {
> > > - unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
> > > -
> > > - if (hva)
> > > - *hva = addr;
> > > + foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
> > > + foll->flags & FOLL_WRITE);
> > >
> > > - if (addr == KVM_HVA_ERR_RO_BAD) {
> > > - if (writable)
> > > - *writable = false;
> > > + if (foll->hva == KVM_HVA_ERR_RO_BAD)
> > > return KVM_PFN_ERR_RO_FAULT;
> > > - }
> > >
> >
> > Can you explain why updating foll->writable = false (previously *writeable
> > = false) is omitted here?
> >
> > In the caller where the struct kvm_follow_pfn is initialized, e.g.
> > __gfn_to_pfn_memslot()/gfn_to_pfn_prot(), .writable is not initialized.
> > IIUC, they expect __kvm_follow_pfn() to update it and return .writable to
> > upper caller.
> >
> > As the one of the output, it would be better to initalize it either in the
> > caller or update it in __kvm_follow_pfn(). Or
> > __gfn_to_pfn_memslot()/gfn_to_pfn_prot() will return random data in the
> > stack to the caller via bool *writable. It doesn't sound nice.
>
> Entries omitted from an initializer are initialized to zero, so
> .writable does get initialized in all of the patches in this series
> via designated initializers. Although you're right that explicitly
> setting it to false is a good idea, in case someday someone adds a
> caller that doesn't use an initializer when declaring its
> kvm_follow_pfn.
>

Nice trick and nice to know that. :) Agreed on improving readability and
preventing a risk from the caller.

> -David