Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

From: Isaku Yamahata
Date: Fri Jul 21 2023 - 18:27:14 EST


On Fri, Jul 21, 2023 at 02:13:14PM +0800,
Yuan Yao <yuan.yao@xxxxxxxxxxxxxxx> wrote:

> On Tue, Jul 18, 2023 at 04:44:55PM -0700, Sean Christopherson wrote:
> > TODO
> >
> > Cc: Fuad Tabba <tabba@xxxxxxxxxx>
> > Cc: Vishal Annapurve <vannapurve@xxxxxxxxxx>
> > Cc: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> > Cc: Jarkko Sakkinen <jarkko@xxxxxxxxxx>
> > Cc: Maciej Szmigiero <mail@xxxxxxxxxxxxxxxxxxxxx>
> > Cc: Vlastimil Babka <vbabka@xxxxxxx>
> > Cc: David Hildenbrand <david@xxxxxxxxxx>
> > Cc: Quentin Perret <qperret@xxxxxxxxxx>
> > Cc: Michael Roth <michael.roth@xxxxxxx>
> > Cc: Wang <wei.w.wang@xxxxxxxxx>
> > Cc: Liam Merwick <liam.merwick@xxxxxxxxxx>
> > Cc: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> > Co-developed-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> > Co-developed-by: Yu Zhang <yu.c.zhang@xxxxxxxxxxxxxxx>
> > Signed-off-by: Yu Zhang <yu.c.zhang@xxxxxxxxxxxxxxx>
> > Co-developed-by: Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx>
> > Signed-off-by: Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx>
> > Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> > Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> > Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> > ---
> > include/linux/kvm_host.h | 48 +++
> > include/uapi/linux/kvm.h | 14 +-
> > include/uapi/linux/magic.h | 1 +
> > virt/kvm/Kconfig | 4 +
> > virt/kvm/Makefile.kvm | 1 +
> > virt/kvm/guest_mem.c | 591 +++++++++++++++++++++++++++++++++++++
> > virt/kvm/kvm_main.c | 58 +++-
> > virt/kvm/kvm_mm.h | 38 +++
> > 8 files changed, 750 insertions(+), 5 deletions(-)
> > create mode 100644 virt/kvm/guest_mem.c
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 97db63da6227..0d1e2ee8ae7a 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -592,8 +592,20 @@ struct kvm_memory_slot {
> > u32 flags;
> > short id;
> > u16 as_id;
> > +
> > +#ifdef CONFIG_KVM_PRIVATE_MEM
> > + struct {
> > + struct file __rcu *file;
> > + pgoff_t pgoff;
> > + } gmem;
> > +#endif
> > };
> >
> > +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> > +{
> > + return slot && (slot->flags & KVM_MEM_PRIVATE);
> > +}
> > +
> > static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
> > {
> > return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> > @@ -688,6 +700,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
> > }
> > #endif
> >
> > +/*
> > + * Arch code must define kvm_arch_has_private_mem if support for private memory
> > + * is enabled.
> > + */
> > +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
> > +static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
> > +{
> > + return false;
> > +}
> > +#endif
> > +
> > struct kvm_memslots {
> > u64 generation;
> > atomic_long_t last_used_slot;
> > @@ -1380,6 +1403,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> > void kvm_mmu_invalidate_begin(struct kvm *kvm);
> > void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
> > void kvm_mmu_invalidate_end(struct kvm *kvm);
> > +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> >
> > long kvm_arch_dev_ioctl(struct file *filp,
> > unsigned int ioctl, unsigned long arg);
> > @@ -2313,6 +2337,30 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
> >
> > bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> > struct kvm_gfn_range *range);
> > +
> > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > +{
> > + return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> > + kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > +}
> > +#else
> > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > +{
> > + return false;
> > +}
> > #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
> >
> > +#ifdef CONFIG_KVM_PRIVATE_MEM
> > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > + gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
> > +#else
> > +static inline int kvm_gmem_get_pfn(struct kvm *kvm,
> > + struct kvm_memory_slot *slot, gfn_t gfn,
> > + kvm_pfn_t *pfn, int *max_order)
> > +{
> > + KVM_BUG_ON(1, kvm);
> > + return -EIO;
> > +}
> > +#endif /* CONFIG_KVM_PRIVATE_MEM */
> > +
> > #endif
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index f065c57db327..9b344fc98598 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 {
> > __u64 guest_phys_addr;
> > __u64 memory_size;
> > __u64 userspace_addr;
> > - __u64 pad[16];
> > + __u64 gmem_offset;
> > + __u32 gmem_fd;
> > + __u32 pad1;
> > + __u64 pad2[14];
> > };
> >
> > /*
> > @@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 {
> > */
> > #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
> > #define KVM_MEM_READONLY (1UL << 1)
> > +#define KVM_MEM_PRIVATE (1UL << 2)
> >
> > /* for KVM_IRQ_LINE */
> > struct kvm_irq_level {
> > @@ -2284,4 +2288,12 @@ struct kvm_memory_attributes {
> >
> > #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
> >
> > +#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
> > +
> > +struct kvm_create_guest_memfd {
> > + __u64 size;
> > + __u64 flags;
> > + __u64 reserved[6];
> > +};
> > +
> > #endif /* __LINUX_KVM_H */
> > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> > index 6325d1d0e90f..15041aa7d9ae 100644
> > --- a/include/uapi/linux/magic.h
> > +++ b/include/uapi/linux/magic.h
> > @@ -101,5 +101,6 @@
> > #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
> > #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
> > #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
> > +#define GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */
> >
> > #endif /* __LINUX_MAGIC_H__ */
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 8375bc49f97d..3ee3205e0b39 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -103,3 +103,7 @@ config KVM_GENERIC_MMU_NOTIFIER
> > config KVM_GENERIC_MEMORY_ATTRIBUTES
> > select KVM_GENERIC_MMU_NOTIFIER
> > bool
> > +
> > +config KVM_PRIVATE_MEM
> > + select XARRAY_MULTI
> > + bool
> > diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> > index 2c27d5d0c367..a5a61bbe7f4c 100644
> > --- a/virt/kvm/Makefile.kvm
> > +++ b/virt/kvm/Makefile.kvm
> > @@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
> > kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
> > kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
> > kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
> > +kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_mem.o
> > diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
> > new file mode 100644
> > index 000000000000..1b705fd63fa8
> > --- /dev/null
> > +++ b/virt/kvm/guest_mem.c
> > @@ -0,0 +1,591 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <linux/backing-dev.h>
> > +#include <linux/falloc.h>
> > +#include <linux/kvm_host.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/pseudo_fs.h>
> > +
> > +#include <uapi/linux/magic.h>
> > +
> > +#include "kvm_mm.h"
> > +
> > +static struct vfsmount *kvm_gmem_mnt;
> > +
> > +struct kvm_gmem {
> > + struct kvm *kvm;
> > + struct xarray bindings;
> > + struct list_head entry;
> > +};
> > +
> > +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> > +{
> > + struct folio *folio;
> > +
> > + /* TODO: Support huge pages. */
> > + folio = filemap_grab_folio(file->f_mapping, index);
> > + if (!folio)
> > + return NULL;
> > +
> > + /*
> > + * Use the up-to-date flag to track whether or not the memory has been
> > + * zeroed before being handed off to the guest. There is no backing
> > + * storage for the memory, so the folio will remain up-to-date until
> > + * it's removed.
> > + *
> > + * TODO: Skip clearing pages when trusted firmware will do it when
> > + * assigning memory to the guest.
> > + */
> > + if (!folio_test_uptodate(folio)) {
> > + unsigned long nr_pages = folio_nr_pages(folio);
> > + unsigned long i;
> > +
> > + for (i = 0; i < nr_pages; i++)
> > + clear_highpage(folio_page(folio, i));
> > +
> > + folio_mark_uptodate(folio);
> > + }
> > +
> > + /*
> > + * Ignore accessed, referenced, and dirty flags. The memory is
> > + * unevictable and there is no storage to write back to.
> > + */
> > + return folio;
> > +}
> > +
> > +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> > + pgoff_t end)
> > +{
> > + struct kvm_memory_slot *slot;
> > + struct kvm *kvm = gmem->kvm;
> > + unsigned long index;
> > + bool flush = false;
> > +
> > + KVM_MMU_LOCK(kvm);
> > +
> > + kvm_mmu_invalidate_begin(kvm);
> > +
> > + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > + pgoff_t pgoff = slot->gmem.pgoff;
> > +
> > + struct kvm_gfn_range gfn_range = {
> > + .start = slot->base_gfn + max(pgoff, start) - pgoff,
> > + .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
> > + .slot = slot,
> > + .may_block = true,
> > + };
> > +
> > + flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> > + }
> > +
> > + if (flush)
> > + kvm_flush_remote_tlbs(kvm);
> > +
> > + KVM_MMU_UNLOCK(kvm);
> > +}
> > +
> > +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
> > + pgoff_t end)
> > +{
> > + struct kvm *kvm = gmem->kvm;
> > +
> > + KVM_MMU_LOCK(kvm);
> > + if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT))
> > + kvm_mmu_invalidate_end(kvm);
> > + KVM_MMU_UNLOCK(kvm);
> > +}
> > +
> > +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> > +{
> > + struct list_head *gmem_list = &inode->i_mapping->private_list;
> > + pgoff_t start = offset >> PAGE_SHIFT;
> > + pgoff_t end = (offset + len) >> PAGE_SHIFT;
> > + struct kvm_gmem *gmem;
> > +
> > + /*
> > + * Bindings must stable across invalidation to ensure the start+end
> > + * are balanced.
> > + */
> > + filemap_invalidate_lock(inode->i_mapping);
> > +
> > + list_for_each_entry(gmem, gmem_list, entry)
> > + kvm_gmem_invalidate_begin(gmem, start, end);
> > +
> > + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
> > +
> > + list_for_each_entry(gmem, gmem_list, entry)
> > + kvm_gmem_invalidate_end(gmem, start, end);
> > +
> > + filemap_invalidate_unlock(inode->i_mapping);
> > +
> > + return 0;
> > +}
> > +
> > +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> > +{
> > + struct address_space *mapping = inode->i_mapping;
> > + pgoff_t start, index, end;
> > + int r;
> > +
> > + /* Dedicated guest is immutable by default. */
> > + if (offset + len > i_size_read(inode))
> > + return -EINVAL;
> > +
> > + filemap_invalidate_lock_shared(mapping);
> > +
> > + start = offset >> PAGE_SHIFT;
> > + end = (offset + len) >> PAGE_SHIFT;
> > +
> > + r = 0;
> > + for (index = start; index < end; ) {
> > + struct folio *folio;
> > +
> > + if (signal_pending(current)) {
> > + r = -EINTR;
> > + break;
> > + }
> > +
> > + folio = kvm_gmem_get_folio(inode, index);
> > + if (!folio) {
> > + r = -ENOMEM;
> > + break;
> > + }
> > +
> > + index = folio_next_index(folio);
> > +
> > + folio_unlock(folio);
> > + folio_put(folio);
> > +
> > + /* 64-bit only, wrapping the index should be impossible. */
> > + if (WARN_ON_ONCE(!index))
> > + break;
> > +
> > + cond_resched();
> > + }
> > +
> > + filemap_invalidate_unlock_shared(mapping);
> > +
> > + return r;
> > +}
> > +
> > +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> > + loff_t len)
> > +{
> > + int ret;
> > +
> > + if (!(mode & FALLOC_FL_KEEP_SIZE))
> > + return -EOPNOTSUPP;
> > +
> > + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> > + return -EOPNOTSUPP;
> > +
> > + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> > + return -EINVAL;
> > +
> > + if (mode & FALLOC_FL_PUNCH_HOLE)
> > + ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
> > + else
> > + ret = kvm_gmem_allocate(file_inode(file), offset, len);
> > +
> > + if (!ret)
> > + file_modified(file);
> > + return ret;
> > +}
> > +
> > +static int kvm_gmem_release(struct inode *inode, struct file *file)
> > +{
> > + struct kvm_gmem *gmem = file->private_data;
> > + struct kvm_memory_slot *slot;
> > + struct kvm *kvm = gmem->kvm;
> > + unsigned long index;
> > +
> > + filemap_invalidate_lock(inode->i_mapping);
> > +
> > + /*
> > + * Prevent concurrent attempts to *unbind* a memslot. This is the last
> > + * reference to the file and thus no new bindings can be created, but
> > + * dereferencing the slot for existing bindings needs to be protected
> > + * against memslot updates, specifically so that unbind doesn't race
> > + * and free the memslot (kvm_gmem_get_file() will return NULL).
> > + */
> > + mutex_lock(&kvm->slots_lock);
> > +
> > + xa_for_each(&gmem->bindings, index, slot)
> > + rcu_assign_pointer(slot->gmem.file, NULL);
> > +
> > + synchronize_rcu();
> > +
> > + /*
> > + * All in-flight operations are gone and new bindings can be created.
> > + * Zap all SPTEs pointed at by this file. Do not free the backing
> > + * memory, as its lifetime is associated with the inode, not the file.
> > + */
> > + kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> > + kvm_gmem_invalidate_end(gmem, 0, -1ul);
> > +
> > + mutex_unlock(&kvm->slots_lock);
> > +
> > + list_del(&gmem->entry);
> > +
> > + filemap_invalidate_unlock(inode->i_mapping);
> > +
> > + xa_destroy(&gmem->bindings);
> > + kfree(gmem);
> > +
> > + kvm_put_kvm(kvm);
> > +
> > + return 0;
> > +}
> > +
> > +static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
> > +{
> > + struct file *file;
> > +
> > + rcu_read_lock();
> > +
> > + file = rcu_dereference(slot->gmem.file);
> > + if (file && !get_file_rcu(file))
> > + file = NULL;
> > +
> > + rcu_read_unlock();
> > +
> > + return file;
> > +}
> > +
> > +static const struct file_operations kvm_gmem_fops = {
> > + .open = generic_file_open,
> > + .release = kvm_gmem_release,
> > + .fallocate = kvm_gmem_fallocate,
> > +};
> > +
> > +static int kvm_gmem_migrate_folio(struct address_space *mapping,
> > + struct folio *dst, struct folio *src,
> > + enum migrate_mode mode)
> > +{
> > + WARN_ON_ONCE(1);
> > + return -EINVAL;
> > +}
> > +
> > +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> > +{
> > + struct list_head *gmem_list = &mapping->private_list;
> > + struct kvm_memory_slot *slot;
> > + struct kvm_gmem *gmem;
> > + unsigned long index;
> > + pgoff_t start, end;
> > + gfn_t gfn;
> > +
> > + filemap_invalidate_lock_shared(mapping);
> > +
> > + start = page->index;
> > + end = start + thp_nr_pages(page);
> > +
> > + list_for_each_entry(gmem, gmem_list, entry) {
> > + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > + for (gfn = start; gfn < end; gfn++) {
>
> Why the start end range used as gfn here ?
>
> the page->index is offset of inode's page cache mapping and
> gmem address space, IIUC, gfn calculation should follow same
> way as kvm_gmem_invalidate_begin().

Also instead of sending signal multiple times, we can utilize lsb argument.
Something like this?

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index a14eaac9dbad..8072ac901855 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -349,20 +349,35 @@ static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
struct kvm_gmem *gmem;
unsigned long index;
pgoff_t start, end;
- gfn_t gfn;
+ unsigned int order;
+ int nr_pages;
+ gfn_t gfn, gfn_end;

filemap_invalidate_lock_shared(mapping);

start = page->index;
end = start + thp_nr_pages(page);
+ nr_pages = thp_nr_pages(page);
+ order = thp_order(page);

list_for_each_entry(gmem, gmem_list, entry) {
xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
- for (gfn = start; gfn < end; gfn++) {
- if (WARN_ON_ONCE(gfn < slot->base_gfn ||
- gfn >= slot->base_gfn + slot->npages))
- continue;
+ gfn = slot->base_gfn + page->index - slot->gmem.pgoff;

+ if (page->index + nr_pages <= slot->gmem.pgoff + slot->npages &&
+ !(gfn & ~((1ULL << order) - 1))) {
+ /*
+ * FIXME: Tell userspace that the *private*
+ * memory encountered an error.
+ */
+ send_sig_mceerr(BUS_MCEERR_AR,
+ (void __user *)gfn_to_hva_memslot(slot, gfn),
+ order, current);
+ break;
+ }
+
+ gfn_end = min(gfn + nr_pages, slot->base_gfn + slot->npages);
+ for (; gfn < gfn_end; gfn++) {
/*
* FIXME: Tell userspace that the *private*
* memory encountered an error.

--
Isaku Yamahata <isaku.yamahata@xxxxxxxxx>