Re: [RFCv2 14/16] KVM: Handle protected memory in __kvm_map_gfn()/__kvm_unmap_gfn()

From: Edgecombe, Rick P
Date: Wed Oct 21 2020 - 14:50:45 EST


On Tue, 2020-10-20 at 09:18 +0300, Kirill A. Shutemov wrote:
> We cannot access protected pages directly. Use ioremap() to
> create a temporary mapping of the page. The mapping is destroyed
> on __kvm_unmap_gfn().
>
> The new interface gfn_to_pfn_memslot_protected() is used to detect if
> the page is protected.
>
> ioremap_cache_force() is a hack to bypass IORES_MAP_SYSTEM_RAM check
> in
> the x86 ioremap code. We need a better solution.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> ---
> arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +-
> arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +-
> arch/x86/include/asm/io.h | 2 +
> arch/x86/include/asm/pgtable_types.h | 1 +
> arch/x86/kvm/mmu/mmu.c | 6 ++-
> arch/x86/mm/ioremap.c | 16 ++++++--
> include/linux/kvm_host.h | 3 +-
> include/linux/kvm_types.h | 1 +
> virt/kvm/kvm_main.c | 52 +++++++++++++++++++-----
> --
> 9 files changed, 63 insertions(+), 22 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 38ea396a23d6..8e06cd3f759c 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu
> *vcpu,
> } else {
> /* Call KVM generic code to do the slow-path check */
> pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
> - writing, &write_ok);
> + writing, &write_ok, NULL);
> if (is_error_noslot_pfn(pfn))
> return -EFAULT;
> page = NULL;
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 22a677b18695..6fd4e3f9b66a 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct
> kvm_vcpu *vcpu,
>
> /* Call KVM generic code to do the slow-path check */
> pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
> - writing, upgrade_p);
> + writing, upgrade_p, NULL);
> if (is_error_noslot_pfn(pfn))
> return -EFAULT;
> page = NULL;
> diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
> index c58d52fd7bf2..a3e1bfad1026 100644
> --- a/arch/x86/include/asm/io.h
> +++ b/arch/x86/include/asm/io.h
> @@ -184,6 +184,8 @@ extern void __iomem *ioremap_uc(resource_size_t
> offset, unsigned long size);
> #define ioremap_uc ioremap_uc
> extern void __iomem *ioremap_cache(resource_size_t offset, unsigned
> long size);
> #define ioremap_cache ioremap_cache
> +extern void __iomem *ioremap_cache_force(resource_size_t offset,
> unsigned long size);
> +#define ioremap_cache_force ioremap_cache_force
> extern void __iomem *ioremap_prot(resource_size_t offset, unsigned
> long size, unsigned long prot_val);
> #define ioremap_prot ioremap_prot
> extern void __iomem *ioremap_encrypted(resource_size_t phys_addr,
> unsigned long size);
> diff --git a/arch/x86/include/asm/pgtable_types.h
> b/arch/x86/include/asm/pgtable_types.h
> index 816b31c68550..4c16a9583786 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -147,6 +147,7 @@ enum page_cache_mode {
> _PAGE_CACHE_MODE_UC = 3,
> _PAGE_CACHE_MODE_WT = 4,
> _PAGE_CACHE_MODE_WP = 5,
> + _PAGE_CACHE_MODE_WB_FORCE = 6,
>
> _PAGE_CACHE_MODE_NUM = 8
> };
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 71aa3da2a0b7..162cb285b87b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4058,7 +4058,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu,
> bool prefault, gfn_t gfn,
> }
>
> async = false;
> - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write,
> writable);
> + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write,
> writable,
> + NULL);
> if (!async)
> return false; /* *pfn has correct page already */
>
> @@ -4072,7 +4073,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu,
> bool prefault, gfn_t gfn,
> return true;
> }
>
> - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write,
> writable);
> + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write,
> writable,
> + NULL);
> return false;
> }
>
> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
> index 9e5ccc56f8e0..4409785e294c 100644
> --- a/arch/x86/mm/ioremap.c
> +++ b/arch/x86/mm/ioremap.c
> @@ -202,9 +202,12 @@ __ioremap_caller(resource_size_t phys_addr,
> unsigned long size,
> __ioremap_check_mem(phys_addr, size, &io_desc);
>
> /*
> - * Don't allow anybody to remap normal RAM that we're using..
> + * Don't allow anybody to remap normal RAM that we're using,
> unless
> + * _PAGE_CACHE_MODE_WB_FORCE is used.
> */
> - if (io_desc.flags & IORES_MAP_SYSTEM_RAM) {
> + if (pcm == _PAGE_CACHE_MODE_WB_FORCE) {
> + pcm = _PAGE_CACHE_MODE_WB;
> + } else if (io_desc.flags & IORES_MAP_SYSTEM_RAM) {
> WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
> &phys_addr, &last_addr);
> return NULL;
> @@ -419,6 +422,13 @@ void __iomem *ioremap_cache(resource_size_t
> phys_addr, unsigned long size)
> }
> EXPORT_SYMBOL(ioremap_cache);
>
> +void __iomem *ioremap_cache_force(resource_size_t phys_addr,
> unsigned long size)
> +{
> + return __ioremap_caller(phys_addr, size,
> _PAGE_CACHE_MODE_WB_FORCE,
> + __builtin_return_address(0), false);
> +}
> +EXPORT_SYMBOL(ioremap_cache_force);
> +
> void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long
> size,
> unsigned long prot_val)
> {
> @@ -467,7 +477,7 @@ void iounmap(volatile void __iomem *addr)
> p = find_vm_area((void __force *)addr);
>
> if (!p) {
> - printk(KERN_ERR "iounmap: bad address %p\n", addr);
> + printk(KERN_ERR "iounmap: bad address %px\n", addr);

Unintentional?

> dump_stack();
> return;
> }
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 6655e8da4555..0d5f3885747b 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -238,6 +238,7 @@ struct kvm_host_map {
> void *hva;
> kvm_pfn_t pfn;
> kvm_pfn_t gfn;
> + bool protected;
> };
>
> /*
> @@ -725,7 +726,7 @@ kvm_pfn_t gfn_to_pfn_memslot(struct
> kvm_memory_slot *slot, gfn_t gfn);
> kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot,
> gfn_t gfn);
> kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t
> gfn,
> bool atomic, bool *async, bool
> write_fault,
> - bool *writable);
> + bool *writable, bool *protected);
>
> void kvm_release_pfn_clean(kvm_pfn_t pfn);
> void kvm_release_pfn_dirty(kvm_pfn_t pfn);
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index a7580f69dda0..0a8c6426b4f4 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -58,6 +58,7 @@ struct gfn_to_pfn_cache {
> gfn_t gfn;
> kvm_pfn_t pfn;
> bool dirty;
> + bool protected;
> };
>
> #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 9b569b78874a..7c2c764c28c5 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1852,9 +1852,10 @@ static bool hva_to_pfn_fast(unsigned long
> addr, bool write_fault,
> * 1 indicates success, -errno is returned if error is detected.
> */
> static int hva_to_pfn_slow(unsigned long addr, bool *async, bool
> write_fault,
> - bool *writable, kvm_pfn_t *pfn)
> + bool *writable, bool *protected, kvm_pfn_t
> *pfn)
> {
> unsigned int flags = FOLL_HWPOISON | FOLL_KVM;
> + struct vm_area_struct *vma;
> struct page *page;
> int npages = 0;
>
> @@ -1868,9 +1869,15 @@ static int hva_to_pfn_slow(unsigned long addr,
> bool *async, bool write_fault,
> if (async)
> flags |= FOLL_NOWAIT;
>
> - npages = get_user_pages_unlocked(addr, 1, &page, flags);
> - if (npages != 1)
> + mmap_read_lock(current->mm);
> + npages = get_user_pages(addr, 1, flags, &page, &vma);
> + if (npages != 1) {
> + mmap_read_unlock(current->mm);
> return npages;
> + }
> + if (protected)
> + *protected = vma_is_kvm_protected(vma);
> + mmap_read_unlock(current->mm);
>
> /* map read fault as writable if possible */
> if (unlikely(!write_fault) && writable) {
> @@ -1961,7 +1968,7 @@ static int hva_to_pfn_remapped(struct
> vm_area_struct *vma,
> * whether the mapping is writable.
> */
> static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool
> *async,
> - bool write_fault, bool *writable)
> + bool write_fault, bool *writable, bool
> *protected)
> {
> struct vm_area_struct *vma;
> kvm_pfn_t pfn = 0;
> @@ -1976,7 +1983,8 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr,
> bool atomic, bool *async,
> if (atomic)
> return KVM_PFN_ERR_FAULT;
>
> - npages = hva_to_pfn_slow(addr, async, write_fault, writable,
> &pfn);
> + npages = hva_to_pfn_slow(addr, async, write_fault, writable,
> protected,
> + &pfn);
> if (npages == 1)
> return pfn;
>
> @@ -2010,7 +2018,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr,
> bool atomic, bool *async,
>
> kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t
> gfn,
> bool atomic, bool *async, bool
> write_fault,
> - bool *writable)
> + bool *writable, bool *protected)
> {
> unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL,
> write_fault);
>
> @@ -2033,7 +2041,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct
> kvm_memory_slot *slot, gfn_t gfn,
> }
>
> return hva_to_pfn(addr, atomic, async, write_fault,
> - writable);
> + writable, protected);
> }
> EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
>
> @@ -2041,19 +2049,26 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm,
> gfn_t gfn, bool write_fault,
> bool *writable)
> {
> return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn,
> false, NULL,
> - write_fault, writable);
> + write_fault, writable, NULL);
> }
> EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
>
> kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t
> gfn)
> {
> - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true,
> NULL);
> + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL,
> NULL);
> }
> EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
>
> +static kvm_pfn_t gfn_to_pfn_memslot_protected(struct kvm_memory_slot
> *slot,
> + gfn_t gfn, bool
> *protected)
> +{
> + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL,
> + protected);
> +}
> +
> kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot,
> gfn_t gfn)
> {
> - return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
> + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL,
> NULL);
> }
> EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
>
> @@ -2134,7 +2149,7 @@ static void kvm_cache_gfn_to_pfn(struct
> kvm_memory_slot *slot, gfn_t gfn,
> {
> kvm_release_pfn(cache->pfn, cache->dirty, cache);
>
> - cache->pfn = gfn_to_pfn_memslot(slot, gfn);
> + cache->pfn = gfn_to_pfn_memslot_protected(slot, gfn, &cache-
> >protected);
> cache->gfn = gfn;
> cache->dirty = false;
> cache->generation = gen;
> @@ -2149,6 +2164,7 @@ static int __kvm_map_gfn(struct kvm_memslots
> *slots, gfn_t gfn,
> void *hva = NULL;
> struct page *page = KVM_UNMAPPED_PAGE;
> struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
> + bool protected;
> u64 gen = slots->generation;
>
> if (!map)
> @@ -2162,15 +2178,20 @@ static int __kvm_map_gfn(struct kvm_memslots
> *slots, gfn_t gfn,
> kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
> }
> pfn = cache->pfn;
> + protected = cache->protected;
> } else {
> if (atomic)
> return -EAGAIN;
> - pfn = gfn_to_pfn_memslot(slot, gfn);
> + pfn = gfn_to_pfn_memslot_protected(slot, gfn,
> &protected);
> }
> if (is_error_noslot_pfn(pfn))
> return -EINVAL;
>
> - if (pfn_valid(pfn)) {
> + if (protected) {
> + if (atomic)
> + return -EAGAIN;
> + hva = ioremap_cache_force(pfn_to_hpa(pfn), PAGE_SIZE);
> + } else if (pfn_valid(pfn)) {
> page = pfn_to_page(pfn);
> if (atomic)
> hva = kmap_atomic(page);

I think the page could have got unmapped since the gup via the
hypercall on another CPU. It could be an avenue for the guest to crash
the host.

> @@ -2191,6 +2212,7 @@ static int __kvm_map_gfn(struct kvm_memslots
> *slots, gfn_t gfn,
> map->hva = hva;
> map->pfn = pfn;
> map->gfn = gfn;
> + map->protected = protected;
>
> return 0;
> }
> @@ -2221,7 +2243,9 @@ static void __kvm_unmap_gfn(struct
> kvm_memory_slot *memslot,
> if (!map->hva)
> return;
>
> - if (map->page != KVM_UNMAPPED_PAGE) {
> + if (map->protected) {
> + iounmap(map->hva);
> + } else if (map->page != KVM_UNMAPPED_PAGE) {
> if (atomic)
> kunmap_atomic(map->hva);
> else