[PATCH v1 09/16] kvm: arm/arm64: Delay stage2 page table allocation

From: Suzuki K Poulose
Date: Tue Jan 09 2018 - 14:05:25 EST


We allocate the entry level page tables for stage2 when the
VM is created. This doesn't give us the flexibility of configuring
the physical address space size for a VM. In order to allow
the VM to choose the required size, we delay the allocation of
stage2 entry level tables until we really try to map something.

This could be either when the VM creates a memory range or when
it tries to map a device memory. So we add in a hook to these
two places to make sure the tables are allocated. We use
kvm->slots_lock to serialize the allocation entry point, since
we add hooks to the arch specific call back with the mutex held.

Cc: Marc Zyngier <marc.zyngier@xxxxxxx>
Cc: Christoffer Dall <cdall@xxxxxxxxxx>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@xxxxxxx>
---
virt/kvm/arm/arm.c | 18 ++++++----------
virt/kvm/arm/mmu.c | 61 +++++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 19b720ddedce..d06f00566664 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -127,13 +127,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
for_each_possible_cpu(cpu)
*per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;

- ret = kvm_alloc_stage2_pgd(kvm);
- if (ret)
- goto out_fail_alloc;
-
ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
- if (ret)
- goto out_free_stage2_pgd;
+ if (ret) {
+ free_percpu(kvm->arch.last_vcpu_ran);
+ kvm->arch.last_vcpu_ran = NULL;
+ return ret;
+ }
+

kvm_vgic_early_init(kvm);

@@ -145,12 +145,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;

return ret;
-out_free_stage2_pgd:
- kvm_free_stage2_pgd(kvm);
-out_fail_alloc:
- free_percpu(kvm->arch.last_vcpu_ran);
- kvm->arch.last_vcpu_ran = NULL;
- return ret;
}

bool kvm_arch_has_vcpu_debugfs(void)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index c94c61ac38b9..257f2a8ccfc7 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1011,15 +1011,39 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
return stage2_ptep_test_and_clear_young((pte_t *)pmd);
}

-/**
- * kvm_phys_addr_ioremap - map a device range to guest IPA
- *
- * @kvm: The KVM pointer
- * @guest_ipa: The IPA at which to insert the mapping
- * @pa: The physical address of the device
- * @size: The size of the mapping
+/*
+ * Finalise the stage2 page table layout. Must be called with kvm->slots_lock
+ * held.
*/
-int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
+static int __kvm_init_stage2_table(struct kvm *kvm)
+{
+ /* Double check if somebody has already allocated it */
+ if (likely(kvm->arch.pgd))
+ return 0;
+ return kvm_alloc_stage2_pgd(kvm);
+}
+
+static int kvm_init_stage2_table(struct kvm *kvm)
+{
+ int rc;
+
+ /*
+ * Once allocated, the stage2 entry level tables are only
+ * freed when the KVM instance is destroyed. So, if we see
+ * something valid here, that guarantees that we have
+ * done the one time allocation and it is something valid
+ * and won't go away until the last reference to the KVM
+ * is gone.
+ */
+ if (likely(kvm->arch.pgd))
+ return 0;
+ mutex_lock(&kvm->slots_lock);
+ rc = __kvm_init_stage2_table(kvm);
+ mutex_unlock(&kvm->slots_lock);
+ return rc;
+}
+
+static int __kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable)
{
phys_addr_t addr, end;
@@ -1055,6 +1079,23 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
return ret;
}

+/**
+ * kvm_phys_addr_ioremap - map a device range to guest IPA.
+ * Acquires kvm->slots_lock for making sure that the stage2 is initialized.
+ *
+ * @kvm: The KVM pointer
+ * @guest_ipa: The IPA at which to insert the mapping
+ * @pa: The physical address of the device
+ * @size: The size of the mapping
+ */
+int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
+ phys_addr_t pa, unsigned long size, bool writable)
+{
+ if (unlikely(kvm_init_stage2_table(kvm)))
+ return -ENOMEM;
+ return __kvm_phys_addr_ioremap(kvm, guest_ipa, pa, size, writable);
+}
+
static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
{
kvm_pfn_t pfn = *pfnp;
@@ -1912,7 +1953,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
goto out;
}

- ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
+ ret = __kvm_phys_addr_ioremap(kvm, gpa, pa,
vm_end - vm_start,
writable);
if (ret)
@@ -1943,7 +1984,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned long npages)
{
- return 0;
+ return __kvm_init_stage2_table(kvm);
}

void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
--
2.13.6