[PATCH] KVM: x86/mmu: fix potential races when walking host page table

From: Mingwei Zhang
Date: Thu Apr 28 2022 - 23:18:20 EST


Implement a KVM function for walking host page table in x86 architecture
and stop using lookup_address_in_mm(). lookup_address_in_mm() is basically
lookup_address_in_pgd() in mm. This function suffer from several issues:

- no usage of READ_ONCE(*). This allows multiple dereference of the same
page table entry. The TOCTOU problem because of that may cause kernel
incorrectly thinks a newly generated leaf entry as a nonleaf one and
dereference the content by using its pfn value.

- Incorrect information returned. lookup_address_in_mm() returns pte_t
pointer and level regardless of the 'presentness' of the entry, ie.,
even if an PXE entry is 'non-present', as long as it is not 'none', the
function still returns its level. In comparison, KVM needs the level
information of only 'present' entries. This is a clear bug and may cause
KVM incorrectly regard a non-present PXE entry as a present large page
mapping.

lookup_address_in_mm() and its relevant functions are generally helpful
only for walking kernel addresses that have mostly static mappings and no
page table tear down would happen. Patching this function does not help
other callers, since its return value: a PTE pointer, is NEVER safe to
deference after the function returns.

Cc: Sean Christopherson <seanjc@xxxxxxxxxx>
Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>

Signed-off-by: Mingwei Zhang <mizhang@xxxxxxxxxx>
---
arch/x86/kvm/mmu/mmu.c | 8 +----
arch/x86/kvm/x86.c | 70 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.h | 2 ++
3 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 904f0faff2186..6db195e5eae94 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2822,8 +2822,6 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
const struct kvm_memory_slot *slot)
{
unsigned long hva;
- pte_t *pte;
- int level;

if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
return PG_LEVEL_4K;
@@ -2838,11 +2836,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
*/
hva = __gfn_to_hva_memslot(slot, gfn);

- pte = lookup_address_in_mm(kvm->mm, hva, &level);
- if (unlikely(!pte))
- return PG_LEVEL_4K;
-
- return level;
+ return kvm_lookup_address_level_in_mm(kvm, hva);
}

int kvm_mmu_max_mapping_level(struct kvm *kvm,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 951d0a78ccdae..61406efe4ea7f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13044,6 +13044,76 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
}
EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);

+/*
+ * Lookup the valid mapping level for a virtual address in the current mm.
+ * Return the level of the mapping if there is present one. Otherwise, always
+ * return PG_LEVEL_NONE.
+ *
+ * Note: the information retrieved may be stale. Use it with causion.
+ */
+int kvm_lookup_address_level_in_mm(struct kvm *kvm, unsigned long address)
+{
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+ unsigned long flags;
+ int level = PG_LEVEL_NONE;
+
+ /* Disable IRQs to prevent any tear down of page tables. */
+ local_irq_save(flags);
+
+ pgdp = pgd_offset(kvm->mm, address);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ goto out;
+
+ p4dp = p4d_offset(pgdp, address);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ goto out;
+
+ if (p4d_large(p4d)) {
+ level = PG_LEVEL_512G;
+ goto out;
+ }
+
+ pudp = pud_offset(p4dp, address);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud) || !pud_present(pud))
+ goto out;
+
+ if (pud_large(pud)) {
+ level = PG_LEVEL_1G;
+ goto out;
+ }
+
+ pmdp = pmd_offset(pudp, address);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd) || !pmd_present(pmd))
+ goto out;
+
+ if (pmd_large(pmd)) {
+ level = PG_LEVEL_2M;
+ goto out;
+ }
+
+ ptep = pte_offset_map(&pmd, address);
+ pte = ptep_get(ptep);
+ if (pte_present(pte)) {
+ pte_unmap(ptep);
+ level = PG_LEVEL_4K;
+ goto out;
+ }
+ pte_unmap(ptep);
+
+out:
+ local_irq_restore(flags);
+ return level;
+}
+EXPORT_SYMBOL_GPL(kvm_lookup_address_level_in_mm);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 588792f003345..f1cdcc8483bd0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -454,4 +454,6 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
unsigned int port, void *data, unsigned int count,
int in);

+int kvm_lookup_address_level_in_mm(struct kvm *kvm, unsigned long address);
+
#endif

base-commit: 2a39d8b39bffdaf1a4223d0d22f07baee154c8f3
--
2.36.0.464.gb9c8b46e94-goog