Re: [syzbot] [kvm?] WARNING in clear_dirty_gfn_range

From: David Matlack
Date: Fri Mar 15 2024 - 14:29:43 EST


On 2024-03-15 11:07 AM, David Matlack wrote:
> On Tue, Mar 12, 2024 at 4:34 PM syzbot
> <syzbot+900d58a45dcaab9e4821@xxxxxxxxxxxxxxxxxxxxxxxxx> wrote:
> >
> > ------------[ cut here ]------------
> > WARNING: CPU: 1 PID: 5165 at arch/x86/kvm/mmu/tdp_mmu.c:1526 clear_dirty_gfn_range+0x3d6/0x540 arch/x86/kvm/mmu/tdp_mmu.c:1526
> > Modules linked in:
> > CPU: 1 PID: 5165 Comm: syz-executor417 Not tainted 6.8.0-syzkaller-01185-g855684c7d938 #0
> > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> > RIP: 0010:clear_dirty_gfn_range+0x3d6/0x540 arch/x86/kvm/mmu/tdp_mmu.c:1526
> > Call Trace:
> > <TASK>
> > kvm_tdp_mmu_clear_dirty_slot+0x24f/0x2e0 arch/x86/kvm/mmu/tdp_mmu.c:1557
> > kvm_mmu_slot_leaf_clear_dirty+0x38b/0x490 arch/x86/kvm/mmu/mmu.c:6783
> > kvm_mmu_slot_apply_flags arch/x86/kvm/x86.c:12962 [inline]
> > kvm_arch_commit_memory_region+0x299/0x490 arch/x86/kvm/x86.c:13031
> > kvm_commit_memory_region arch/x86/kvm/../../../virt/kvm/kvm_main.c:1751 [inline]
> > kvm_set_memslot+0x4d3/0x13e0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:1994
> > __kvm_set_memory_region arch/x86/kvm/../../../virt/kvm/kvm_main.c:2129 [inline]
> > __kvm_set_memory_region+0xdbc/0x1520 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2020
> > kvm_set_memory_region arch/x86/kvm/../../../virt/kvm/kvm_main.c:2150 [inline]
> > kvm_vm_ioctl_set_memory_region arch/x86/kvm/../../../virt/kvm/kvm_main.c:2162 [inline]
> > kvm_vm_ioctl+0x151c/0x3e20 arch/x86/kvm/../../../virt/kvm/kvm_main.c:5152
>
> The reproducer uses nested virtualization to launch an L2 with EPT
> disabled. This creates a TDP MMU root with role.guest_mode=1, which
> triggers the WARN_ON() in kvm_tdp_mmu_clear_dirty_slot() because
> kvm_mmu_page_ad_need_write_protect() returns false whenever PML is
> enabled and the shadow page role.guest_mode=1.
>
> If I'm reading prepare_vmcs02_constant_state() correctly, we always
> disable PML when running in L2. So when enable_pml=1 and L2 runs with
> EPT disabled we are blind to dirty tracking L2 accesses.

+Vipin

I believe this was introduced by 6.4 commit 5982a5392663 ("KVM: x86/mmu:
Use kvm_ad_enabled() to determine if TDP MMU SPTEs need wrprot").

I see two options to fix:

1. Allow PML to be enabled when L2 is running with EPT is disabled.
2. Fix the TDP MMU logic to write-protect role.guest_mode=1 SPTEs.

(1.) sounds more complicated and will require more testing. (2.) is
quite simple since an entire TDP MMU tree is either guest_mode=0 or
guest_mode=1.

Example fix (fixes syzbot repro but otherwise untested):

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6ae19b4ee5b1..eb6fb8d9c00c 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1498,6 +1498,24 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
}
}

+static inline u64 tdp_mmu_dirty_bit(struct kvm_mmu_page *root, bool force_wrprot)
+{
+ if (force_wrprot || kvm_mmu_page_ad_need_write_protect(root) || !kvm_ad_enabled())
+ return PT_WRITABLE_MASK;
+
+ return shadow_dirty_mask;
+}
+
+static inline bool tdp_mmu_dirty_bit_invalid_for_spte(struct tdp_iter *iter, u64 dbit)
+{
+ /*
+ * The decision of whether to clear the D-bit or W-bit is made based on
+ * the root, as all TDP MMU SPTEs within a root should require the same
+ * modifications. This check ensures that is actually the case.
+ */
+ return dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter->old_spte);
+}
+
/*
* Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
* AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
@@ -1508,7 +1526,7 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
- u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
+ u64 dbit = tdp_mmu_dirty_bit(root, false);
struct tdp_iter iter;
bool spte_set = false;

@@ -1523,8 +1541,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;

- KVM_MMU_WARN_ON(kvm_ad_enabled() &&
- spte_ad_need_write_protect(iter.old_spte));
+ KVM_MMU_WARN_ON(tdp_mmu_dirty_bit_invalid_for_spte(&iter, dbit));

if (!(iter.old_spte & dbit))
continue;
@@ -1570,8 +1587,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
- u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
- shadow_dirty_mask;
+ u64 dbit = tdp_mmu_dirty_bit(root, wrprot);
struct tdp_iter iter;

lockdep_assert_held_write(&kvm->mmu_lock);
@@ -1583,8 +1599,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;

- KVM_MMU_WARN_ON(kvm_ad_enabled() &&
- spte_ad_need_write_protect(iter.old_spte));
+ KVM_MMU_WARN_ON(tdp_mmu_dirty_bit_invalid_for_spte(&iter, dbit));

if (iter.level > PG_LEVEL_4K ||
!(mask & (1UL << (iter.gfn - gfn))))