Re: [PATCH 28/28] KVM: x86/mmu: Defer TLB flush to caller when freeing TDP MMU shadow pages

From: Ben Gardon
Date: Tue Nov 23 2021 - 15:12:33 EST


On Fri, Nov 19, 2021 at 8:51 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote:
>
> Defer TLB flushes to the caller when freeing TDP MMU shadow pages instead
> of immediately flushing. Because the shadow pages are freed in an RCU
> callback, so long as at least one CPU holds RCU, all CPUs are protected.
> For vCPUs running in the guest, i.e. consuming TLB entries, KVM only
> needs to ensure the caller services the pending TLB flush before dropping
> its RCU protections. I.e. use the caller's RCU as a proxy for all vCPUs
> running in the guest.
>
> Deferring the flushes allows batching flushes, e.g. when installing a
> 1gb hugepage and zapping a pile of SPs, and when zapping an entire root,
> allows skipping the flush entirely (becaues flushes are not needed in
> that case).
>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>

Reviewed-by: Ben Gardon <bgardon@xxxxxxxxxx>

> ---
> arch/x86/kvm/mmu/mmu.c | 12 ++++++++++++
> arch/x86/kvm/mmu/tdp_iter.h | 7 +++----
> arch/x86/kvm/mmu/tdp_mmu.c | 23 +++++++++++------------
> 3 files changed, 26 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index ef689b8bab12..7aab9737dffa 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -6237,6 +6237,12 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
> rcu_idx = srcu_read_lock(&kvm->srcu);
> write_lock(&kvm->mmu_lock);
>
> + /*
> + * Zapping TDP MMU shadow pages, including the remote TLB flush, must
> + * be done under RCU protection, the pages are freed via RCU callback.
> + */
> + rcu_read_lock();
> +
> ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
> to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
> for ( ; to_zap; --to_zap) {
> @@ -6261,12 +6267,18 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
>
> if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
> kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
> + rcu_read_unlock();
> +
> cond_resched_rwlock_write(&kvm->mmu_lock);
> flush = false;
> +
> + rcu_read_lock();
> }
> }
> kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
>
> + rcu_read_unlock();
> +
> write_unlock(&kvm->mmu_lock);
> srcu_read_unlock(&kvm->srcu, rcu_idx);
> }
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> index 0693f1fdb81e..0299703fc844 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.h
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -9,10 +9,9 @@
>
> /*
> * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
> - * to be zapped while holding mmu_lock for read. Holding RCU isn't required for
> - * correctness if mmu_lock is held for write, but plumbing "struct kvm" down to
> - * the lower* depths of the TDP MMU just to make lockdep happy is a nightmare,
> - * so all* accesses to SPTEs are must be done under RCU protection.
> + * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
> + * batched without having to collect the list of zapped SPs. Flows that can
> + * remove SPs must service pending TLB flushes prior to dropping RCU protection.
> */
> static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
> {
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 55c16680b927..62cb357b1dff 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -433,9 +433,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
> shared);
> }
>
> - kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
> - KVM_PAGES_PER_HPAGE(level + 1));
> -
> call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
> }
>
> @@ -815,21 +812,14 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
>
> bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> {
> - u64 old_spte;
> + u64 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
>
> - rcu_read_lock();
> -
> - old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
> - if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) {
> - rcu_read_unlock();
> + if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
> return false;
> - }
>
> __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
> sp->gfn, sp->role.level + 1, true, true);
>
> - rcu_read_unlock();
> -
> return true;
> }
>
> @@ -871,6 +861,11 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> }
>
> rcu_read_unlock();
> +
> + /*
> + * Because this flows zaps _only_ leaf SPTEs, the caller doesn't need
> + * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
> + */
> return flush;
> }
>
> @@ -1011,6 +1006,10 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> ret = RET_PF_SPURIOUS;
> else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
> return RET_PF_RETRY;
> + else if (is_shadow_present_pte(iter->old_spte) &&
> + !is_last_spte(iter->old_spte, iter->level))
> + kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
> + KVM_PAGES_PER_HPAGE(iter->level + 1));
>
> /*
> * If the page fault was caused by a write but the page is write
> --
> 2.34.0.rc2.393.gf8c9666880-goog
>