Re: [RFC PATCH v4 13/16] KVM: x86/tdp_mmu: Try to merge pages into a large page

From: Isaku Yamahata
Date: Mon Aug 14 2023 - 16:36:20 EST


On Tue, Jul 25, 2023 at 03:23:59PM -0700,
isaku.yamahata@xxxxxxxxx wrote:

> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
>
> When a large page is passed to the KVM page fault handler and some of sub
> pages are already populated, try to merge sub pages into a large page.
> This situation can happen when the guest converts small pages into shared
> and convert it back into private.
>
> When a large page is passed to KVM mmu page fault handler and the spte
> corresponding to the page is non-leaf (one or more of sub pages are already
> populated at lower page level), the current kvm mmu zaps non-leaf spte at a
> large page level, and populate a leaf spte at that level. Thus small pages
> are converted into a large page. However, it doesn't work for TDX because
> zapping and re-populating results in zeroing page content. Instead,
> populate all small pages and merge them into a large page.
>
> Merging pages into a large page can fail when some sub pages are accepted
> and some are not. In such case, with the assumption that guest tries to
> accept at large page size for performance when possible, don't try to be
> smart to identify which page is still pending, map all pages at lower page
> level, and let vcpu re-execute.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> ---
> arch/x86/include/asm/kvm-x86-ops.h | 2 +
> arch/x86/include/asm/kvm_host.h | 4 +
> arch/x86/kvm/mmu/tdp_iter.c | 37 +++++--
> arch/x86/kvm/mmu/tdp_iter.h | 2 +
> arch/x86/kvm/mmu/tdp_mmu.c | 163 ++++++++++++++++++++++++++++-
> 5 files changed, 198 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index c3963002722c..612fcaac600d 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1242,6 +1242,167 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm, bool skip_private)
> rcu_read_unlock();
> }
>
> +static void tdp_mmu_iter_step_side(int i, struct tdp_iter *iter)
> +{
> + /*
> + * if i = SPTE_ENT_PER_PAGE - 1, tdp_iter_step_side() results
> + * in reading the entry beyond the last entry.
> + */
> + if (i < SPTE_ENT_PER_PAGE)
> + tdp_iter_step_side(iter);
> +}
> +
> +static int tdp_mmu_merge_private_spt(struct kvm_vcpu *vcpu,
> + struct kvm_page_fault *fault,
> + struct tdp_iter *iter, u64 new_spte)
> +{
> + u64 *sptep = rcu_dereference(iter->sptep);
> + struct kvm_mmu_page *child_sp;
> + struct kvm *kvm = vcpu->kvm;
> + struct tdp_iter child_iter;
> + bool ret_pf_retry = false;
> + int level = iter->level;
> + gfn_t gfn = iter->gfn;
> + u64 old_spte = *sptep;
> + tdp_ptep_t child_pt;
> + u64 child_spte;
> + int ret = 0;
> + int i;
> +
> + /*
> + * TDX KVM supports only 2MB large page. It's not supported to merge
> + * 2MB pages into 1GB page at the moment.
> + */
> + WARN_ON_ONCE(fault->goal_level != PG_LEVEL_2M);
> + WARN_ON_ONCE(iter->level != PG_LEVEL_2M);
> + WARN_ON_ONCE(!is_large_pte(new_spte));
> +
> + /* Freeze the spte to prevent other threads from working spte. */
> + if (!try_cmpxchg64(sptep, &iter->old_spte, REMOVED_SPTE))
> + return -EBUSY;
> +
> + /*
> + * Step down to the child spte. Because tdp_iter_next() assumes the
> + * parent spte isn't freezed, do it manually.
> + */
> + child_pt = spte_to_child_pt(iter->old_spte, iter->level);
> + child_sp = sptep_to_sp(child_pt);
> + WARN_ON_ONCE(child_sp->role.level != PG_LEVEL_4K);
> + WARN_ON_ONCE(!kvm_mmu_page_role_is_private(child_sp->role));
> +
> + /* Don't modify iter as the caller will use iter after this function. */
> + child_iter = *iter;
> + /* Adjust the target gfn to the head gfn of the large page. */
> + child_iter.next_last_level_gfn &= -KVM_PAGES_PER_HPAGE(level);
> + tdp_iter_step_down(&child_iter, child_pt);
> +
> + /*
> + * All child pages are required to be populated for merging them into a
> + * large page. Populate all child spte.
> + */
> + for (i = 0; i < SPTE_ENT_PER_PAGE; i++, tdp_mmu_iter_step_side(i, &child_iter)) {
> + WARN_ON_ONCE(child_iter.level != PG_LEVEL_4K);
> + if (is_shadow_present_pte(child_iter.old_spte)) {
> + /* TODO: relocate page for huge page. */
> + if (WARN_ON_ONCE(spte_to_pfn(child_iter.old_spte) !=
> + spte_to_pfn(new_spte) + i)) {
> + ret = -EAGAIN;
> + ret_pf_retry = true;
> + }
> + /*
> + * When SEPT_VE_DISABLE=true and the page state is
> + * pending, this case can happen. Just resume the vcpu
> + * again with the expectation for other vcpu to accept
> + * this page.
> + */
> + if (child_iter.gfn == fault->gfn) {
> + ret = -EAGAIN;
> + ret_pf_retry = true;
> + break;
> + }
> + continue;
> + }
> +
> + WARN_ON_ONCE(spte_to_pfn(child_iter.old_spte) != spte_to_pfn(new_spte) + i);
> + child_spte = make_huge_page_split_spte(kvm, new_spte, child_sp->role, i);
> + /*
> + * Because other thread may have started to operate on this spte
> + * before freezing the parent spte, Use atomic version to
> + * prevent race.
> + */
> + ret = tdp_mmu_set_spte_atomic(vcpu->kvm, &child_iter, child_spte);
> + if (ret == -EBUSY || ret == -EAGAIN)
> + /*
> + * There was a race condition. Populate remaining 4K
> + * spte to resolve fault->gfn to guarantee the forward
> + * progress.
> + */
> + ret_pf_retry = true;
> + else if (ret)
> + goto out;
> +
> + }
> + if (ret_pf_retry)
> + goto out;
> +
> + /* Prevent the Secure-EPT entry from being used. */
> + ret = static_call(kvm_x86_zap_private_spte)(kvm, gfn, level);
> + if (ret)
> + goto out;
> + kvm_flush_remote_tlbs_range(kvm, gfn & KVM_HPAGE_GFN_MASK(level),
> + KVM_PAGES_PER_HPAGE(level));
> +
> + /* Merge pages into a large page. */
> + ret = static_call(kvm_x86_merge_private_spt)(kvm, gfn, level,
> + kvm_mmu_private_spt(child_sp));
> + /*
> + * Failed to merge pages because some pages are accepted and some are
> + * pending. Since the child page was mapped above, let vcpu run.
> + */
> + if (ret) {
> + if (static_call(kvm_x86_unzap_private_spte)(kvm, gfn, level))
> + old_spte = SHADOW_NONPRESENT_VALUE |
> + (spte_to_pfn(old_spte) << PAGE_SHIFT) |
> + PT_PAGE_SIZE_MASK;
> + goto out;
> + }
> +
> + /* Unfreeze spte. */
> + __kvm_tdp_mmu_write_spte(sptep, new_spte);
> +
> + /*
> + * Free unused child sp. Secure-EPT page was already freed at TDX level
> + * by kvm_x86_merge_private_spt().
> + */
> + tdp_unaccount_mmu_page(kvm, child_sp);
> + tdp_mmu_free_sp(child_sp);
> + return -EAGAIN;
> +
> +out:
> + __kvm_tdp_mmu_write_spte(sptep, old_spte);
> + return ret;
> +}
> +
> +static int __tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
> + struct kvm_page_fault *fault,
> + struct tdp_iter *iter, u64 new_spte)
> +{
> + /*
> + * The private page has smaller-size pages. For example, the child
> + * pages was converted from shared to page, and now it can be mapped as
> + * a large page. Try to merge small pages into a large page.
> + */
> + if (fault->slot &&
> + kvm_gfn_shared_mask(vcpu->kvm) &&
> + iter->level > PG_LEVEL_4K &&
> + kvm_is_private_gpa(vcpu->kvm, fault->addr) &&
> + is_shadow_present_pte(iter->old_spte) &&
> + !is_large_pte(iter->old_spte))
> + return tdp_mmu_merge_private_spt(vcpu, fault, iter, new_spte);
> +
> + return tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte);
> +}
> +
> /*
> * Installs a last-level SPTE to handle a TDP page fault.
> * (NPT/EPT violation/misconfiguration)
> @@ -1276,7 +1437,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>
> if (new_spte == iter->old_spte)
> ret = RET_PF_SPURIOUS;
> - else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
> + else if (__tdp_mmu_map_handle_target_level(vcpu, fault, iter, new_spte))
> return RET_PF_RETRY;
> else if (is_shadow_present_pte(iter->old_spte) &&
> !is_last_spte(iter->old_spte, iter->level))
> --
> 2.25.1
>


I missed the race condition and had a wrong WARN_ON_ONCE(). I think it's hard
to hit it because
- In most cases, we hit if (is_shadow_present_pte()) because map it with large
page, split the page on mapgpa(to-shared), merge the page on
mapgpa(to-private-again).
We need independent mapgpa sequence on a different GPA, but within same 2M
range.

- To hit removed case, we need a race with 2 vcpus in addition to the above.

Anyway this will be included in the next respin.

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 70051dd863a8..4ccfbd04fb27 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1306,6 +1306,13 @@ static int tdp_mmu_merge_private_spt(struct kvm_vcpu *vcpu,
*/
for (i = 0; i < SPTE_ENT_PER_PAGE; i = tdp_mmu_iter_step_side(i, &child_iter)) {
WARN_ON_ONCE(child_iter.level != PG_LEVEL_4K);
+
+ if (is_removed_spte(child_iter.old_spte)) {
+ ret = -EAGAIN;
+ ret_pf_retry = true;
+ continue;
+ }
+
if (is_shadow_present_pte(child_iter.old_spte)) {
/* TODO: relocate page for huge page. */
if (WARN_ON_ONCE(spte_to_pfn(child_iter.old_spte) !=
@@ -1327,7 +1334,6 @@ static int tdp_mmu_merge_private_spt(struct kvm_vcpu *vcpu,
continue;
}

- WARN_ON_ONCE(spte_to_pfn(child_iter.old_spte) != spte_to_pfn(new_spte) + i);
child_spte = make_huge_page_split_spte(kvm, new_spte, child_sp->role, i);
/*
* Because other thread may have started to operate on this spte
--
2.25.1

--
Isaku Yamahata <isaku.yamahata@xxxxxxxxx>