On Tue, Nov 21, 2023 at 05:57:28PM +0800,But I don't find it may block during freeing memory.
Binbin Wu <binbin.wu@xxxxxxxxxxxxxxx> wrote:
I'll update it.diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.cCan use is_private instead of is_private_sp(root) here as well.
index 7873e9ee82ad..a209a67decae 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -964,6 +964,14 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
return true;
}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool shared);
+
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared);
+
/*
* If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this
@@ -975,13 +983,15 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield, bool flush,
bool zap_private)
{
+ bool is_private = is_private_sp(root);
+ struct kvm_mmu_page *split_sp = NULL;
struct tdp_iter iter;
end = min(end, tdp_mmu_max_gfn_exclusive());
lockdep_assert_held_write(&kvm->mmu_lock);
- WARN_ON_ONCE(zap_private && !is_private_sp(root));
+ WARN_ON_ONCE(zap_private && !is_private);
if (!zap_private && is_private_sp(root))
Because tdp_mmu_alloc_sp_for_split() may unlock mmu_lock and block.return false;Is it necessary to do the flush here?
@@ -1006,12 +1016,66 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
+ if (is_private && kvm_gfn_shared_mask(kvm) &&
+ is_large_pte(iter.old_spte)) {
+ gfn_t gfn = iter.gfn & ~kvm_gfn_shared_mask(kvm);
+ gfn_t mask = KVM_PAGES_PER_HPAGE(iter.level) - 1;
+ struct kvm_memory_slot *slot;
+ struct kvm_mmu_page *sp;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (kvm_hugepage_test_mixed(slot, gfn, iter.level) ||
+ (gfn & mask) < start ||
+ end < (gfn & mask) + KVM_PAGES_PER_HPAGE(iter.level)) {
+ WARN_ON_ONCE(!can_yield);
+ if (split_sp) {
+ sp = split_sp;
+ split_sp = NULL;
+ sp->role = tdp_iter_child_role(&iter);
+ } else {
+ WARN_ON(iter.yielded);
+ if (flush && can_yield) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
While blocking, other thread operates on KVM MMU and gets confused due to
remaining TLB cache.
Hmm, we don't need it. When breaking up page table, we need to tlb flush+ sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, false);Why it needs to flush TLB immediately if tdp_mmu_split_huge_page() fails?
+ if (iter.yielded) {
+ split_sp = sp;
+ continue;
+ }
+ }
+ KVM_BUG_ON(!sp, kvm);
+
+ tdp_mmu_init_sp(sp, iter.sptep, iter.gfn);
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, false)) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
before issuing TDH.MEM.PAGE.DEMOTE(), not after it. Will remove those two lines.
Also, when KVM MMU write lock is held, it seems tdp_mmu_split_huge_page()This can happen with TDX_OPERAND_BUSY with secure-ept tree lock with other
will not fail.
vcpus TDH.VP.ENTER(). TDH.VP.ENTER() can take exclusive lock of secure-EPT.
But let's assume this condition can be triggered, since sp isNice catch. I'll add split_sp = sp;
local
variable, it will lost its value after continue, and split_sp is also NULL,
it will try to allocate a new sp, memory leakage here?
No. Because we unlock mmu_lock and may block when freeing memory.+ /* force retry on this gfn. */Same here, why we need to do the flush here?
+ iter.yielded = true;
+ } else
+ flush = true;
+ continue;
+ }
+ }
+
tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
flush = true;
}
rcu_read_unlock();
+ if (split_sp) {
+ WARN_ON(!can_yield);
+ if (flush) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
Can we delay it till the caller do the flush?
+
+ write_unlock(&kvm->mmu_lock);
+ tdp_mmu_free_sp(split_sp);
+ write_lock(&kvm->mmu_lock);
+ }
+
/*
* Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
* to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
@@ -1606,8 +1670,6 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
KVM_BUG_ON(kvm_mmu_page_role_is_private(role) !=
is_private_sptep(iter->sptep), kvm);
- /* TODO: Large page isn't supported for private SPTE yet. */
- KVM_BUG_ON(kvm_mmu_page_role_is_private(role), kvm);
/*
* Since we are allocating while under the MMU lock we have to be