[PATCH 11/24] kvm: x86/mmu: Put TDP MMU PT walks in RCU read-critical section

From: Ben Gardon
Date: Tue Jan 12 2021 - 13:13:37 EST


In order to enable concurrent modifications to the paging structures in
the TDP MMU, threads must be able to safely remove pages of page table
memory while other threads are traversing the same memory. To ensure
threads do not access PT memory after it is freed, protect PT memory
with RCU.

Reviewed-by: Peter Feiner <pfeiner@xxxxxxxxxx>

Signed-off-by: Ben Gardon <bgardon@xxxxxxxxxx>
---
arch/x86/kvm/mmu/tdp_mmu.c | 53 ++++++++++++++++++++++++++++++++++++--
1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index e8f35cd46b4c..662907d374b3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -458,11 +458,14 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* Return true if this function yielded, the TLBs were flushed, and the
* iterator's traversal was reset. Return false if a yield was not needed.
*/
-static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
+static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter)
{
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
kvm_flush_remote_tlbs(kvm);
+ rcu_read_unlock();
cond_resched_lock(&kvm->mmu_lock);
+ rcu_read_lock();
tdp_iter_refresh_walk(iter);
return true;
} else
@@ -483,7 +486,9 @@ static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *it
static bool tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
{
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ rcu_read_unlock();
cond_resched_lock(&kvm->mmu_lock);
+ rcu_read_lock();
tdp_iter_refresh_walk(iter);
return true;
} else
@@ -508,6 +513,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t last_goal_gfn = start;
bool flush_needed = false;

+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
/* Ensure forward progress has been made before yielding. */
if (can_yield && iter.goal_gfn != last_goal_gfn &&
@@ -538,6 +545,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
tdp_mmu_set_spte(kvm, &iter, 0);
flush_needed = true;
}
+
+ rcu_read_unlock();
return flush_needed;
}

@@ -650,6 +659,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
huge_page_disallowed, &req_level);

trace_kvm_mmu_spte_requested(gpa, level, pfn);
+
+ rcu_read_lock();
+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
if (nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(iter.old_spte, gfn,
@@ -693,11 +705,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
}
}

- if (WARN_ON(iter.level != level))
+ if (WARN_ON(iter.level != level)) {
+ rcu_read_unlock();
return RET_PF_RETRY;
+ }

ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
pfn, prefault);
+ rcu_read_unlock();

return ret;
}
@@ -768,6 +783,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
int young = 0;
u64 new_spte = 0;

+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
/*
* If we have a non-accessed entry we don't need to change the
@@ -799,6 +816,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
trace_kvm_age_page(iter.gfn, iter.level, slot, young);
}

+ rcu_read_unlock();
+
return young;
}

@@ -844,6 +863,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
u64 new_spte;
int need_flush = 0;

+ rcu_read_lock();
+
WARN_ON(pte_huge(*ptep));

new_pfn = pte_pfn(*ptep);
@@ -872,6 +893,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
if (need_flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);

+ rcu_read_unlock();
+
return 0;
}

@@ -896,6 +919,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t last_goal_gfn = start;
bool spte_set = false;

+ rcu_read_lock();
+
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);

for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
@@ -924,6 +949,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
spte_set = true;
}
+
+ rcu_read_unlock();
return spte_set;
}

@@ -966,6 +993,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t last_goal_gfn = start;
bool spte_set = false;

+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
/* Ensure forward progress has been made before yielding. */
if (iter.goal_gfn != last_goal_gfn &&
@@ -994,6 +1023,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
spte_set = true;
}
+
+ rcu_read_unlock();
return spte_set;
}

@@ -1035,6 +1066,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
struct tdp_iter iter;
u64 new_spte;

+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
@@ -1060,6 +1093,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,

mask &= ~(1UL << (iter.gfn - gfn));
}
+
+ rcu_read_unlock();
}

/*
@@ -1100,6 +1135,8 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t last_goal_gfn = start;
bool spte_set = false;

+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
/* Ensure forward progress has been made before yielding. */
if (iter.goal_gfn != last_goal_gfn &&
@@ -1125,6 +1162,7 @@ static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
spte_set = true;
}

+ rcu_read_unlock();
return spte_set;
}

@@ -1163,6 +1201,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
gfn_t last_goal_gfn = start;
bool spte_set = false;

+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
/* Ensure forward progress has been made before yielding. */
if (iter.goal_gfn != last_goal_gfn &&
@@ -1190,6 +1230,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
spte_set = true;
}

+ rcu_read_unlock();
if (spte_set)
kvm_flush_remote_tlbs(kvm);
}
@@ -1226,6 +1267,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;

+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
if (!is_writable_pte(iter.old_spte))
break;
@@ -1237,6 +1280,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
spte_set = true;
}

+ rcu_read_unlock();
+
return spte_set;
}

@@ -1277,10 +1322,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,

*root_level = vcpu->arch.mmu->shadow_root_level;

+ rcu_read_lock();
+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}

+ rcu_read_unlock();
+
return leaf;
}
--
2.30.0.284.gd98b1dd5eaa7-goog