[PATCH] kvm: x86: move srcu lock out of kvm_vcpu_check_block

From: Jon Kohler
Date: Wed Apr 28 2021 - 13:40:39 EST


When using halt polling on x86, srcu_read_lock + unlock overhead [1] is
high in a bursty workload, showing as ~8% of samples in a 60-sec flame
graph.

kvm_vcpu_block calls kvm_vcpu_check_block for both halt polling and
normal blocking. kvm_vcpu_check_block takes srcu_read on kvm->srcu.
This was added in 50c28f21d0 to support fast CR3 and was questioned [2]
at the time but made it in such that we take the lock even for
non-nested. This only appears to be valid for nested situations, where
we will eventually call kvm_vcpu_running and vmx_check_nested_events.
This check is hidden behind is_guest_mode() and therefore does not
seem to apply to non-nested workloads.

To improve performance, this moves kvm->srcu lock logic from
kvm_vcpu_check_block to kvm_vcpu_running and wraps directly around
check_events. Also adds a hint for callers to tell
kvm_vcpu_running whether or not to acquire srcu, which is useful in
situations where the lock may already be held. With this in place, we
see roughly 5% improvement in an internal benchmark [3] and no more
impact from this lock on non-nested workloads.

[1] perf top output in heavy workload
Overhead Shared Object Symbol
9.24% [kernel] [k] __srcu_read_lock
7.48% [kernel] [k] __srcu_read_unlock

[2] Locking originally discussed here
https://patchwork.kernel.org/project/kvm/patch/20180612225244.71856-9-junaids@xxxxxxxxxx/

[3] Internal benchmark details
Fixed-rate 100 GBytes/second 1MB random read IO ran against the
internal in-memory read cache of Nutanix AOS, 16 threads on a 22
vCPU CentOS 7.9 VM. Before: ~120us avg latency, After: ~113us.

Fixes: 50c28f21d0 ("kvm: x86: Use fast CR3 switch for nested VMX")
Signed-off-by: Jon Kohler <jon@xxxxxxxxxxx>
Reviewed-by: Bijan Mottahedeh <bijan.mottahedeh@xxxxxxxxxxx>
Reviewed-by: Raphael Norwitz <raphael.norwitz@xxxxxxxxxxx>
Cc: Junaid Shahid <junaids@xxxxxxxxxx>
---
arch/x86/kvm/x86.c | 24 +++++++++++++++++++-----
virt/kvm/kvm_main.c | 21 +++++++--------------
2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index efc7a82ab140..354f690cc982 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9273,10 +9273,24 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
return 1;
}

-static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu, bool acquire_srcu)
{
- if (is_guest_mode(vcpu))
- kvm_x86_ops.nested_ops->check_events(vcpu);
+ if (is_guest_mode(vcpu)) {
+ if (acquire_srcu) {
+ /*
+ * We need to lock because check_events could call
+ * nested_vmx_vmexit() which might need to resolve a
+ * valid memslot. We will have this lock only when
+ * called from vcpu_run but not when called from
+ * kvm_vcpu_check_block > kvm_arch_vcpu_runnable.
+ */
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_x86_ops.nested_ops->check_events(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ } else {
+ kvm_x86_ops.nested_ops->check_events(vcpu);
+ }
+ }

return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
@@ -9291,7 +9305,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.l1tf_flush_l1d = true;

for (;;) {
- if (kvm_vcpu_running(vcpu)) {
+ if (kvm_vcpu_running(vcpu, false)) {
r = vcpu_enter_guest(vcpu);
} else {
r = vcpu_block(kvm, vcpu);
@@ -10999,7 +11013,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)

int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+ return kvm_vcpu_running(vcpu, true) || kvm_vcpu_has_events(vcpu);
}

bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 383df23514b9..05e29aed35b5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2783,22 +2783,15 @@ static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)

static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
{
- int ret = -EINTR;
- int idx = srcu_read_lock(&vcpu->kvm->srcu);
-
if (kvm_arch_vcpu_runnable(vcpu)) {
kvm_make_request(KVM_REQ_UNHALT, vcpu);
- goto out;
+ return -EINTR;
}
- if (kvm_cpu_has_pending_timer(vcpu))
- goto out;
- if (signal_pending(current))
- goto out;

- ret = 0;
-out:
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- return ret;
+ if (kvm_cpu_has_pending_timer(vcpu) || signal_pending(current))
+ return -EINTR;
+
+ return 0;
}

static inline void
--
2.24.3 (Apple Git-128)