[RFC PATCH] KVM: race-free exit from KVM_RUN without POSIX signals

From: Paolo Bonzini
Date: Wed Feb 08 2017 - 06:18:21 EST


The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick"
a VCPU out of KVM_RUN through a POSIX signal. A signal is attached
to a dummy signal handler; by blocking the signal outside KVM_RUN and
unblocking it inside, this possible race is closed:

VCPU thread service thread
--------------------------------------------------------------
check flag
set flag
raise signal
(signal handler does nothing)
KVM_RUN

However, one issue with KVM_SET_SIGNAL_MASK is that it has to take
tsk->sighand->siglock on every KVM_RUN. This lock is often on a
remote NUMA node, because it is on the node of a thread's creator.
Taking this lock can be very expensive if there are many userspace
exits (as is the case for SMP Windows VMs without Hyper-V reference
time counter).

As an alternative, we can put the flag directly in kvm_run so that
KVM can see it:

VCPU thread service thread
--------------------------------------------------------------
raise signal
signal handler
set run->immediate_exit
KVM_RUN
check run->immediate_exit

Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
---
Documentation/virtual/kvm/api.txt | 13 ++++++++++++-
include/uapi/linux/kvm.h | 1 +
virt/kvm/kvm_main.c | 14 +++++++++++---
3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e4f2cdcf78eb..925b1b6be073 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3389,7 +3389,18 @@ struct kvm_run {
Request that KVM_RUN return when it becomes possible to inject external
interrupts into the guest. Useful in conjunction with KVM_INTERRUPT.

- __u8 padding1[7];
+ __u8 immediate_exit;
+
+This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
+clears the field and exits immediately, returning -EINTR. In the common
+scenario where a signal is used to "kick" a VCPU out of KVM_RUN, this
+field can be used as an alternative to KVM_SET_SIGNAL_MASK: instead of
+blocking the signal outside KVM_RUN, the signal handler should set
+run->immediate_exit to a nonzero value.
+
+This field is only available if KVM_CAP_IMMEDIATE_EXIT is.
+
+ __u8 padding1[6];

/* out */
__u32 exit_reason;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7964b970b9ad..0aa43b1e5b1d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -881,6 +881,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_SPAPR_RESIZE_HPT 133
#define KVM_CAP_PPC_MMU_RADIX 134
#define KVM_CAP_PPC_MMU_HASH_V3 135
+#define KVM_CAP_IMMEDIATE_EXIT 136

#ifdef KVM_CAP_IRQ_ROUTING

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 482612b4e496..5abb8df00db1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2550,7 +2550,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (r)
return r;
switch (ioctl) {
- case KVM_RUN:
+ case KVM_RUN: {
+ struct kvm_run *run;
r = -EINVAL;
if (arg)
goto out;
@@ -2564,9 +2565,15 @@ static long kvm_vcpu_ioctl(struct file *filp,
synchronize_rcu();
put_pid(oldpid);
}
- r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
- trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
+ run = vcpu->run;
+ if (run->immediate_exit) {
+ WRITE_ONCE(run->immediate_exit, 0);
+ return -EINTR;
+ }
+ r = kvm_arch_vcpu_ioctl_run(vcpu, run);
+ trace_kvm_userspace_exit(run->exit_reason, r);
break;
+ }
case KVM_GET_REGS: {
struct kvm_regs *kvm_regs;

@@ -2927,6 +2934,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
#endif
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
case KVM_CAP_CHECK_EXTENSION_VM:
+ case KVM_CAP_IMMEDIATE_EXIT:
return 1;
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
case KVM_CAP_IRQ_ROUTING:
--
1.8.3.1