Re: [PATCH v7 11/11] KVM: xen: allow vcpu_info content to be 'safely' copied

From: Sean Christopherson
Date: Wed Nov 08 2023 - 15:55:29 EST


On Wed, Nov 08, 2023, David Woodhouse wrote:
> On Tue, 2023-10-31 at 16:58 -0700, Sean Christopherson wrote:
> >
> > > diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
> > > index aafc794940e4..e645066217bb 100644
> > > --- a/arch/x86/kvm/xen.c
> > > +++ b/arch/x86/kvm/xen.c
> > > @@ -1606,9 +1606,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
> > >                 WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
> > >         }
> > >  
> > > -       if (!vcpu->arch.xen.vcpu_info_cache.active)
> > > -               return -EINVAL;
> > > -
> >
> > Hmm, maybe move this check after the "hard" error checks and explicitly do:
> >
> >                 return -EWOULDBLOCK
> >
> > That way it's much more obvious that this patch is safe.  Alternatively, briefly
> > explain what happens if the cache is invalid in the changelog.
>
>
> Hm, I thought we dropped that? It doesn't want to return -EWOULDBLOCK.
> That would cause a "fast" irqfd delivery to defer to the slow
> (workqueue) path, but then the same thing would happen on that path
> *too*.
>
> I actually think this check needs to be dropped completely. The
> evtchn_pending_sel bit will get set in the *shadow* copy in vcpu-
> >arch.xen.evtchn_pending_sel, and will subsequently be set in the real
> vcpu_info when that is configured again. We do already handle that case
> correctly, I believe:
>
> if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
> /*
> * Could not access the vcpu_info. Set the bit in-kernel
> * and prod the vCPU to deliver it for itself.
> */
> if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
> kick_vcpu = true;
> goto out_rcu;
> }
>
> If we leave this check in place as it is, I think we're going to lose
> incoming events (and IPIs) directed at a vCPU while it has its
> vcpu_info temporarily removed.

Oh, dagnabbit, there are two different gfn_to_pfn_cache objects. I assumed "gpc"
was the same thing as vcpu_info_cache.active, i.e. thought the explicit active
check here was just to pre-check the validity before acquiring the lock.

if (!vcpu->arch.xen.vcpu_info_cache.active)
return -EINVAL;

rc = -EWOULDBLOCK;

idx = srcu_read_lock(&kvm->srcu);

read_lock_irqsave(&gpc->lock, flags);
if (!kvm_gpc_check(gpc, PAGE_SIZE)) <== I thought this was the same check
goto out_rcu;

That's why my comment was nonsensical, and also why I was super confused by your
response for a few minutes :-)

Good gravy, this even conditionally switches gpc and makes the out path drop
different locks. That's just mean.

Any objection to splitting out the vcpu_info chunk to a separate function? That'd
eliminate the subtle gpc+lock switch, and provide a convenient and noticeable
location to explain why it's ok for setting the vcpu_info to fail.

E.g.

---
arch/x86/kvm/xen.c | 102 ++++++++++++++++++++++++++-------------------
1 file changed, 58 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index d65e89e062e4..cd2021ba0ae3 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1621,6 +1621,57 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
}
}

+static int kvm_xen_needs_a_name(struct kvm_vcpu *vcpu, int port_word_bit)
+{
+ struct gfn_to_pfn_cache *gpc = &vcpu->arch.xen.vcpu_info_cache;
+ bool kick_vcpu = false;
+ unsigned long flags;
+ int r = 0;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
+ /*
+ * Could not access the vcpu_info. Set the bit in-kernel and
+ * prod the vCPU to deliver it for itself.
+ */
+ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+
+ r = -EWOULDBLOCK;
+ goto out;
+ }
+
+ if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) {
+ struct vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ } else {
+ struct compat_vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit,
+ (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ }
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
+ kvm_xen_inject_vcpu_vector(vcpu);
+ kick_vcpu = false;
+ }
+
+out:
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kick_vcpu) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ return r;
+}
/*
* The return value from this function is propagated to kvm_set_irq() API,
* so it returns:
@@ -1638,7 +1689,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
unsigned long *pending_bits, *mask_bits;
unsigned long flags;
int port_word_bit;
- bool kick_vcpu = false;
int vcpu_idx, idx, rc;

vcpu_idx = READ_ONCE(xe->vcpu_idx);
@@ -1660,7 +1710,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)

read_lock_irqsave(&gpc->lock, flags);
if (!kvm_gpc_check(gpc, PAGE_SIZE))
- goto out_rcu;
+ goto out_unlock;

if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
struct shared_info *shinfo = gpc->khva;
@@ -1688,52 +1738,16 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
kvm_xen_check_poller(vcpu, xe->port);
} else {
rc = 1; /* Delivered to the bitmap in shared_info. */
- /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
- read_unlock_irqrestore(&gpc->lock, flags);
- gpc = &vcpu->arch.xen.vcpu_info_cache;
-
- read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- /*
- * Could not access the vcpu_info. Set the bit in-kernel
- * and prod the vCPU to deliver it for itself.
- */
- if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
- kick_vcpu = true;
- goto out_rcu;
- }
-
- if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
- struct vcpu_info *vcpu_info = gpc->khva;
- if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
- WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
- kick_vcpu = true;
- }
- } else {
- struct compat_vcpu_info *vcpu_info = gpc->khva;
- if (!test_and_set_bit(port_word_bit,
- (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
- WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
- kick_vcpu = true;
- }
- }
-
- /* For the per-vCPU lapic vector, deliver it as MSI. */
- if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
- kvm_xen_inject_vcpu_vector(vcpu);
- kick_vcpu = false;
- }
}

- out_rcu:
+ out_unlock:
read_unlock_irqrestore(&gpc->lock, flags);
+
+ /* Paul to add comment explaining why it's ok kvm_xen_needs_a_name() fails */
+ if (rc == 1)
+ rc = kvm_xen_needs_a_name(vcpu, port_word_bit);
+
srcu_read_unlock(&kvm->srcu, idx);
-
- if (kick_vcpu) {
- kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
- kvm_vcpu_kick(vcpu);
- }
-
return rc;
}


base-commit: 0f34262cf6fab8163456c7740c7ee1888a8af93c
--