Re: [PATCH 03/16] KVM: x86: set gfn-to-pfn cache length consistently with VM word size

From: David Woodhouse
Date: Wed Nov 16 2022 - 17:50:13 EST


On Mon, 2022-11-14 at 16:16 -0800, David Woodhouse wrote:
>
> I'm playing with using a second GPC for the overrun onto the second
> page. Debating if it's already too ugly to live before I even fix up
> the actual copying part...

Well it certainly didn't get any *prettier*. Utterly untested other
than building it, so it's certainly going to be broken, but as an
illustration.

I can't see a sane way to get the two pages vmapped consecutively,
given that they might be IOMEM. So I can't see how to make a single GPC
do this "nicely", and I think we have to declare that the runstate area
is the only case that actually needs this, then do it this way as a
special case... even though it's fugly?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 81114a376c4e..3fc08f416aa3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -647,6 +647,7 @@ struct kvm_vcpu_xen {
struct gfn_to_pfn_cache vcpu_info_cache;
struct gfn_to_pfn_cache vcpu_time_info_cache;
struct gfn_to_pfn_cache runstate_cache;
+ struct gfn_to_pfn_cache runstate2_cache;
u64 last_steal;
u64 runstate_entry_time;
u64 runstate_times[4];
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b8e9628fbf5..14ba45b541bf 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -198,38 +198,101 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
vx->runstate_entry_time = now;
}

+/*
+ * The guest region is arbitrarily aligned, and could be split across
+ * two pages.
+ *
+ * d1: Pointer to kernel map of first byte of region.
+ * d2: Pointer to kernel map of first byte of second page.
+ * l1: length of first range [ == PAGE_SIZE - (d1 & ~PAGE_MASK) ]
+ * src: Source pointer.
+ * len: Source length to be copied.
+ * dst_ofs: Destination offset within the guest region.
+ */
+static inline void memcpy_to_runstate(void *d1, void *d2, size_t l1,
+ void *src, size_t len, size_t dst_ofs)
+{
+ size_t copylen;
+
+ if (dst_ofs < l1) {
+ copylen = min(l1 - dst_ofs, len);
+ memcpy(d1 + dst_ofs, src, copylen);
+ if (copylen == len)
+ return;
+
+ src += copylen;
+ dst_ofs += copylen;
+ len -= copylen;
+ }
+
+ BUG_ON(dst_ofs < l1);
+ memcpy(d2 + dst_ofs - l1, src, len);
+}
+
void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
- struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
- uint64_t *user_times;
+ struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+ struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
unsigned long flags;
- size_t user_len;
- int *user_state;
+ size_t user_len, user_len1, user_len2;
+ size_t times_ofs;
+ u8 *update_bit;

kvm_xen_update_runstate(v, state);

- if (!vx->runstate_cache.active)
+ if (!gpc1->active)
return;

- if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
user_len = sizeof(struct vcpu_runstate_info);
- else
+ times_ofs = offsetof(struct vcpu_runstate_info,
+ state_entry_time);
+ } else {
user_len = sizeof(struct compat_vcpu_runstate_info);
+ times_ofs = offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time);
+ }

- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- user_len)) {
- read_unlock_irqrestore(&gpc->lock, flags);
+ if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+ user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+ user_len2 = user_len - user_len1;
+ } else {
+ user_len1 = user_len;
+ user_len2 = 0;
+ }
+ BUG_ON(user_len1 + user_len2 != user_len);
+
+ retry:
+ read_lock_irqsave(&gpc1->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa,
+ user_len1)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);

/* When invoked from kvm_sched_out() we cannot sleep */
if (state == RUNSTATE_runnable)
return;

- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa, user_len1))
return;

- read_lock_irqsave(&gpc->lock, flags);
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
+ if (user_len2) {
+ read_lock(&gpc2->lock);
+ if (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc2, gpc2->gpa, user_len2)) {
+ read_unlock(&gpc2->lock);
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ if (state == RUNSTATE_runnable)
+ return;
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc2,
+ gpc2->gpa, user_len2))
+ return;
+
+ goto retry;
+ }
}

/*
@@ -252,25 +315,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
offsetof(struct compat_vcpu_runstate_info, time) + 4);
#endif

- user_state = gpc->khva;
-
- if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
- user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
- state_entry_time);
- else
- user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
- state_entry_time);
-
/*
- * First write the updated state_entry_time at the appropriate
- * location determined by 'offset'.
+ * The XEN_RUNSTATE_UPDATE bit is the top bit of the state_entry_time
+ * field. We need to set it (and write-barrier) before the rest.
*/
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
- sizeof(user_times[0]));
+ sizeof(uint64_t));
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
- sizeof(user_times[0]));
+ sizeof(uint64_t));
+ BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);

- user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
+ if (user_len1 >= times_ofs + sizeof(uint64_t))
+ update_bit = ((u8 *)gpc1->khva) + times_ofs + sizeof(u64) - 1;
+ else
+ update_bit = ((u8 *)gpc2->khva) + times_ofs + sizeof(u64) - 1 -
+ user_len1;
+
+ *update_bit |= (XEN_RUNSTATE_UPDATE >> 56);
smp_wmb();

/*
@@ -284,7 +345,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));

- *user_state = vx->current_runstate;
+ memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+ &vx->current_runstate, sizeof(vx->current_runstate),
+ offsetof(struct vcpu_runstate_info, state));

/*
* Write the actual runstate times immediately after the
@@ -299,19 +362,28 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));

- memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
- smp_wmb();
+ memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+ vx->runstate_times, sizeof(vx->runstate_times),
+ times_ofs + sizeof(u64));

+ memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+ &vx->runstate_entry_time, sizeof(vx->runstate_entry_time) - 1,
+ times_ofs);
+ smp_wmb();
/*
* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
* runstate_entry_time field.
*/
- user_times[0] &= ~XEN_RUNSTATE_UPDATE;
+ *update_bit = vx->runstate_entry_time >> 56;
smp_wmb();

- read_unlock_irqrestore(&gpc->lock, flags);
+ if (user_len2)
+ read_unlock_irqrestore(&gpc2->lock, flags);
+ read_unlock_irqrestore(&gpc1->lock, flags);

- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
+ if (user_len2)
+ mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
}

static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
@@ -584,23 +656,52 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;

- case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+ size_t sz;
+
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
if (data->u.gpa == GPA_INVALID) {
+ r = 0;
+ deactivate_out:
kvm_gpc_deactivate(vcpu->kvm,
&vcpu->arch.xen.runstate_cache);
- r = 0;
+ deactivate2_out:
+ kvm_gpc_deactivate(vcpu->kvm,
+ &vcpu->arch.xen.runstate2_cache);
break;
}

- r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_runstate_info));
+ if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+ sz = sizeof(struct vcpu_runstate_info);
+ else
+ sz = sizeof(struct compat_vcpu_runstate_info);
+
+ /* Handle structures which cross a page boundary by using two GPCs */
+ if ((data->u.gpa & ~PAGE_MASK) + sz <= PAGE_SIZE) {
+ r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ sizeof(struct vcpu_runstate_info));
+ goto deactivate2_out;
+ } else {
+ /* Map the end of the first page... */
+ r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ PAGE_SIZE - (data->u.gpa & ~PAGE_MASK));
+ if (r)
+ goto deactivate2_out;
+ /* ... and the start of the second. */
+ sz -= PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
+ r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache,
+ NULL, KVM_HOST_USES_PFN,
+ (data->u.gpa + PAGE_SIZE) & PAGE_MASK, sz);
+ if (r)
+ goto deactivate_out;
+ }
break;
-
+ }
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
@@ -1834,6 +1935,7 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);

kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_init(&vcpu->arch.xen.runstate2_cache);
kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
}
@@ -1844,6 +1946,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
kvm_xen_stop_timer(vcpu);

kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache);
kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);

Attachment: smime.p7s
Description: S/MIME cryptographic signature