[PATCH] Updates to Xen hypercall preemption

From: Per Bilse
Date: Wed Jun 21 2023 - 11:19:33 EST


Some Xen hypercalls issued by dom0 guests may run for many 10s of
seconds, potentially causing watchdog timeouts and other problems.
It's rare for this to happen, but it does in extreme circumstances,
for instance when shutting down VMs with very large memory allocations
(> 0.5 - 1TB). These hypercalls are preemptible, but the fixes in the
kernel to ensure preemption have fallen into a state of disrepair, and
are currently ineffective. This patch brings things up to date by way of:

1) Update general feature selection from XEN_PV to XEN_DOM0.
The issue is unique to dom0 Xen guests, but isn't unique to PV dom0s,
and will occur in future PVH dom0s. XEN_DOM0 depends on either PV or PVH,
as well as the appropriate details for dom0.

2) Update specific feature selection from !PREEMPTION to !PREEMPT.
The following table shows the relationship between different preemption
features and their indicators/selectors (Y = "=Y", N = "is not set",
. = absent):

| np-s | np-d | vp-s | vp-d | fp-s | fp-d
CONFIG_PREEMPT_DYNAMIC N Y N Y N Y
CONFIG_PREEMPTION . Y . Y Y Y
CONFIG_PREEMPT N N N N Y Y
CONFIG_PREEMPT_VOLUNTARY N N Y Y N N
CONFIG_PREEMPT_NONE Y Y N N N N

Unless PREEMPT is set, we need to enable the fixes.

3) Update flag access from __this_cpu_XXX() to raw_cpu_XXX().
The long-running hypercalls are flagged by way of a per-cpu variable
which is set before and cleared after the relevant calls. This elicits
a warning "BUG: using __this_cpu_write() in preemptible [00000000] code",
but xen_pv_evtchn_do_upcall() deals specifically with this. For
consistency, flag testing is also updated, and the code is simplified
and tidied accordingly.

4) Update irqentry_exit_cond_resched() to raw_irqentry_exit_cond_resched().
The code will call irqentry_exit_cond_resched() if the flag (as noted
above) is set, but the dynamic preemption feature will livepatch that
function to a no-op unless full preemption is selected. The code is
therefore updated to call raw_irqentry_exit_cond_resched().

Signed-off-by: Per Bilse <per.bilse@xxxxxxxxxx>
---
arch/x86/entry/common.c | 34 +++++++++++++---------------------
drivers/xen/privcmd.c | 12 ++++++------
include/xen/xen-ops.h | 14 +++++++-------
3 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6c2826417b33..19e8609a7a5a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -20,7 +20,7 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>

-#ifdef CONFIG_XEN_PV
+#ifdef CONFIG_XEN_DOM0
#include <xen/xen-ops.h>
#include <xen/events.h>
#endif
@@ -252,17 +252,17 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
}

-#ifdef CONFIG_XEN_PV
-#ifndef CONFIG_PREEMPTION
+#ifdef CONFIG_XEN_DOM0
+#ifndef CONFIG_PREEMPT
/*
* Some hypercalls issued by the toolstack can take many 10s of
* seconds. Allow tasks running hypercalls via the privcmd driver to
* be voluntarily preempted even if full kernel preemption is
* disabled.
*
- * Such preemptible hypercalls are bracketed by
- * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
- * calls.
+ * Such preemptible hypercalls are flagged by
+ * xen_preemptible_hcall_set(true/false), and status is
+ * returned by xen_preemptible_hcall_get().
*/
DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
@@ -271,21 +271,15 @@ EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
* In case of scheduling the flag must be cleared and restored after
* returning from schedule as the task might move to a different CPU.
*/
-static __always_inline bool get_and_clear_inhcall(void)
+static __always_inline bool get_and_unset_hcall(void)
{
- bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+ bool inhcall = xen_preemptible_hcall_get();

- __this_cpu_write(xen_in_preemptible_hcall, false);
+ xen_preemptible_hcall_set(false);
return inhcall;
}
-
-static __always_inline void restore_inhcall(bool inhcall)
-{
- __this_cpu_write(xen_in_preemptible_hcall, inhcall);
-}
#else
-static __always_inline bool get_and_clear_inhcall(void) { return false; }
-static __always_inline void restore_inhcall(bool inhcall) { }
+static __always_inline bool get_and_unset_hcall(void) { return false; }
#endif

static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
@@ -302,16 +296,14 @@ static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
{
irqentry_state_t state = irqentry_enter(regs);
- bool inhcall;

instrumentation_begin();
run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);

- inhcall = get_and_clear_inhcall();
- if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
- irqentry_exit_cond_resched();
+ if (get_and_unset_hcall() && !WARN_ON_ONCE(state.exit_rcu)) {
+ raw_irqentry_exit_cond_resched();
instrumentation_end();
- restore_inhcall(inhcall);
+ xen_preemptible_hcall_set(true);
} else {
instrumentation_end();
irqentry_exit(regs, state);
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index e2f580e30a86..78c91227d2a5 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -77,12 +77,12 @@ static long privcmd_ioctl_hypercall(struct file *file, void __user *udata)
if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
return -EFAULT;

- xen_preemptible_hcall_begin();
+ xen_preemptible_hcall_set(true);
ret = privcmd_call(hypercall.op,
hypercall.arg[0], hypercall.arg[1],
hypercall.arg[2], hypercall.arg[3],
hypercall.arg[4]);
- xen_preemptible_hcall_end();
+ xen_preemptible_hcall_set(false);

return ret;
}
@@ -688,9 +688,9 @@ static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
xbufs[i].size = kbufs[i].size;
}

- xen_preemptible_hcall_begin();
+ xen_preemptible_hcall_set(true);
rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs);
- xen_preemptible_hcall_end();
+ xen_preemptible_hcall_set(false);

out:
unlock_pages(pages, pinned);
@@ -790,9 +790,9 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
xdata.nr_frames = kdata.num;
set_xen_guest_handle(xdata.frame_list, pfns);

- xen_preemptible_hcall_begin();
+ xen_preemptible_hcall_set(true);
rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
- xen_preemptible_hcall_end();
+ xen_preemptible_hcall_set(false);

if (rc)
goto out;
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 47f11bec5e90..c5b06405f355 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -194,24 +194,24 @@ bool xen_running_on_version_or_later(unsigned int major, unsigned int minor);
void xen_efi_runtime_setup(void);


-#if defined(CONFIG_XEN_PV) && !defined(CONFIG_PREEMPTION)
+#if defined(CONFIG_XEN_DOM0) && !defined(CONFIG_PREEMPT)

DECLARE_PER_CPU(bool, xen_in_preemptible_hcall);

-static inline void xen_preemptible_hcall_begin(void)
+static inline void xen_preemptible_hcall_set(bool status)
{
- __this_cpu_write(xen_in_preemptible_hcall, true);
+ raw_cpu_write(xen_in_preemptible_hcall, status);
}

-static inline void xen_preemptible_hcall_end(void)
+static inline bool xen_preemptible_hcall_get(void)
{
- __this_cpu_write(xen_in_preemptible_hcall, false);
+ return raw_cpu_read(xen_in_preemptible_hcall);
}

#else

-static inline void xen_preemptible_hcall_begin(void) { }
-static inline void xen_preemptible_hcall_end(void) { }
+static inline void xen_preemptible_hcall_set(bool status) { }
+static inline bool xen_preemptible_hcall_get(void) { return false; }

#endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */

--
2.31.1