Re: [PATCH v2] xen: get rid of paravirt op adjust_exception_frame

From: Andy Lutomirski
Date: Tue Aug 01 2017 - 15:45:45 EST


On Tue, Aug 1, 2017 at 3:39 AM, Juergen Gross <jgross@xxxxxxxx> wrote:
> When running as Xen pv-guest the exception frame on the stack contains
> %r11 and %rcx additional to the other data pushed by the processor.
>
> Instead of having a paravirt op being called for each exception type
> prepend the Xen specific code to each exception entry. When running as
> Xen pv-guest just use the exception entry with prepended instructions,
> otherwise use the entry without the Xen specific code.
>
> Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
> ---
> arch/x86/entry/entry_64.S | 22 ++------------
> arch/x86/entry/entry_64_compat.S | 1 -
> arch/x86/include/asm/desc.h | 16 ++++++++++
> arch/x86/include/asm/paravirt.h | 5 ---
> arch/x86/include/asm/paravirt_types.h | 4 ---
> arch/x86/include/asm/proto.h | 3 ++
> arch/x86/include/asm/traps.h | 51 +++++++++++++++++++++++++++++--
> arch/x86/kernel/asm-offsets_64.c | 1 -
> arch/x86/kernel/paravirt.c | 3 --
> arch/x86/kernel/traps.c | 57 ++++++++++++++++++-----------------
> arch/x86/xen/enlighten_pv.c | 16 +++++-----
> arch/x86/xen/irq.c | 3 --
> arch/x86/xen/xen-asm_64.S | 45 ++++++++++++++++++++++++---
> arch/x86/xen/xen-ops.h | 1 -
> 14 files changed, 147 insertions(+), 81 deletions(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index a9a8027a6c0e..602bcf68a32c 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -745,7 +745,6 @@ ENTRY(\sym)
> .endif
>
> ASM_CLAC
> - PARAVIRT_ADJUST_EXCEPTION_FRAME
>
> .ifeq \has_error_code
> pushq $-1 /* ORIG_RAX: no syscall to restart */
> @@ -901,7 +900,7 @@ ENTRY(do_softirq_own_stack)
> END(do_softirq_own_stack)
>
> #ifdef CONFIG_XEN
> -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
> +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
>
> /*
> * A note on the "critical region" in our callback handler.
> @@ -967,8 +966,6 @@ ENTRY(xen_failsafe_callback)
> movq 8(%rsp), %r11
> addq $0x30, %rsp
> pushq $0 /* RIP */
> - pushq %r11
> - pushq %rcx
> jmp general_protection
> 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
> movq (%rsp), %rcx
> @@ -997,9 +994,8 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
> idtentry stack_segment do_stack_segment has_error_code=1
>
> #ifdef CONFIG_XEN
> -idtentry xen_debug do_debug has_error_code=0
> -idtentry xen_int3 do_int3 has_error_code=0
> -idtentry xen_stack_segment do_stack_segment has_error_code=1
> +idtentry xendebug do_debug has_error_code=0
> +idtentry xenint3 do_int3 has_error_code=0
> #endif
>
> idtentry general_protection do_general_protection has_error_code=1
> @@ -1161,18 +1157,6 @@ END(error_exit)
> /* Runs on exception stack */
> ENTRY(nmi)
> /*
> - * Fix up the exception frame if we're on Xen.
> - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
> - * one value to the stack on native, so it may clobber the rdx
> - * scratch slot, but it won't clobber any of the important
> - * slots past it.
> - *
> - * Xen is a different story, because the Xen frame itself overlaps
> - * the "NMI executing" variable.
> - */

Based on Andrew Cooper's email, it sounds like this function is just
straight-up broken on Xen PV. Maybe change the comment to "XXX:
broken on Xen PV" or similar.

> +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
> +#define pv_trap_entry(name) (void *)(xen_pv_domain() ? xen_ ## name : name)
> +#else
> +#define pv_trap_entry(name) (void *)(name)
> +#endif
> +

Seems reasonable to me.

> #ifdef CONFIG_X86_64
>
> static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
> @@ -482,6 +490,14 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
> 0, 0, __KERNEL_CS); \
> } while (0)
>
> +#define set_intr_gate_pv(n, addr) \
> + do { \
> + set_intr_gate_notrace(n, pv_trap_entry(addr)); \
> + _trace_set_gate(n, GATE_INTERRUPT, \
> + pv_trap_entry(trace_##addr), \
> + 0, 0, __KERNEL_CS); \
> + } while (0)

Any reason this can't be set_intr_gate((n), pv_trap_entry(addr))? Or
does that get preprocessed wrong?

> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 01fd0a7f48cd..e2ae5f3b9c2c 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -13,9 +13,8 @@ asmlinkage void divide_error(void);
> asmlinkage void debug(void);
> asmlinkage void nmi(void);
> asmlinkage void int3(void);
> -asmlinkage void xen_debug(void);
> -asmlinkage void xen_int3(void);
> -asmlinkage void xen_stack_segment(void);
> +asmlinkage void xendebug(void);
> +asmlinkage void xenint3(void);

What are xendebug and xenint3 for? Are they because they don't use IST on Xen?

> __visible struct pv_cpu_ops pv_cpu_ops = {
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index bf54309b85da..a79a97a46a59 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -946,15 +946,15 @@ void __init early_trap_init(void)
> * early stage. DEBUG_STACK will be equipped after cpu_init() in
> * trap_init().
> *
> - * We don't need to set trace_idt_table like set_intr_gate(),
> + * We don't need to set trace_idt_table like set_intr_gate_pv(),
> * since we don't have trace_debug and it will be reset to
> * 'debug' in trap_init() by set_intr_gate_ist().
> */
> set_intr_gate_notrace(X86_TRAP_DB, debug);

This is confusing IMO. Maybe just drop the comment change? But how
does this work at all on Xen?

> - set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
> - set_intr_gate(X86_TRAP_TS, invalid_TSS);
> - set_intr_gate(X86_TRAP_NP, segment_not_present);
> - set_intr_gate(X86_TRAP_SS, stack_segment);
> - set_intr_gate(X86_TRAP_GP, general_protection);
> - set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
> - set_intr_gate(X86_TRAP_MF, coprocessor_error);
> - set_intr_gate(X86_TRAP_AC, alignment_check);
> + set_intr_gate_pv(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
> + set_intr_gate_pv(X86_TRAP_TS, invalid_TSS);
> + set_intr_gate_pv(X86_TRAP_NP, segment_not_present);
> + set_intr_gate_pv(X86_TRAP_SS, stack_segment);
> + set_intr_gate_pv(X86_TRAP_GP, general_protection);
> + set_intr_gate_pv(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
> + set_intr_gate_pv(X86_TRAP_MF, coprocessor_error);
> + set_intr_gate_pv(X86_TRAP_AC, alignment_check);

Hmm. I'm okay with this, but I'm wondering whether it might be nice
to try to have a pv op that changes what IDT writes do and remaps the
function pointer. Like...

> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
> index 811e4ddb3f37..7e107142bc4f 100644
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -598,24 +598,22 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
> * so we should never see them. Warn if
> * there's an unexpected IST-using fault handler.
> */
> - if (addr == (unsigned long)debug)
> - addr = (unsigned long)xen_debug;
> - else if (addr == (unsigned long)int3)
> - addr = (unsigned long)xen_int3;
> - else if (addr == (unsigned long)stack_segment)
> - addr = (unsigned long)xen_stack_segment;
> - else if (addr == (unsigned long)double_fault) {
> + if (addr == (unsigned long)xen_debug)
> + addr = (unsigned long)xen_xendebug;
> + else if (addr == (unsigned long)xen_int3)
> + addr = (unsigned long)xen_xenint3;
> + else if (addr == (unsigned long)xen_double_fault) {
> /* Don't need to handle these */
> return 0;

...this? Can't this list just be extended to handle all of the pv
entries instead of using the macro magic?

> diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
> index c3df43141e70..f72ff71cc897 100644
> --- a/arch/x86/xen/xen-asm_64.S
> +++ b/arch/x86/xen/xen-asm_64.S
> @@ -22,11 +22,46 @@
>
> #include "xen-asm.h"
>
> -ENTRY(xen_adjust_exception_frame)
> - mov 8+0(%rsp), %rcx
> - mov 8+8(%rsp), %r11
> - ret $16
> -ENDPROC(xen_adjust_exception_frame)
> +.macro xen_pv_trap name
> +ENTRY(xen_\name)
> + pop %rcx
> + pop %r11
> + jmp \name
> +END(xen_\name)
> +.endm
> +
> +xen_pv_trap divide_error
> +xen_pv_trap debug
> +xen_pv_trap xendebug
> +xen_pv_trap int3
> +xen_pv_trap xenint3
> +xen_pv_trap nmi
> +xen_pv_trap overflow
> +xen_pv_trap bounds
> +xen_pv_trap invalid_op
> +xen_pv_trap device_not_available
> +xen_pv_trap double_fault
> +xen_pv_trap coprocessor_segment_overrun
> +xen_pv_trap invalid_TSS
> +xen_pv_trap segment_not_present
> +xen_pv_trap stack_segment
> +xen_pv_trap general_protection
> +xen_pv_trap page_fault
> +xen_pv_trap async_page_fault
> +xen_pv_trap spurious_interrupt_bug
> +xen_pv_trap coprocessor_error
> +xen_pv_trap alignment_check
> +#ifdef CONFIG_X86_MCE
> +xen_pv_trap machine_check
> +#endif /* CONFIG_X86_MCE */
> +xen_pv_trap simd_coprocessor_error
> +#ifdef CONFIG_IA32_EMULATION
> +xen_pv_trap entry_INT80_compat
> +#endif
> +#ifdef CONFIG_TRACING
> +xen_pv_trap trace_page_fault
> +#endif
> +xen_pv_trap hypervisor_callback

I like this.

Also, IMO it would be nice to fully finish the job. Remaining steps are:

1. Unsuck the SYSCALL entries on Xen PV.
2. Unsuck the SYENTER entry on Xen PV.
3. Make a xen_nmi that's actually correct (should be trivial)

#1 is here:

https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/entry_syscall&id=14fee348e3e86c994400d68085217d1232a637d6

Can you test it and, if you like it, either add it to your series or
ack it? If you have extra spare cycles, you could try to do #2 and
#3, too :) For #2, it might actually make sense to rig up the Xen
sysenter entry to redirect to entry_SYSCALL_compat or even an entirely
new entry point -- SYSENTER is really weird.

--Andy