Re: [RFC PATCH] x86 NMI-safe INT3 and Page Fault (v3)

From: Jeremy Fitzhardinge
Date: Wed Apr 16 2008 - 13:58:01 EST


Mathieu Desnoyers wrote:
"This way lies madness. Don't go there."

It is a large amount of... stuff. This immediate values thing makes a big improvement then?

Index: linux-2.6-lttng/include/linux/hardirq.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/hardirq.h 2008-04-16 11:25:18.000000000 -0400
+++ linux-2.6-lttng/include/linux/hardirq.h 2008-04-16 11:29:30.000000000 -0400
@@ -22,10 +22,13 @@
* PREEMPT_MASK: 0x000000ff
* SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x0fff0000
+ * HARDNMI_MASK: 0x40000000
*/
#define PREEMPT_BITS 8
#define SOFTIRQ_BITS 8
+#define HARDNMI_BITS 1
+
#ifndef HARDIRQ_BITS
#define HARDIRQ_BITS 12
@@ -45,16 +48,19 @@
#define PREEMPT_SHIFT 0
#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define HARDNMI_SHIFT (30)

Why at 30, rather than HARDIRQ_SHIFT+HARDIRQ_BITS?

#define __IRQ_MASK(x) ((1UL << (x))-1)
#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define HARDNMI_MASK (__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)
#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
+#define HARDNMI_OFFSET (1UL << HARDNMI_SHIFT)
#if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
#error PREEMPT_ACTIVE is too low!
@@ -63,6 +69,7 @@
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+#define hardnmi_count() (preempt_count() & HARDNMI_MASK)
/*
* Are we doing bottom half or hardware interrupt processing?
@@ -71,6 +78,7 @@
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
+#define in_nmi() (hardnmi_count())
/*
* Are we running in atomic context? WARNING: this macro cannot
@@ -159,7 +167,19 @@ extern void irq_enter(void);
*/
extern void irq_exit(void);
-#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter() \
+ do { \
+ lockdep_off(); \
+ BUG_ON(hardnmi_count()); \
+ add_preempt_count(HARDNMI_OFFSET); \
+ __irq_enter(); \
+ } while (0)
+
+#define nmi_exit() \
+ do { \
+ __irq_exit(); \
+ sub_preempt_count(HARDNMI_OFFSET); \
+ lockdep_on(); \
+ } while (0)
#endif /* LINUX_HARDIRQ_H */
Index: linux-2.6-lttng/arch/x86/kernel/entry_32.S
===================================================================
--- linux-2.6-lttng.orig/arch/x86/kernel/entry_32.S 2008-04-16 11:25:18.000000000 -0400
+++ linux-2.6-lttng/arch/x86/kernel/entry_32.S 2008-04-16 12:06:30.000000000 -0400
@@ -79,7 +79,6 @@ VM_MASK = 0x00020000
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
#define preempt_stop(clobbers)
-#define resume_kernel restore_nocheck
#endif
.macro TRACE_IRQS_IRET
@@ -265,6 +264,8 @@ END(ret_from_exception)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
+ testl $0x40000000,TI_preempt_count(%ebp) # nested over NMI ?
+ jnz return_to_nmi
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
jnz restore_nocheck
need_resched:
@@ -276,6 +277,12 @@ need_resched:
call preempt_schedule_irq
jmp need_resched
END(resume_kernel)
+#else
+ENTRY(resume_kernel)
+ testl $0x40000000,TI_preempt_count(%ebp) # nested over NMI ?

HARDNMI_MASK?

+ jnz return_to_nmi
+ jmp restore_nocheck
+END(resume_kernel)
#endif
CFI_ENDPROC
@@ -411,6 +418,22 @@ restore_nocheck_notrace:
CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
+return_to_nmi:
+ testl $X86_EFLAGS_TF, PT_EFLAGS(%esp)
+ jnz restore_nocheck /*
+ * If single-stepping an NMI handler,
+ * use the normal iret path instead of
+ * the popf/lret because lret would be
+ * single-stepped. It should not
+ * happen : it will reactivate NMIs
+ * prematurely.
+ */
+ TRACE_IRQS_IRET
+ RESTORE_REGS
+ addl $4, %esp # skip orig_eax/error_code
+ CFI_ADJUST_CFA_OFFSET -4
+ INTERRUPT_RETURN_NMI_SAFE
+
.section .fixup,"ax"
iret_exc:
pushl $0 # no error code
Index: linux-2.6-lttng/arch/x86/kernel/entry_64.S
===================================================================
--- linux-2.6-lttng.orig/arch/x86/kernel/entry_64.S 2008-04-16 11:25:18.000000000 -0400
+++ linux-2.6-lttng/arch/x86/kernel/entry_64.S 2008-04-16 12:06:31.000000000 -0400
@@ -581,12 +581,27 @@ retint_restore_args: /* return to kernel
* The iretq could re-enable interrupts:
*/
TRACE_IRQS_IRETQ
+ testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */

HARDNMI_MASK? (ditto below)

+ jnz return_to_nmi
restore_args:
RESTORE_ARGS 0,8,0
irq_return:
INTERRUPT_RETURN
+return_to_nmi: /*
+ * If single-stepping an NMI handler,
+ * use the normal iret path instead of
+ * the popf/lret because lret would be
+ * single-stepped. It should not
+ * happen : it will reactivate NMIs
+ * prematurely.
+ */
+ bt $8,EFLAGS-ARGOFFSET(%rsp) /* trap flag? */

test[bwl] is a bit more usual; then you can use X86_EFLAGS_TF.

+ jc restore_args
+ RESTORE_ARGS 0,8,0
+ INTERRUPT_RETURN_NMI_SAFE
+
.section __ex_table, "a"
.quad irq_return, bad_iret
.previous
@@ -802,6 +817,10 @@ END(spurious_interrupt)
.macro paranoidexit trace=1
/* ebx: no swapgs flag */
paranoid_exit\trace:
+ GET_THREAD_INFO(%rcx)
+ testl $0x40000000,threadinfo_preempt_count(%rcx) /* Nested over NMI ? */
+ jnz paranoid_return_to_nmi\trace
+paranoid_exit_no_nmi\trace:
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore\trace
testl $3,CS(%rsp)
@@ -814,6 +833,18 @@ paranoid_swapgs\trace:
paranoid_restore\trace:
RESTORE_ALL 8
jmp irq_return
+paranoid_return_to_nmi\trace: /*
+ * If single-stepping an NMI handler,
+ * use the normal iret path instead of
+ * the popf/lret because lret would be
+ * single-stepped. It should not
+ * happen : it will reactivate NMIs
+ * prematurely.
+ */
+ bt $8,EFLAGS-0(%rsp) /* trap flag? */
+ jc paranoid_exit_no_nmi\trace
+ RESTORE_ALL 8
+ INTERRUPT_RETURN_NMI_SAFE

Does this need to be repeated verbatim?

paranoid_userspace\trace:
GET_THREAD_INFO(%rcx)
movl threadinfo_flags(%rcx),%ebx
Index: linux-2.6-lttng/include/asm-x86/irqflags.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/irqflags.h 2008-04-16 11:25:18.000000000 -0400
+++ linux-2.6-lttng/include/asm-x86/irqflags.h 2008-04-16 11:29:30.000000000 -0400
@@ -138,12 +138,73 @@ static inline unsigned long __raw_local_
#ifdef CONFIG_X86_64
#define INTERRUPT_RETURN iretq
+
+/*
+ * Only returns from a trap or exception to a NMI context (intra-privilege
+ * level near return) to the same SS and CS segments. Should be used
+ * upon trap or exception return when nested over a NMI context so no iret is
+ * issued. It takes care of modifying the eflags, rsp and returning to the
+ * previous function.
+ *
+ * The stack, at that point, looks like :
+ *
+ * 0(rsp) RIP
+ * 8(rsp) CS
+ * 16(rsp) EFLAGS
+ * 24(rsp) RSP
+ * 32(rsp) SS
+ *
+ * Upon execution :
+ * Copy EIP to the top of the return stack
+ * Update top of return stack address
+ * Pop eflags into the eflags register
+ * Make the return stack current
+ * Near return (popping the return address from the return stack)
+ */
+#define INTERRUPT_RETURN_NMI_SAFE pushq %rax; \
+ pushq %rbx; \
+ movq 40(%rsp), %rax; \
+ movq 16(%rsp), %rbx; \

Use X+16(%rsp) notation here, so that the offsets correspond to the comment above.

+ subq $8, %rax; \
+ movq %rbx, (%rax); \
+ movq %rax, 40(%rsp); \
+ popq %rbx; \
+ popq %rax; \
+ addq $16, %rsp; \
+ popfq; \
+ movq (%rsp), %rsp; \
+ ret; \

How about something like

pushq %rax
mov %rsp, %rax /* old stack */
mov 24+8(%rax), %rsp /* switch stacks */
pushq 0+8(%rax) /* push return rip */
pushq 16+8(%rax) /* push return rflags */
movq (%rax), %rax /* restore %rax */
popfq /* restore flags */
ret /* restore rip */


+
#define ENABLE_INTERRUPTS_SYSCALL_RET \
movq %gs:pda_oldrsp, %rsp; \
swapgs; \
sysretq;
#else
#define INTERRUPT_RETURN iret
+
+/*
+ * Protected mode only, no V8086. Implies that protected mode must
+ * be entered before NMIs or MCEs are enabled. Only returns from a trap or
+ * exception to a NMI context (intra-privilege level far return). Should be used
+ * upon trap or exception return when nested over a NMI context so no iret is
+ * issued.
+ *
+ * The stack, at that point, looks like :
+ *
+ * 0(esp) EIP
+ * 4(esp) CS
+ * 8(esp) EFLAGS
+ *
+ * Upon execution :
+ * Copy the stack eflags to top of stack
+ * Pop eflags into the eflags register
+ * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
+ */
+#define INTERRUPT_RETURN_NMI_SAFE pushl 8(%esp); \
+ popfl; \
+ .byte 0xCA; \
+ .word 4;

Why not "lret $4"?

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/