[RFC/PATCH v2 v2 5/6] x86/entry/32: Migrate to C exit path and rework vm86 exit hack

From: Andy Lutomirski
Date: Thu Jul 09 2015 - 22:18:42 EST


This removes the hybrid asm-and-C implementation of exit work.

This patch modifies a giant hack. vm86 used to fiddle with
TIF_NOTIFY_RESUME and fix itself up in the exit asm. The hack was
messy and completely incorrect: it broke vm86 if the syscall slow
path was being used.

Rework the hack. We now forcibly exit vm86 mode on return to
userspace if we're delivering a signal (this is needed to deliver
the signal correctly) or if a new TIF_EXIT_VM86 flag is set. The
TIF_NOTIFY_RESUME hack is changed to use TIF_EXIT_VM86 instead.

This makes prepare_exit_to_usermode a bit slower on CONFIG_VM86=y
kernels. People shouldn't use such kernels if they care about
sanity, security, or performance.

Brian Gerst is planning to further rework vm86 mode to leave pt_regs
where it belongs. That will allow us to revert the
pt_regs_to_thread_info slowdown the stack switching parts of this
code; instead we can just exit normally, as vm86 won't have a
special stack layout any more.

Before this change, the entry_from_vm86 test failed under strace.
Now it passes.

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
---
arch/x86/entry/common.c | 56 ++++++++++++++++++++++++++-
arch/x86/entry/entry_32.S | 79 ++++++--------------------------------
arch/x86/include/asm/thread_info.h | 2 +
arch/x86/kernel/vm86_32.c | 6 +--
4 files changed, 69 insertions(+), 74 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index febc53086a69..aeaf7d64be0f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -240,10 +240,51 @@ void syscall_trace_leave(struct pt_regs *regs)

static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
{
+#ifdef CONFIG_VM86
+ /*
+ * In VM86 mode, pt_regs isn't in a well-defined place on the
+ * stack. Skip the optimization entirely.
+ */
+ return current_thread_info();
+#else
unsigned long top_of_stack =
(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+#endif
+}
+
+#ifdef CONFIG_VM86
+static void __noreturn exit_vm86_immediately(struct pt_regs *regs)
+{
+ /*
+ * VM86 sometimes needs to exit back to normal user mode
+ * (unsurprisingly) and its hack of resetting the stack and
+ * jumping into the exit asm isn't always usable (also
+ * unsurprisingly). Instead, we land in this abomination.
+ *
+ * While I can't defend this code as being anything other
+ * than awful, at least it's more or less self-contained,
+ * and it's less awful and much less buggy than the even
+ * worse hack it replaces. --Andy
+ */
+ struct pt_regs *regs32;
+
+ clear_tsk_thread_flag(current, TIF_EXIT_VM86);
+ regs32 = save_v86_state((struct kernel_vm86_regs *)regs);
+ local_irq_disable();
+ __asm__ __volatile__(
+ "movl %0,%%esp\n\t"
+ "movl %1,%%ebp\n\t"
+ "jmp resume_userspace"
+ : : "r" (regs32), "r" (current_thread_info()));
+
+ /*
+ * We don't get here. Instead we restart
+ * prepare_exit_to_usermode via resume_userspace.
+ */
+ unreachable();
}
+#endif

/* Called with IRQs disabled. */
__visible void prepare_exit_to_usermode(struct pt_regs *regs)
@@ -264,12 +305,18 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
READ_ONCE(pt_regs_to_thread_info(regs)->flags);

if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
- _TIF_UPROBE | _TIF_NEED_RESCHED)))
+ _TIF_UPROBE | _TIF_NEED_RESCHED |
+ _TIF_EXIT_VM86)))
break;

/* We have work to do. */
local_irq_enable();

+#ifdef CONFIG_VM86
+ if (cached_flags & _TIF_EXIT_VM86)
+ exit_vm86_immediately(regs);
+#endif
+
if (cached_flags & _TIF_NEED_RESCHED)
schedule();

@@ -277,8 +324,13 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
uprobe_notify_resume(regs);

/* deal with pending signal delivery */
- if (cached_flags & _TIF_SIGPENDING)
+ if (cached_flags & _TIF_SIGPENDING) {
+#ifdef CONFIG_VM86
+ if (v8086_mode(regs))
+ exit_vm86_immediately(regs);
+#endif
do_signal(regs);
+ }

if (cached_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 66ff9c4055d7..b2909bf8cf70 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -256,14 +256,10 @@ ret_from_intr:

ENTRY(resume_userspace)
LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
- # int/exception return?
- jne work_pending
+ movl %esp, %eax
+ call prepare_exit_to_usermode
jmp restore_all
END(ret_from_exception)

@@ -341,7 +337,7 @@ sysenter_after_call:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx
- jnz syscall_exit_work
+ jnz syscall_exit_work_irqs_off
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
@@ -377,13 +373,7 @@ syscall_after_call:
movl %eax, PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jnz syscall_exit_work
+ jmp syscall_exit_work

restore_all:
TRACE_IRQS_IRET
@@ -460,52 +450,6 @@ ldt_ss:
#endif
ENDPROC(entry_INT80_32)

- # perform work that needs to be done immediately before resumption
- ALIGN
-work_pending:
- testb $_TIF_NEED_RESCHED, %cl
- jz work_notifysig
-work_resched:
- call schedule
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
- # than syscall tracing?
- jz restore_all
- testb $_TIF_NEED_RESCHED, %cl
- jnz work_resched
-
-work_notifysig: # deal with pending signals and
- # notify-resume requests
-#ifdef CONFIG_VM86
- testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
- movl %esp, %eax
- jnz work_notifysig_v86 # special case for v86
-1:
-#else
- movl %esp, %eax
-#endif
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace
-
-#ifdef CONFIG_VM86
- ALIGN
-work_notifysig_v86:
- pushl %ecx # save ti_flags for do_notify_resume
- call save_v86_state # %eax contains pt_regs pointer
- popl %ecx
- movl %eax, %esp
- jmp 1b
-#endif
-END(work_pending)
-
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
@@ -520,15 +464,14 @@ END(syscall_trace_entry)

# perform syscall exit tracing
ALIGN
-syscall_exit_work:
- testl $_TIF_WORK_SYSCALL_EXIT, %ecx
- jz work_pending
+syscall_exit_work_irqs_off:
TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
- # schedule() instead
+ ENABLE_INTERRUPTS(CLBR_ANY)
+
+syscall_exit_work:
movl %esp, %eax
- call syscall_trace_leave
- jmp resume_userspace
+ call syscall_return_slowpath
+ jmp restore_all
END(syscall_exit_work)

syscall_fault:
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 225ee545e1a0..5a60392ce70e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,7 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_EXIT_VM86 9 /* deferred vm86 exit */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
@@ -119,6 +120,7 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_EXIT_VM86 (1 << TIF_EXIT_VM86)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index fc9db6ef2a95..46dcef7046b6 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -549,11 +549,9 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
{
if (VMPI.is_vm86pus) {
if ((trapno == 3) || (trapno == 1)) {
+ /* Queue up a return to normal userspace. */
KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
- /* setting this flag forces the code in entry_32.S to
- the path where we call save_v86_state() and change
- the stack pointer to KVM86->regs32 */
- set_thread_flag(TIF_NOTIFY_RESUME);
+ set_thread_flag(TIF_EXIT_VM86);
return 0;
}
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
--
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/