[RFC] de-asmify the x86-64 system call slowpath

From: Linus Torvalds
Date: Sun Jan 26 2014 - 17:28:38 EST


The x86-64 (and 32-bit, for that matter) system call slowpaths are all
in C, but the *selection* of which slow-path to take is a mixture of
complicated assembler ("sysret_check -> sysret_careful ->
sysret_signal ->sysret_audit -> int_check_syscall_exit_work" etc), and
oddly named and placed C code ("schedule_user" vs
"__audit_syscall_exit" vs "do_notify_resume").

This attached patch tries to take the "do_notify_resume()" approach,
and renaming it to something sane ("syscall_exit_slowpath") and call
out to *all* the different slow cases from that one place, instead of
having some cases hardcoded in asm, and some in C. And instead of
hardcoding which cases result in a "iretq" and which cases result in a
faster sysret case, it's now simply a return value from that
syscall_exit_slowpath() function, so it's very natural and easy to say
"taking a signal will force us to do the slow iretq case, but we can
do the task exit work and still do the sysret".

I've marked this as an RFC, because I didn't bother trying to clean up
the 32-bit code similarly (no test-cases, and trust me, if you get
this wrong, it will fail spectacularly but in very subtle and
hard-to-debug ways), and I also didn't bother with the slow cases in
the "iretq" path, so that path still has the odd asm cases and calls
the old (now legacy) do_notify_resume() path.

But this is actually tested, and seems to work (including limited
testing with strace, gdb etc), and while it adds a few more lines than
it removes, the removed lines are mostly asm, and added lines are C
(and part of them are the temporary still-extant do_notify_resume()
wrapper). In particular, it should be fairly straightforward to take
this as a starting point, removing the extant
do_notify_resume/schedule/etc cases one by one, and get rid of more
asm code and finally the wrapper.

Comments? This was obviously brought on by my frustration with the
currently nasty do_notify_resume() always returning to iret for the
task_work case, and PeterZ's patch that fixed that, but made the asm
mess even *worse*.

Linus
arch/x86/kernel/entry_64.S | 51 ++++++++++++++--------------------------------
arch/x86/kernel/signal.c | 37 ++++++++++++++++++++++++++++-----
2 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c3628bf2..15b5953da958 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -658,30 +658,24 @@ sysret_check:
/* Handle reschedules */
/* edx: work, edi: workmask */
sysret_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc sysret_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
- SCHEDULE_USER
- popq_cfi %rdi
- jmp sysret_check
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ movq %rsp,%rdi
+ movl %edx,%esi
+ call syscall_exit_slowpath
+ movl $_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT,%edi
+ testl %eax,%eax
+ jne sysret_using_iret
+ addq $REST_SKIP,%rsp
+ jmp sysret_check;

- /* Handle a signal */
-sysret_signal:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
- bt $TIF_SYSCALL_AUDIT,%edx
- jc sysret_audit
-#endif
- /*
- * We have a signal, or exit tracing or single-step.
- * These all wind up with the iret return path anyway,
- * so just join that path right now.
- */
- FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
- jmp int_check_syscall_exit_work
+sysret_using_iret:
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp int_with_check

badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -703,20 +697,6 @@ auditsys:
call __audit_syscall_entry
LOAD_ARGS 0 /* reload call-clobbered registers */
jmp system_call_fastpath
-
- /*
- * Return fast path for syscall audit. Call __audit_syscall_exit()
- * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
- * masked off.
- */
-sysret_audit:
- movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
- cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
- setbe %al /* 1 if so, 0 if not */
- movzbl %al,%edi /* zero-extend that into %edi */
- call __audit_syscall_exit
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- jmp sysret_check
#endif /* CONFIG_AUDITSYSCALL */

/* Do syscall tracing */
@@ -786,7 +766,6 @@ int_careful:
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
SAVE_REST
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 9e5de6813e1f..25d02f6a65f1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -725,13 +725,30 @@ static void do_signal(struct pt_regs *regs)
}

/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
+ * This is the system call exit slowpath, called if any of
+ * the thread info flags are set that cause us to not just
+ * return using a 'sysret' instruction.
+ *
+ * Return zero if the user register state hasn't changed,
+ * and a 'sysret' is still ok. Return nonzero if we need
+ * to go to the slow 'iret' path due to possible register
+ * changes (signals, ptrace, whatever).
*/
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+__visible int syscall_exit_slowpath(struct pt_regs *regs, u32 thread_info_flags)
{
+ int retval = 0;
+
user_exit();
+ if (thread_info_flags & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (thread_info_flags & _TIF_WORK_SYSCALL_EXIT)
+ syscall_trace_leave(regs);
+
+#ifdef CONFIG_AUDITSYSCALL
+ if (thread_info_flags & _TIF_SYSCALL_AUDIT)
+ __audit_syscall_exit(regs->ax, regs->ax < -MAX_ERRNO);
+#endif

#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
@@ -743,8 +760,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
uprobe_notify_resume(regs);

/* deal with pending signal delivery */
- if (thread_info_flags & _TIF_SIGPENDING)
+ if (thread_info_flags & _TIF_SIGPENDING) {
do_signal(regs);
+ retval = 1;
+ }

if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
@@ -754,8 +773,16 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
fire_user_return_notifiers();

user_enter();
+ return retval;
}

+/* Get rid of this old interface some day.. */
+__visible void do_notify_resume(struct pt_regs *regs, void *unused, u32 thread_info_flags)
+{
+ syscall_exit_slowpath(regs, thread_info_flags & ~(_TIF_NEED_RESCHED|_TIF_SYSCALL_AUDIT));
+}
+
+
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
{
struct task_struct *me = current;