[PATCH v2 5/6] x86: Rewrite ret_from_fork() in C

From: Peter Zijlstra
Date: Thu Jun 22 2023 - 10:50:44 EST


From: Brian Gerst <brgerst@xxxxxxxxx>

When kCFI is enabled, special handling is needed for the indirect call
to the kernel thread function. Rewrite the ret_from_fork() function in
C so that the compiler can properly handle the indirect call.

Suggested-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Signed-off-by: Brian Gerst <brgerst@xxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Link: https://lkml.kernel.org/r/20230622120750.5549-3-brgerst@xxxxxxxxx
---
arch/x86/entry/entry_32.S | 30 +++++++-----------------------
arch/x86/entry/entry_64.S | 35 ++++++++++-------------------------
arch/x86/include/asm/switch_to.h | 4 +++-
arch/x86/kernel/process.c | 22 +++++++++++++++++++++-
4 files changed, 41 insertions(+), 50 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -727,37 +727,21 @@ SYM_CODE_END(__switch_to_asm)
* edi: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
/* return address for the stack unwinder */
pushl $.Lsyscall_32_done
FRAME_BEGIN

- pushl %eax
- call schedule_tail
+ /* prev already in EAX */
+ movl %esp, %edx /* regs */
+ movl %ebx, %ecx /* fn */
+ pushl %edi /* fn_arg */
+ call ret_from_fork
addl $4, %esp

- testl %ebx, %ebx
- jnz 1f /* kernel threads are uncommon */
-
-2:
- /* When we fork, we trace the syscall return in the child, too. */
- movl %esp, %eax
- call syscall_exit_to_user_mode
-
FRAME_END
RET
-
- /* kernel thread */
-1: movl %edi, %eax
- CALL_NOSPEC ebx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movl $0, PT_EAX(%esp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
.popsection

SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
- __FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT
- movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */

- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
+ /* return address for the stack unwinder */
+ pushq $swapgs_restore_regs_and_return_to_usermode
+ UNWIND_HINT_FUNC

-2:
- UNWIND_HINT_REGS
- movq %rsp, %rdi
- call syscall_exit_to_user_mode /* returns with IRQs disabled */
- jmp swapgs_restore_regs_and_return_to_usermode
-
-1:
- /* kernel thread */
- UNWIND_HINT_END_OF_STACK
- movq %r12, %rdi
- CALL_NOSPEC rbx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movq $0, RAX(%rsp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+ movq %rax, %rdi /* prev */
+ movq %rsp, %rsi /* regs */
+ movq %rbx, %rdx /* fn */
+ movq %r12, %rcx /* fn_arg */
+ jmp ret_from_fork
+SYM_CODE_END(ret_from_fork_asm)
.popsection

.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(stru
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);

-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg);

/*
* This is the structure pointed to by thread.sp for an inactive task. The
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
#include <asm/cpu.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
@@ -134,6 +135,25 @@ static int set_new_tls(struct task_struc
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}

+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg)
+{
+ schedule_tail(prev);
+
+ /* Is this a kernel thread? */
+ if (unlikely(fn)) {
+ fn(fn_arg);
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling kernel_execve(). Exit to userspace to complete the
+ * execve() syscall.
+ */
+ regs->ax = 0;
+ }
+
+ syscall_exit_to_user_mode(regs);
+}
+
int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
unsigned long clone_flags = args->flags;
@@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, c
frame = &fork_frame->frame;

frame->bp = encode_frame_pointer(childregs);
- frame->ret_addr = (unsigned long) ret_from_fork;
+ frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
p->thread.iopl_warn = 0;