[PATCH RFC 2/5] x86,fpu: delay FPU register loading until switch to userspace

From: riel
Date: Sat Oct 01 2016 - 16:50:58 EST


From: Rik van Riel <riel@xxxxxxxxxx>

Delay the loading of FPU registers until a process switches back to
userspace. This allows us to skip FPU saving & restoring for kernel
threads, the idle task, and tasks that are spinning in kernel space.

It also allows us to not repeatedly save & restore the userspace FPU
context on repeated invocations of kernel_fpu_start & kernel_fpu_end.

Not overwriting the FPU state of a task unless we need to also allows
us to be be lazier about restoring it, in a future patch.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxx>
---
arch/x86/entry/common.c | 4 ++++
arch/x86/include/asm/fpu/api.h | 5 +++++
arch/x86/include/asm/fpu/internal.h | 44 +++++++++----------------------------
arch/x86/include/asm/thread_info.h | 4 +++-
arch/x86/kernel/fpu/core.c | 17 ++++++++------
arch/x86/kernel/process.c | 35 +++++++++++++++++++++++++++++
arch/x86/kernel/process_32.c | 5 ++---
arch/x86/kernel/process_64.c | 5 ++---
8 files changed, 71 insertions(+), 48 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1433f6b4607d..a69bbefa3408 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -27,6 +27,7 @@
#include <asm/vdso.h>
#include <asm/uaccess.h>
#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -197,6 +198,9 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);

+ if (unlikely(test_and_clear_thread_flag(TIF_LOAD_FPU)))
+ switch_fpu_return();
+
#ifdef CONFIG_COMPAT
/*
* Compat syscalls set TS_COMPAT. Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 1429a7c736db..edd7dc7ae4f7 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -37,6 +37,11 @@ extern int irq_ts_save(void);
extern void irq_ts_restore(int TS_state);

/*
+ * Set up the userspace FPU context before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
+/*
* Query the presence of one or more xfeatures. Works on any legacy CPU as well.
*
* If 'feature_name' is set then put a human-readable description of
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 79e1cee9f3b0..b5accb35e434 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -19,6 +19,7 @@
#include <asm/fpu/xstate.h>
#include <asm/cpufeature.h>
#include <asm/trace/fpu.h>
+#include <asm/thread_info.h>

/*
* High level FPU state handling functions:
@@ -576,13 +577,17 @@ static inline void fpregs_deactivate(struct fpu *fpu)
/*
* FPU state switching for scheduling.
*
- * This is a two-stage process:
+ * This is a three-stage process:
*
* - switch_fpu_prepare() saves the old state.
* This is done within the context of the old process.
*
- * - switch_fpu_finish() restores the new state
- * and flips CR0.TS as necessary.
+ * - switch_fpu_finish() sets TIF_LOAD_CPU, causing FPU state to
+ * be loaded when the new process returns to userspace.
+ * This is done with current_task pointing to the new process.
+ *
+ * - switch_fpu_return() restores the new state and flips CR0.TS as
+ * necessary. This only runs if the process returns to userspace.
*/
static inline void
switch_fpu_prepare(struct fpu *old_fpu, int cpu)
@@ -605,38 +610,9 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
/*
* Misc helper functions:
*/
-
-/*
- * Set up the userspace FPU context for the new task.
- */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(void)
{
- bool preload;
- /*
- * If the task has used the math, pre-load the FPU on xsave processors
- * or if the past 5 consecutive context-switches used math.
- */
- preload = static_cpu_has(X86_FEATURE_FPU) &&
- new_fpu->fpstate_active &&
- (use_eager_fpu() || new_fpu->counter > 5);
-
- if (preload) {
- prefetch(&new_fpu->state);
- new_fpu->counter++;
- __fpregs_activate(new_fpu);
- trace_x86_fpu_regs_activated(new_fpu);
-
- /* Don't change CR0.TS if we just switch! */
- if (!__this_cpu_read(fpu_active)) {
- __fpregs_activate_hw();
- __this_cpu_write(fpu_active, true);
- }
-
- copy_kernel_to_fpregs(&new_fpu->state);
- } else if (__this_cpu_read(fpu_active)) {
- __this_cpu_write(fpu_active, false);
- __fpregs_deactivate_hw();
- }
+ set_thread_flag(TIF_LOAD_FPU);
}

/*
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8b7c8d8e0852..401e9c3e6039 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -106,6 +106,7 @@ struct thread_info {
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
#define TIF_X32 30 /* 32-bit native x86-64 binary */
+#define TIF_LOAD_FPU 31 /* load FPU on return to userspace */

#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -129,6 +130,7 @@ struct thread_info {
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
+#define _TIF_LOAD_FPU (1 << TIF_LOAD_FPU)

/*
* work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
@@ -142,7 +144,7 @@ struct thread_info {
/* work to do on any return to user space */
#define _TIF_ALLWORK_MASK \
((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \
- _TIF_NOHZ)
+ _TIF_NOHZ | _TIF_LOAD_FPU)

/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 82cd46584528..c4350f188be1 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -118,6 +118,8 @@ void __kernel_fpu_begin(void)

kernel_fpu_disable();

+ this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+
if (fpu->fpregs_active) {
/*
* Ignore return value -- we don't care if reg state
@@ -125,8 +127,10 @@ void __kernel_fpu_begin(void)
*/
copy_fpregs_to_fpstate(fpu);
} else {
- this_cpu_write(fpu_fpregs_owner_ctx, NULL);
- __fpregs_activate_hw();
+ if (!__this_cpu_read(fpu_active)) {
+ __this_cpu_write(fpu_active, true);
+ __fpregs_activate_hw();
+ }
}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -135,11 +139,10 @@ void __kernel_fpu_end(void)
{
struct fpu *fpu = &current->thread.fpu;

- if (fpu->fpregs_active)
- copy_kernel_to_fpregs(&fpu->state);
- else
- __fpregs_deactivate_hw();
-
+ if (fpu->fpregs_active) {
+ switch_fpu_finish();
+ fpu->fpregs_active = 0;
+ }
kernel_fpu_enable();
}
EXPORT_SYMBOL(__kernel_fpu_end);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 62c0b0ea2ce4..087413be39cf 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,6 +32,7 @@
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/vm86.h>
+#include <asm/fpu/types.h>

/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -191,6 +192,40 @@ int set_tsc_mode(unsigned int val)
return 0;
}

+/*
+ * Set up the userspace FPU context before returning to userspace.
+ */
+void switch_fpu_return(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ bool preload;
+ /*
+ * If the task has used the math, pre-load the FPU on xsave processors
+ * or if the past 5 consecutive context-switches used math.
+ */
+ preload = static_cpu_has(X86_FEATURE_FPU) &&
+ fpu->fpstate_active &&
+ (use_eager_fpu() || fpu->counter > 5);
+
+ if (preload) {
+ prefetch(&fpu->state);
+ fpu->counter++;
+ __fpregs_activate(fpu);
+ trace_x86_fpu_regs_activated(fpu);
+
+ /* Don't change CR0.TS if we just switch! */
+ if (!__this_cpu_read(fpu_active)) {
+ __fpregs_activate_hw();
+ __this_cpu_write(fpu_active, true);
+ }
+
+ copy_kernel_to_fpregs(&fpu->state);
+ } else if (__this_cpu_read(fpu_active)) {
+ __this_cpu_write(fpu_active, false);
+ __fpregs_deactivate_hw();
+ }
+}
+
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8cd2f42190dc..45e08c14e06d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -244,7 +244,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);

@@ -309,9 +308,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
lazy_load_gs(next->gs);

- switch_fpu_finish(next_fpu);
-
this_cpu_write(current_task, next_p);

+ switch_fpu_finish();
+
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 92b9485a6a18..f3b83b6af6ea 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -260,7 +260,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned prev_fsindex, prev_gsindex;
@@ -415,8 +414,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
prev->gsbase = 0;
prev->gsindex = prev_gsindex;

- switch_fpu_finish(next_fpu);
-
/*
* Switch the PDA and FPU contexts.
*/
@@ -425,6 +422,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next);

+ switch_fpu_finish();
+
/*
* Now maybe reload the debug registers and handle I/O bitmaps
*/
--
2.7.4