[RFC PATCH] x86/pkeys: update PKRU to enable pkey 0 before XSAVE

From: Aruna Ramakrishna
Date: Thu Mar 14 2024 - 13:29:46 EST


This patch is a workaround for a bug where the PKRU value is not
restored to the init value before the signal handler is invoked.

Problem description:
Let's assume there's a multithreaded application that runs untrusted
user code. Each thread has its stack/code protected by a non-zero pkey,
and the PKRU register is set up such that only that particular non-zero
pkey is enabled. Each thread also sets up an alternate signal stack to
handle signals, which is protected by pkey zero. The pkeys man page
documents that the PKRU will be reset to init_pkru when the signal
handler is invoked, which means that pkey zero access will be enabled.
But this reset happens after the kernel attempts to push fpu state
to the alternate stack, which is not (yet) accessible by the kernel,
which leads to a new SIGSEGV being sent to the application, terminating
it.

Enabling both the non-zero pkey (for the thread) and pkey zero (in
userspace) will not work for us. We cannot have the alt stack writeable
by all - the rationale here is that the code running in that thread
(using a non-zero pkey) is untrusted and should not have access to the
alternate signal stack (that uses pkey zero), to prevent the return address
of a function from being changed. The expectation is that kernel should be
able to set up the alternate signal stack and deliver the signal to the
application even if pkey zero is explicitly disabled by the application
(as documented in the pkeys man page). The signal handler accessibility
should not be dictated by whatever PKRU value the thread sets up.

Solution:
The PKRU register is managed by XSAVE, which means the sigframe contents
must match the register contents - which is not the case here. We want
the sigframe to contain the user-defined PKRU value (so that it is
restored correctly from sigcontext) but the actual register must be
reset to init_pkru so that the alt stack is accessible and the signal
can be delivered to the application. It seems that the proper fix here
would be to remove PKRU from the XSAVE framework and manage it
separately, which is quite complicated. As a workaround, this patch does
something like this:

orig_pkru = rdpkru();
wrpkru(init_pkru & orig_pkru);
xsave_to_user_sigframe();
put_user(pkru_sigframe_addr, orig_pkru)

Note:
I think the use_xsave() check in __update_pkru_in_sigframe() is redundant,
but I'm not 100% sure.

Signed-off-by: Aruna Ramakrishna <aruna.ramakrishna@xxxxxxxxxx>
Helped-by: Dave Kleikamp <dave.kleikamp@xxxxxxxxxx>
Helped-by: Prakash Sangappa <prakash.sangappa@xxxxxxxxxx>
---
arch/x86/include/asm/fpu/xstate.h | 8 ++++
arch/x86/kernel/fpu/internal.h | 7 +---
arch/x86/kernel/fpu/signal.c | 4 +-
arch/x86/kernel/fpu/xstate.c | 13 +++++++
arch/x86/kernel/signal.c | 65 ++++++++++++++++++++++++++++++-
5 files changed, 88 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index d4427b88ee12..681d4bb5939c 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -112,6 +112,14 @@ void xsaves(struct xregs_state *xsave, u64 mask);
void xrstors(struct xregs_state *xsave, u64 mask);

int xfd_enable_feature(u64 xfd_err);
+void *get_xsave_standard_addr(struct xregs_state *xsave, int xfeature_nr);
+bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+ struct _fpx_sw_bytes *fx_sw);
+
+static __always_inline __pure bool use_xsave(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XSAVE);
+}

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
index dbdb31f55fc7..ab838c9625dd 100644
--- a/arch/x86/kernel/fpu/internal.h
+++ b/arch/x86/kernel/fpu/internal.h
@@ -4,12 +4,7 @@

extern struct fpstate init_fpstate;

-/* CPU feature check wrappers */
-static __always_inline __pure bool use_xsave(void)
-{
- return cpu_feature_enabled(X86_FEATURE_XSAVE);
-}
-
+/* CPU feature check wrapper */
static __always_inline __pure bool use_fxsr(void)
{
return cpu_feature_enabled(X86_FEATURE_FXSR);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 247f2225aa9f..86fcae8ea141 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -24,8 +24,8 @@
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
*/
-static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
- struct _fpx_sw_bytes *fx_sw)
+bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+ struct _fpx_sw_bytes *fx_sw)
{
int min_xstate_size = sizeof(struct fxregs_state) +
sizeof(struct xstate_header);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 117e74c44e75..81de5759a71a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -941,6 +941,19 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
}

+/*
+ * Given an xstate feature nr, calculate where in the xsave
+ * buffer the state is. The xsave buffer should be in standard
+ * format, not compacted (e.g. user mode signal frames).
+ */
+void *get_xsave_standard_addr(struct xregs_state *xsave, int xfeature_nr)
+{
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ return (void *)xsave + xstate_offsets[xfeature_nr];
+}
+
/*
* Given the xsave area and a state inside, this function returns the
* address of the state.
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 31b6f5dddfc2..b52edf9c29e8 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -224,11 +224,52 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
}
}

+static inline int
+__update_pkru_in_sigframe(struct rt_sigframe __user *frame, u32 new_pkru)
+{
+ int err = -EFAULT;
+ struct _fpx_sw_bytes fx_sw;
+ struct pkru_state *pk = NULL;
+ struct sigcontext __user *sc;
+ unsigned long buf;
+
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE) || !use_xsave())
+ return 0;
+
+ if (!user_read_access_begin(frame, sizeof(*frame)))
+ goto out;
+ sc = (struct sigcontext __user *) &frame->uc.uc_mcontext;
+ unsafe_get_user(buf,
+ (unsigned long __user *)&sc->fpstate,
+ uaccess_end);
+ user_access_end();
+
+ if (unlikely(!check_xstate_in_sigframe((void __user *) buf, &fx_sw)))
+ goto out;
+
+ pk = get_xsave_standard_addr((struct xregs_state __user *) buf,
+ XFEATURE_PKRU);
+ if (!pk || !user_write_access_begin((struct xregs_state __user *) buf,
+ sizeof(struct xregs_state)))
+ goto out;
+ unsafe_put_user(new_pkru, (unsigned int __user *) pk, uaccess_end);
+
+ err = 0;
+
+uaccess_end:
+ user_access_end();
+out:
+ return err;
+}
+
static void
handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
bool stepping, failed;
struct fpu *fpu = &current->thread.fpu;
+ u32 orig_pkru;
+ u32 init_pkru_snapshot;
+ struct rt_sigframe __user *frame = NULL;

if (v8086_mode(regs))
save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
@@ -264,8 +305,30 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
if (stepping)
user_disable_single_step(current);

+ /*
+ * We must have access to both the current stack and the exception
+ * stack. The exception stack must be accessible by the init PKRU
+ * value. Temporarily set the PKRU to access both. It will be set
+ * properly in fpu__clear_user_states().
+ */
+ orig_pkru = read_pkru();
+ init_pkru_snapshot = pkru_get_init_value();
+ write_pkru(orig_pkru & init_pkru_snapshot);
+
failed = (setup_rt_frame(ksig, regs) < 0);
- if (!failed) {
+ if (failed)
+ write_pkru(orig_pkru);
+ else {
+ /*
+ * Overwrite the PKRU value on the signal frame with the
+ * user-defined value so that it gets restored correctly
+ * from sigcontext.
+ */
+ frame = (struct rt_sigframe __force __user *)(regs->sp);
+ if (__update_pkru_in_sigframe(frame, orig_pkru))
+ pr_info("Failed to reset PKRU value to 0x%x on "
+ "the signal frame\n", orig_pkru);
+
/*
* Clear the direction flag as per the ABI for function entry.
*
--
2.39.3