[RFC PATCH v2 2/3] restartable sequences: x86 ABI

From: Paul Turner
Date: Tue Oct 27 2015 - 19:57:22 EST


From: Paul Turner <pjt@xxxxxxxxxx>

Recall the general ABI is:
The kernel ABI generally consists of:
a) A shared TLS word which exports the current cpu and event-count
b) A shared TLS word which, when non-zero, stores the first post-commit
instruction if a sequence is active. (The kernel observing rips greater
than this takes no action and concludes user-space has completed its
critical section, but not yet cleared this state).
c) An architecture specific way to publish the state read from (a) which
user-space believes to be current.

This patch defines (c) for x86, both x86_64 and i386.

The exact sequence is:
*0. Userspace stores current event+cpu counter values
1. Userspace loads the rip to move to at failure into cx
2. Userspace loads the rip of the instruction following
the critical section into a registered TLS address.
3. Userspace loads the values read at [0] into a known
location.
4. Userspace tests to see whether the current event and
cpu counter values match those stored at 0. Manually
jumping to the address from [1] in the case of a
mismatch.

Note that if we are preempted or otherwise interrupted
then the kernel can also now perform this comparison
and conditionally jump us to [1].
4. Our final instruction bfeore [2] is then our commit.
The critical section is self-terminating. [2] must
also be cleared at this point.

For x86_64:
[3] uses rdx to represent cpu and event counter as a
single 64-bit value.

For i386:
[3] uses ax for cpu and dx for the event_counter.

Both:
Instruction after commit: rseq_state->post_commit_instr
Current event and cpu state: rseq_state->event_and_cpu

An example user-space x86_64 implementation:
__asm__ __volatile__ goto (
"movq $%l[failed], %%rcx\n"
"movq $1f, %[commit_instr]\n"
"cmpq %[start_value], %[current_value]\n"
"jnz %l[failed]\n"
"movq %[to_write], (%[target])\n"
"1: movq $0, %[commit_instr]\n"
: /* no outputs */
: [start_value]"d"(start_value.storage),
[current_value]"m"(__rseq_state),
[to_write]"r"(to_write),
[target]"r"(p),
[commit_instr]"m"(__rseq_state.post_commit_instr)
: "rcx", "memory"
: failed

Signed-off-by: Paul Turner <pjt@xxxxxxxxxx>
---
arch/x86/entry/common.c | 3 +
arch/x86/include/asm/restartable_sequences.h | 18 +++
arch/x86/kernel/Makefile | 2
arch/x86/kernel/restartable_sequences.c | 136 ++++++++++++++++++++++++++
arch/x86/kernel/signal.c | 7 +
kernel/restartable_sequences.c | 10 +-
6 files changed, 173 insertions(+), 3 deletions(-)
create mode 100644 arch/x86/include/asm/restartable_sequences.h
create mode 100644 arch/x86/kernel/restartable_sequences.c

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc..e382487 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
#include <linux/uprobes.h>

#include <asm/desc.h>
+#include <asm/restartable_sequences.h>
#include <asm/traps.h>
#include <asm/vdso.h>
#include <asm/uaccess.h>
@@ -249,6 +250,8 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
if (cached_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+ if (rseq_active(current))
+ arch_rseq_update_event_counter(regs);
}

if (cached_flags & _TIF_USER_RETURN_NOTIFY)
diff --git a/arch/x86/include/asm/restartable_sequences.h b/arch/x86/include/asm/restartable_sequences.h
new file mode 100644
index 0000000..75864a7
--- /dev/null
+++ b/arch/x86/include/asm/restartable_sequences.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_RESTARTABLE_SEQUENCES_H
+#define _ASM_X86_RESTARTABLE_SEQUENCES_H
+
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_RESTARTABLE_SEQUENCES
+
+void arch_rseq_update_event_counter(struct pt_regs *regs);
+
+#else /* !CONFIG_RESTARTABLE_SEQUENCES */
+
+static inline void arch_rseq_update_event_counter(struct pt_regs *regs) {}
+
+#endif
+
+#endif /* _ASM_X86_RESTARTABLE_SEQUENCES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ff..ee98fb6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -110,6 +110,8 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o

+obj-$(CONFIG_RESTARTABLE_SEQUENCES) += restartable_sequences.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/restartable_sequences.c b/arch/x86/kernel/restartable_sequences.c
new file mode 100644
index 0000000..9f43efd
--- /dev/null
+++ b/arch/x86/kernel/restartable_sequences.c
@@ -0,0 +1,136 @@
+/*
+ * Restartable Sequences: x86 ABI.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright (C) 2015, Google, Inc.,
+ * Paul Turner <pjt@xxxxxxxxxx> and Andrew Hunter <ahh@xxxxxxxxxx>
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/processor-flags.h>
+#include <asm/restartable_sequences.h>
+
+static inline u64 rseq_encode_cpu_and_event_count(int cpu, int event_count)
+{
+ return (u64)(event_count) << 32 | cpu;
+}
+
+static inline int rseq_regs_cpu(struct pt_regs *regs, int is_i386)
+{
+#ifdef CONFIG_64BIT
+ return is_i386 ? regs->ax : regs->dx & 0xFFFF;
+#else
+ return regs->ax;
+#endif
+}
+
+static inline int rseq_regs_event_count(struct pt_regs *regs, int is_i386)
+{
+#ifdef CONFIG_64BIT
+ return is_i386 ? regs->dx : regs->dx >> 32;
+#else
+ return regs->dx;
+#endif
+}
+
+void arch_rseq_update_event_counter(struct pt_regs *regs)
+{
+ struct restartable_sequence_state *rseq_state = &current->rseq_state;
+ int cpu = task_cpu(current);
+ int is_i386 = test_tsk_thread_flag(current, TIF_IA32);
+ int addr_size = is_i386 ? 4 : 8;
+ long post_commit_instr = 0;
+ u64 state_value;
+
+ /*
+ * Note: post_commit_instr must be zero-initialized above for the case
+ * of a 32-bit thread on a 64-bit system.
+ */
+ if (copy_from_user(&post_commit_instr,
+ rseq_state->post_commit_instr_addr, addr_size)) {
+ goto fail;
+ }
+
+ /* Handle potentially being within a critical section. */
+ if (regs->ip < post_commit_instr) {
+ /*
+ * The ABI is relatively compact, with some differences for 32
+ * and 64-bit threads.
+ *
+ * Progress is ordered as follows:
+ * *0. USerspace stores current event+cpu counter values
+ * 1. Userspace loads the rip to move to at failure into cx
+ * 2. Userspace loads the rip of the instruction following
+ * the critical section into a registered TLS address.
+ * 3. Userspace loads the values read at [0] into a known
+ * location.
+ * 4. Userspace tests to see whether the current event and
+ * cpu counter values match those stored at 0. Manually
+ * jumping to the address from [1] in the case of a
+ * mismatch.
+ *
+ * Note that if we are preempted or otherwise interrupted
+ * then the kernel can also now perform this comparison
+ * and conditionally jump us to [1].
+ * 4. Our final instruction bfeore [2] is then our commit.
+ * The critical section is self-terminating. [2] must
+ * also be cleared at this point.
+ *
+ * For x86_64:
+ * [3] uses rdx to represent cpu and event counter as a
+ * single 64-bit value.
+ *
+ * For i386:
+ * [3] uses ax for cpu and dx for the event_counter.
+ *
+ * Both:
+ * Instruction after commit: rseq_state->post_commit_instr
+ * Current event and cpu state: rseq_state->event_and_cpu
+ *
+ */
+ if (rseq_regs_cpu(regs, is_i386) != cpu ||
+ rseq_regs_event_count(regs, is_i386) !=
+ rseq_state->event_counter) {
+ if (clear_user(rseq_state->post_commit_instr_addr,
+ addr_size))
+ goto fail;
+
+ /*
+ * We set this after potentially failing in clear_user
+ * so that the signal arrives at the faulting rip.
+ */
+ regs->ip = regs->cx;
+ }
+ }
+
+ /* Update state. Compute the next expected state value. */
+ state_value = rseq_encode_cpu_and_event_count(cpu,
+ ++rseq_state->event_counter);
+
+ if (put_user(state_value, rseq_state->event_and_cpu))
+ goto fail;
+
+ return;
+fail:
+ /*
+ * User space has made some (invalid) protection change that does not
+ * allow us to safely continue execution. SEGV is the result.
+ */
+ force_sig(SIGSEGV, current);
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b7ffb7c..58f8813 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -30,6 +30,7 @@
#include <asm/fpu/signal.h>
#include <asm/vdso.h>
#include <asm/mce.h>
+#include <asm/restartable_sequences.h>
#include <asm/sighandling.h>
#include <asm/vm86.h>

@@ -615,6 +616,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
sigset_t *set = sigmask_to_save();
compat_sigset_t *cset = (compat_sigset_t *) set;

+#ifdef CONFIG_RESTARTABLE_SEQUENCES
+ /* Increment the event counter for the, present, pre-signal frame. */
+ if (rseq_active(current))
+ arch_rseq_update_event_counter(regs);
+#endif
+
/* Set up the stack frame */
if (is_ia32_frame()) {
if (ksig->ka.sa.sa_flags & SA_SIGINFO)
diff --git a/kernel/restartable_sequences.c b/kernel/restartable_sequences.c
index c99a574..5449561 100644
--- a/kernel/restartable_sequences.c
+++ b/kernel/restartable_sequences.c
@@ -24,17 +24,21 @@

#ifdef CONFIG_RESTARTABLE_SEQUENCES

+#include <asm/restartable_sequences.h>
#include <linux/uaccess.h>
#include <linux/preempt.h>
#include <linux/syscalls.h>

static void rseq_sched_in_nop(struct preempt_notifier *pn, int cpu) {}
-static void rseq_sched_out_nop(struct preempt_notifier *pn,
- struct task_struct *next) {}
+static void rseq_sched_out(struct preempt_notifier *pn,
+ struct task_struct *next)
+{
+ set_thread_flag(TIF_NOTIFY_RESUME);
+}

static __read_mostly struct preempt_ops rseq_preempt_ops = {
.sched_in = rseq_sched_in_nop,
- .sched_out = rseq_sched_out_nop,
+ .sched_out = rseq_sched_out,
};

int rseq_configure_current(__user u64 *event_and_cpu,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/