Re: [RFC][PATCH] mips: Fix arch_spin_unlock()

From: Will Deacon
Date: Tue Feb 09 2016 - 06:43:27 EST


On Tue, Feb 09, 2016 at 12:23:58PM +0100, Ingo Molnar wrote:
> * Will Deacon <will.deacon@xxxxxxx> wrote:
> > On Wed, Feb 03, 2016 at 01:32:10PM +0000, Will Deacon wrote:
> > > On Wed, Feb 03, 2016 at 09:33:39AM +0100, Ingo Molnar wrote:
> > > > In fact I'd suggest to test this via a quick runtime hack like this in rcupdate.h:
> > > >
> > > > extern int panic_timeout;
> > > >
> > > > ...
> > > >
> > > > if (panic_timeout)
> > > > smp_load_acquire(p);
> > > > else
> > > > typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p);
> > > >
> > > > (or so)
> > >
> > > So the problem with this is that a LOAD <ctrl> LOAD sequence isn't an
> > > ordering hazard on ARM, so you're potentially at the mercy of the branch
> > > predictor as to whether you get an acquire. That's not to say it won't
> > > be discarded as soon as the conditional is resolved, but it could
> > > screw up the benchmarking.
> > >
> > > I'd be better off doing some runtime patching, but that's not something
> > > I can knock up in a couple of minutes (so I'll add it to my list).
> >
> > ... so I actually got that up and running, believe it or not. Filthy stuff.
>
> Wow!
>
> I tried to implement the simpler solution by hacking rcupdate.h, but got drowned
> in nasty circular header file dependencies and gave up...

I hacked linux/compiler.h, but got similar problems and ended up copying
what I need under a new namespace (posh way of saying I added _WILL to
everything until it built).

> If you are not overly embarrassed by posting hacky patches, mind posting your
> solution?

Ok, since you asked! I'm thoroughly ashamed of the hacks, but maybe it
will keep those spammy Microsoft recruiters at bay ;)

Note that the "trigger" is changing the loglevel, but you need to do this
by echoing to /proc/sysrq-trigger, otherwise the whole thing gets invoked
in irq context which ends badly. It's also heavily arm64-specific.

> > The good news is that you're right, and I'm now seeing ~1% difference between
> > the runs with ~0.3% noise for either of them. I still think that's significant,
> > but it's a lot more reassuring than 4%.
>
> hm, so for such marginal effects I think we could improve the testing method a
> bit: we could improve 'perf bench sched messaging' to allow 'steady state
> testing': to not exit+restart all the processes between test iterations, but to
> continuously measure and print out current performance figures.
>
> I.e. every 10 seconds it could print a decaying running average of current
> throughput.
>
> That way you could patch/unpatch the instructions without having to restart the
> tasks. If you still see an effect (in the numbers reported every 10 seconds), then
> that's a guaranteed result.
>
> [ We have such functionality in 'perf bench numa' (the --show-convergence option),
> for similar reasons, to allow runtime monitoring and tweaking of kernel
> parameters. ]

That sounds handy.

Will

--->8

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 8f271b83f910..1a1e353983be 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -30,8 +30,8 @@
#define ARM64_HAS_LSE_ATOMICS 5
#define ARM64_WORKAROUND_CAVIUM_23154 6
#define ARM64_WORKAROUND_834220 7
-
-#define ARM64_NCAPS 8
+#define ARM64_RCU_USES_ACQUIRE 8
+#define ARM64_NCAPS 9

#ifndef __ASSEMBLY__

diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index d2ee1b21a10d..edbf188a8541 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -143,6 +143,66 @@ static int __apply_alternatives_multi_stop(void *unused)
return 0;
}

+static void __apply_alternatives_rcu(void *alt_region)
+{
+ struct alt_instr *alt;
+ struct alt_region *region = alt_region;
+ u32 *origptr, *replptr;
+
+ for (alt = region->begin; alt < region->end; alt++) {
+ u32 insn, orig_insn;
+ int nr_inst;
+
+ if (alt->cpufeature != ARM64_RCU_USES_ACQUIRE)
+ continue;
+
+ BUG_ON(alt->alt_len != alt->orig_len);
+
+ origptr = ALT_ORIG_PTR(alt);
+ replptr = ALT_REPL_PTR(alt);
+ nr_inst = alt->alt_len / sizeof(insn);
+
+ BUG_ON(nr_inst != 1);
+
+ insn = le32_to_cpu(*replptr);
+ orig_insn = le32_to_cpu(*origptr);
+ *(origptr) = cpu_to_le32(insn);
+ *replptr = cpu_to_le32(orig_insn);
+
+ flush_icache_range((uintptr_t)origptr, (uintptr_t)(origptr + 1));
+ pr_info_ratelimited("%p: 0x%x => 0x%x\n", origptr, orig_insn, insn);
+ }
+}
+
+static int will_patched;
+
+static int __apply_alternatives_rcu_multi_stop(void *unused)
+{
+ struct alt_region region = {
+ .begin = __alt_instructions,
+ .end = __alt_instructions_end,
+ };
+
+ /* We always have a CPU 0 at this point (__init) */
+ if (smp_processor_id()) {
+ while (!READ_ONCE(will_patched))
+ cpu_relax();
+ isb();
+ } else {
+ __apply_alternatives_rcu(&region);
+ /* Barriers provided by the cache flushing */
+ WRITE_ONCE(will_patched, 1);
+ }
+
+ return 0;
+}
+
+void apply_alternatives_rcu(void)
+{
+ will_patched = 0;
+ stop_machine(__apply_alternatives_rcu_multi_stop, NULL, cpu_online_mask);
+}
+
void __init apply_alternatives_all(void)
{
/* better not try code patching on a live SMP system */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index e3928f578891..98c132c3b018 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -141,7 +141,10 @@ SECTIONS

PERCPU_SECTION(L1_CACHE_BYTES)

- . = ALIGN(4);
+ __init_end = .;
+ . = ALIGN(PAGE_SIZE);
+
+
.altinstructions : {
__alt_instructions = .;
*(.altinstructions)
@@ -151,8 +154,7 @@ SECTIONS
*(.altinstr_replacement)
}

- . = ALIGN(PAGE_SIZE);
- __init_end = .;
+ . = ALIGN(4);

_data = .;
_sdata = .;
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e5139402e7f8..3eb7193fcf88 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -80,6 +80,7 @@ static int __init sysrq_always_enabled_setup(char *str)

__setup("sysrq_always_enabled", sysrq_always_enabled_setup);

+extern void apply_alternatives_rcu(void);

static void sysrq_handle_loglevel(int key)
{
@@ -89,6 +90,7 @@ static void sysrq_handle_loglevel(int key)
console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
pr_info("Loglevel set to %d\n", i);
console_loglevel = i;
+ apply_alternatives_rcu();
}
static struct sysrq_key_op sysrq_loglevel_op = {
.handler = sysrq_handle_loglevel,
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 00b042c49ccd..75e29d61fedd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -218,6 +218,61 @@ void __read_once_size(const volatile void *p, void *res, int size)
__READ_ONCE_SIZE;
}

+extern void panic(const char *fmt, ...);
+
+#define __stringify_1_will(x...) #x
+#define __stringify_will(x...) __stringify_1_will(x)
+
+#define ALTINSTR_ENTRY_WILL(feature) \
+ " .word 661b - .\n" /* label */ \
+ " .word 663f - .\n" /* new instruction */ \
+ " .hword " __stringify_will(feature) "\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* source len */ \
+ " .byte 664f-663f\n" /* replacement len */
+
+#define __ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, cfg_enabled) \
+ ".if "__stringify_will(cfg_enabled)" == 1\n" \
+ "661:\n\t" \
+ oldinstr "\n" \
+ "662:\n" \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY_WILL(feature) \
+ ".popsection\n" \
+ ".pushsection .altinstr_replacement, \"a\"\n" \
+ "663:\n\t" \
+ newinstr "\n" \
+ "664:\n\t" \
+ ".popsection\n\t" \
+ ".org . - (664b-663b) + (662b-661b)\n\t" \
+ ".org . - (662b-661b) + (664b-663b)\n" \
+ ".endif\n"
+
+#define _ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, cfg, ...) \
+ __ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, 1)
+
+#define ALTERNATIVE_WILL(oldinstr, newinstr, ...) \
+ _ALTERNATIVE_CFG_WILL(oldinstr, newinstr, __VA_ARGS__, 1)
+
+#define __READ_ONCE_SIZE_WILL \
+({ \
+ __u64 tmp; \
+ \
+ switch (size) { \
+ case 8: asm volatile( \
+ ALTERNATIVE_WILL("ldr %0, %1", "ldar %0, %1", 8) \
+ : "=r" (tmp) : "Q" (*(volatile __u64 *)p)); \
+ *(__u64 *)res = tmp; break; \
+ default: \
+ panic("that's no pointer, yo"); \
+ } \
+})
+
+static __always_inline
+void __read_once_size_will(const volatile void *p, void *res, int size)
+{
+ __READ_ONCE_SIZE_WILL;
+}
+
#ifdef CONFIG_KASAN
/*
* This function is not 'inline' because __no_sanitize_address confilcts
@@ -537,10 +592,17 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
* object's lifetime is managed by something other than RCU. That
* "something other" might be reference counting or simple immortality.
*/
+
+#define READ_ONCE_WILL(x) \
+({ \
+ union { typeof(x) __val; char __c[1]; } __u; \
+ __read_once_size_will(&(x), __u.__c, sizeof(x)); \
+ __u.__val; \
+})
+
#define lockless_dereference(p) \
({ \
- typeof(p) _________p1 = READ_ONCE(p); \
- smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
+ typeof(p) _________p1 = READ_ONCE_WILL(p); \
(_________p1); \
})