Re: [PATCH v3 2/2] riscv: Disable misaligned access probe when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

From: Charlie Jenkins
Date: Fri Feb 16 2024 - 14:24:32 EST


On Wed, Feb 14, 2024 at 09:24:16AM -0500, Evan Green wrote:
> On Fri, Feb 2, 2024 at 6:54 PM Charlie Jenkins <charlie@xxxxxxxxxxxx> wrote:
> >
> > When CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is selected, the cpus can be
> > set to have fast misaligned access without needing to probe.
> >
> > To avoid some ifdefs, move unalignment probing code into its own file
> > and make CONFIG_RISCV_MISALIGNED depend on
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
> >
> > Signed-off-by: Charlie Jenkins <charlie@xxxxxxxxxxxx>
>
> One nit below, but feel free to add my tag on the next spin:
>
> Reviewed-by: Evan Green <evan@xxxxxxxxxxxx>

Thanks! I will apply your suggestions. I am also going to add support
for selecting the other alignment speeds so I will leave off the
reviewed-by for now in case you have further suggestions for that.

- Charlie

>
> > ---
> > arch/riscv/Kconfig | 1 +
> > arch/riscv/include/asm/cpufeature.h | 7 +
> > arch/riscv/kernel/Makefile | 3 +
> > arch/riscv/kernel/cpufeature.c | 255 --------------------------
> > arch/riscv/kernel/misaligned_access_speed.c | 265 ++++++++++++++++++++++++++++
> > arch/riscv/kernel/sys_hwprobe.c | 4 +
> > 6 files changed, 280 insertions(+), 255 deletions(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index bffbd869a068..3223d2d08f74 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -690,6 +690,7 @@ config THREAD_SIZE_ORDER
> > config RISCV_MISALIGNED
> > bool "Support misaligned load/store traps for kernel and userspace"
> > select SYSCTL_ARCH_UNALIGN_ALLOW
> > + depends on !HAVE_EFFICIENT_UNALIGNED_ACCESS
> > default y
> > help
> > Say Y here if you want the kernel to embed support for misaligned
> > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > index eb3ac304fc42..44734e5169b1 100644
> > --- a/arch/riscv/include/asm/cpufeature.h
> > +++ b/arch/riscv/include/asm/cpufeature.h
> > @@ -51,6 +51,12 @@ static inline bool check_unaligned_access_emulated(int cpu)
> > static inline void unaligned_emulation_finish(void) {}
> > #endif
> >
> > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > +static __always_inline bool has_fast_misaligned_accesses(void)
> > +{
> > + return true;
> > +}
> > +#else
> > DECLARE_PER_CPU(long, misaligned_access_speed);
> >
> > DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
> > @@ -59,6 +65,7 @@ static __always_inline bool has_fast_misaligned_accesses(void)
> > {
> > return static_branch_likely(&fast_misaligned_access_speed_key);
> > }
> > +#endif
> >
> > unsigned long riscv_get_elf_hwcap(void);
> >
> > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > index f71910718053..ffba5ecf12c2 100644
> > --- a/arch/riscv/kernel/Makefile
> > +++ b/arch/riscv/kernel/Makefile
> > @@ -62,6 +62,9 @@ obj-y += tests/
> > obj-$(CONFIG_MMU) += vdso.o vdso/
> >
> > obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
> > +ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS), y)
> > +obj-y += misaligned_access_speed.o
>
> copy-unaligned.o can also go in here (those are the assembly copy
> routines used only by the probing code).
>
> > +endif
> > obj-$(CONFIG_FPU) += fpu.o
> > obj-$(CONFIG_RISCV_ISA_V) += vector.o
> > obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index 89920f84d0a3..319670af5704 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -11,7 +11,6 @@
> > #include <linux/cpu.h>
> > #include <linux/cpuhotplug.h>
> > #include <linux/ctype.h>
> > -#include <linux/jump_label.h>
> > #include <linux/log2.h>
> > #include <linux/memory.h>
> > #include <linux/module.h>
> > @@ -21,20 +20,12 @@
> > #include <asm/cacheflush.h>
> > #include <asm/cpufeature.h>
> > #include <asm/hwcap.h>
> > -#include <asm/hwprobe.h>
> > #include <asm/patch.h>
> > #include <asm/processor.h>
> > #include <asm/vector.h>
> >
> > -#include "copy-unaligned.h"
> > -
> > #define NUM_ALPHA_EXTS ('z' - 'a' + 1)
> >
> > -#define MISALIGNED_ACCESS_JIFFIES_LG2 1
> > -#define MISALIGNED_BUFFER_SIZE 0x4000
> > -#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> > -#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> > -
> > unsigned long elf_hwcap __read_mostly;
> >
> > /* Host ISA bitmap */
> > @@ -43,11 +34,6 @@ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly;
> > /* Per-cpu ISA extensions. */
> > struct riscv_isainfo hart_isa[NR_CPUS];
> >
> > -/* Performance information */
> > -DEFINE_PER_CPU(long, misaligned_access_speed);
> > -
> > -static cpumask_t fast_misaligned_access;
> > -
> > /**
> > * riscv_isa_extension_base() - Get base extension word
> > *
> > @@ -706,247 +692,6 @@ unsigned long riscv_get_elf_hwcap(void)
> > return hwcap;
> > }
> >
> > -static int check_unaligned_access(void *param)
> > -{
> > - int cpu = smp_processor_id();
> > - u64 start_cycles, end_cycles;
> > - u64 word_cycles;
> > - u64 byte_cycles;
> > - int ratio;
> > - unsigned long start_jiffies, now;
> > - struct page *page = param;
> > - void *dst;
> > - void *src;
> > - long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
> > -
> > - if (check_unaligned_access_emulated(cpu))
> > - return 0;
> > -
> > - /* Make an unaligned destination buffer. */
> > - dst = (void *)((unsigned long)page_address(page) | 0x1);
> > - /* Unalign src as well, but differently (off by 1 + 2 = 3). */
> > - src = dst + (MISALIGNED_BUFFER_SIZE / 2);
> > - src += 2;
> > - word_cycles = -1ULL;
> > - /* Do a warmup. */
> > - __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > - preempt_disable();
> > - start_jiffies = jiffies;
> > - while ((now = jiffies) == start_jiffies)
> > - cpu_relax();
> > -
> > - /*
> > - * For a fixed amount of time, repeatedly try the function, and take
> > - * the best time in cycles as the measurement.
> > - */
> > - while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
> > - start_cycles = get_cycles64();
> > - /* Ensure the CSR read can't reorder WRT to the copy. */
> > - mb();
> > - __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > - /* Ensure the copy ends before the end time is snapped. */
> > - mb();
> > - end_cycles = get_cycles64();
> > - if ((end_cycles - start_cycles) < word_cycles)
> > - word_cycles = end_cycles - start_cycles;
> > - }
> > -
> > - byte_cycles = -1ULL;
> > - __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > - start_jiffies = jiffies;
> > - while ((now = jiffies) == start_jiffies)
> > - cpu_relax();
> > -
> > - while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
> > - start_cycles = get_cycles64();
> > - mb();
> > - __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > - mb();
> > - end_cycles = get_cycles64();
> > - if ((end_cycles - start_cycles) < byte_cycles)
> > - byte_cycles = end_cycles - start_cycles;
> > - }
> > -
> > - preempt_enable();
> > -
> > - /* Don't divide by zero. */
> > - if (!word_cycles || !byte_cycles) {
> > - pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
> > - cpu);
> > -
> > - return 0;
> > - }
> > -
> > - if (word_cycles < byte_cycles)
> > - speed = RISCV_HWPROBE_MISALIGNED_FAST;
> > -
> > - ratio = div_u64((byte_cycles * 100), word_cycles);
> > - pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
> > - cpu,
> > - ratio / 100,
> > - ratio % 100,
> > - (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
> > -
> > - per_cpu(misaligned_access_speed, cpu) = speed;
> > -
> > - /*
> > - * Set the value of fast_misaligned_access of a CPU. These operations
> > - * are atomic to avoid race conditions.
> > - */
> > - if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
> > - cpumask_set_cpu(cpu, &fast_misaligned_access);
> > - else
> > - cpumask_clear_cpu(cpu, &fast_misaligned_access);
> > -
> > - return 0;
> > -}
> > -
> > -static void check_unaligned_access_nonboot_cpu(void *param)
> > -{
> > - unsigned int cpu = smp_processor_id();
> > - struct page **pages = param;
> > -
> > - if (smp_processor_id() != 0)
> > - check_unaligned_access(pages[cpu]);
> > -}
> > -
> > -DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
> > -
> > -static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
> > -{
> > - if (cpumask_weight(mask) == weight)
> > - static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
> > - else
> > - static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
> > -}
> > -
> > -static void set_unaligned_access_static_branches_except_cpu(int cpu)
> > -{
> > - /*
> > - * Same as set_unaligned_access_static_branches, except excludes the
> > - * given CPU from the result. When a CPU is hotplugged into an offline
> > - * state, this function is called before the CPU is set to offline in
> > - * the cpumask, and thus the CPU needs to be explicitly excluded.
> > - */
> > -
> > - cpumask_t fast_except_me;
> > -
> > - cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
> > - cpumask_clear_cpu(cpu, &fast_except_me);
> > -
> > - modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
> > -}
> > -
> > -static void set_unaligned_access_static_branches(void)
> > -{
> > - /*
> > - * This will be called after check_unaligned_access_all_cpus so the
> > - * result of unaligned access speed for all CPUs will be available.
> > - *
> > - * To avoid the number of online cpus changing between reading
> > - * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
> > - * held before calling this function.
> > - */
> > -
> > - cpumask_t fast_and_online;
> > -
> > - cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
> > -
> > - modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
> > -}
> > -
> > -static int lock_and_set_unaligned_access_static_branch(void)
> > -{
> > - cpus_read_lock();
> > - set_unaligned_access_static_branches();
> > - cpus_read_unlock();
> > -
> > - return 0;
> > -}
> > -
> > -arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
> > -
> > -static int riscv_online_cpu(unsigned int cpu)
> > -{
> > - static struct page *buf;
> > -
> > - /* We are already set since the last check */
> > - if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
> > - goto exit;
> > -
> > - buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> > - if (!buf) {
> > - pr_warn("Allocation failure, not measuring misaligned performance\n");
> > - return -ENOMEM;
> > - }
> > -
> > - check_unaligned_access(buf);
> > - __free_pages(buf, MISALIGNED_BUFFER_ORDER);
> > -
> > -exit:
> > - set_unaligned_access_static_branches();
> > -
> > - return 0;
> > -}
> > -
> > -static int riscv_offline_cpu(unsigned int cpu)
> > -{
> > - set_unaligned_access_static_branches_except_cpu(cpu);
> > -
> > - return 0;
> > -}
> > -
> > -/* Measure unaligned access on all CPUs present at boot in parallel. */
> > -static int check_unaligned_access_all_cpus(void)
> > -{
> > - unsigned int cpu;
> > - unsigned int cpu_count = num_possible_cpus();
> > - struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
> > - GFP_KERNEL);
> > -
> > - if (!bufs) {
> > - pr_warn("Allocation failure, not measuring misaligned performance\n");
> > - return 0;
> > - }
> > -
> > - /*
> > - * Allocate separate buffers for each CPU so there's no fighting over
> > - * cache lines.
> > - */
> > - for_each_cpu(cpu, cpu_online_mask) {
> > - bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> > - if (!bufs[cpu]) {
> > - pr_warn("Allocation failure, not measuring misaligned performance\n");
> > - goto out;
> > - }
> > - }
> > -
> > - /* Check everybody except 0, who stays behind to tend jiffies. */
> > - on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
> > -
> > - /* Check core 0. */
> > - smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
> > -
> > - /*
> > - * Setup hotplug callbacks for any new CPUs that come online or go
> > - * offline.
> > - */
> > - cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
> > - riscv_online_cpu, riscv_offline_cpu);
> > -
> > -out:
> > - unaligned_emulation_finish();
> > - for_each_cpu(cpu, cpu_online_mask) {
> > - if (bufs[cpu])
> > - __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
> > - }
> > -
> > - kfree(bufs);
> > - return 0;
> > -}
> > -
> > -arch_initcall(check_unaligned_access_all_cpus);
> > -
> > void riscv_user_isa_enable(void)
> > {
> > if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ))
> > diff --git a/arch/riscv/kernel/misaligned_access_speed.c b/arch/riscv/kernel/misaligned_access_speed.c
> > new file mode 100644
> > index 000000000000..b725c07dd1af
> > --- /dev/null
> > +++ b/arch/riscv/kernel/misaligned_access_speed.c
> > @@ -0,0 +1,265 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright 2024 Rivos Inc.
> > + */
> > +
> > +#include <linux/cpu.h>
> > +#include <linux/cpumask.h>
> > +#include <linux/jump_label.h>
> > +#include <linux/mm.h>
> > +#include <linux/smp.h>
> > +#include <linux/types.h>
> > +#include <asm/cpufeature.h>
> > +#include <asm/hwprobe.h>
> > +
> > +#include "copy-unaligned.h"
> > +
> > +#define MISALIGNED_ACCESS_JIFFIES_LG2 1
> > +#define MISALIGNED_BUFFER_SIZE 0x4000
> > +#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> > +#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> > +
> > +DEFINE_PER_CPU(long, misaligned_access_speed);
> > +
> > +static cpumask_t fast_misaligned_access;
> > +
> > +static int check_unaligned_access(void *param)
> > +{
> > + int cpu = smp_processor_id();
> > + u64 start_cycles, end_cycles;
> > + u64 word_cycles;
> > + u64 byte_cycles;
> > + int ratio;
> > + unsigned long start_jiffies, now;
> > + struct page *page = param;
> > + void *dst;
> > + void *src;
> > + long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
> > +
> > + if (check_unaligned_access_emulated(cpu))
> > + return 0;
> > +
> > + /* Make an unaligned destination buffer. */
> > + dst = (void *)((unsigned long)page_address(page) | 0x1);
> > + /* Unalign src as well, but differently (off by 1 + 2 = 3). */
> > + src = dst + (MISALIGNED_BUFFER_SIZE / 2);
> > + src += 2;
> > + word_cycles = -1ULL;
> > + /* Do a warmup. */
> > + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > + preempt_disable();
> > + start_jiffies = jiffies;
> > + while ((now = jiffies) == start_jiffies)
> > + cpu_relax();
> > +
> > + /*
> > + * For a fixed amount of time, repeatedly try the function, and take
> > + * the best time in cycles as the measurement.
> > + */
> > + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
> > + start_cycles = get_cycles64();
> > + /* Ensure the CSR read can't reorder WRT to the copy. */
> > + mb();
> > + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > + /* Ensure the copy ends before the end time is snapped. */
> > + mb();
> > + end_cycles = get_cycles64();
> > + if ((end_cycles - start_cycles) < word_cycles)
> > + word_cycles = end_cycles - start_cycles;
> > + }
> > +
> > + byte_cycles = -1ULL;
> > + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > + start_jiffies = jiffies;
> > + while ((now = jiffies) == start_jiffies)
> > + cpu_relax();
> > +
> > + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
> > + start_cycles = get_cycles64();
> > + mb();
> > + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > + mb();
> > + end_cycles = get_cycles64();
> > + if ((end_cycles - start_cycles) < byte_cycles)
> > + byte_cycles = end_cycles - start_cycles;
> > + }
> > +
> > + preempt_enable();
> > +
> > + /* Don't divide by zero. */
> > + if (!word_cycles || !byte_cycles) {
> > + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
> > + cpu);
> > +
> > + return 0;
> > + }
> > +
> > + if (word_cycles < byte_cycles)
> > + speed = RISCV_HWPROBE_MISALIGNED_FAST;
> > +
> > + ratio = div_u64((byte_cycles * 100), word_cycles);
> > + pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
> > + cpu,
> > + ratio / 100,
> > + ratio % 100,
> > + (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
> > +
> > + per_cpu(misaligned_access_speed, cpu) = speed;
> > +
> > + /*
> > + * Set the value of fast_misaligned_access of a CPU. These operations
> > + * are atomic to avoid race conditions.
> > + */
> > + if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
> > + cpumask_set_cpu(cpu, &fast_misaligned_access);
> > + else
> > + cpumask_clear_cpu(cpu, &fast_misaligned_access);
> > +
> > + return 0;
> > +}
> > +
> > +static void check_unaligned_access_nonboot_cpu(void *param)
> > +{
> > + unsigned int cpu = smp_processor_id();
> > + struct page **pages = param;
> > +
> > + if (smp_processor_id() != 0)
> > + check_unaligned_access(pages[cpu]);
> > +}
> > +
> > +DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
> > +
> > +static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
> > +{
> > + if (cpumask_weight(mask) == weight)
> > + static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
> > + else
> > + static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
> > +}
> > +
> > +static void set_unaligned_access_static_branches_except_cpu(int cpu)
> > +{
> > + /*
> > + * Same as set_unaligned_access_static_branches, except excludes the
> > + * given CPU from the result. When a CPU is hotplugged into an offline
> > + * state, this function is called before the CPU is set to offline in
> > + * the cpumask, and thus the CPU needs to be explicitly excluded.
> > + */
> > +
> > + cpumask_t fast_except_me;
> > +
> > + cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
> > + cpumask_clear_cpu(cpu, &fast_except_me);
> > +
> > + modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
> > +}
> > +
> > +static void set_unaligned_access_static_branches(void)
> > +{
> > + /*
> > + * This will be called after check_unaligned_access_all_cpus so the
> > + * result of unaligned access speed for all CPUs will be available.
> > + *
> > + * To avoid the number of online cpus changing between reading
> > + * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
> > + * held before calling this function.
> > + */
> > +
> > + cpumask_t fast_and_online;
> > +
> > + cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
> > +
> > + modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
> > +}
> > +
> > +static int lock_and_set_unaligned_access_static_branch(void)
> > +{
> > + cpus_read_lock();
> > + set_unaligned_access_static_branches();
> > + cpus_read_unlock();
> > +
> > + return 0;
> > +}
> > +
> > +arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
> > +
> > +static int riscv_online_cpu(unsigned int cpu)
> > +{
> > + static struct page *buf;
> > +
> > + /* We are already set since the last check */
> > + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
> > + goto exit;
> > +
> > + buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> > + if (!buf) {
> > + pr_warn("Allocation failure, not measuring misaligned performance\n");
> > + return -ENOMEM;
> > + }
> > +
> > + check_unaligned_access(buf);
> > + __free_pages(buf, MISALIGNED_BUFFER_ORDER);
> > +
> > +exit:
> > + set_unaligned_access_static_branches();
> > +
> > + return 0;
> > +}
> > +
> > +static int riscv_offline_cpu(unsigned int cpu)
> > +{
> > + set_unaligned_access_static_branches_except_cpu(cpu);
> > +
> > + return 0;
> > +}
> > +
> > +/* Measure unaligned access on all CPUs present at boot in parallel. */
> > +static int check_unaligned_access_all_cpus(void)
> > +{
> > + unsigned int cpu;
> > + unsigned int cpu_count = num_possible_cpus();
> > + struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
> > + GFP_KERNEL);
> > +
> > + if (!bufs) {
> > + pr_warn("Allocation failure, not measuring misaligned performance\n");
> > + return 0;
> > + }
> > +
> > + /*
> > + * Allocate separate buffers for each CPU so there's no fighting over
> > + * cache lines.
> > + */
> > + for_each_cpu(cpu, cpu_online_mask) {
> > + bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> > + if (!bufs[cpu]) {
> > + pr_warn("Allocation failure, not measuring misaligned performance\n");
> > + goto out;
> > + }
> > + }
> > +
> > + /* Check everybody except 0, who stays behind to tend jiffies. */
> > + on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
> > +
> > + /* Check core 0. */
> > + smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
> > +
> > + /*
> > + * Setup hotplug callbacks for any new CPUs that come online or go
> > + * offline.
> > + */
> > + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
> > + riscv_online_cpu, riscv_offline_cpu);
> > +
> > +out:
> > + unaligned_emulation_finish();
> > + for_each_cpu(cpu, cpu_online_mask) {
> > + if (bufs[cpu])
> > + __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
> > + }
> > +
> > + kfree(bufs);
> > + return 0;
> > +}
> > +
> > +arch_initcall(check_unaligned_access_all_cpus);
> > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> > index a7c56b41efd2..d9bd24776a3e 100644
> > --- a/arch/riscv/kernel/sys_hwprobe.c
> > +++ b/arch/riscv/kernel/sys_hwprobe.c
> > @@ -149,6 +149,9 @@ static bool hwprobe_ext0_has(const struct cpumask *cpus, unsigned long ext)
> >
> > static u64 hwprobe_misaligned(const struct cpumask *cpus)
> > {
> > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > + return RISCV_HWPROBE_MISALIGNED_FAST;
> > +#else
>
> Ok, two nits, this is a little clunky with the ifdef inside the
> function body. The pattern I see more often is the ifdef/else each
> containing a separate function definition. That would work nicely
> here.
>
> > int cpu;
> > u64 perf = -1ULL;
> >
> > @@ -168,6 +171,7 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
> > return RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> >
> > return perf;
> > +#endif
> > }
> >
> > static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> >
> > --
> > 2.43.0
> >