[RFC PATCH 1/2] smp: Implement serialized smp_call_function APIs

From: Mathieu Desnoyers
Date: Wed Mar 13 2024 - 16:56:44 EST


Introduce serialized smp_call_function APIs to limit the number of
concurrent smp_call_function IPIs which can be sent to a given CPU to a
maximum of two: one broadcast and one specifically targeting the CPU.

- smp_call_function_single_serialize holds a per-target-cpu
mutex while sending the IPI,
- smp_call_function_many_serialize, on_each_cpu_cond_mask_serialize,
on_each_cpu_mask_serialize call smp_call_function_single_serialize
if there is only one cpu in the mask, else grab the
serialize_ipi_broadcast_lock mutex to use the broadcast IPIs.

Motivations for this new API:

- Prevent CPU-IPI-hammering from various IPI sources (other than
membarrier) that rely on smp_call_function APIs to send IPIs.
- Restore scaling of membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
targeting a specific CPU with MEMBARRIER_CMD_FLAG_CPU, which is
serialized by a mutex by commit 944d5fe50f3f ("sched/membarrier:
reduce the ability to hammer on sys_membarrier"). This is used in
Google tcmalloc for cross-CPU operations.

My testing on an AMD EPYC 9654 shows that 5 IPI source CPUs are needed
to overwhelm one target CPU without commit 944d5fe50f3f. Therefore,
permitting two CPUs to concurrently IPI a given target CPU should not
allow overwhelming it. [ This should be confirmed by testing on other
hardware. ]

* nr_src=1
Runs OK
top output: %Cpu0 : 35.0 us, 65.0 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st

* nr_src=2
Runs OK
top output: %Cpu0 : 19.1 us, 80.9 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st

* nr_src=3
Runs OK
top output: %Cpu0 : 4.9 us, 94.1 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 1.0 si, 0.0 st

* nr_src=4
Runs OK, but "si" (time spent servicing software interrupts) is getting high:
top output: %Cpu0 : 0.0 us, 20.0 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 80.0 si, 0.0 st

* nr_src=5
"si" is now 100%. watchdog: BUG: soft lockup - CPU#0 stuck for 24s! [membarrier-hamm:5464]
%Cpu0 : 0.0 us, 0.0 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi,100.0 si, 0.0 st

With this reproducer:

#define _GNU_SOURCE
#include <linux/membarrier.h> /* Definition of MEMBARRIER_* constants */
#include <sys/syscall.h> /* Definition of SYS_* constants */
#include <unistd.h>
#include <pthread.h>
#include <sched.h>
#include <stdlib.h>
#include <stdio.h>

#define NR_DEST 1
#define NR_SRC 5

static int
membarrier(int cmd, unsigned int flags, int cpu_id)
{
return syscall(__NR_membarrier, cmd, flags, cpu_id);
}

void *ipi_dest(void *arg)
{
int ret;
cpu_set_t mask;

CPU_ZERO(&mask);
CPU_SET(0, &mask);
ret = sched_setaffinity(0, sizeof(mask), &mask);
if (ret) {
perror("sched_setaffinity");
abort();
}
while (1) {
asm volatile ("rep; nop" : : : "memory");
}
}

void *ipi_src(void *arg)
{
int ret;

while (1) {
/* Hammer CPU 0 */
ret = membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
MEMBARRIER_CMD_FLAG_CPU, 0);
if (ret) {
perror("membarrier");
abort();
}
}
}

static
void do_membarrier_hammer(void)
{
int ret, i;
pthread_t dest_tid[NR_DEST];
pthread_t src_tid[NR_SRC];

for (i = 0; i < NR_DEST; i++) {
ret = pthread_create(&dest_tid[i], NULL,
ipi_dest, NULL);
if (ret)
abort();
}
for (i = 0; i < NR_SRC; i++) {
ret = pthread_create(&src_tid[i], NULL,
ipi_src, NULL);
if (ret)
abort();
}

for (i = 0; i < NR_DEST; i++) {
ret = pthread_join(dest_tid[i], NULL);
if (ret)
abort();
}
for (i = 0; i < NR_SRC; i++) {
ret = pthread_join(src_tid[i], NULL);
if (ret)
abort();
}
}

int main()
{
int ret;

ret = membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0);
if (ret) {
perror("membarrier");
return -1;
}
do_membarrier_hammer();
return 0;
}

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Boqun Feng <boqun.feng@xxxxxxxxx>
Cc: Andrew Hunter <ahh@xxxxxxxxxx>
Cc: Maged Michael <maged.michael@xxxxxxxxx>
Cc: gromer@xxxxxxxxxx
Cc: Avi Kivity <avi@xxxxxxxxxxxx>
Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
Cc: Paul Mackerras <paulus@xxxxxxxxx>
Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
Cc: Peter Oskolkov <posk@xxxxxxxxxx>
---
include/linux/smp.h | 40 +++++++++++++++++
kernel/smp.c | 106 +++++++++++++++++++++++++++++++++++++++-----
2 files changed, 134 insertions(+), 12 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index e87520dc2959..cfc7df87fceb 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -160,6 +160,16 @@ void smp_call_function_many(const struct cpumask *mask,
int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait);

+int smp_call_function_single_serialize(int cpuid, smp_call_func_t func,
+ void *info, int wait);
+
+void smp_call_function_many_serialize(const struct cpumask *mask,
+ smp_call_func_t func,
+ void *info, bool wait);
+
+void on_each_cpu_cond_mask_serialize(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask);
+
void kick_all_cpus_sync(void);
void wake_up_all_idle_cpus(void);

@@ -215,6 +225,28 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
return smp_call_function_single(0, func, info, wait);
}

+static inline
+int smp_call_function_single_serialize(int cpuid, smp_call_func_t func,
+ void *info, int wait)
+{
+ return smp_call_function_single(cpuid, func, info, wait);
+}
+
+static inline
+void smp_call_function_many_serialize(const struct cpumask *mask,
+ smp_call_func_t func,
+ void *info, bool wait)
+{
+ return smp_call_function_many(mask, func, info, wait);
+}
+
+static inline
+void on_each_cpu_cond_mask_serialize(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask)
+{
+ return on_each_cpu_cond_mask(cond_func, func, info, wait, mask);
+}
+
static inline void kick_all_cpus_sync(void) { }
static inline void wake_up_all_idle_cpus(void) { }

@@ -288,6 +320,14 @@ void smp_setup_processor_id(void);
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par,
bool phys);

+static inline
+void on_each_cpu_mask_serialize(const struct cpumask *mask,
+ smp_call_func_t func,
+ void *info, bool wait)
+{
+ on_each_cpu_cond_mask_serialize(NULL, func, info, wait, mask);
+}
+
/* SMP core functions */
int smpcfd_prepare_cpu(unsigned int cpu);
int smpcfd_dead_cpu(unsigned int cpu);
diff --git a/kernel/smp.c b/kernel/smp.c
index f085ebcdf9e7..4c58617a2c48 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -48,6 +48,10 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);

static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);

+static DEFINE_PER_CPU(struct mutex, serialize_ipi_lock);
+
+static DEFINE_MUTEX(serialize_ipi_broadcast_lock);
+
static void __flush_smp_call_function_queue(bool warn_cpu_offline);

int smpcfd_prepare_cpu(unsigned int cpu)
@@ -102,9 +106,10 @@ void __init call_function_init(void)
{
int i;

- for_each_possible_cpu(i)
+ for_each_possible_cpu(i) {
init_llist_head(&per_cpu(call_single_queue, i));
-
+ mutex_init(&per_cpu(serialize_ipi_lock, i));
+ }
smpcfd_prepare_cpu(smp_processor_id());
}

@@ -590,16 +595,9 @@ void flush_smp_call_function_queue(void)
local_irq_restore(flags);
}

-/*
- * smp_call_function_single - Run a function on a specific CPU
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- */
-int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
- int wait)
+static
+int _smp_call_function_single(int cpu, smp_call_func_t func, void *info,
+ int wait, bool serialize)
{
call_single_data_t *csd;
call_single_data_t csd_stack = {
@@ -608,6 +606,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int this_cpu;
int err;

+ if (serialize)
+ mutex_lock(&per_cpu(serialize_ipi_lock, cpu));
+
/*
* prevent preemption and reschedule on another processor,
* as well as CPU removal
@@ -651,10 +652,38 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,

put_cpu();

+ if (serialize)
+ mutex_unlock(&per_cpu(serialize_ipi_lock, cpu));
+
return err;
}
+
+/*
+ * smp_call_function_single - Run a function on a specific CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ */
+int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
+ int wait)
+{
+ return _smp_call_function_single(cpu, func, info, wait, false);
+}
EXPORT_SYMBOL(smp_call_function_single);

+/*
+ * Run a function on a specific CPU. Serialize the IPI to that CPU with
+ * a mutex.
+ */
+int smp_call_function_single_serialize(int cpu, smp_call_func_t func, void *info,
+ int wait)
+{
+ return _smp_call_function_single(cpu, func, info, wait, true);
+}
+EXPORT_SYMBOL(smp_call_function_single_serialize);
+
/**
* smp_call_function_single_async() - Run an asynchronous function on a
* specific CPU.
@@ -880,6 +909,30 @@ void smp_call_function_many(const struct cpumask *mask,
}
EXPORT_SYMBOL(smp_call_function_many);

+/* Call with preemption *enabled*. */
+void smp_call_function_many_serialize(const struct cpumask *mask,
+ smp_call_func_t func, void *info, bool wait)
+{
+ unsigned int weight = cpumask_weight(mask);
+ int ret;
+
+ if (!weight)
+ return;
+
+ if (weight == 1) {
+ ret = smp_call_function_single_serialize(cpumask_first(mask), func, info, wait);
+ WARN_ON_ONCE(ret);
+ return;
+ }
+
+ mutex_lock(&serialize_ipi_broadcast_lock);
+ preempt_disable();
+ smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
+ preempt_enable();
+ mutex_unlock(&serialize_ipi_broadcast_lock);
+}
+EXPORT_SYMBOL(smp_call_function_many_serialize);
+
/**
* smp_call_function(): Run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
@@ -1025,6 +1078,35 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);

+/* Call with preemption enabled. */
+void on_each_cpu_cond_mask_serialize(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, const struct cpumask *mask)
+{
+ int cpu, ret;
+ unsigned int scf_flags = SCF_RUN_LOCAL, weight = cpumask_weight(mask);
+
+ if (!weight)
+ return;
+
+ if (weight == 1) {
+ cpu = cpumask_first(mask);
+ if (cond_func && !cond_func(cpu, info))
+ return;
+ ret = smp_call_function_single_serialize(cpu, func, info, wait);
+ WARN_ON_ONCE(ret);
+ }
+
+ if (wait)
+ scf_flags |= SCF_WAIT;
+
+ mutex_lock(&serialize_ipi_broadcast_lock);
+ preempt_disable();
+ smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
+ preempt_enable();
+ mutex_unlock(&serialize_ipi_broadcast_lock);
+}
+EXPORT_SYMBOL(on_each_cpu_cond_mask_serialize);
+
static void do_nothing(void *unused)
{
}
--
2.39.2