[PATCH v2] irqchip/gic-v3: use dsb(ishst) to synchronize data to smp before issuing ipi

From: Barry Song
Date: Sun Feb 20 2022 - 01:37:15 EST


From: Barry Song <song.bao.hua@xxxxxxxxxxxxx>

dsb(ishst) should be enough here as we only need to guarantee the
visibility of data to other CPUs in smp inner domain before we
send the ipi.

A micro-benchmark is written to verify the performance impact on
kunpeng920 machine with 2 sockets, each socket has 2 dies, and
each die has 24 CPUs, so totally the system has 2 * 2 * 24 = 96
CPUs. ~2% performance improvement can be seen by this benchmark.

The code of benchmark module:

#include <linux/module.h>
#include <linux/timekeeping.h>

volatile int data0 ____cacheline_aligned;
volatile int data1 ____cacheline_aligned;
volatile int data2 ____cacheline_aligned;
volatile int data3 ____cacheline_aligned;
volatile int data4 ____cacheline_aligned;
volatile int data5 ____cacheline_aligned;
volatile int data6 ____cacheline_aligned;

static void ipi_latency_func(void *val)
{
}

static int __init ipi_latency_init(void)
{
ktime_t stime, etime, delta;
int cpu, i;
int start = smp_processor_id();

stime = ktime_get();
for ( i = 0; i < 1000; i++)
for (cpu = 0; cpu < 96; cpu++) {
data0 = data1 = data2 = data3 = data4 = data5 = data6 = cpu;
smp_call_function_single(cpu, ipi_latency_func, NULL, 1);
}
etime = ktime_get();

delta = ktime_sub(etime, stime);

printk("%s ipi from cpu%d to cpu0-95 delta of 1000times:%lld\n",
__func__, start, delta);

return 0;
}
module_init(ipi_latency_init);

static void ipi_latency_exit(void)
{
}
module_exit(ipi_latency_exit);

MODULE_DESCRIPTION("IPI benchmark");
MODULE_LICENSE("GPL");

run the below commands 10 times on both Vanilla and the kernel with this
patch:
# taskset -c 0 insmod test.ko
# rmmod test

The result on vanilla:
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126757449
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126784249
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126177703
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:127022281
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126184883
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:127374585
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:125778089
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126974441
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:127357625
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:126228184

The result on the kernel with this patch:
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:124467401
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123474209
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123558497
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:122993951
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:122984223
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123323609
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:124507583
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123386963
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123340664
ipi_latency_init ipi from cpu0 to cpu0-95 delta of 1000times:123285324

Signed-off-by: Barry Song <song.bao.hua@xxxxxxxxxxxxx>
---
-v2: add benchmark and benchmark data in commit log

drivers/irqchip/irq-gic-v3.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 5e935d97207d..0efe1a9a9f3b 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1211,7 +1211,7 @@ static void gic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask)
* Ensure that stores to Normal memory are visible to the
* other CPUs before issuing the IPI.
*/
- wmb();
+ dsb(ishst);

for_each_cpu(cpu, mask) {
u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(cpu_logical_map(cpu));
--
2.25.1