Re: [PATCH 3/3] IPI: Avoid to use 2 cache lines for one call_single_data

From: Peter Zijlstra
Date: Thu Aug 03 2017 - 04:58:11 EST


On Thu, Aug 03, 2017 at 04:35:21PM +0800, Huang, Ying wrote:
> diff --git a/include/linux/smp.h b/include/linux/smp.h
> index 68123c1fe549..4d3b372d50b0 100644
> --- a/include/linux/smp.h
> +++ b/include/linux/smp.h
> @@ -13,13 +13,22 @@
> #include <linux/init.h>
> #include <linux/llist.h>
>
> +#define CSD_ALIGNMENT (4 * sizeof(void *))
> +
> typedef void (*smp_call_func_t)(void *info);
> struct call_single_data {
> struct llist_node llist;
> smp_call_func_t func;
> void *info;
> unsigned int flags;
> -};
> +} __aligned(CSD_ALIGNMENT);
> +
> +/* To avoid allocate csd across 2 cache lines */
> +static inline void check_alignment_of_csd(void)
> +{
> + BUILD_BUG_ON((CSD_ALIGNMENT & (CSD_ALIGNMENT - 1)) != 0);
> + BUILD_BUG_ON(sizeof(struct call_single_data) > CSD_ALIGNMENT);
> +}
>
> /* total number of cpus in this system (may exceed NR_CPUS) */
> extern unsigned int total_cpus;

Bah, C sucks.. a much larger but possibly nicer patch

---
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 770d4d1516cb..bd8ba5472bca 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST

static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
-static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);

void tick_broadcast(const struct cpumask *mask)
{
atomic_t *count;
- struct call_single_data *csd;
+ call_single_data_t *csd;
int cpu;

for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)

static int __init tick_broadcast_init(void)
{
- struct call_single_data *csd;
+ call_single_data_t *csd;
int cpu;

for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 87b7df4851bf..07125e7941f4 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
static int raise_blk_irq(int cpu, struct request *rq)
{
if (cpu_online(cpu)) {
- struct call_single_data *data = &rq->csd;
+ call_single_data_t *data = &rq->csd;

data->func = trigger_softirq;
data->info = rq;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 85c24cace973..81142ce781da 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -13,7 +13,7 @@
struct nullb_cmd {
struct list_head list;
struct llist_node ll_list;
- struct call_single_data csd;
+ call_single_data_t csd;
struct request *rq;
struct bio *bio;
unsigned int tag;
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 71e586d7df71..e54be79b2084 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -119,7 +119,7 @@ struct cpuidle_coupled {

#define CPUIDLE_COUPLED_NOT_IDLE (-1)

-static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
+static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);

/*
* The cpuidle_coupled_poke_pending mask is used to avoid calling
@@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info)
*/
static void cpuidle_coupled_poke(int cpu)
{
- struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
+ call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);

if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
smp_call_function_single_async(cpu, csd);
@@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev)
{
int cpu;
struct cpuidle_device *other_dev;
- struct call_single_data *csd;
+ call_single_data_t *csd;
struct cpuidle_coupled *coupled;

if (cpumask_empty(&dev->coupled_cpus))
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 51583ae4b1eb..120b6e537b28 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg)
if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
napi_schedule_irqoff(&droq->napi);
} else {
- struct call_single_data *csd = &droq->csd;
+ call_single_data_t *csd = &droq->csd;

csd->func = napi_schedule_wrapper;
csd->info = &droq->napi;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
index 6efd139b894d..f91bc84d1719 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
@@ -328,7 +328,7 @@ struct octeon_droq {

u32 cpu_id;

- struct call_single_data csd;
+ call_single_data_t csd;
};

#define OCT_DROQ_SIZE (sizeof(struct octeon_droq))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 25f6a0cb27d3..006fa09a641e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -134,7 +134,7 @@ typedef __u32 __bitwise req_flags_t;
struct request {
struct list_head queuelist;
union {
- struct call_single_data csd;
+ call_single_data_t csd;
u64 fifo_time;
};

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 779b23595596..6557f320b66e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2774,7 +2774,7 @@ struct softnet_data {
unsigned int input_queue_head ____cacheline_aligned_in_smp;

/* Elements below can be accessed between CPUs for RPS/RFS */
- struct call_single_data csd ____cacheline_aligned_in_smp;
+ call_single_data_t csd ____cacheline_aligned_in_smp;
struct softnet_data *rps_ipi_next;
unsigned int cpu;
unsigned int input_queue_tail;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 68123c1fe549..8d817cb80a38 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -14,13 +14,16 @@
#include <linux/llist.h>

typedef void (*smp_call_func_t)(void *info);
-struct call_single_data {
+struct __call_single_data {
struct llist_node llist;
smp_call_func_t func;
void *info;
unsigned int flags;
};

+typedef struct __call_single_data call_single_data_t
+ __aligned(sizeof(struct __call_single_data));
+
/* total number of cpus in this system (may exceed NR_CPUS) */
extern unsigned int total_cpus;

@@ -48,7 +51,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
smp_call_func_t func, void *info, bool wait,
gfp_t gfp_flags);

-int smp_call_function_single_async(int cpu, struct call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);

#ifdef CONFIG_SMP

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2f9c9f7c0ee..39f9cc47eb1c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -769,7 +769,7 @@ struct rq {
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
- struct call_single_data hrtick_csd;
+ call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 3061483cb3ad..1fb84b6db429 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,7 +28,7 @@ enum {
};

struct call_function_data {
- struct call_single_data __percpu *csd;
+ call_single_data_t __percpu *csd;
cpumask_var_t cpumask;
cpumask_var_t cpumask_ipi;
};
@@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
free_cpumask_var(cfd->cpumask);
return -ENOMEM;
}
- cfd->csd = alloc_percpu(struct call_single_data);
+ cfd->csd = alloc_percpu(call_single_data_t);
if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
@@ -103,12 +103,12 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static __always_inline void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}

-static __always_inline void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
{
csd_lock_wait(csd);
csd->flags |= CSD_FLAG_LOCK;
@@ -121,7 +121,7 @@ static __always_inline void csd_lock(struct call_single_data *csd)
smp_wmb();
}

-static __always_inline void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
{
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));

@@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
smp_store_release(&csd->flags, 0);
}

-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);

/*
* Insert a previously allocated call_single_data element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
-static int generic_exec_single(int cpu, struct call_single_data *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd,
smp_call_func_t func, void *info)
{
if (cpu == smp_processor_id()) {
@@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
struct llist_head *head;
struct llist_node *entry;
- struct call_single_data *csd, *csd_next;
+ call_single_data_t *csd, *csd_next;
static bool warned;

WARN_ON(!irqs_disabled());
@@ -268,8 +268,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
- struct call_single_data *csd;
- struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+ call_single_data_t *csd;
+ call_single_data_t csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
int this_cpu;
int err;

@@ -321,7 +321,7 @@ EXPORT_SYMBOL(smp_call_function_single);
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
*/
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
int err = 0;

@@ -444,7 +444,7 @@ void smp_call_function_many(const struct cpumask *mask,

cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
- struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+ call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);

csd_lock(csd);
if (wait)
@@ -460,7 +460,7 @@ void smp_call_function_many(const struct cpumask *mask,

if (wait) {
for_each_cpu(cpu, cfd->cpumask) {
- struct call_single_data *csd;
+ call_single_data_t *csd;

csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
diff --git a/kernel/up.c b/kernel/up.c
index ee81ac9af4ca..42c46bf3e0a5 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
EXPORT_SYMBOL(smp_call_function_single);

-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
unsigned long flags;