[PATCH RFC tip/core/rcu 1/5] rcu: Reduce overhead of cond_resched() checks for RCU

From: Paul E. McKenney
Date: Fri Jun 20 2014 - 14:33:39 EST


From: "Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx>

Commit ac1bea85781e (Make cond_resched() report RCU quiescent states)
fixed a problem where a CPU looping in the kernel with but one runnable
task would give RCU CPU stall warnings, even if the in-kernel loop
contained cond_resched() calls. Unfortunately, in so doing, it introduced
performance regressions in Anton Blanchard's will-it-scale "open1" test.
The problem appears to be not so much the increased cond_resched() path
length as an increase in the rate at which grace periods complete, which
increased per-update grace-period overhead.

This commit takes a different approach to fixing this bug, mainly by
avoiding having cond_resched() do an RCU-visible quiescent state unless
there is a grace period that has been in flight for a significant period
of time. This commit also reduces the common-case cond_resched() overhead
to a check of a single per-CPU variable.

Reported-by: Dave Hansen <dave.hansen@xxxxxxxxx>
Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Josh Triplett <josh@xxxxxxxxxxxxxxxx>
Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxxx>
Cc: Mike Galbraith <umgwanakikbuti@xxxxxxxxx>
Cc: Eric Dumazet <eric.dumazet@xxxxxxxxx>
---
include/linux/rcupdate.h | 35 ----------------------
include/linux/rcutiny.h | 13 +++++++++
include/linux/rcutree.h | 16 +++++++++++
kernel/rcu/tree.c | 75 ++++++++++++++++++++++++++++++++++++++++++++----
kernel/rcu/tree.h | 6 +++-
kernel/rcu/tree_plugin.h | 2 +-
kernel/rcu/update.c | 18 ------------
7 files changed, 104 insertions(+), 61 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5a75d19aa661..d5a3a015c80a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -300,41 +300,6 @@ bool __rcu_is_watching(void);
#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */

/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-
-#define RCU_COND_RESCHED_LIM 256 /* ms vs. 100s of ms. */
-DECLARE_PER_CPU(int, rcu_cond_resched_count);
-void rcu_resched(void);
-
-/*
- * Is it time to report RCU quiescent states?
- *
- * Note unsynchronized access to rcu_cond_resched_count. Yes, we might
- * increment some random CPU's count, and possibly also load the result from
- * yet another CPU's count. We might even clobber some other CPU's attempt
- * to zero its counter. This is all OK because the goal is not precision,
- * but rather reasonable amortization of rcu_note_context_switch() overhead
- * and extremely high probability of avoiding RCU CPU stall warnings.
- * Note that this function has to be preempted in just the wrong place,
- * many thousands of times in a row, for anything bad to happen.
- */
-static inline bool rcu_should_resched(void)
-{
- return raw_cpu_inc_return(rcu_cond_resched_count) >=
- RCU_COND_RESCHED_LIM;
-}
-
-/*
- * Report quiscent states to RCU if it is time to do so.
- */
-static inline void rcu_cond_resched(void)
-{
- if (unlikely(rcu_should_resched()))
- rcu_resched();
-}
-
-/*
* Infrastructure to implement the synchronize_() primitives in
* TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
*/
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index d40a6a451330..ff2ede319890 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -83,6 +83,19 @@ static inline void rcu_note_context_switch(int cpu)
rcu_sched_qs(cpu);
}

+static inline bool rcu_should_resched(void)
+{
+ return false;
+}
+
+static inline void rcu_cond_resched(void)
+{
+}
+
+static inline void rcu_resched(void)
+{
+}
+
/*
* Take advantage of the fact that there is only one CPU, which
* allows us to ignore virtualization-based context switches.
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 3e2f5d432743..16780fed7155 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -46,6 +46,22 @@ static inline void rcu_virt_note_context_switch(int cpu)
rcu_note_context_switch(cpu);
}

+DECLARE_PER_CPU(int, rcu_cond_resched_mask);
+void rcu_resched(void);
+
+/* Is it time to report RCU quiescent states? */
+static inline bool rcu_should_resched(void)
+{
+ return raw_cpu_read(rcu_cond_resched_mask);
+}
+
+/* Report quiescent states to RCU if it is time to do so. */
+static inline void rcu_cond_resched(void)
+{
+ if (unlikely(rcu_should_resched()))
+ rcu_resched();
+}
+
void synchronize_rcu_bh(void);
void synchronize_sched_expedited(void);
void synchronize_rcu_expedited(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..2cc72ce19ff6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -229,6 +229,58 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};

+/*
+ * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
+ */
+
+DEFINE_PER_CPU(int, rcu_cond_resched_mask);
+
+/*
+ * Let the RCU core know that this CPU has gone through a cond_resched(),
+ * which is a quiescent state.
+ */
+void rcu_resched(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp;
+ int resched_mask;
+ struct rcu_state *rsp;
+
+ local_irq_save(flags);
+
+ /*
+ * Yes, we can lose flag-setting operations. This is OK, because
+ * the flag will be set again after some delay.
+ */
+ resched_mask = raw_cpu_read(rcu_cond_resched_mask);
+ raw_cpu_write(rcu_cond_resched_mask, 0);
+
+ /* Find the flavor that needs a quiescent state. */
+ for_each_rcu_flavor(rsp) {
+ rdp = raw_cpu_ptr(rsp->rda);
+ if (!(resched_mask & rsp->flavor_mask))
+ continue;
+ smp_mb(); /* ->flavor_mask before ->cond_resched_completed. */
+ if (ACCESS_ONCE(rdp->mynode->completed) !=
+ ACCESS_ONCE(rdp->cond_resched_completed))
+ continue;
+
+ /*
+ * Pretend to be momentarily idle for the quiescent state.
+ * This allows the grace-period kthread to record the
+ * quiescent state, with no need for this CPU to do anything
+ * further.
+ */
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ smp_mb__before_atomic(); /* Earlier stuff before QS. */
+ atomic_add(2, &rdtp->dynticks); /* QS. */
+ smp_mb__after_atomic(); /* Later stuff after QS. */
+ break;
+ }
+ local_irq_restore(flags);
+}
+
static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static long qhimark = 10000; /* If this many pending, ignore blimit. */
static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -853,6 +905,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
unsigned int curr;
+ int *rcrmp;
unsigned int snap;

curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,13 +946,20 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
}

/*
- * There is a possibility that a CPU in adaptive-ticks state
- * might run in the kernel with the scheduling-clock tick disabled
- * for an extended time period. Invoke rcu_kick_nohz_cpu() to
- * force the CPU to restart the scheduling-clock tick in this
- * CPU is in this state.
+ * A CPU running for an extended time within the kernel can
+ * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
+ * even context-switching back and forth between a pair of
+ * in-kernel CPU-bound tasks cannot advance grace periods.
+ * So if the grace period is old enough, make the CPU pay attention.
*/
- rcu_kick_nohz_cpu(rdp->cpu);
+ if (ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + 7)) {
+ rcrmp = &per_cpu(rcu_cond_resched_mask, rdp->cpu);
+ ACCESS_ONCE(rdp->cond_resched_completed) =
+ ACCESS_ONCE(rdp->mynode->completed);
+ smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+ ACCESS_ONCE(*rcrmp) =
+ ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ }

/*
* Alternatively, the CPU might be running in the kernel
@@ -3491,6 +3551,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
"rcu_node_fqs_1",
"rcu_node_fqs_2",
"rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static u8 fl_mask = 0x1;
int cpustride = 1;
int i;
int j;
@@ -3509,6 +3570,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
for (i = 1; i < rcu_num_lvls; i++)
rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
rcu_init_levelspread(rsp);
+ rsp->flavor_mask = fl_mask;
+ fl_mask <<= 1;

/* Initialize the elements themselves, starting from the leaves. */

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -307,6 +307,9 @@ struct rcu_data {
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
unsigned long offline_fqs; /* Kicked due to being offline. */
+ unsigned long cond_resched_completed;
+ /* Grace period that needs help */
+ /* from cond_resched(). */

/* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -392,6 +395,7 @@ struct rcu_state {
struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
void (*func)(struct rcu_head *head));
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
* if an adaptive-ticks CPU is failing to respond to the current grace
* period and has not be idle from an RCU perspective, kick it.
*/
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(cpu))
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..d22309cae9f5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
early_initcall(check_cpu_stall_init);

#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-
-/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-
-DEFINE_PER_CPU(int, rcu_cond_resched_count);
-
-/*
- * Report a set of RCU quiescent states, for use by cond_resched()
- * and friends. Out of line due to being called infrequently.
- */
-void rcu_resched(void)
-{
- preempt_disable();
- __this_cpu_write(rcu_cond_resched_count, 0);
- rcu_note_context_switch(smp_processor_id());
- preempt_enable();
-}
--
1.8.1.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/