[PATCH RFC] x86/entry: Ask RCU if it needs rcu_irq_{enter,exit}()

From: Paul E. McKenney
Date: Thu Jun 11 2020 - 19:53:08 EST


RCU needs to detect when one if its interrupt handlers interrupted an idle
state, where an idle state is either the idle loop itself or nohz_full
userspace execution. When a CPU has been interrupted from one of these
idle states, RCU can report a quiescent state, helping the current grace
period to come to an end.

However, there are optimized kernel-entry paths where handlers that
would normally be executed in interrupt context are instead executed
directly from the base process context, but with interrupts disabled.
When such a directly-executed handler is followed by softirq processing
(which re-enables interrupts), it is possible that one of RCU's interrupt
handlers will interrupt this softirq processing. This situation can cause
RCU to incorrectly assume that the CPU has passed through a quiescent
state, when in fact the CPU is instead in the midst of softirq processing,
and might well be within an RCU read-side critical section. In that case,
reporting a quiescent state can result in too-short RCU grace periods,
leading to arbitrary memory corruption and a sharp degradation in the
actuarial statistics of your kernel.

The fix is for the optimized kernel-entry paths to replace the current
call to __rcu_is_watching() with a call to a new rcu_needs_irq_enter()
function, which returns true iff RCU needs explicit calls to
rcu_irq_enter() and rcu_irq_exit() surrounding the optimized invocation
of the handler. These explicit calls are never required in Tiny RCU,
and in Tree RCU are required only if the CPU might have interrupted
nohz_full userspace execution or the idle loop. There is the usual
precision/overhead tradeoff, with the current implementation majoring
in low common-case overhead.

While in the area, the commit also returns rcu_is_cpu_rrupt_from_idle()
to its original semantics.

This has been subjected only to very light rcutorture testing, so use
appropriate caution. The placement of the new rcu_needs_irq_enter()
is not ideal, but the more natural include/linux/hardirq.h location has
#include issues.

Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxx>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index f4d5778..50d002a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -517,6 +517,41 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
}

+/*
+ * RCU needs to detect when one if its interrupt handlers interrupted
+ * an idle state, where an idle state is either the idle loop itself or
+ * nohz_full userspace execution. When a CPU has been interrupted from
+ * one of these idle states, RCU can report a quiescent state, helping
+ * the current grace period to come to an end.
+ *
+ * However, there are optimized kernel-entry paths where handlers that
+ * would normally be executed in interrupt context are instead executed
+ * directly from the base process context, but with interrupts disabled.
+ * When such a directly-executed handler is followed by softirq processing
+ * (which re-enables interrupts), it is possible that one of RCU's interrupt
+ * handlers will interrupt this softirq processing. This situation
+ * can cause RCU to incorrectly assume that the CPU has passed through a
+ * quiescent state, when in fact the CPU is instead in the midst of softirq
+ * processing, and might well be within an RCU read-side critical section.
+ * In that case, reporting a quiescent state can result in too-short
+ * RCU grace periods, leading to arbitrary memory corruption and a sharp
+ * degradation in the actuarial statistics of your kernel.
+ *
+ * The fix is for the optimized kernel-entry paths to make use of
+ * this function, which returns true iff RCU needs explicit calls to
+ * rcu_irq_enter() and rcu_irq_exit() surrounding the optimized invocation
+ * of the handler. These explicit calls are never required in Tiny RCU,
+ * and in Tree RCU are required only if the CPU might have interrupted
+ * nohz_full userspace execution or the idle loop. There is the usual
+ * precision/overhead tradeoff, with the current implementation majoring
+ * in low common-case overhead.
+ */
+static __always_inline bool rcu_needs_irq_enter(void)
+{
+ return !IS_ENABLED(CONFIG_TINY_RCU) &&
+ (context_tracking_enabled_cpu(smp_processor_id()) || is_idle_task(current));
+}
+
/**
* idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional
* RCU handling
@@ -557,7 +592,7 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
return false;
}

- if (!__rcu_is_watching()) {
+ if (rcu_needs_irq_enter()) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 8512cae..957ed29 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -87,7 +87,6 @@ static inline void rcu_scheduler_starting(void) { }
static inline void rcu_end_inkernel_boot(void) { }
static inline bool rcu_inkernel_boot_has_ended(void) { return true; }
static inline bool rcu_is_watching(void) { return true; }
-static inline bool __rcu_is_watching(void) { return true; }
static inline void rcu_momentary_dyntick_idle(void) { }
static inline void kfree_rcu_scheduler_running(void) { }
static inline bool rcu_gp_might_be_stalled(void) { return false; }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d5cc9d67..e7072a0 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -64,7 +64,6 @@ extern int rcu_scheduler_active __read_mostly;
void rcu_end_inkernel_boot(void);
bool rcu_inkernel_boot_has_ended(void);
bool rcu_is_watching(void);
-bool __rcu_is_watching(void);
#ifndef CONFIG_PREEMPTION
void rcu_all_qs(void);
#endif
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c716ead..d833e10 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -429,30 +429,25 @@ static int rcu_is_cpu_rrupt_from_idle(void)
{
long nesting;

- /*
- * Usually called from the tick; but also used from smp_function_call()
- * for expedited grace periods. This latter can result in running from
- * the idle task, instead of an actual IPI.
- */
+ // Usually called from the tick; but also used from smp_function_call()
+ // for expedited grace periods. This latter can result in running from
+ // the idle task, instead of an actual IPI.
lockdep_assert_irqs_disabled();

- /* Check for counter underflows */
+ // Check for counter underflows
RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
"RCU dynticks_nesting counter underflow!");
RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
"RCU dynticks_nmi_nesting counter underflow/zero!");

- /* Are we at first interrupt nesting level? */
+ // Are we at first interrupt nesting level?
nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
- if (nesting > 1)
- return false;
-
- /*
- * If we're not in an interrupt, we must be in the idle task!
- */
+ // If we're not in an interrupt, we must be in the idle task!
WARN_ON_ONCE(!nesting && !is_idle_task(current));
+ if (nesting != 1)
+ return false; // Not in irq or in nested irq.

- /* Does CPU appear to be idle from an RCU standpoint? */
+ // Does CPU appear to be idle from an RCU standpoint?
return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
}

@@ -1058,11 +1053,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
}
}

-noinstr bool __rcu_is_watching(void)
-{
- return !rcu_dynticks_curr_cpu_in_eqs();
-}
-
/**
* rcu_is_watching - see if RCU thinks that the current CPU is not idle
*