Re: [tip: locking/core] lockdep: Fix lockdep recursion

From: Paul E. McKenney
Date: Wed Oct 14 2020 - 19:55:57 EST


On Thu, Oct 15, 2020 at 12:39:54AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 14, 2020 at 03:11:52PM -0700, Paul E. McKenney wrote:
> > On Wed, Oct 14, 2020 at 11:53:19PM +0200, Peter Zijlstra wrote:
> > > On Wed, Oct 14, 2020 at 11:34:05AM -0700, Paul E. McKenney wrote:
> > > > commit 7deaa04b02298001426730ed0e6214ac20d1a1c1
> > > > Author: Paul E. McKenney <paulmck@xxxxxxxxxx>
> > > > Date: Tue Oct 13 12:39:23 2020 -0700
> > > >
> > > > rcu: Prevent lockdep-RCU splats on lock acquisition/release
> > > >
> > > > The rcu_cpu_starting() and rcu_report_dead() functions transition the
> > > > current CPU between online and offline state from an RCU perspective.
> > > > Unfortunately, this means that the rcu_cpu_starting() function's lock
> > > > acquisition and the rcu_report_dead() function's lock releases happen
> > > > while the CPU is offline from an RCU perspective, which can result in
> > > > lockdep-RCU splats about using RCU from an offline CPU. In reality,
> > > > aside from the splats, both transitions are safe because a new grace
> > > > period cannot start until these functions release their locks.
> > >
> > > But we call the trace_* crud before we acquire the lock. Are you sure
> > > that's a false-positive?
> >
> > You lost me on this one.
> >
> > I am assuming that you are talking about rcu_cpu_starting(), because
> > that is the one where RCU is not initially watching, that is, the
> > case where tracing before the lock acquisition would be a problem.
> > You cannot be talking about rcu_cpu_starting() itself, because it does
> > not do any tracing before acquiring the lock. But if you are talking
> > about the caller of rcu_cpu_starting(), then that caller should put the
> > rcu_cpu_starting() before the tracing. But that would be the other
> > patch earlier in this thread that was proposing moving the call to
> > rcu_cpu_starting() much earlier in CPU bringup.
> >
> > So what am I missing here?
>
> rcu_cpu_starting();
> raw_spin_lock_irqsave();
> local_irq_save();
> preempt_disable();
> spin_acquire()
> lock_acquire()
> trace_lock_acquire() <--- *whoopsie-doodle*
> /* uses RCU for tracing */
> arch_spin_lock_flags() <--- the actual spinlock

Gah! Idiot here left out the most important part, so good catch!!!
Much easier this way than finding out about it the hard way...

I should have asked myself harder questions earlier today about moving
the counter from the rcu_node structure to the rcu_data structure.

Perhaps something like the following untested patch on top of the
earlier patch?

Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 286dc0a..8b5215e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1159,8 +1159,8 @@ bool rcu_lockdep_current_cpu_online(void)
preempt_disable_notrace();
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
- seq = READ_ONCE(rdp->ofl_seq) & ~0x1;
- if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || seq != READ_ONCE(rdp->ofl_seq))
+ seq = READ_ONCE(rnp->ofl_seq) & ~0x1;
+ if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || seq != READ_ONCE(rnp->ofl_seq))
ret = true;
preempt_enable_notrace();
return ret;
@@ -1982,6 +1982,7 @@ static void rcu_gp_fqs_loop(void)
static void rcu_gp_cleanup(void)
{
int cpu;
+ unsigned long firstseq;
bool needgp = false;
unsigned long gp_duration;
unsigned long new_gp_seq;
@@ -2019,6 +2020,12 @@ static void rcu_gp_cleanup(void)
new_gp_seq = rcu_state.gp_seq;
rcu_seq_end(&new_gp_seq);
rcu_for_each_node_breadth_first(rnp) {
+ smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
+ firstseq = READ_ONCE(rnp->ofl_seq);
+ if (firstseq & 0x1)
+ while (firstseq == smp_load_acquire(&rnp->ofl_seq))
+ schedule_timeout_idle(1); // Can't wake unless RCU is watching.
+ smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
raw_spin_lock_irq_rcu_node(rnp);
if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
dump_blkd_tasks(rnp, 10);
@@ -4067,8 +4074,9 @@ void rcu_cpu_starting(unsigned int cpu)

rnp = rdp->mynode;
mask = rdp->grpmask;
- WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
- WARN_ON_ONCE(!(rdp->ofl_seq & 0x1));
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
newcpu = !(rnp->expmaskinitnext & mask);
@@ -4088,8 +4096,9 @@ void rcu_cpu_starting(unsigned int cpu)
} else {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
- WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
- WARN_ON_ONCE(rdp->ofl_seq & 0x1);
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(rnp->ofl_seq & 0x1);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}

@@ -4117,8 +4126,9 @@ void rcu_report_dead(unsigned int cpu)

/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
- WARN_ON_ONCE(!(rdp->ofl_seq & 0x1));
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
raw_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4131,8 +4141,9 @@ void rcu_report_dead(unsigned int cpu)
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
raw_spin_unlock(&rcu_state.ofl_lock);
- WRITE_ONCE(rdp->ofl_seq, rdp->ofl_seq + 1);
- WARN_ON_ONCE(rdp->ofl_seq & 0x1);
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(rnp->ofl_seq & 0x1);

rdp->cpu_started = false;
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf0198d..7708ed1 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,6 +56,7 @@ struct rcu_node {
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
+ unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */
/* Online CPUs for next grace period. */
unsigned long expmask; /* CPUs or groups that need to check in */
/* to allow the current expedited GP */
@@ -250,7 +251,6 @@ struct rcu_data {
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
- unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */

int cpu;
};