[PATCH v5 08/18] rcu: Add per-CB tracing for queuing, flush and invocation.

From: Joel Fernandes (Google)
Date: Thu Sep 01 2022 - 18:18:57 EST


This is very useful to debug whether call_rcu_lazy() is working
correctly. In the future, any BPF tools could also be modified to read
the new information.

The additional tracing is enabled by a new CONFIG_RCU_TRACE_CB and is
kept disabled by default.

Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
---
include/linux/types.h | 44 ++++++++++++++++++++++++
include/trace/events/rcu.h | 69 +++++++++++++++++++++++++++++++++++---
kernel/rcu/Kconfig | 11 ++++++
kernel/rcu/rcu_segcblist.c | 21 ++++++++++++
kernel/rcu/rcu_segcblist.h | 8 +++++
kernel/rcu/tree.c | 46 +++++++++++++++++++++++--
kernel/rcu/tree_nocb.h | 26 +++++++++++++-
7 files changed, 217 insertions(+), 8 deletions(-)

diff --git a/include/linux/types.h b/include/linux/types.h
index ea8cf60a8a79..47501fdfd3c1 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -198,10 +198,51 @@ struct ustat {
char f_fpack[6];
};

+#ifdef CONFIG_RCU_TRACE_CB
+/*
+ * Debug information that a caller can store within a callback_head.
+ * Its expected to provide at least 12 bytes before BUILD_BUG starts
+ * complaining.
+ */
+enum cb_debug_flags {
+ CB_DEBUG_KFREE,
+ CB_DEBUG_LAZY,
+ CB_DEBUG_BYPASS,
+
+ // A new non-lazy CB showed up and we decided to not use
+ // the bypass list for it. So we flushed the old ones.
+ CB_DEBUG_NON_LAZY_FLUSHED,
+
+ // We decided to use the bypass list but had to flush
+ // the old bypass CBs because they got too old or too big.
+ CB_DEBUG_BYPASS_FLUSHED,
+ CB_DEBUG_BYPASS_LAZY_FLUSHED,
+
+ // The GP thread flushed the bypass CBs if they got old or big.
+ CB_DEBUG_GPTHREAD_FLUSHED,
+
+ // De-offload from NOCB mode.
+ CB_DEBUG_DEOFFLOAD_FLUSHED,
+
+ // rcu_barrier() flushes lazy/bypass CBs for CB exec ordering.
+ CB_DEBUG_BARRIER_FLUSHED
+};
+
+struct cb_debug_info {
+ // 16-bit jiffie deltas can provide about 60 seconds of resolution
+ // at HZ=1000 before wrapping. That's enough for debug.
+ u16 cb_queue_jiff;
+ u16 first_bp_jiff;
+ u16 cb_flush_jiff;
+ enum cb_debug_flags flags:16;
+};
+#endif
+
/**
* struct callback_head - callback structure for use with RCU and task_work
* @next: next update requests in a list
* @func: actual update function to call after the grace period.
+ * @di: debug information that can be stored.
*
* The struct is aligned to size of pointer. On most architectures it happens
* naturally due ABI requirements, but some architectures (like CRIS) have
@@ -220,6 +261,9 @@ struct ustat {
struct callback_head {
struct callback_head *next;
void (*func)(struct callback_head *head);
+#ifdef CONFIG_RCU_TRACE_CB
+ struct cb_debug_info di;
+#endif
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 90b2fb0292cb..dd6baec6745c 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -630,24 +630,85 @@ TRACE_EVENT_RCU(rcu_batch_start,
*/
TRACE_EVENT_RCU(rcu_invoke_callback,

- TP_PROTO(const char *rcuname, struct rcu_head *rhp),
+ TP_PROTO(const char *rcuname, struct rcu_head *rhp
+#ifdef CONFIG_RCU_TRACE_CB
+ , unsigned long jiffies_first
+#endif
+ ),

- TP_ARGS(rcuname, rhp),
+ TP_ARGS(rcuname, rhp
+#ifdef CONFIG_RCU_TRACE_CB
+ , jiffies_first
+#endif
+ ),

TP_STRUCT__entry(
__field(const char *, rcuname)
__field(void *, rhp)
__field(void *, func)
+#ifdef CONFIG_RCU_TRACE_CB
+ __field(u16, cb_debug_flags)
+ __field(int, cb_queue_exec_latency)
+ __array(char, bypass_flush_reason, 32)
+ __field(int, cb_bypass_slack)
+ __field(int, cb_queue_flush_latency)
+#endif
),

TP_fast_assign(
__entry->rcuname = rcuname;
__entry->rhp = rhp;
__entry->func = rhp->func;
+#ifdef CONFIG_RCU_TRACE_CB
+ __entry->cb_debug_flags = rhp->di.flags;
+ __entry->cb_queue_exec_latency =
+ (jiffies - jiffies_first) - rhp->di.cb_queue_jiff,
+
+ /* The following 3 fields are valid only for bypass/lazy CBs. */
+ __entry->cb_bypass_slack =
+ (rhp->di.flags & BIT(CB_DEBUG_BYPASS)) ?
+ rhp->di.cb_queue_jiff - rhp->di.first_bp_jiff : 0,
+ __entry->cb_queue_flush_latency =
+ (rhp->di.flags & BIT(CB_DEBUG_BYPASS)) ?
+ rhp->di.cb_flush_jiff - rhp->di.cb_queue_jiff : 0;
+
+
+ if (__entry->cb_debug_flags & BIT(CB_DEBUG_NON_LAZY_FLUSHED))
+ strcpy(__entry->bypass_flush_reason, "non-lazy");
+ else if (__entry->cb_debug_flags & BIT(CB_DEBUG_BYPASS_FLUSHED))
+ strcpy(__entry->bypass_flush_reason, "bypass");
+ else if (__entry->cb_debug_flags & BIT(CB_DEBUG_BYPASS_LAZY_FLUSHED))
+ strcpy(__entry->bypass_flush_reason, "bypass-lazy");
+ else if (__entry->cb_debug_flags & BIT(CB_DEBUG_GPTHREAD_FLUSHED))
+ strcpy(__entry->bypass_flush_reason, "gpthread");
+ else if (__entry->cb_debug_flags & BIT(CB_DEBUG_BARRIER_FLUSHED))
+ strcpy(__entry->bypass_flush_reason, "rcu_barrier");
+ else if (__entry->cb_debug_flags & BIT(CB_DEBUG_DEOFFLOAD_FLUSHED))
+ strcpy(__entry->bypass_flush_reason, "deoffload");
+#endif
),

- TP_printk("%s rhp=%p func=%ps",
- __entry->rcuname, __entry->rhp, __entry->func)
+ TP_printk("%s rhp=%p func=%ps"
+#ifdef CONFIG_RCU_TRACE_CB
+ " lazy=%d "
+ "bypass=%d "
+ "flush_reason=%s "
+ "queue_exec_latency=%d "
+ "bypass_slack=%d "
+ "queue_flush_latency=%d"
+#endif
+ ,
+ __entry->rcuname, __entry->rhp, __entry->func
+#ifdef CONFIG_RCU_TRACE_CB
+ ,
+ !!(__entry->cb_debug_flags & BIT(CB_DEBUG_LAZY)),
+ !!(__entry->cb_debug_flags & BIT(CB_DEBUG_BYPASS)),
+ __entry->bypass_flush_reason,
+ __entry->cb_queue_exec_latency,
+ __entry->cb_bypass_slack,
+ __entry->cb_queue_flush_latency
+#endif
+ )
);

/*
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 3128d01427cb..cd6ff63517f4 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -319,4 +319,15 @@ config RCU_LAZY
To save power, batch RCU callbacks and flush after delay, memory
pressure or callback list growing too big.

+config RCU_TRACE_CB
+ bool "Enable additional callback tracing"
+ depends on RCU_LAZY
+ default n
+ help
+ Enable additional callback tracing for RCU. Currently only
+ tracing for lazy/bypass callbacks is done. In the future, it could be
+ extended to non-lazy callbacks as well. The trace point contains
+ detailed information about callback flushing reason and execution
+ time. This information can be retrieved by BPF tools via tracepoints.
+
endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 55b50e592986..8f0b024f7bc5 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -32,6 +32,27 @@ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
WRITE_ONCE(rclp->len, rclp->len + 1);
}

+/*
+ * Set a debug flag on all CBs in the unsegmented cblist @rclp.
+ *
+ * The callback causing the flush. Should be NULL if its a timer that does it.
+ */
+#ifdef CONFIG_RCU_TRACE_CB
+void rcu_cblist_set_flush(struct rcu_cblist *rcl,
+ enum cb_debug_flags flags,
+ unsigned long flush_jiff)
+{
+ if (!rcl || !rcl->head)
+ return;
+
+ for (struct rcu_head *head = rcl->head;
+ head && head != *(rcl->tail); head = head->next) {
+ head->di.flags |= flags;
+ head->di.cb_flush_jiff = flush_jiff;
+ }
+}
+#endif
+
/*
* Flush the second rcu_cblist structure onto the first one, obliterating
* any contents of the first. If rhp is non-NULL, enqueue it as the sole
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 431cee212467..ac1f34024b84 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -53,6 +53,14 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
#endif
}

+#ifdef CONFIG_RCU_TRACE_CB
+void rcu_cblist_set_flush(struct rcu_cblist *rcl,
+ enum cb_debug_flags flags,
+ unsigned long flush_jiff);
+#else
+#define rcu_cblist_set_flush(...)
+#endif
+
static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
int flags)
{
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index aaced29a0a71..8111d9f37621 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2179,6 +2179,12 @@ int rcutree_dead_cpu(unsigned int cpu)
return 0;
}

+static unsigned long cb_debug_jiffies_first;
+
+#if defined(CONFIG_RCU_TRACE_CB) && defined(CONFIG_RCU_LAZY)
+extern unsigned long jiffies_till_flush;
+#endif
+
/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Throttle as specified by rdp->blimit.
@@ -2241,7 +2247,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
debug_rcu_head_unqueue(rhp);

rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_callback(rcu_state.name, rhp);
+
+ trace_rcu_invoke_callback(rcu_state.name, rhp
+#ifdef CONFIG_RCU_TRACE_CB
+ , READ_ONCE(cb_debug_jiffies_first)
+#endif
+ );

f = rhp->func;
WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
@@ -2736,6 +2747,17 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
struct rcu_data *rdp;
bool was_alldone;

+ /*
+ * For CB debugging, record the starting reference to jiffies while
+ * making sure it does not wrap around. The starting reference is
+ * used to calculate 16-bit jiffie deltas. To be safe, reset the
+ * reference once the delta exceeds 15-bits worth.
+ */
+ if (IS_ENABLED(CONFIG_RCU_TRACE_CB) &&
+ (!READ_ONCE(cb_debug_jiffies_first) ||
+ (jiffies - READ_ONCE(cb_debug_jiffies_first) > (1 << 15))))
+ WRITE_ONCE(cb_debug_jiffies_first, jiffies);
+
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));

@@ -2771,14 +2793,29 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)

check_cb_ovld(rdp);

- if (__is_kvfree_rcu_offset((unsigned long)func))
+#ifdef CONFIG_RCU_TRACE_CB
+ head->di.flags = 0;
+ WARN_ON_ONCE(jiffies - READ_ONCE(cb_debug_jiffies_first) > 65535);
+ head->di.cb_queue_jiff = (u16)(jiffies - READ_ONCE(cb_debug_jiffies_first));
+#endif
+
+ if (__is_kvfree_rcu_offset((unsigned long)func)) {
trace_rcu_kvfree_callback(rcu_state.name, head,
(unsigned long)func,
rcu_segcblist_n_cbs(&rdp->cblist));
- else
+#ifdef CONFIG_RCU_TRACE_CB
+ head->di.flags |= BIT(CB_DEBUG_KFREE);
+#endif
+ } else {
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));

+#ifdef CONFIG_RCU_TRACE_CB
+ if (lazy)
+ head->di.flags |= BIT(CB_DEBUG_LAZY);
+#endif
+ }
+
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
@@ -3943,6 +3980,9 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
rcu_nocb_lock(rdp);
+
+ rcu_cblist_set_flush(&rdp->nocb_bypass, BIT(CB_DEBUG_BARRIER_FLUSHED),
+ (jiffies - READ_ONCE(cb_debug_jiffies_first)));
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false,
/* wake gp thread */ true));
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 560ba87911c5..ee5924ba2f3b 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -355,6 +355,11 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
if (rhp)
rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */

+ /* The lazy CBs are being flushed, but a new one might be enqueued. */
+#ifdef CONFIG_RCU_TRACE_CB
+ WARN_ON_ONCE(rhp && lazy != !!(rhp->di.flags & BIT(CB_DEBUG_LAZY)));
+#endif
+
/*
* If the new CB requested was a lazy one, queue it onto the main
* ->cblist so we can take advantage of a sooner grade period.
@@ -407,6 +412,10 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
if (!rcu_rdp_is_offloaded(rdp) ||
!rcu_nocb_bypass_trylock(rdp))
return;
+
+ rcu_cblist_set_flush(&rdp->nocb_bypass, BIT(CB_DEBUG_GPTHREAD_FLUSHED),
+ (j - READ_ONCE(cb_debug_jiffies_first)));
+
WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
}

@@ -489,8 +498,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("FirstQ"));

- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false, false));
+ rcu_cblist_set_flush(&rdp->nocb_bypass, BIT(CB_DEBUG_NON_LAZY_FLUSHED),
+ (j - READ_ONCE(cb_debug_jiffies_first)));

+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false, false));
WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
return false; // Caller must enqueue the callback.
}
@@ -503,6 +514,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
ncbs >= qhimark) {
rcu_nocb_lock(rdp);

+ rcu_cblist_set_flush(&rdp->nocb_bypass,
+ lazy ? BIT(CB_DEBUG_BYPASS_LAZY_FLUSHED) : BIT(CB_DEBUG_BYPASS_FLUSHED),
+ (j - READ_ONCE(cb_debug_jiffies_first)));
+
if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy, false)) {
*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
if (*was_alldone)
@@ -543,6 +558,11 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
}

+#ifdef CONFIG_RCU_TRACE_CB
+ rhp->di.flags |= BIT(CB_DEBUG_BYPASS);
+ rhp->di.first_bp_jiff = READ_ONCE(rdp->nocb_bypass_first) - READ_ONCE(cb_debug_jiffies_first);
+#endif
+
rcu_nocb_bypass_unlock(rdp);
smp_mb(); /* Order enqueue before wake. */

@@ -1156,6 +1176,10 @@ static long rcu_nocb_rdp_deoffload(void *arg)
* return false, which means that future calls to rcu_nocb_try_bypass()
* will refuse to put anything into the bypass.
*/
+
+ rcu_cblist_set_flush(&rdp->nocb_bypass, BIT(CB_DEBUG_DEOFFLOAD_FLUSHED),
+ (jiffies - READ_ONCE(cb_debug_jiffies_first)));
+
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false, false));
/*
* Start with invoking rcu_core() early. This way if the current thread
--
2.37.2.789.g6183377224-goog