[PATCH] ring_buffer: Offload wakeup IPI to housekeeping CPUs

From: Nicolas Saenz Julienne
Date: Wed Jun 02 2021 - 05:33:48 EST


In order to minimize trace's effect on nohz_full CPUs, offload the
ring-buffer consumer wakeup IPI into one of the housekeeping CPUs.
Systems not using nohz_full will still run the wakeup IPI locally (as
per housekeeping_any_cpu()'s implementation).

This wakeup IPI, although negligible for the vast majority of trace
workloads, may cause unwarranted latencies on systems tracing events on
nohz_full CPUs. This is made worse on PREEMPT_RT kernels, as they defer
the irq_work handling into ksoftirqd, forcing unwarranted context
switches on the otherwise extremely busy CPU.

Note that the local IPI can't be avoided when tracing in NMI context, as
irq_work_queue() is the only mechanism supported in that context.

To illustrate this, tracing on a nohz_full CPU with PREEMPT_RT=y (plus a
good amount of extra isolation options). I see:
- 50-100 μs latency spikes while tracing without this patch
- 10-14 μs latency spikes while tracing with this patch
- 8-11 μs latency spikes when not tracing at all

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@xxxxxxxxxx>
---
kernel/trace/ring_buffer.c | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2c0ee6484990..ce7817861c5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,6 +5,7 @@
* Copyright (C) 2008 Steven Rostedt <srostedt@xxxxxxxxxx>
*/
#include <linux/trace_recursion.h>
+#include <linux/sched/isolation.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -3052,6 +3053,20 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
rb_end_commit(cpu_buffer);
}

+/*
+ * Offload the irq_work into a housekeeping CPU unless in NMI context, which
+ * doesn't support it. Systems not using nohz_full trigger the IPI locally.
+ *
+ * Note that irq_work supplies its own memory barriers.
+ */
+static __always_inline void rb_irq_work_queue(struct irq_work *work)
+{
+ if (in_nmi())
+ irq_work_queue(work);
+ else
+ irq_work_queue_on(work, housekeeping_any_cpu(HK_FLAG_MISC));
+}
+
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -3061,14 +3076,12 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)

if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&buffer->irq_work.work);
+ rb_irq_work_queue(&buffer->irq_work.work);
}

if (cpu_buffer->irq_work.waiters_pending) {
cpu_buffer->irq_work.waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
+ rb_irq_work_queue(&cpu_buffer->irq_work.work);
}

if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
@@ -3090,8 +3103,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)

cpu_buffer->irq_work.wakeup_full = true;
cpu_buffer->irq_work.full_waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
+ rb_irq_work_queue(&cpu_buffer->irq_work.work);
}

#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
--
2.31.1