[PATCH] tracing: Fix race issue between cpu buffer write and swap

From: Zheng Yejian
Date: Thu Aug 31 2023 - 09:27:49 EST


Warning happened in rb_end_commit() at code:
if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing)))

WARNING: CPU: 0 PID: 139 at kernel/trace/ring_buffer.c:3142
rb_commit+0x402/0x4a0
Call Trace:
ring_buffer_unlock_commit+0x42/0x250
trace_buffer_unlock_commit_regs+0x3b/0x250
trace_event_buffer_commit+0xe5/0x440
trace_event_buffer_reserve+0x11c/0x150
trace_event_raw_event_sched_switch+0x23c/0x2c0
__traceiter_sched_switch+0x59/0x80
__schedule+0x72b/0x1580
schedule+0x92/0x120
worker_thread+0xa0/0x6f0

It is because the race between writing event into cpu buffer and swapping
cpu buffer through file per_cpu/cpu0/snapshot:

Write on CPU 0 Swap buffer by per_cpu/cpu0/snapshot on CPU 1
-------- --------
tracing_snapshot_write()
[...]

ring_buffer_lock_reserve()
cpu_buffer = buffer->buffers[cpu]; // 1. Suppose find 'cpu_buffer_a';
[...]
rb_reserve_next_event()
[...]

ring_buffer_swap_cpu()
if (local_read(&cpu_buffer_a->committing))
goto out_dec;
if (local_read(&cpu_buffer_b->committing))
goto out_dec;
buffer_a->buffers[cpu] = cpu_buffer_b;
buffer_b->buffers[cpu] = cpu_buffer_a;
// 2. cpu_buffer has swapped here.

rb_start_commit(cpu_buffer);
if (unlikely(READ_ONCE(cpu_buffer->buffer)
!= buffer)) { // 3. This check passed due to 'cpu_buffer->buffer'
[...] // has not changed here.
return NULL;
}
cpu_buffer_b->buffer = buffer_a;
cpu_buffer_a->buffer = buffer_b;
[...]

// 4. Reserve event from 'cpu_buffer_a'.

ring_buffer_unlock_commit()
[...]
cpu_buffer = buffer->buffers[cpu]; // 5. Now find 'cpu_buffer_b' !!!
rb_commit(cpu_buffer)
rb_end_commit() // 6. WARN for the wrong 'committing' state !!!

Based on above analysis, we can easily reproduce by following testcase:
``` bash
#!/bin/bash

dmesg -n 7
sysctl -w kernel.panic_on_warn=1
TR=/sys/kernel/tracing
echo 7 > ${TR}/buffer_size_kb
echo "sched:sched_switch" > ${TR}/set_event
while [ true ]; do
echo 1 > ${TR}/per_cpu/cpu0/snapshot
done &
while [ true ]; do
echo 1 > ${TR}/per_cpu/cpu0/snapshot
done &
while [ true ]; do
echo 1 > ${TR}/per_cpu/cpu0/snapshot
done &
```

To fix it, IIUC, we can use smp_call_function_single() to do the swap on
the target cpu where the buffer is located, so that above race would be
avoided.

Fixes: f1affcaaa861 ("tracing: Add snapshot in the per_cpu trace directories")
Signed-off-by: Zheng Yejian <zhengyejian1@xxxxxxxxxx>
---
kernel/trace/trace.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8e64aaad5361..55bb08aaf392 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7618,6 +7618,11 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
return ret;
}

+static void tracing_swap_cpu_buffer(void *tr)
+{
+ update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
+}
+
static ssize_t
tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -7676,13 +7681,15 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
ret = tracing_alloc_snapshot_instance(tr);
if (ret < 0)
break;
- local_irq_disable();
/* Now, we're going to swap */
- if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
+ local_irq_disable();
update_max_tr(tr, current, smp_processor_id(), NULL);
- else
- update_max_tr_single(tr, current, iter->cpu_file);
- local_irq_enable();
+ local_irq_enable();
+ } else {
+ smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
+ (void *)tr, 1);
+ }
break;
default:
if (tr->allocated_snapshot) {
--
2.25.1