[PATCH 2/2] perf: Fix missing SIGTRAPs due to pending_disable abuse

From: Marco Elver
Date: Fri Sep 23 2022 - 10:43:19 EST


Due to the implementation of how SIGTRAP are delivered if
perf_event_attr::sigtrap is set, we've noticed 3 issues:

1. Missing SIGTRAP due to a race with event_sched_out() (more
details below).

2. Hardware PMU events being disabled due to returning 1 from
perf_event_overflow(). The only way to re-enable the event is
for user space to first "properly" disable the event and then
re-enable it.

3. The inability to automatically disable an event after a
specified number of overflows via PERF_EVENT_IOC_REFRESH.

The worst of the 3 issues is problem (1), which occurs when a
pending_disable is "consumed" by a racing event_sched_out(), observed as
follows:

CPU0 | CPU1
--------------------------------+---------------------------
__perf_event_overflow() |
perf_event_disable_inatomic() |
pending_disable = CPU0 | ...
| _perf_event_enable()
| event_function_call()
| task_function_call()
| /* sends IPI to CPU0 */
<IPI> | ...
__perf_event_enable() +---------------------------
ctx_resched()
task_ctx_sched_out()
ctx_sched_out()
group_sched_out()
event_sched_out()
pending_disable = -1
</IPI>
<IRQ-work>
perf_pending_event()
perf_pending_event_disable()
/* Fails to send SIGTRAP because no pending_disable! */
</IRQ-work>

In the above case, not only is that particular SIGTRAP missed, but also
all future SIGTRAPs because 'event_limit' is not reset back to 1.

To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction
of a separate 'pending_sigtrap', no longer using 'event_limit' and
'pending_disable' for its delivery.

During testing, this also revealed several more possible races between
reschedules and pending IRQ work; see code comments for details.

Doing so makes it possible to use 'event_limit' normally (thereby
enabling use of PERF_EVENT_IOC_REFRESH), perf_event_overflow() no longer
returns 1 on SIGTRAP causing disabling of hardware PMUs, and finally the
race is no longer possible due to event_sched_out() not consuming
'pending_disable'.

Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events")
Reported-by: Dmitry Vyukov <dvyukov@xxxxxxxxxx>
Debugged-by: Dmitry Vyukov <dvyukov@xxxxxxxxxx>
Signed-off-by: Marco Elver <elver@xxxxxxxxxx>
---
v2:
* Use irq_work_raw_sync().
---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 64 +++++++++++++++++++++++++++++++-------
2 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 907b0e3f1318..c119fa7b70d6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -740,6 +740,7 @@ struct perf_event {
int pending_wakeup;
int pending_kill;
int pending_disable;
+ int pending_sigtrap;
unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending;

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c37ba0909078..6ba02a1b5c5d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2527,6 +2527,15 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;

+ if (event->pending_sigtrap) {
+ /*
+ * The task and event might have been moved to another CPU:
+ * queue another IRQ work. See perf_pending_event_sigtrap().
+ */
+ irq_work_raw_sync(&event->pending); /* Syncs if pending on other CPU. */
+ irq_work_queue(&event->pending);
+ }
+
out:
perf_pmu_enable(event->pmu);

@@ -6446,6 +6455,37 @@ static void perf_sigtrap(struct perf_event *event)
event->attr.type, event->attr.sig_data);
}

+static void perf_pending_event_sigtrap(struct perf_event *event)
+{
+ if (!event->pending_sigtrap)
+ return;
+
+ /*
+ * If we're racing with disabling of the event, consume pending_sigtrap
+ * and don't send the SIGTRAP. This avoids potentially delaying a signal
+ * indefinitely (oncpu mismatch) until the event is enabled again, which
+ * could happen after already returning to user space; in that case the
+ * signal would erroneously become asynchronous.
+ */
+ if (event->state == PERF_EVENT_STATE_OFF) {
+ event->pending_sigtrap = 0;
+ return;
+ }
+
+ /*
+ * Only process this pending SIGTRAP if this IRQ work is running on the
+ * right CPU: the scheduler is able to run before the IRQ work, which
+ * moved the task to another CPU. In event_sched_in() another IRQ work
+ * is scheduled, so that the signal is not lost; given the kernel has
+ * not yet returned to user space, the signal remains synchronous.
+ */
+ if (READ_ONCE(event->oncpu) != smp_processor_id())
+ return;
+
+ event->pending_sigtrap = 0;
+ perf_sigtrap(event);
+}
+
static void perf_pending_event_disable(struct perf_event *event)
{
int cpu = READ_ONCE(event->pending_disable);
@@ -6455,13 +6495,6 @@ static void perf_pending_event_disable(struct perf_event *event)

if (cpu == smp_processor_id()) {
WRITE_ONCE(event->pending_disable, -1);
-
- if (event->attr.sigtrap) {
- perf_sigtrap(event);
- atomic_set_release(&event->event_limit, 1); /* rearm event */
- return;
- }
-
perf_event_disable_local(event);
return;
}
@@ -6500,6 +6533,7 @@ static void perf_pending_event(struct irq_work *entry)
* and we won't recurse 'further'.
*/

+ perf_pending_event_sigtrap(event);
perf_pending_event_disable(event);

if (event->pending_wakeup) {
@@ -9209,11 +9243,20 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_addr = data->addr;
-
perf_event_disable_inatomic(event);
}

+ if (event->attr.sigtrap) {
+ /*
+ * Should not be able to return to user space without processing
+ * pending_sigtrap (kernel events can overflow multiple times).
+ */
+ WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+ event->pending_sigtrap = 1;
+ event->pending_addr = data->addr;
+ irq_work_queue(&event->pending);
+ }
+
READ_ONCE(event->overflow_handler)(event, data, regs);

if (*perf_event_fasync(event) && event->pending_kill) {
@@ -11557,9 +11600,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (parent_event)
event->event_caps = parent_event->event_caps;

- if (event->attr.sigtrap)
- atomic_set(&event->event_limit, 1);
-
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*
--
2.37.3.998.g577e59143f-goog


--bwaiqnhq9mMzxU4e--