Re: [PATCH] perf: Fix missing SIGTRAPs due to pending_disable abuse

From: Peter Zijlstra
Date: Tue Oct 04 2022 - 13:09:47 EST


On Wed, Sep 28, 2022 at 04:55:46PM +0200, Marco Elver wrote:
> On Wed, Sep 28, 2022 at 12:06PM +0200, Marco Elver wrote:
>
> > My second idea about introducing something like irq_work_raw_sync().
> > Maybe it's not that crazy if it is actually safe. I expect this case
> > where we need the irq_work_raw_sync() to be very very rare.
>
> The previous irq_work_raw_sync() forgot about irq_work_queue_on(). Alas,
> I might still be missing something obvious, because "it's never that
> easy". ;-)
>
> And for completeness, the full perf patch of what it would look like
> together with irq_work_raw_sync() (consider it v1.5). It's already
> survived some shorter stress tests and fuzzing.

So.... I don't like it. But I cooked up the below, which _almost_ works :-/

For some raisin it sometimes fails with 14999 out of 15000 events
delivered and I've not yet figured out where it goes sideways. I'm
currently thinking it's that sigtrap clear on OFF.

Still, what do you think of the approach?

---
include/linux/perf_event.h | 8 ++--
kernel/events/core.c | 92 +++++++++++++++++++++++++---------------------
2 files changed, 55 insertions(+), 45 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ee8b9ecdc03b..c54161719d37 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -736,9 +736,11 @@ struct perf_event {
struct fasync_struct *fasync;

/* delayed work for NMIs and such */
- int pending_wakeup;
- int pending_kill;
- int pending_disable;
+ unsigned int pending_wakeup :1;
+ unsigned int pending_disable :1;
+ unsigned int pending_sigtrap :1;
+ unsigned int pending_kill :3;
+
unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending;

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2621fd24ad26..8e5dbe971d9e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2268,11 +2268,15 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;

- if (READ_ONCE(event->pending_disable) >= 0) {
- WRITE_ONCE(event->pending_disable, -1);
+ if (event->pending_disable) {
+ event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
+
+ if (event->pending_sigtrap && state == PERF_EVENT_STATE_OFF)
+ event->pending_sigtrap = 0;
+
perf_event_set_state(event, state);

if (!is_software_event(event))
@@ -2463,8 +2467,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
- WRITE_ONCE(event->pending_disable, smp_processor_id());
- /* can fail, see perf_pending_event_disable() */
+ event->pending_disable = 1;
irq_work_queue(&event->pending);
}

@@ -2527,6 +2530,9 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;

+ if (event->pending_disable || event->pending_sigtrap)
+ irq_work_queue(&event->pending);
+
out:
perf_pmu_enable(event->pmu);

@@ -6440,47 +6446,40 @@ static void perf_sigtrap(struct perf_event *event)
event->attr.type, event->attr.sig_data);
}

-static void perf_pending_event_disable(struct perf_event *event)
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_event(struct perf_event *event)
{
- int cpu = READ_ONCE(event->pending_disable);
+ int cpu = READ_ONCE(event->oncpu);

+ /*
+ * If the event isn't running; we done. event_sched_in() will restart
+ * the irq_work when needed.
+ */
if (cpu < 0)
return;

+ /*
+ * Yay, we hit home and are in the context of the event.
+ */
if (cpu == smp_processor_id()) {
- WRITE_ONCE(event->pending_disable, -1);
-
- if (event->attr.sigtrap) {
+ if (event->pending_sigtrap) {
+ event->pending_sigtrap = 0;
perf_sigtrap(event);
- atomic_set_release(&event->event_limit, 1); /* rearm event */
- return;
}
-
- perf_event_disable_local(event);
- return;
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ perf_event_disable_local(event);
+ }
}

/*
- * CPU-A CPU-B
- *
- * perf_event_disable_inatomic()
- * @pending_disable = CPU-A;
- * irq_work_queue();
- *
- * sched-out
- * @pending_disable = -1;
- *
- * sched-in
- * perf_event_disable_inatomic()
- * @pending_disable = CPU-B;
- * irq_work_queue(); // FAILS
- *
- * irq_work_run()
- * perf_pending_event()
- *
- * But the event runs on CPU-B and wants disabling there.
+ * Requeue if there's still any pending work left, make sure to follow
+ * where the event went.
*/
- irq_work_queue_on(&event->pending, cpu);
+ if (event->pending_disable || event->pending_sigtrap)
+ irq_work_queue_on(&event->pending, cpu);
}

static void perf_pending_event(struct irq_work *entry)
@@ -6488,19 +6487,23 @@ static void perf_pending_event(struct irq_work *entry)
struct perf_event *event = container_of(entry, struct perf_event, pending);
int rctx;

- rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
+ rctx = perf_swevent_get_recursion_context();

- perf_pending_event_disable(event);
-
+ /*
+ * The wakeup isn't bound to the context of the event -- it can happen
+ * irrespective of where the event is.
+ */
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
}

+ __perf_pending_event(event);
+
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
@@ -9203,11 +9206,20 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_addr = data->addr;
-
perf_event_disable_inatomic(event);
}

+ if (event->attr.sigtrap) {
+ /*
+ * Should not be able to return to user space without processing
+ * pending_sigtrap (kernel events can overflow multiple times).
+ */
+ WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+ event->pending_sigtrap = 1;
+ event->pending_addr = data->addr;
+ irq_work_queue(&event->pending);
+ }
+
READ_ONCE(event->overflow_handler)(event, data, regs);

if (*perf_event_fasync(event) && event->pending_kill) {
@@ -11528,7 +11540,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,


init_waitqueue_head(&event->waitq);
- event->pending_disable = -1;
init_irq_work(&event->pending, perf_pending_event);

mutex_init(&event->mmap_mutex);
@@ -11551,9 +11562,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (parent_event)
event->event_caps = parent_event->event_caps;

- if (event->attr.sigtrap)
- atomic_set(&event->event_limit, 1);
-
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*