Re: [PATCH] perf: Fix missing SIGTRAPs due to pending_disable abuse

From: Marco Elver
Date: Tue Oct 04 2022 - 13:35:29 EST


On Tue, 4 Oct 2022 at 19:09, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Wed, Sep 28, 2022 at 04:55:46PM +0200, Marco Elver wrote:
> > On Wed, Sep 28, 2022 at 12:06PM +0200, Marco Elver wrote:
> >
> > > My second idea about introducing something like irq_work_raw_sync().
> > > Maybe it's not that crazy if it is actually safe. I expect this case
> > > where we need the irq_work_raw_sync() to be very very rare.
> >
> > The previous irq_work_raw_sync() forgot about irq_work_queue_on(). Alas,
> > I might still be missing something obvious, because "it's never that
> > easy". ;-)
> >
> > And for completeness, the full perf patch of what it would look like
> > together with irq_work_raw_sync() (consider it v1.5). It's already
> > survived some shorter stress tests and fuzzing.
>
> So.... I don't like it. But I cooked up the below, which _almost_ works :-/
>
> For some raisin it sometimes fails with 14999 out of 15000 events
> delivered and I've not yet figured out where it goes sideways. I'm
> currently thinking it's that sigtrap clear on OFF.
>
> Still, what do you think of the approach?

It looks reasonable, but obviously needs to pass tests. :-)
Also, see comment below (I think you're still turning signals
asynchronous, which we shouldn't do).

> ---
> include/linux/perf_event.h | 8 ++--
> kernel/events/core.c | 92 +++++++++++++++++++++++++---------------------
> 2 files changed, 55 insertions(+), 45 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index ee8b9ecdc03b..c54161719d37 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -736,9 +736,11 @@ struct perf_event {
> struct fasync_struct *fasync;
>
> /* delayed work for NMIs and such */
> - int pending_wakeup;
> - int pending_kill;
> - int pending_disable;
> + unsigned int pending_wakeup :1;
> + unsigned int pending_disable :1;
> + unsigned int pending_sigtrap :1;
> + unsigned int pending_kill :3;
> +
> unsigned long pending_addr; /* SIGTRAP */
> struct irq_work pending;
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 2621fd24ad26..8e5dbe971d9e 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2268,11 +2268,15 @@ event_sched_out(struct perf_event *event,
> event->pmu->del(event, 0);
> event->oncpu = -1;
>
> - if (READ_ONCE(event->pending_disable) >= 0) {
> - WRITE_ONCE(event->pending_disable, -1);
> + if (event->pending_disable) {
> + event->pending_disable = 0;
> perf_cgroup_event_disable(event, ctx);
> state = PERF_EVENT_STATE_OFF;
> }
> +
> + if (event->pending_sigtrap && state == PERF_EVENT_STATE_OFF)
> + event->pending_sigtrap = 0;
> +
> perf_event_set_state(event, state);
>
> if (!is_software_event(event))
> @@ -2463,8 +2467,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
>
> void perf_event_disable_inatomic(struct perf_event *event)
> {
> - WRITE_ONCE(event->pending_disable, smp_processor_id());
> - /* can fail, see perf_pending_event_disable() */
> + event->pending_disable = 1;
> irq_work_queue(&event->pending);
> }
>
> @@ -2527,6 +2530,9 @@ event_sched_in(struct perf_event *event,
> if (event->attr.exclusive)
> cpuctx->exclusive = 1;
>
> + if (event->pending_disable || event->pending_sigtrap)
> + irq_work_queue(&event->pending);
> +
> out:
> perf_pmu_enable(event->pmu);
>
> @@ -6440,47 +6446,40 @@ static void perf_sigtrap(struct perf_event *event)
> event->attr.type, event->attr.sig_data);
> }
>
> -static void perf_pending_event_disable(struct perf_event *event)
> +/*
> + * Deliver the pending work in-event-context or follow the context.
> + */
> +static void __perf_pending_event(struct perf_event *event)
> {
> - int cpu = READ_ONCE(event->pending_disable);
> + int cpu = READ_ONCE(event->oncpu);
>
> + /*
> + * If the event isn't running; we done. event_sched_in() will restart
> + * the irq_work when needed.
> + */
> if (cpu < 0)
> return;
>
> + /*
> + * Yay, we hit home and are in the context of the event.
> + */
> if (cpu == smp_processor_id()) {
> - WRITE_ONCE(event->pending_disable, -1);
> -
> - if (event->attr.sigtrap) {
> + if (event->pending_sigtrap) {
> + event->pending_sigtrap = 0;
> perf_sigtrap(event);
> - atomic_set_release(&event->event_limit, 1); /* rearm event */
> - return;
> }
> -
> - perf_event_disable_local(event);
> - return;
> + if (event->pending_disable) {
> + event->pending_disable = 0;
> + perf_event_disable_local(event);
> + }
> }
>
> /*
> - * CPU-A CPU-B
> - *
> - * perf_event_disable_inatomic()
> - * @pending_disable = CPU-A;
> - * irq_work_queue();
> - *
> - * sched-out
> - * @pending_disable = -1;
> - *
> - * sched-in
> - * perf_event_disable_inatomic()
> - * @pending_disable = CPU-B;
> - * irq_work_queue(); // FAILS
> - *
> - * irq_work_run()
> - * perf_pending_event()
> - *
> - * But the event runs on CPU-B and wants disabling there.
> + * Requeue if there's still any pending work left, make sure to follow
> + * where the event went.
> */
> - irq_work_queue_on(&event->pending, cpu);
> + if (event->pending_disable || event->pending_sigtrap)
> + irq_work_queue_on(&event->pending, cpu);

I considered making the irq_work "chase" the right CPU but it doesn't
work for sigtrap. This will make the signal asynchronous (it should be
synchronous), and the reason why I had to do irq_work_raw_sync().

Thanks,
-- Marco