Re: [PATCH RFC V2 1/4] perf/core: Add aux_pause, aux_resume, aux_start_paused

From: Adrian Hunter
Date: Fri Jan 05 2024 - 07:58:03 EST


On 20/12/23 19:41, Suzuki K Poulose wrote:
> On 08/12/2023 17:24, Adrian Hunter wrote:
>> Hardware traces, such as instruction traces, can produce a vast amount of
>> trace data, so being able to reduce tracing to more specific circumstances
>> can be useful.
>>
>> The ability to pause or resume tracing when another event happens, can do
>> that.
>>
>> Add ability for an event to "pause" or "resume" AUX area tracing.
>>
>> Add aux_pause bit to perf_event_attr to indicate that, if the event
>> happens, the associated AUX area tracing should be paused. Ditto
>> aux_resume. Do not allow aux_pause and aux_resume to be set together.
>>
>> Add aux_start_paused bit to perf_event_attr to indicate to an AUX area
>> event that it should start in a "paused" state.
>>
>> Add aux_paused to struct perf_event for AUX area events to keep track of
>> the "paused" state. aux_paused is initialized to aux_start_paused.
>>
>> Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start()
>> callbacks. Call as needed, during __perf_event_output(). Add
>> aux_in_pause_resume to struct perf_buffer to prevent races with the NMI
>> handler. Pause/resume in NMI context will miss out if it coincides with
>> another pause/resume.
>>
>> To use aux_pause or aux_resume, an event must be in a group with the AUX
>> area event as the group leader.
>>
>> Example (requires Intel PT and tools patches also):
>>
>>   $ perf record --kcore -e '{intel_pt/aux-start-paused/k,syscalls:sys_enter_newuname/aux-resume/,syscalls:sys_exit_newuname/aux-pause/}' uname
>>   Linux
>>   [ perf record: Woken up 1 times to write data ]
>>   [ perf record: Captured and wrote 0.041 MB perf.data ]
>>   $ perf script --call-trace
>>   uname    5712 [007]    83.855580930: name: 0x7ffd9dcebec0
>>   uname    5712 [007]    83.855582518:  psb offs: 0
>>   uname    5712 [007]    83.855582518:  cbr: 42 freq: 4205 MHz (150%)
>>   uname    5712 [007]    83.855582723: ([kernel.kallsyms])    debug_smp_processor_id
>>   uname    5712 [007]    83.855582723: ([kernel.kallsyms])    __x64_sys_newuname
>>   uname    5712 [007]    83.855582723: ([kernel.kallsyms])        down_read
>>   uname    5712 [007]    83.855582723: ([kernel.kallsyms])            __cond_resched
>>   uname    5712 [007]    83.855582932: ([kernel.kallsyms])            preempt_count_add
>>   uname    5712 [007]    83.855582932: ([kernel.kallsyms])                in_lock_functions
>>   uname    5712 [007]    83.855582932: ([kernel.kallsyms])            preempt_count_sub
>>   uname    5712 [007]    83.855582932: ([kernel.kallsyms])        up_read
>>   uname    5712 [007]    83.855582932: ([kernel.kallsyms])            preempt_count_add
>>   uname    5712 [007]    83.855583348: ([kernel.kallsyms])                in_lock_functions
>>   uname    5712 [007]    83.855583348: ([kernel.kallsyms])            preempt_count_sub
>>   uname    5712 [007]    83.855583348: ([kernel.kallsyms])        _copy_to_user
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])    syscall_exit_to_user_mode
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])        syscall_exit_work
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])            perf_syscall_exit
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                debug_smp_processor_id
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                perf_trace_buf_alloc
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                    perf_swevent_get_recursion_context
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                        debug_smp_processor_id
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                    debug_smp_processor_id
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                perf_tp_event
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                    perf_trace_buf_update
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                        tracing_gen_ctx_irq_test
>>   uname    5712 [007]    83.855583557: ([kernel.kallsyms])                    perf_swevent_event
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                        __perf_event_account_interrupt
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                            __this_cpu_preempt_check
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                        perf_event_output_forward
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                            perf_event_aux_pause
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                                ring_buffer_get
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                                    __rcu_read_lock
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                                    __rcu_read_unlock
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                                pt_event_stop
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                                    debug_smp_processor_id
>>   uname    5712 [007]    83.855583765: ([kernel.kallsyms])                                    debug_smp_processor_id
>>   uname    5712 [007]    83.855583973: ([kernel.kallsyms])                                    native_write_msr
>>   uname    5712 [007]    83.855583973: ([kernel.kallsyms])                                    native_write_msr
>>   uname    5712 [007]    83.855584175: 0x0
>>
>> Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
>> ---
>>   include/linux/perf_event.h      | 15 +++++++
>>   include/uapi/linux/perf_event.h | 11 ++++-
>>   kernel/events/core.c            | 72 +++++++++++++++++++++++++++++++--
>>   kernel/events/internal.h        |  1 +
>>   4 files changed, 95 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index e85cd1c0eaf3..252c4aac3b79 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -291,6 +291,7 @@ struct perf_event_pmu_context;
>>   #define PERF_PMU_CAP_NO_EXCLUDE            0x0040
>>   #define PERF_PMU_CAP_AUX_OUTPUT            0x0080
>>   #define PERF_PMU_CAP_EXTENDED_HW_TYPE        0x0100
>> +#define PERF_PMU_CAP_AUX_PAUSE            0x0200
>>     struct perf_output_handle;
>>   @@ -363,6 +364,8 @@ struct pmu {
>>   #define PERF_EF_START    0x01        /* start the counter when adding    */
>>   #define PERF_EF_RELOAD    0x02        /* reload the counter when starting */
>>   #define PERF_EF_UPDATE    0x04        /* update the counter when stopping */
>> +#define PERF_EF_PAUSE    0x08        /* AUX area event, pause tracing */
>> +#define PERF_EF_RESUME    0x10        /* AUX area event, resume tracing */
>>         /*
>>        * Adds/Removes a counter to/from the PMU, can be done inside a
>> @@ -402,6 +405,15 @@ struct pmu {
>>        *
>>        * ->start() with PERF_EF_RELOAD will reprogram the counter
>>        *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
>> +     *
>> +     * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
>> +     * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
>> +     * PERF_EF_RESUME.
>> +     *
>> +     * ->start() with PERF_EF_RESUME will start as simply as possible but
>> +     * only if the counter is not otherwise stopped. Will not overlap
>> +     * another ->start() with PERF_EF_RESUME nor ->stop() with
>> +     * PERF_EF_PAUSE.
>>        */
>>       void (*start)            (struct perf_event *event, int flags);
>>       void (*stop)            (struct perf_event *event, int flags);
>> @@ -797,6 +809,9 @@ struct perf_event {
>>       /* for aux_output events */
>>       struct perf_event        *aux_event;
>>   +    /* for AUX area events */
>> +    unsigned int            aux_paused;
>> +
>>       void (*destroy)(struct perf_event *);
>>       struct rcu_head            rcu_head;
>>   diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 39c6a250dd1b..437bc2a8d50c 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -507,7 +507,16 @@ struct perf_event_attr {
>>       __u16    sample_max_stack;
>>       __u16    __reserved_2;
>>       __u32    aux_sample_size;
>> -    __u32    __reserved_3;
>> +
>> +    union {
>> +        __u32    aux_output_cfg;
>> +        struct {
>> +            __u64    aux_pause        :  1, /* on overflow, pause AUX area tracing */
>
> Did you mean __u32 ? Otherwise look good to me.

True, thanks!