Re: [PATCH V16 3/8] drivers: perf: arm_pmuv3: Enable branch stack sampling framework

From: Anshuman Khandual
Date: Sun Jan 28 2024 - 23:43:19 EST




On 1/25/24 19:14, Suzuki K Poulose wrote:
> On 25/01/2024 09:41, Anshuman Khandual wrote:
>> Branch stack sampling support i.e capturing branch records during execution
>> in core perf, rides along with normal HW events being scheduled on the PMU.
>> This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
>> with required HW implementation.
>>
>> ARMV8 PMU hardware support for branch stack sampling is indicated via a new
>> feature flag called 'has_branch_stack' that can be ascertained via probing.
>> This modifies current gate in armpmu_event_init() which blocks branch stack
>> sampling based perf events unconditionally. Instead allows such perf events
>> getting initialized on supporting PMU hardware.
>>
>> Branch stack sampling is enabled and disabled along with regular PMU events
>> . This adds required function callbacks in armv8pmu_branch_xxx() format, to
>> drive the PMU branch stack hardware when supported. This also adds fallback
>> stub definitions for these callbacks for PMUs which would not have required
>> support.
>>
>> If a task gets scheduled out, the current branch records get saved in the
>> task's context data, which can be later used to fill in the records upon an
>> event overflow. Hence, we enable PERF_ATTACH_TASK_DATA (event->attach_state
>> based flag) for branch stack requesting perf events. But this also requires
>> adding support for pmu::sched_task() callback to arm_pmu.
>>
>> Cc: Catalin Marinas <catalin.marinas@xxxxxxx>
>> Cc: Will Deacon <will@xxxxxxxxxx>
>> Cc: Mark Rutland <mark.rutland@xxxxxxx>
>> Cc: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx
>> Cc: linux-kernel@xxxxxxxxxxxxxxx
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@xxxxxxx>
>> ---
>> Changes in V16:
>>
>> - Renamed arm_brbe.h as arm_pmuv3_branch.h
>> - Updated perf_sample_save_brstack()'s new argument requirements with NULL
>>
>>   drivers/perf/arm_pmu.c          |  57 ++++++++++++-
>>   drivers/perf/arm_pmuv3.c        | 141 +++++++++++++++++++++++++++++++-
>>   drivers/perf/arm_pmuv3_branch.h |  50 +++++++++++
>>   include/linux/perf/arm_pmu.h    |  29 ++++++-
>>   include/linux/perf/arm_pmuv3.h  |   1 -
>>   5 files changed, 273 insertions(+), 5 deletions(-)
>>   create mode 100644 drivers/perf/arm_pmuv3_branch.h
>>
>> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
>> index 8458fe2cebb4..16f488ae7747 100644
>> --- a/drivers/perf/arm_pmu.c
>> +++ b/drivers/perf/arm_pmu.c
>> @@ -317,6 +317,15 @@ armpmu_del(struct perf_event *event, int flags)
>>       struct hw_perf_event *hwc = &event->hw;
>>       int idx = hwc->idx;
>>   +    if (has_branch_stack(event)) {
>> +        WARN_ON_ONCE(!hw_events->brbe_users);
>> +        hw_events->brbe_users--;
>
> ^^ Should we do this only if the event matches the sample type ? Put the other way around, what does brbe_users track ? The number of events that
> "share" the BRBE instance ? Or the number of active events that
> has_branch_stack() ?

Active perf events with has_branch_stack() irrespective of whether
there is branch_sample_type match or not i.e branch records might
be consumed or not.

>
>> +        if (!hw_events->brbe_users) {
>> +            hw_events->brbe_context = NULL;
>> +            hw_events->brbe_sample_type = 0;
>> +        }
>> +    }
>> +
>>       armpmu_stop(event, PERF_EF_UPDATE);
>>       hw_events->events[idx] = NULL;
>>       armpmu->clear_event_idx(hw_events, event);
>> @@ -333,6 +342,38 @@ armpmu_add(struct perf_event *event, int flags)
>>       struct hw_perf_event *hwc = &event->hw;
>>       int idx;
>>   +    if (has_branch_stack(event)) {
>> +        /*
>> +         * Reset branch records buffer if a new CPU bound event
>> +         * gets scheduled on a PMU. Otherwise existing branch
>> +         * records present in the buffer might just leak into
>> +         * such events.
>> +         *
>> +         * Also reset current 'hw_events->brbe_context' because
>> +         * any previous task bound event now would have lost an
>> +         * opportunity for continuous branch records.
>> +         */
>> +        if (!event->ctx->task) {
>> +            hw_events->brbe_context = NULL;
>> +            if (armpmu->branch_reset)
>> +                armpmu->branch_reset();
>> +        }
>> +
>> +        /*
>> +         * Reset branch records buffer if a new task event gets
>> +         * scheduled on a PMU which might have existing records.
>> +         * Otherwise older branch records present in the buffer
>> +         * might leak into the new task event.
>> +         */
>> +        if (event->ctx->task && hw_events->brbe_context != event->ctx) {
>> +            hw_events->brbe_context = event->ctx;
>> +            if (armpmu->branch_reset)
>> +                armpmu->branch_reset();
>> +        }
>> +        hw_events->brbe_users++;
>> +        hw_events->brbe_sample_type = event->attr.branch_sample_type;
>> +    }
>> +
>>       /* An event following a process won't be stopped earlier */
>>       if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
>>           return -ENOENT;
>> @@ -511,13 +552,24 @@ static int armpmu_event_init(struct perf_event *event)
>>           !cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
>>           return -ENOENT;
>>   -    /* does not support taken branch sampling */
>> -    if (has_branch_stack(event))
>> +    /*
>> +     * Branch stack sampling events are allowed
>> +     * only on PMU which has required support.
>> +     */
>> +    if (has_branch_stack(event) && !armpmu->has_branch_stack)
>>           return -EOPNOTSUPP;
>>         return __hw_perf_event_init(event);
>>   }
>>   +static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>> +{
>> +    struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>> +
>> +    if (armpmu->sched_task)
>> +        armpmu->sched_task(pmu_ctx, sched_in);
>> +}
>> +
>>   static void armpmu_enable(struct pmu *pmu)
>>   {
>>       struct arm_pmu *armpmu = to_arm_pmu(pmu);
>> @@ -864,6 +916,7 @@ struct arm_pmu *armpmu_alloc(void)
>>       }
>>         pmu->pmu = (struct pmu) {
>> +        .sched_task    = armpmu_sched_task,
>>           .pmu_enable    = armpmu_enable,
>>           .pmu_disable    = armpmu_disable,
>>           .event_init    = armpmu_event_init,
>> diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
>> index 23fa6c5da82c..9e17764a0929 100644
>> --- a/drivers/perf/arm_pmuv3.c
>> +++ b/drivers/perf/arm_pmuv3.c
>> @@ -26,6 +26,7 @@
>>   #include <linux/nmi.h>
>>     #include <asm/arm_pmuv3.h>
>> +#include "arm_pmuv3_branch.h"
>>     /* ARMv8 Cortex-A53 specific event types. */
>>   #define ARMV8_A53_PERFCTR_PREF_LINEFILL                0xC2
>> @@ -829,14 +830,56 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu)
>>       armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
>>         kvm_vcpu_pmu_resync_el0();
>> +    if (cpu_pmu->has_branch_stack)
>> +        armv8pmu_branch_enable(cpu_pmu);
>
> Is there a reason why do this after kvm_vcpu_pmu_resync_el0() ?
> Ideally, we should get counting as soon as the PMU is on ?

But if the kernel is also being traced, branches from kvm_vcpu_pmu_resync_el0()
might get into final BRBE samples as well. Placing branch_enable() at the last
help avoid such situations. Although the same could also be reasoned about
normal PMU event counters being enabled via armv8pmu_pmcr_write() as well. So
armv8pmu_branch_enable() could be moved before kvm_vcpu_pmu_resync_el0().

>
>>   }
>>     static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
>>   {
>> +    if (cpu_pmu->has_branch_stack)
>> +        armv8pmu_branch_disable();
>> +
>>       /* Disable all counters */
>>       armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
>>   }
>>   +static void read_branch_records(struct pmu_hw_events *cpuc,
>> +                struct perf_event *event,
>> +                struct perf_sample_data *data,
>> +                bool *branch_captured)
>> +{
>> +    /*
>> +     * CPU specific branch records buffer must have been allocated already
>> +     * for the hardware records to be captured and processed further.
>> +     */
>> +    if (WARN_ON(!cpuc->branches))
>> +        return;
>> +
>> +    /*
>> +     * Overflowed event's branch_sample_type does not match the configured
>> +     * branch filters in the BRBE HW. So the captured branch records here
>> +     * cannot be co-related to the overflowed event. Report to the user as
>> +     * if no branch records have been captured, and flush branch records.
>> +     * The same scenario is applicable when the current task context does
>> +     * not match with overflown event.
>> +     */
>> +    if ((cpuc->brbe_sample_type != event->attr.branch_sample_type) ||
>> +        (event->ctx->task && cpuc->brbe_context != event->ctx))
>> +        return;
>> +
>> +    /*
>> +     * Read the branch records from the hardware once after the PMU IRQ
>> +     * has been triggered but subsequently same records can be used for
>> +     * other events that might have been overflowed simultaneously thus
>> +     * saving much CPU cycles.
>> +     */
>> +    if (!*branch_captured) {
>> +        armv8pmu_branch_read(cpuc, event);
>> +        *branch_captured = true;
>> +    }
>> +    perf_sample_save_brstack(data, event, &cpuc->branches->branch_stack, NULL);
>> +}
>> +
>>   static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>   {
>>       u32 pmovsr;
>> @@ -844,6 +887,7 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>       struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
>>       struct pt_regs *regs;
>>       int idx;
>> +    bool branch_captured = false;
>>         /*
>>        * Get and reset the IRQ flags
>> @@ -887,6 +931,13 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>           if (!armpmu_event_set_period(event))
>>               continue;
>>   +        /*
>> +         * PMU IRQ should remain asserted until all branch records
>> +         * are captured and processed into struct perf_sample_data.
>> +         */
>> +        if (has_branch_stack(event) && cpu_pmu->has_branch_stack)
>
> nit: Do we really need the cpu_pmu->has_branch_stack check ? The event
> wouldn't reach here if the PMU doesn't supported it ?

This is just an additional check - has_branch_stack() based event might
not have reached here otherwise i.e without the PMU supporting branch
records.

>
>> +            read_branch_records(cpuc, event, &data, &branch_captured);
>> +
>>           /*
>>            * Perf event overflow will queue the processing of the event as
>>            * an irq_work which will be taken care of in the handling of
>> @@ -896,6 +947,8 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
>>               cpu_pmu->disable(event);
>>       }
>>       armv8pmu_start(cpu_pmu);
>> +    if (cpu_pmu->has_branch_stack)
>> +        armv8pmu_branch_reset();
>>         return IRQ_HANDLED;
>>   }
>> @@ -985,6 +1038,24 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
>>       return event->hw.idx;
>>   }
>>   +static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
>> +{
>> +    struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
>> +    void *task_ctx = pmu_ctx->task_ctx_data;
>> +
>> +    if (armpmu->has_branch_stack) {
>> +        /* Save branch records in task_ctx on sched out */
>> +        if (task_ctx && !sched_in) {
>> +            armv8pmu_branch_save(armpmu, task_ctx);
>> +            return;
>> +        }
>> +
>> +        /* Reset branch records on sched in */
>> +        if (sched_in)
>> +            armv8pmu_branch_reset();
>> +    }
>> +}
>> +
>>   /*
>>    * Add an event filter to a given event.
>>    */
>> @@ -1077,6 +1148,9 @@ static void armv8pmu_reset(void *info)
>>           pmcr |= ARMV8_PMU_PMCR_LP;
>>         armv8pmu_pmcr_write(pmcr);
>> +
>> +    if (cpu_pmu->has_branch_stack)
>> +        armv8pmu_branch_reset();
>>   }
>>     static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
>> @@ -1114,6 +1188,20 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
>>         hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);
>>   +    if (has_branch_stack(event)) {
>> +        if (!armv8pmu_branch_attr_valid(event))
>> +            return -EOPNOTSUPP;
>> +
>> +        /*
>> +         * If a task gets scheduled out, the current branch records
>> +         * get saved in the task's context data, which can be later
>> +         * used to fill in the records upon an event overflow. Let's
>> +         * enable PERF_ATTACH_TASK_DATA in 'event->attach_state' for
>> +         * all branch stack sampling perf events.
>> +         */
>> +        event->attach_state |= PERF_ATTACH_TASK_DATA;
>> +    }
>> +
>>       /*
>>        * CHAIN events only work when paired with an adjacent counter, and it
>>        * never makes sense for a user to open one in isolation, as they'll be
>> @@ -1229,6 +1317,41 @@ static void __armv8pmu_probe_pmu(void *info)
>>           cpu_pmu->reg_pmmir = read_pmmir();
>>       else
>>           cpu_pmu->reg_pmmir = 0;
>> +
>> +    /*
>> +     * BRBE is being probed on a single cpu for a
>> +     * given PMU. The remaining cpus, are assumed
>> +     * to have the exact same BRBE implementation.
>> +     */
>> +    armv8pmu_branch_probe(cpu_pmu);
>> +}
>> +
>> +static int branch_records_alloc(struct arm_pmu *armpmu)
>> +{
>> +    struct branch_records __percpu *records;
>> +    int cpu;
>> +
>> +    records = alloc_percpu_gfp(struct branch_records, GFP_KERNEL);
>> +    if (!records)
>> +        return -ENOMEM;
>> +
>> +    /*
>> +     * percpu memory allocated for 'records' gets completely consumed
>> +     * here, and never required to be freed up later. So permanently
>> +     * losing access to this anchor i.e 'records' is acceptable.
>> +     *
>> +     * Otherwise this allocation handle would have to be saved up for
>> +     * free_percpu() release later if required.
>> +     */
>> +    for_each_possible_cpu(cpu) {
>> +        struct pmu_hw_events *events_cpu;
>> +        struct branch_records *records_cpu;
>> +
>> +        events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
>> +        records_cpu = per_cpu_ptr(records, cpu);
>> +        events_cpu->branches = records_cpu;
>> +    }
>> +    return 0;
>>   }
>>     static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>> @@ -1245,7 +1368,21 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
>>       if (ret)
>>           return ret;
>>   -    return probe.present ? 0 : -ENODEV;
>> +    if (!probe.present)
>> +        return -ENODEV;
>> +
>> +    if (cpu_pmu->has_branch_stack) {
>> +        ret = armv8pmu_task_ctx_cache_alloc(cpu_pmu);
>> +        if (ret)
>> +            return ret;
>> +
>> +        ret = branch_records_alloc(cpu_pmu);
>> +        if (ret) {
>> +            armv8pmu_task_ctx_cache_free(cpu_pmu);
>> +            return ret;
>> +        }
>> +    }
>> +    return 0;
>>   }
>>     static void armv8pmu_disable_user_access_ipi(void *unused)
>> @@ -1304,6 +1441,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
>>       cpu_pmu->set_event_filter    = armv8pmu_set_event_filter;
>>         cpu_pmu->pmu.event_idx        = armv8pmu_user_event_idx;
>> +    cpu_pmu->sched_task        = armv8pmu_sched_task;
>> +    cpu_pmu->branch_reset        = armv8pmu_branch_reset;
>>         cpu_pmu->name            = name;
>>       cpu_pmu->map_event        = map_event;
>> diff --git a/drivers/perf/arm_pmuv3_branch.h b/drivers/perf/arm_pmuv3_branch.h
>> new file mode 100644
>> index 000000000000..609e4d4ccac6
>> --- /dev/null
>> +++ b/drivers/perf/arm_pmuv3_branch.h
>> @@ -0,0 +1,50 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Branch Record Buffer Extension Helpers.
>> + *
>> + * Copyright (C) 2022-2023 ARM Limited
>> + *
>> + * Author: Anshuman Khandual <anshuman.khandual@xxxxxxx>
>> + */
>> +#include <linux/perf/arm_pmu.h>
>> +
>> +static inline void armv8pmu_branch_reset(void)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_probe(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> +
>> +static inline bool armv8pmu_branch_attr_valid(struct perf_event *event)
>> +{
>> +    WARN_ON_ONCE(!has_branch_stack(event));
>> +    return false;
>> +}
>> +
>> +static inline void armv8pmu_branch_enable(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_disable(void)
>> +{
>> +}
>> +
>> +static inline void armv8pmu_branch_read(struct pmu_hw_events *cpuc,
>> +                    struct perf_event *event)
>> +{
>> +    WARN_ON_ONCE(!has_branch_stack(event));
>> +}
>> +
>> +static inline void armv8pmu_branch_save(struct arm_pmu *arm_pmu, void *ctx)
>> +{
>> +}
>> +
>> +static inline int armv8pmu_task_ctx_cache_alloc(struct arm_pmu *arm_pmu)
>> +{
>> +    return 0;
>> +}
>> +
>> +static inline void armv8pmu_task_ctx_cache_free(struct arm_pmu *arm_pmu)
>> +{
>> +}
>> diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
>> index b3b34f6670cf..8cfcc735c0f7 100644
>> --- a/include/linux/perf/arm_pmu.h
>> +++ b/include/linux/perf/arm_pmu.h
>> @@ -46,6 +46,18 @@ static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_63BIT) == ARMPMU_EVT_63BIT);
>>       },                                \
>>   }
>>   +/*
>> + * Maximum branch record entries which could be processed
>> + * for core perf branch stack sampling support, regardless
>> + * of the hardware support available on a given ARM PMU.
>> + */
>> +#define MAX_BRANCH_RECORDS 64
>> +
>> +struct branch_records {
>> +    struct perf_branch_stack    branch_stack;
>> +    struct perf_branch_entry    branch_entries[MAX_BRANCH_RECORDS];
>> +};
>> +
>>   /* The events for a given PMU register set. */
>>   struct pmu_hw_events {
>>       /*
>> @@ -66,6 +78,17 @@ struct pmu_hw_events {
>>       struct arm_pmu        *percpu_pmu;
>>         int irq;
>> +
>> +    struct branch_records    *branches;
>> +
>> +    /* Active context for task events */
>> +    void            *brbe_context;
>> +
>> +    /* Active events requesting branch records */
>
> Please see my comment above.

This is true - brbe_users tracks number of active perf events requesting
branch records, irrespective of whether their branch_sample_type matches
with each other or not.

>
>> +    unsigned int        brbe_users;
>> +
>> +    /* Active branch sample type filters */
>> +    unsigned long        brbe_sample_type;
>>   };
>>     enum armpmu_attr_groups {
>> @@ -96,8 +119,12 @@ struct arm_pmu {
>>       void        (*stop)(struct arm_pmu *);
>>       void        (*reset)(void *);
>>       int        (*map_event)(struct perf_event *event);
>> +    void        (*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
>> +    void        (*branch_reset)(void);
>>       int        num_events;
>> -    bool        secure_access; /* 32-bit ARM only */
>> +    unsigned int    secure_access    : 1, /* 32-bit ARM only */
>> +            has_branch_stack: 1, /* 64-bit ARM only */
>> +            reserved    : 30;
>>   #define ARMV8_PMUV3_MAX_COMMON_EVENTS        0x40
>>       DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
>>   #define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE    0x4000
>> diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
>> index 46377e134d67..c3e7d2cfb737 100644
>> --- a/include/linux/perf/arm_pmuv3.h
>> +++ b/include/linux/perf/arm_pmuv3.h
>> @@ -308,5 +308,4 @@
>>           default: WARN(1, "Invalid PMEV* index\n");    \
>>           }                        \
>>       } while (0)
>> -
>
> nit: Unrelated change ?

Right, will fix and fold.