Re: [PATCH v4 02/44] perf stat: Introduce skippable evsels

From: Liang, Kan
Date: Wed May 03 2023 - 16:57:42 EST




On 2023-05-02 6:38 p.m., Ian Rogers wrote:
> Perf stat with no arguments will use default events and metrics. These
> events may fail to open even with kernel and hypervisor disabled. When
> these fail then the permissions error appears even though they were
> implicitly selected. This is particularly a problem with the automatic
> selection of the TopdownL1 metric group on certain architectures like
> Skylake:
>
> '''
> $ perf stat true
> Error:
> Access to performance monitoring and observability operations is limited.
> Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open
> access to performance monitoring and observability operations for processes
> without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.
> More information can be found at 'Perf events and tool security' document:
> https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html
> perf_event_paranoid setting is 2:
> -1: Allow use of (almost) all events by all users
> Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
>> = 0: Disallow raw and ftrace function tracepoint access
>> = 1: Disallow CPU event access
>> = 2: Disallow kernel profiling
> To make the adjusted perf_event_paranoid setting permanent preserve it
> in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)
> '''
>
> This patch adds skippable evsels that when they fail to open won't
> cause termination and will appear as "<not supported>" in output. The
> TopdownL1 events, from the metric group, are marked as skippable. This
> turns the failure above to:
>
> '''
> $ perf stat perf bench internals synthesize
> Computing performance of single threaded perf event synthesis by
> synthesizing events on the perf process itself:
> Average synthesis took: 49.287 usec (+- 0.083 usec)
> Average num. events: 3.000 (+- 0.000)
> Average time per event 16.429 usec
> Average data synthesis took: 49.641 usec (+- 0.085 usec)
> Average num. events: 11.000 (+- 0.000)
> Average time per event 4.513 usec
>
> Performance counter stats for 'perf bench internals synthesize':
>
> 1,222.38 msec task-clock:u # 0.993 CPUs utilized
> 0 context-switches:u # 0.000 /sec
> 0 cpu-migrations:u # 0.000 /sec
> 162 page-faults:u # 132.529 /sec
> 774,445,184 cycles:u # 0.634 GHz (49.61%)
> 1,640,969,811 instructions:u # 2.12 insn per cycle (59.67%)
> 302,052,148 branches:u # 247.102 M/sec (59.69%)
> 1,807,718 branch-misses:u # 0.60% of all branches (59.68%)
> 5,218,927 CPU_CLK_UNHALTED.REF_XCLK:u # 4.269 M/sec
> # 17.3 % tma_frontend_bound
> # 56.4 % tma_retiring
> # nan % tma_backend_bound
> # nan % tma_bad_speculation (60.01%)
> 536,580,469 IDQ_UOPS_NOT_DELIVERED.CORE:u # 438.965 M/sec (60.33%)
> <not supported> INT_MISC.RECOVERY_CYCLES_ANY:u
> 5,223,936 CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE:u # 4.274 M/sec (40.31%)
> 774,127,250 CPU_CLK_UNHALTED.THREAD:u # 633.297 M/sec (50.34%)
> 1,746,579,518 UOPS_RETIRED.RETIRE_SLOTS:u # 1.429 G/sec (50.12%)
> 1,940,625,702 UOPS_ISSUED.ANY:u # 1.588 G/sec (49.70%)
>
> 1.231055525 seconds time elapsed
>
> 0.258327000 seconds user
> 0.965749000 seconds sys
> '''
>
> The event INT_MISC.RECOVERY_CYCLES_ANY:u is skipped as it can't be
> opened with paranoia 2 on Skylake. With a lower paranoia, or as root,
> all events/metrics are computed.
>
> Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>

Tested-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>

Thanks,
Kan

> ---
> tools/perf/builtin-stat.c | 38 +++++++++++++++++++++++++++++---------
> tools/perf/util/evsel.c | 15 +++++++++++++--
> tools/perf/util/evsel.h | 1 +
> 3 files changed, 43 insertions(+), 11 deletions(-)
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index be9677aa642f..ffb47b166098 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -667,6 +667,13 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
> evsel_list->core.threads->err_thread = -1;
> return COUNTER_RETRY;
> }
> + } else if (counter->skippable) {
> + if (verbose > 0)
> + ui__warning("skipping event %s that kernel failed to open .\n",
> + evsel__name(counter));
> + counter->supported = false;
> + counter->errored = true;
> + return COUNTER_SKIP;
> }
>
> evsel__open_strerror(counter, &target, errno, msg, sizeof(msg));
> @@ -1890,15 +1897,28 @@ static int add_default_attributes(void)
> * caused by exposing latent bugs. This is fixed properly in:
> * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@xxxxxxxxxxxxxxx/
> */
> - if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid() &&
> - metricgroup__parse_groups(evsel_list, "TopdownL1",
> - /*metric_no_group=*/false,
> - /*metric_no_merge=*/false,
> - /*metric_no_threshold=*/true,
> - stat_config.user_requested_cpu_list,
> - stat_config.system_wide,
> - &stat_config.metric_events) < 0)
> - return -1;
> + if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid()) {
> + struct evlist *metric_evlist = evlist__new();
> + struct evsel *metric_evsel;
> +
> + if (!metric_evlist)
> + return -1;
> +
> + if (metricgroup__parse_groups(metric_evlist, "TopdownL1",
> + /*metric_no_group=*/false,
> + /*metric_no_merge=*/false,
> + /*metric_no_threshold=*/true,
> + stat_config.user_requested_cpu_list,
> + stat_config.system_wide,
> + &stat_config.metric_events) < 0)
> + return -1;
> +
> + evlist__for_each_entry(metric_evlist, metric_evsel) {
> + metric_evsel->skippable = true;
> + }
> + evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries);
> + evlist__delete(metric_evlist);
> + }
>
> /* Platform specific attrs */
> if (evlist__add_default_attrs(evsel_list, default_null_attrs) < 0)
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 356c07f03be6..1cd04b5998d2 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -290,6 +290,7 @@ void evsel__init(struct evsel *evsel,
> evsel->per_pkg_mask = NULL;
> evsel->collect_stat = false;
> evsel->pmu_name = NULL;
> + evsel->skippable = false;
> }
>
> struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx)
> @@ -1725,9 +1726,13 @@ static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread)
> return -1;
>
> fd = FD(leader, cpu_map_idx, thread);
> - BUG_ON(fd == -1);
> + BUG_ON(fd == -1 && !leader->skippable);
>
> - return fd;
> + /*
> + * When the leader has been skipped, return -2 to distinguish from no
> + * group leader case.
> + */
> + return fd == -1 ? -2 : fd;
> }
>
> static void evsel__remove_fd(struct evsel *pos, int nr_cpus, int nr_threads, int thread_idx)
> @@ -2109,6 +2114,12 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
>
> group_fd = get_group_fd(evsel, idx, thread);
>
> + if (group_fd == -2) {
> + pr_debug("broken group leader for %s\n", evsel->name);
> + err = -EINVAL;
> + goto out_close;
> + }
> +
> test_attr__ready();
>
> /* Debug message used by test scripts */
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 35805dcdb1b9..bf8f01af1c0b 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -95,6 +95,7 @@ struct evsel {
> bool weak_group;
> bool bpf_counter;
> bool use_config_name;
> + bool skippable;
> int bpf_fd;
> struct bpf_object *bpf_obj;
> struct list_head config_terms;