RE: [RFC PATCH v3 2/6] perf stat: Fork and launch perf record when perf stat needs to get retire latency value for a metric.

From: Wang, Weilin
Date: Tue Mar 12 2024 - 17:52:01 EST




> -----Original Message-----
> From: Namhyung Kim <namhyung@xxxxxxxxxx>
> Sent: Monday, March 11, 2024 11:40 PM
> To: Wang, Weilin <weilin.wang@xxxxxxxxx>
> Cc: Ian Rogers <irogers@xxxxxxxxxx>; Peter Zijlstra <peterz@xxxxxxxxxxxxx>;
> Ingo Molnar <mingo@xxxxxxxxxx>; Arnaldo Carvalho de Melo
> <acme@xxxxxxxxxx>; Alexander Shishkin
> <alexander.shishkin@xxxxxxxxxxxxxxx>; Jiri Olsa <jolsa@xxxxxxxxxx>; Hunter,
> Adrian <adrian.hunter@xxxxxxxxx>; Kan Liang <kan.liang@xxxxxxxxxxxxxxx>;
> linux-perf-users@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; Taylor, Perry
> <perry.taylor@xxxxxxxxx>; Alt, Samantha <samantha.alt@xxxxxxxxx>; Biggers,
> Caleb <caleb.biggers@xxxxxxxxx>
> Subject: Re: [RFC PATCH v3 2/6] perf stat: Fork and launch perf record when
> perf stat needs to get retire latency value for a metric.
>
> On Fri, Mar 1, 2024 at 4:11 PM <weilin.wang@xxxxxxxxx> wrote:
> >
> > From: Weilin Wang <weilin.wang@xxxxxxxxx>
> >
> > When retire_latency value is used in a metric formula, perf stat would fork a
> > perf record process with "-e" and "-W" options. Perf record will collect
> > required retire_latency values in parallel while perf stat is collecting
> > counting values.
> >
> > At the point of time that perf stat stops counting, it would send sigterm
> signal
> > to perf record process and receiving sampling data back from perf record
> from a
> > pipe. Perf stat will then process the received data to get retire latency data
> > and calculate metric result.
> >
> > Signed-off-by: Weilin Wang <weilin.wang@xxxxxxxxx>
> > ---
> > tools/perf/builtin-stat.c | 179
> +++++++++++++++++++++++++++++++++-
> > tools/perf/util/data.c | 4 +
> > tools/perf/util/data.h | 1 +
> > tools/perf/util/metricgroup.h | 7 ++
> > tools/perf/util/stat.h | 3 +
> > 5 files changed, 191 insertions(+), 3 deletions(-)
> >
> > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > index 5a3093541cff..3890a579349e 100644
> > --- a/tools/perf/builtin-stat.c
> > +++ b/tools/perf/builtin-stat.c
> > @@ -94,8 +94,13 @@
> > #include <perf/evlist.h>
> > #include <internal/threadmap.h>
> >
> > +#include "util/sample.h"
> > +#include <sys/param.h>
> > +#include <subcmd/run-command.h>
> > +
> > #define DEFAULT_SEPARATOR " "
> > #define FREEZE_ON_SMI_PATH "devices/cpu/freeze_on_smi"
> > +#define PERF_DATA "-"
> >
> > static void print_counters(struct timespec *ts, int argc, const char **argv);
> >
> > @@ -162,7 +167,8 @@ static struct perf_stat_config stat_config = {
> > .ctl_fd = -1,
> > .ctl_fd_ack = -1,
> > .iostat_run = false,
> > - .tpebs_event_size = 0,
> > + .tpebs_event_size = 0,
> > + .tpebs_pid = -1,
> > };
> >
> > static bool cpus_map_matched(struct evsel *a, struct evsel *b)
> > @@ -687,12 +693,163 @@ static enum counter_recovery
> stat_handle_error(struct evsel *counter)
> > return COUNTER_FATAL;
> > }
> >
> > -static int __run_perf_record(void)
> > +static int __run_perf_record(const char **record_argv)
> > {
> > + int i = 0;
> > + struct tpebs_event *e;
> > pr_debug("Prepare perf record for retire_latency\n");
> > +
> > +
> > + record_argv[i++] = "perf";
> > + record_argv[i++] = "record";
> > + record_argv[i++] = "-W";
> > +
> > + if (stat_config.user_requested_cpu_list) {
> > + record_argv[i++] = "-C";
> > + record_argv[i++] = stat_config.user_requested_cpu_list;
> > + }
> > +
> > + if (stat_config.system_wide)
> > + record_argv[i++] = "-a";
> > +
> > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > + record_argv[i++] = "-e";
> > + record_argv[i++] = e->name;
> > + }
> > +
> > + record_argv[i++] = "-o";
> > + record_argv[i++] = PERF_DATA;
> > +
> > return 0;
> > }
> >
> > +static void prepare_run_command(struct child_process *cmd,
> > + const char **argv)
> > +{
> > + memset(cmd, 0, sizeof(*cmd));
> > + cmd->argv = argv;
> > + cmd->out = -1;
> > +}
>
> A new line please.
>
> > +static int prepare_perf_record(struct child_process *cmd)
> > +{
> > + const char **record_argv;
> > +
> > + record_argv = calloc(10 + 2 * stat_config.tpebs_event_size, sizeof(char
> *));
> > + if (!record_argv)
> > + return -1;
> > + __run_perf_record(record_argv);
> > +
> > + prepare_run_command(cmd, record_argv);
> > + return start_command(cmd);
> > +}
> > +
> > +struct perf_script {
> > + struct perf_tool tool;
> > + struct perf_session *session;
> > + struct evswitch evswitch;
> > + struct perf_cpu_map *cpus;
> > + struct perf_thread_map *threads;
> > + int name_width;
> > +};
>
> Do we really need all these?
>
> > +
> > +static void tpebs_data__delete(void)
> > +{
> > + struct tpebs_retire_lat *r, *rtmp;
> > + struct tpebs_event *e, *etmp;
> > + list_for_each_entry_safe(r, rtmp, &stat_config.tpebs_results, nd) {
> > + list_del_init(&r->nd);
> > + free(r);
> > + }
> > + list_for_each_entry_safe(e, etmp, &stat_config.tpebs_events, nd) {
> > + list_del_init(&e->nd);
> > + free(e);
> > + }
> > +}
> > +
> > +static int process_sample_event(struct perf_tool *tool,
> > + union perf_event *event __maybe_unused,
> > + struct perf_sample *sample,
> > + struct evsel *evsel,
> > + struct machine *machine __maybe_unused)
> > +{
> > + struct perf_script *script = container_of(tool, struct perf_script, tool);
> > + int ret = 0;
> > + const char *evname;
> > + struct tpebs_retire_lat *t;
> > +
> > + pr_debug("entering function %s\n ", __func__);
> > + evname = evsel__name(evsel);
> > +
> > + pr_debug("[%03d] ", sample->cpu);
> > + pr_debug("%*s: ", script->name_width, evname ?: "[unknown]");
> > + pr_debug("%16" PRIu16, sample->retire_lat);
> > + pr_debug("\n");
>
> These debug messages can go away.
>
> > +
> > + // Need to handle per core results?
> > + // We are assuming average retire latency value will be used. Save the
> number of
> > + // samples and the sum of retire latency value for each event.
> > + list_for_each_entry(t, &stat_config.tpebs_results, nd) {
> > + if (!strcmp(evname, t->name)) {
> > + t->count += 1;
> > + t->sum += sample->retire_lat;
> > + break;
> > + }
> > + }
> > +
> > + return ret;
> > +}
> > +
> > +static int process_feature_event(struct perf_session *session,
> > + union perf_event *event)
> > +{
> > + if (event->feat.feat_id < HEADER_LAST_FEATURE)
> > + return perf_event__process_feature(session, event);
>
> Just curious. Why is this necessary?

Without this, we cannot successfully process retire_latency values when data is
passed from a PIPE. I'm not sure the real reason though.

>
>
> > + return 0;
> > +}
> > +
> > +static int __cmd_script(struct child_process *cmd __maybe_unused)
>
> __cmd_script() is probably not a good name. Maybe just
> process_tpebs_samples()? Also 'cmd' is actually used.
>
> > +{
> > + int err = 0;
> > + struct perf_session *session;
> > + struct perf_data data = {
> > + .mode = PERF_DATA_MODE_READ,
> > + .path = PERF_DATA,
> > + .fd = cmd->out,
> > + };
> > + struct perf_script script = {
> > + .tool = {
> > + .sample = process_sample_event,
> > + .ordered_events = true,
> > + .ordering_requires_timestamps = true,
>
> I don't think you need an ordered events queue for this.
>
>
> > + .feature = process_feature_event,
> > + .attr = perf_event__process_attr,
> > + },
> > + };
> > + struct tpebs_event *e;
> > +
> > + list_for_each_entry(e, &stat_config.tpebs_events, nd) {
> > + struct tpebs_retire_lat *new = malloc(sizeof(struct
> tpebs_retire_lat));
> > +
> > + if (!new)
> > + return -1;
> > + new->name = strdup(e->name);
> > + new->tpebs_name = strdup(e->tpebs_name);
> > + new->count = 0;
> > + new->sum = 0;
>
> I think you can simply use zalloc() instead.
>
>
> > + list_add_tail(&new->nd, &stat_config.tpebs_results);
> > + }
> > +
> > + kill(cmd->pid, SIGTERM);
> > + session = perf_session__new(&data, &script.tool);
> > + if (IS_ERR(session))
> > + return PTR_ERR(session);
> > + script.session = session;
> > + err = perf_session__process_events(session);
> > + perf_session__delete(session);
> > +
> > + return err;
> > +}
> > +
> > static int __run_perf_stat(int argc, const char **argv, int run_idx)
> > {
> > int interval = stat_config.interval;
> > @@ -709,12 +866,14 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> > struct affinity saved_affinity, *affinity = NULL;
> > int err;
> > bool second_pass = false;
> > + struct child_process cmd;
> >
> > //Prepare perf record for sampling event retire_latency before fork and
> prepare workload
> > if (stat_config.tpebs_event_size > 0) {
> > int ret;
> >
> > - ret = __run_perf_record();
> > + pr_debug("perf stat pid = %d\n", getpid());
> > + ret = prepare_perf_record(&cmd);
> > if (ret)
> > return ret;
> > }
> > @@ -924,6 +1083,17 @@ static int __run_perf_stat(int argc, const char
> **argv, int run_idx)
> >
> > t1 = rdclock();
> >
> > + if (stat_config.tpebs_event_size > 0) {
> > + int ret;
> > +
> > + pr_debug("pid = %d\n", getpid());
> > + pr_debug("cmd.pid = %d\n", cmd.pid);
> > +
> > + ret = __cmd_script(&cmd);
> > + close(cmd.out);
> > + pr_debug("%d\n", ret);
>
> Please remove all the unnecessary debug messages.
>
> And you don't need to wait for the target process to finish
> in order to process samples. Actually it'd increase the
> chance that the pipe is full during the execution.

This part is changed to use a separate thread in later commit.

>
>
> > + }
> > +
> > if (stat_config.walltime_run_table)
> > stat_config.walltime_run[run_idx] = t1 - t0;
> >
> > @@ -2761,6 +2931,7 @@ int cmd_stat(int argc, const char **argv)
> > }
> >
> > INIT_LIST_HEAD(&stat_config.tpebs_events);
> > + INIT_LIST_HEAD(&stat_config.tpebs_results);
> >
> > /*
> > * Metric parsing needs to be delayed as metrics may optimize events
> > @@ -2972,5 +3143,7 @@ int cmd_stat(int argc, const char **argv)
> > metricgroup__rblist_exit(&stat_config.metric_events);
> > evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack,
> &stat_config.ctl_fd_close);
> >
> > + tpebs_data__delete();
> > +
> > return status;
> > }
> > diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
> > index 08c4bfbd817f..2e2a20fc5c30 100644
> > --- a/tools/perf/util/data.c
> > +++ b/tools/perf/util/data.c
> > @@ -185,6 +185,10 @@ static bool check_pipe(struct perf_data *data)
> > int fd = perf_data__is_read(data) ?
> > STDIN_FILENO : STDOUT_FILENO;
> >
> > + if (data->fd > 0) {
> > + fd = data->fd;
> > + }
>
> I'm not sure if it's needed. The data->path is set and it'll be
> checked in the below.

I think we need to set this fd when we use PIPE instead of a file. This fd is
opened in start_command() in run-command.c. So we need to pass this value
back at execution time.

>
>
> > +
> > if (!data->path) {
> > if (!fstat(fd, &st) && S_ISFIFO(st.st_mode))
> > is_pipe = true;
> > diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
> > index 110f3ebde30f..720638116ca0 100644
> > --- a/tools/perf/util/data.h
> > +++ b/tools/perf/util/data.h
> > @@ -28,6 +28,7 @@ struct perf_data_file {
> >
> > struct perf_data {
> > const char *path;
> > + int fd;
>
> Can we remove this?
>
>
> > struct perf_data_file file;
> > bool is_pipe;
> > bool is_dir;
> > diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
> > index 7c24ed768ff3..1fa12cc3294e 100644
> > --- a/tools/perf/util/metricgroup.h
> > +++ b/tools/perf/util/metricgroup.h
> > @@ -71,6 +71,13 @@ struct tpebs_event {
> > const char *name;
> > const char *tpebs_name;
> > };
>
> A new line please.
>
> > +struct tpebs_retire_lat {
> > + struct list_head nd;
> > + const char *name;
> > + const char *tpebs_name;
>
> What are the 'name' and 'tpebs_name'?

Sorry about this bad naming! There is a name change from event_name:R to event_name:p
at metric formula parsing time.
The "name" is meant for event_name (added :p for perf record purpose) and "tpebs_name"
is event_name:R (event_name with TPEBS modifier).

I will add some comments here. Please let me know if you have any name suggestions.

>
>
> > + size_t count;
> > + int sum;
> > +};
> >
> > struct metric_event *metricgroup__lookup(struct rblist *metric_events,
> > struct evsel *evsel,
> > diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> > index b987960df3c5..2387c6289f93 100644
> > --- a/tools/perf/util/stat.h
> > +++ b/tools/perf/util/stat.h
> > @@ -111,6 +111,9 @@ struct perf_stat_config {
> > struct rblist metric_events;
> > struct list_head tpebs_events;
> > size_t tpebs_event_size;
> > + struct list_head tpebs_results;
> > + pid_t tpebs_pid;
> > + int tpebs_pipe;
>
> Did you use these?
>
>
> > int ctl_fd;
> > int ctl_fd_ack;
> > bool ctl_fd_close;
> > --
> > 2.43.0
> >