[PATCH v7 02/12] perf record: implement -f,--mmap-flush=<threshold> option

From: Alexey Budankov
Date: Tue Mar 12 2019 - 01:12:35 EST



Implemented -f,--mmap-flush option that specifies minimal size of data
chunk that is extracted from mmaped kernel buffer to store into a trace.
The default option value is 1 byte what means every time trace writing
thread finds some new data in the mmaped buffer the data is extracted,
possibly compressed and written to a trace.

$ tools/perf/perf record -f 1024 -e cycles -- matrix.gcc
$ tools/perf/perf record --aio -f 1K -e cycles -- matrix.gcc

The option is independent from -z setting, doesn't vary with compression
level and can serve two purposes.

The first purpose is to increase the compression ratio of a trace data.
Larger data chunks are compressed more effectively so the implemented
option allows specifying data chunk size to compress. Also at some cases
executing more write syscalls with smaller data size can take longer
than executing less write syscalls with bigger data size due to syscall
overhead so extracting bigger data chunks specified by the option value
could additionally decrease runtime overhead.

The second purpose is to avoid self monitoring live-lock issue in system
wide (-a) profiling mode. Profiling in system wide mode with compression
(-a -z) can additionally induce data into the kernel buffers along with
the data from monitored processes. If performance data rate and volume
from the monitored processes is high then trace streaming and compression
activity in the tool is also high. High tool process activity can lead
to subtle live-lock effect when compression of single new byte from some
of mmaped kernel buffer leads to generation of the next single byte at
some mmaped buffer. So perf tool process ends up in endless self monitoring.

Implemented sync param is the mean to force data move independently from
the specified flush threshold value. Despite the provided flush value the
tool needs capability to unconditionally drain memory buffers, at least
in the end of the collection.

Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
---
tools/perf/Documentation/perf-record.txt | 13 +++++
tools/perf/builtin-record.c | 65 +++++++++++++++++++++---
tools/perf/perf.h | 1 +
tools/perf/util/evlist.c | 6 +--
tools/perf/util/evlist.h | 3 +-
tools/perf/util/mmap.c | 4 +-
tools/perf/util/mmap.h | 3 +-
7 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 8f0c2be34848..d1e6c1fd7387 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -459,6 +459,19 @@ Set affinity mask of trace reading thread according to the policy defined by 'mo
node - thread affinity mask is set to NUMA node cpu mask of the processed mmap buffer
cpu - thread affinity mask is set to cpu of the processed mmap buffer

+-f::
+--mmap-flush=n::
+Specify minimal number of bytes that is extracted from mmap data pages and stored
+into a trace. The number specification is possible using B/K/M/G suffixes. Maximal allowed
+value is a quarter of the size of mmaped data pages. The default option value is 1 byte
+what means that every time trace writing thread finds some new data in the mmaped buffer
+the data is extracted, possibly compressed (-z) and written to a trace. Larger data chunks
+are compressed more effectively in comparison to smaller chunks so extraction of larger
+chunks from the mmap data pages is preferable from perspective of trace size reduction.
+Also at some cases executing less trace write syscalls with bigger data size can take
+shorter than executing more trace write syscalls with smaller data size thus lowering
+runtime profiling overhead.
+
--all-kernel::
Configure all used events to run in kernel space.

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a468d882e74f..736a0f008959 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -334,6 +334,41 @@ static int record__aio_enabled(struct record *rec)
return rec->opts.nr_cblocks > 0;
}

+#define MMAP_FLUSH_DEFAULT 1
+static int record__mmap_flush_parse(const struct option *opt,
+ const char *str,
+ int unset)
+{
+ int flush_max;
+ struct record_opts *opts = (struct record_opts *)opt->value;
+ static struct parse_tag tags[] = {
+ { .tag = 'B', .mult = 1 },
+ { .tag = 'K', .mult = 1 << 10 },
+ { .tag = 'M', .mult = 1 << 20 },
+ { .tag = 'G', .mult = 1 << 30 },
+ { .tag = 0 },
+ };
+
+ if (unset)
+ return 0;
+
+ if (str) {
+ opts->mmap_flush = parse_tag_value(str, tags);
+ if (opts->mmap_flush == (int)-1)
+ opts->mmap_flush = strtol(str, NULL, 0);
+ }
+
+ if (!opts->mmap_flush)
+ opts->mmap_flush = MMAP_FLUSH_DEFAULT;
+
+ flush_max = perf_evlist__mmap_size(opts->mmap_pages);
+ flush_max /= 4;
+ if (opts->mmap_flush > flush_max)
+ opts->mmap_flush = flush_max;
+
+ return 0;
+}
+
static int process_synthesized_event(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample __maybe_unused,
@@ -543,7 +578,8 @@ static int record__mmap_evlist(struct record *rec,
if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
opts->auxtrace_mmap_pages,
opts->auxtrace_snapshot_mode,
- opts->nr_cblocks, opts->affinity) < 0) {
+ opts->nr_cblocks, opts->affinity,
+ opts->mmap_flush) < 0) {
if (errno == EPERM) {
pr_err("Permission error mapping pages.\n"
"Consider increasing "
@@ -733,7 +769,7 @@ static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
}

static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
- bool overwrite)
+ bool overwrite, bool sync)
{
u64 bytes_written = rec->bytes_written;
int i;
@@ -756,12 +792,19 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
off = record__aio_get_pos(trace_fd);

for (i = 0; i < evlist->nr_mmaps; i++) {
+ u64 flush = 0;
struct perf_mmap *map = &maps[i];

if (map->base) {
record__adjust_affinity(rec, map);
+ if (sync) {
+ flush = map->flush;
+ map->flush = 1;
+ }
if (!record__aio_enabled(rec)) {
if (perf_mmap__push(map, rec, record__pushfn) != 0) {
+ if (sync)
+ map->flush = flush;
rc = -1;
goto out;
}
@@ -774,10 +817,14 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
idx = record__aio_sync(map, false);
if (perf_mmap__aio_push(map, rec, idx, record__aio_pushfn, &off) != 0) {
record__aio_set_pos(trace_fd, off);
+ if (sync)
+ map->flush = flush;
rc = -1;
goto out;
}
}
+ if (sync)
+ map->flush = flush;
}

if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
@@ -803,15 +850,15 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
return rc;
}

-static int record__mmap_read_all(struct record *rec)
+static int record__mmap_read_all(struct record *rec, bool sync)
{
int err;

- err = record__mmap_read_evlist(rec, rec->evlist, false);
+ err = record__mmap_read_evlist(rec, rec->evlist, false, sync);
if (err)
return err;

- return record__mmap_read_evlist(rec, rec->evlist, true);
+ return record__mmap_read_evlist(rec, rec->evlist, true, sync);
}

static void record__init_features(struct record *rec)
@@ -1312,7 +1359,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
if (trigger_is_hit(&switch_output_trigger) || done || draining)
perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);

- if (record__mmap_read_all(rec) < 0) {
+ if (record__mmap_read_all(rec, false) < 0) {
trigger_error(&auxtrace_snapshot_trigger);
trigger_error(&switch_output_trigger);
err = -1;
@@ -1413,6 +1460,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
record__synthesize_workload(rec, true);

out_child:
+ record__mmap_read_all(rec, true);
record__aio_mmap_read_sync(rec);

if (forks) {
@@ -1815,6 +1863,7 @@ static struct record record = {
.uses_mmap = true,
.default_per_cpu = true,
},
+ .mmap_flush = MMAP_FLUSH_DEFAULT,
},
.tool = {
.sample = process_sample_event,
@@ -1881,6 +1930,9 @@ static struct option __record_options[] = {
OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
"number of mmap data pages and AUX area tracing mmap pages",
record__parse_mmap_pages),
+ OPT_CALLBACK('f', "mmap-flush", &record.opts, "bytes",
+ "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
+ record__mmap_flush_parse),
OPT_BOOLEAN(0, "group", &record.opts.group,
"put the counters into a counter group"),
OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
@@ -2184,6 +2236,7 @@ int cmd_record(int argc, const char **argv)
pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);

pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
+ pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);

err = __cmd_record(&record, argc, argv);
out:
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index b120e547ddc7..7886cc9771cf 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -85,6 +85,7 @@ struct record_opts {
u64 clockid_res_ns;
int nr_cblocks;
int affinity;
+ int mmap_flush;
};

enum perf_affinity {
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index ed20f4379956..8858d829983b 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1037,7 +1037,7 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
*/
int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
unsigned int auxtrace_pages,
- bool auxtrace_overwrite, int nr_cblocks, int affinity)
+ bool auxtrace_overwrite, int nr_cblocks, int affinity, int flush)
{
struct perf_evsel *evsel;
const struct cpu_map *cpus = evlist->cpus;
@@ -1047,7 +1047,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
* Its value is decided by evsel's write_backward.
* So &mp should not be passed through const pointer.
*/
- struct mmap_params mp = { .nr_cblocks = nr_cblocks, .affinity = affinity };
+ struct mmap_params mp = { .nr_cblocks = nr_cblocks, .affinity = affinity, .flush = flush };

if (!evlist->mmap)
evlist->mmap = perf_evlist__alloc_mmap(evlist, false);
@@ -1079,7 +1079,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,

int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages)
{
- return perf_evlist__mmap_ex(evlist, pages, 0, false, 0, PERF_AFFINITY_SYS);
+ return perf_evlist__mmap_ex(evlist, pages, 0, false, 0, PERF_AFFINITY_SYS, 1);
}

int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 744906dd4887..edf18811e39f 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -165,7 +165,8 @@ unsigned long perf_event_mlock_kb_in_pages(void);

int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
unsigned int auxtrace_pages,
- bool auxtrace_overwrite, int nr_cblocks, int affinity);
+ bool auxtrace_overwrite, int nr_cblocks,
+ int affinity, int flush);
int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages);
void perf_evlist__munmap(struct perf_evlist *evlist);

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index cdc7740fc181..ef3d79b2c90b 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -440,6 +440,8 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int c

perf_mmap__setup_affinity_mask(map, mp);

+ map->flush = mp->flush;
+
if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
&mp->auxtrace_mp, map->base, fd))
return -1;
@@ -492,7 +494,7 @@ static int __perf_mmap__read_init(struct perf_mmap *md)
md->start = md->overwrite ? head : old;
md->end = md->overwrite ? old : head;

- if (md->start == md->end)
+ if ((md->end - md->start) < md->flush)
return -EAGAIN;

size = md->end - md->start;
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index e566c19b242b..b82f8c2d55c4 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -39,6 +39,7 @@ struct perf_mmap {
} aio;
#endif
cpu_set_t affinity_mask;
+ u64 flush;
};

/*
@@ -70,7 +71,7 @@ enum bkw_mmap_state {
};

struct mmap_params {
- int prot, mask, nr_cblocks, affinity;
+ int prot, mask, nr_cblocks, affinity, flush;
struct auxtrace_mmap_params auxtrace_mp;
};

--
2.20.1