[PATCH v2 4/7] perf: avoid a bounded set of visit_groups_merge iterators

From: Ian Rogers
Date: Wed Jul 24 2019 - 18:38:51 EST


Create a per-cpu array of iterators that gets resized when cgroup events
are added. The size of the array reflects the maximum depth of cgroups,
although not all cgroups will have events monitored within them. This
approach avoids added storage cost to perf_event.

Signed-off-by: Ian Rogers <irogers@xxxxxxxxxx>
---
include/linux/perf_event.h | 2 +
kernel/events/core.c | 87 +++++++++++++++++++++++++++++++-------
2 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e8ad3c590a23..43f90cfa2c39 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -802,6 +802,8 @@ struct perf_cpu_context {
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
struct list_head cgrp_cpuctx_entry;
+ struct perf_event **visit_groups_merge_iterator_storage;
+ int visit_groups_merge_iterator_storage_size;
#endif

struct list_head sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d70df0415b9..2a2188908bed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1709,6 +1709,20 @@ perf_event_groups_next(struct perf_event *event)
return next;
}

+#ifdef CONFIG_CGROUP_PERF
+int perf_event_cgroup_depth(struct perf_event *event)
+{
+ struct cgroup_subsys_state *css;
+ struct perf_cgroup *cgrp = event->cgrp;
+ int depth = 0;
+
+ if (cgrp)
+ for (css = &cgrp->css; css; css = css->parent)
+ depth++;
+ return depth;
+}
+#endif
+
/*
* Iterate through the whole groups tree.
*/
@@ -2590,6 +2604,7 @@ static int __perf_install_in_context(void *info)

#ifdef CONFIG_CGROUP_PERF
if (is_cgroup_event(event)) {
+ int max_iterators;
/*
* If the current cgroup doesn't match the event's
* cgroup, we should not try to schedule it.
@@ -2597,6 +2612,30 @@ static int __perf_install_in_context(void *info)
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
reprogram = cgroup_is_descendant(cgrp->css.cgroup,
event->cgrp->css.cgroup);
+
+ /*
+ * Ensure space for visit_groups_merge iterator storage. With
+ * cgroup profiling we may have an event at each depth plus
+ * system wide events.
+ */
+ max_iterators = perf_event_cgroup_depth(event) + 1;
+ if (max_iterators >
+ cpuctx->visit_groups_merge_iterator_storage_size) {
+ struct perf_event **storage =
+ krealloc(cpuctx->visit_groups_merge_iterator_storage,
+ sizeof(struct perf_event *) * max_iterators,
+ GFP_KERNEL);
+ if (storage) {
+ cpuctx->visit_groups_merge_iterator_storage
+ = storage;
+ cpuctx->visit_groups_merge_iterator_storage_size
+ = max_iterators;
+ } else {
+ WARN_ONCE(1, "Unable to increase iterator "
+ "storage for perf events with cgroups");
+ ret = -ENOMEM;
+ }
+ }
}
#endif

@@ -3394,6 +3433,13 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
}
}

+
+/*
+ * Without cgroups, with a task context, there may be per-CPU and any
+ * CPU events.
+ */
+#define MIN_VISIT_GROUP_MERGE_ITERATORS 2
+
static int visit_groups_merge(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
struct perf_event_groups *groups,
@@ -3405,22 +3451,25 @@ static int visit_groups_merge(struct perf_event_context *ctx,
{
/*
* A set of iterators, the iterator for the visit is chosen by the
- * group_index. The size of the array is sized such that there is space:
- * - for task contexts per-CPU and any-CPU events can be iterated.
- * - for CPU contexts:
- * - without cgroups, global events can be iterated.
- * - with cgroups, global events can be iterated and 16 sets of cgroup
- * events. Cgroup events may monitor a cgroup at an arbitrary
- * location within the cgroup hierarchy. An iterator is needed for
- * each cgroup with events in the hierarchy. Potentially this is
- * only limited by MAX_PATH.
- */
- struct perf_event *itrs[IS_ENABLED(CONFIG_CGROUP_PERF) ? 17 : 2];
+ * group_index.
+ */
+#ifndef CONFIG_CGROUP_PERF
+ struct perf_event *itrs[MIN_VISIT_GROUP_MERGE_ITERATORS];
struct perf_event_heap heap = {
.storage = itrs,
.num_elements = 0,
- .max_elements = ARRAYSIZE(itrs)
+ .max_elements = MIN_VISIT_GROUP_MERGE_ITERATORS
+ };
+#else
+ /*
+ * With cgroups usage space in the CPU context reserved for iterators.
+ */
+ struct perf_event_heap heap = {
+ .storage = cpuctx->visit_groups_merge_iterator_storage,
+ .num_elements = 0,
+ .max_elements = cpuctx->visit_groups_merge_iterator_storage_size
};
+#endif
int ret, cpu = smp_processor_id();

heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
@@ -3455,9 +3504,8 @@ static int visit_groups_merge(struct perf_event_context *ctx,
heap.num_elements++;
if (heap.num_elements ==
heap.max_elements) {
- WARN_ONCE(
- max_cgroups_with_events_depth,
- "Insufficient iterators for cgroup depth");
+ WARN_ONCE(1,
+ "per-CPU min-heap under sized");
break;
}
}
@@ -10167,7 +10215,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
+#ifdef CONFIG_CGROUP_PERF
+ cpuctx->visit_groups_merge_iterator_storage =
+ kmalloc_array(MIN_VISIT_GROUP_MERGE_ITERATORS,
+ sizeof(struct perf_event *),
+ GFP_KERNEL);
+ cpuctx->visit_groups_merge_iterator_storage_size =
+ MIN_VISIT_GROUP_MERGE_ITERATORS;
+#endif
__perf_mux_hrtimer_init(cpuctx, cpu);
}

--
2.22.0.709.g102302147b-goog