[PATCH V2 3/4] perf cgroup: Add new RB tree keys for cgroup

From: kan . liang
Date: Wed May 15 2019 - 17:04:04 EST


From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>

Current RB tree for pinned/flexible groups doesn't take cgroup into
account. All events on a given CPU will be fed to
pinned/flexible_sched_in(), which relies on perf_cgroup_match() to
filter the events for a specific cgroup. The method has high overhead,
especially in frequent context switch with several events and cgroups
involved.

Add new RB tree keys, cgrp_id and cgrp_group_index, for cgroup.
The unique cgrp_id (the same as css subsys-unique ID) is used to
indicate a cgroup. Events in the same cgroup has the same cgrp_id.
The cgrp_id is always zero for non-cgroup case. There is no functional
change for non-cgroup case.
The cgrp_group_index is used for multiplexing. The rotated events of a
cgroup has the same cgrp_group_index, which equals to the (group_index
-1) of the first rotated events.
The non-cgroup events, e.g. system-wide events, are treated as special
cgroups. The cgrp_group_index is also updated in multiplexing.

Add percpu pinned/flexible_event in perf_cgroup to track the left most
event for a cgroup, which will be used later to fast access the event of
a given cgroup.
Add percpu rotated_event to track the rotated events of a cgroup.

Add perf_event_groups_first_cgroup() to find the left most event for a
given cgroup ID and cgrp_group_index on a given CPU.

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 5 ++
kernel/events/core.c | 217 ++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 210 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3f12937..800bf62 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -703,6 +703,8 @@ struct perf_event {

#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp; /* cgroup event is attach to */
+ u64 cgrp_id; /* perf cgroup ID */
+ u64 cgrp_group_index;
#endif

struct list_head sb_list;
@@ -837,6 +839,9 @@ struct perf_cgroup {
struct cgroup_subsys_state css;
struct perf_cgroup_info __percpu *info;
int cgrp_event_type;
+ struct perf_event * __percpu *pinned_event;
+ struct perf_event * __percpu *flexible_event;
+ struct perf_event * __percpu *rotated_event;
};

/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a3885e68..6891c74 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -717,6 +717,7 @@ static inline void perf_detach_cgroup(struct perf_event *event)
{
css_put(&event->cgrp->css);
event->cgrp = NULL;
+ event->cgrp_id = 0;
}

static inline int is_cgroup_event(struct perf_event *event)
@@ -961,6 +962,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,

cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
+ event->cgrp_id = css->id;

if (event->attr.pinned)
cgrp->cgrp_event_type |= EVENT_CGROUP_PINNED_ONLY;
@@ -1561,6 +1563,9 @@ static void init_event_group(struct perf_event *event)
{
RB_CLEAR_NODE(&event->group_node);
event->group_index = 0;
+#ifdef CONFIG_CGROUP_PERF
+ event->cgrp_group_index = 0;
+#endif
}

/*
@@ -1588,8 +1593,8 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
/*
* Compare function for event groups;
*
- * Implements complex key that first sorts by CPU and then by virtual index
- * which provides ordering when rotating groups for the same CPU.
+ * Implements complex key that sorts by CPU, cgroup index, cgroup ID, and
+ * virtual index which provides ordering when rotating groups for the same CPU.
*/
static bool
perf_event_groups_less(struct perf_event *left, struct perf_event *right)
@@ -1599,6 +1604,18 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
if (left->cpu > right->cpu)
return false;

+#ifdef CONFIG_CGROUP_PERF
+ if (left->cgrp_group_index < right->cgrp_group_index)
+ return true;
+ if (left->cgrp_group_index > right->cgrp_group_index)
+ return false;
+
+ if (left->cgrp_id < right->cgrp_id)
+ return true;
+ if (left->cgrp_id > right->cgrp_id)
+ return false;
+#endif
+
if (left->group_index < right->group_index)
return true;
if (left->group_index > right->group_index)
@@ -1608,13 +1625,14 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
}

/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
+ * Insert @event into @groups' tree; Using
+ * {@event->cpu, @event->cgrp_group_index, @event->cgrp_id, ++@groups->index}
+ * for key (see perf_event_groups_less). This places it last inside the CPU
* subtree.
*/
static void
-perf_event_groups_insert(struct perf_event_groups *groups,
- struct perf_event *event)
+__perf_event_groups_insert(struct perf_event_groups *groups,
+ struct perf_event *event)
{
struct perf_event *node_event;
struct rb_node *parent;
@@ -1639,6 +1657,10 @@ perf_event_groups_insert(struct perf_event_groups *groups,
rb_insert_color(&event->group_node, &groups->tree);
}

+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+ struct perf_event *event);
+
/*
* Helper function to insert event into the pinned or flexible groups.
*/
@@ -1655,8 +1677,8 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
* Delete a group from a tree.
*/
static void
-perf_event_groups_delete(struct perf_event_groups *groups,
- struct perf_event *event)
+__perf_event_groups_delete(struct perf_event_groups *groups,
+ struct perf_event *event)
{
WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
RB_EMPTY_ROOT(&groups->tree));
@@ -1665,6 +1687,10 @@ perf_event_groups_delete(struct perf_event_groups *groups,
init_event_group(event);
}

+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+ struct perf_event *event);
+
/*
* Helper function to delete event from its groups.
*/
@@ -1717,6 +1743,129 @@ perf_event_groups_next(struct perf_event *event)
return NULL;
}

+#ifdef CONFIG_CGROUP_PERF
+
+static struct perf_event *
+perf_event_groups_first_cgroup(struct perf_event_groups *groups,
+ int cpu, u64 cgrp_group_index, u64 cgrp_id)
+{
+ struct perf_event *node_event = NULL, *match = NULL;
+ struct rb_node *node = groups->tree.rb_node;
+
+ while (node) {
+ node_event = container_of(node, struct perf_event, group_node);
+
+ if (cpu < node_event->cpu) {
+ node = node->rb_left;
+ } else if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ } else {
+ if (cgrp_group_index < node_event->cgrp_group_index)
+ node = node->rb_left;
+ else if (cgrp_group_index > node_event->cgrp_group_index)
+ node = node->rb_right;
+ else {
+
+ if (cgrp_id < node_event->cgrp_id)
+ node = node->rb_left;
+ else if (cgrp_id > node_event->cgrp_id)
+ node = node->rb_right;
+ else {
+ match = node_event;
+ node = node->rb_left;
+ }
+ }
+ }
+ }
+ return match;
+}
+
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ struct perf_event **cgrp_event, **rotated_event;
+
+ __perf_event_groups_insert(groups, event);
+
+ if (is_cgroup_event(event)) {
+ if (event->attr.pinned)
+ cgrp_event = per_cpu_ptr(event->cgrp->pinned_event, event->cpu);
+ else {
+ cgrp_event = per_cpu_ptr(event->cgrp->flexible_event, event->cpu);
+ rotated_event = per_cpu_ptr(event->cgrp->rotated_event, event->cpu);
+
+ /* Add the first rotated event into *rotated_event */
+ if (*cgrp_event && !*rotated_event &&
+ (event->cgrp_group_index > (*cgrp_event)->cgrp_group_index))
+ *rotated_event = event;
+
+ /*
+ * *cgrp_event always point to the unrotated events.
+ * All events have been rotated.
+ * Update *cgrp_event and *rotated_event for next round.
+ */
+ if (!*cgrp_event && *rotated_event) {
+ *cgrp_event = *rotated_event;
+ *rotated_event = NULL;
+ }
+ }
+ /*
+ * Cgroup events for the same cgroup on the same CPU will
+ * always be inserted at the right because of bigger
+ * @groups->index.
+ */
+ if (!*cgrp_event)
+ *cgrp_event = event;
+ }
+}
+
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ struct perf_event **cgrp_event, **rotated_event;
+
+ __perf_event_groups_delete(groups, event);
+
+ if (is_cgroup_event(event)) {
+ if (event->attr.pinned)
+ cgrp_event = per_cpu_ptr(event->cgrp->pinned_event, event->cpu);
+ else {
+ cgrp_event = per_cpu_ptr(event->cgrp->flexible_event, event->cpu);
+ rotated_event = per_cpu_ptr(event->cgrp->rotated_event, event->cpu);
+ if (*rotated_event == event) {
+ *rotated_event = perf_event_groups_first_cgroup(groups, event->cpu,
+ event->cgrp_group_index,
+ event->cgrp_id);
+ }
+ }
+ if (*cgrp_event == event) {
+ *cgrp_event = perf_event_groups_first_cgroup(groups, event->cpu,
+ event->cgrp_group_index,
+ event->cgrp_id);
+ }
+ }
+}
+
+#else /* !CONFIG_CGROUP_PERF */
+
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ __perf_event_groups_insert(groups, event);
+}
+
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ __perf_event_groups_delete(groups, event);
+}
+
+#endif
+
/*
* Iterate through the whole groups tree.
*/
@@ -3757,6 +3906,10 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
*/
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
+#ifdef CONFIG_CGROUP_PERF
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event **rotated_event;
+#endif
/*
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
@@ -3765,6 +3918,22 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
return;

perf_event_groups_delete(&ctx->flexible_groups, event);
+
+#ifdef CONFIG_CGROUP_PERF
+
+ /* Rotate cgroups */
+ if (&cpuctx->ctx == ctx) {
+ if (event->cgrp) {
+ rotated_event = per_cpu_ptr(event->cgrp->rotated_event, event->cpu);
+ if (!*rotated_event)
+ event->cgrp_group_index = ctx->flexible_groups.index;
+ else
+ event->cgrp_group_index = (*rotated_event)->cgrp_group_index;
+ } else
+ event->cgrp_group_index = ctx->flexible_groups.index;
+ }
+#endif
+
perf_event_groups_insert(&ctx->flexible_groups, event);
}

@@ -12196,18 +12365,42 @@ perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);

jc->info = alloc_percpu(struct perf_cgroup_info);
- if (!jc->info) {
- kfree(jc);
- return ERR_PTR(-ENOMEM);
- }
+ if (!jc->info)
+ goto free_jc;
+
+ jc->pinned_event = alloc_percpu(struct perf_event *);
+ if (!jc->pinned_event)
+ goto free_jc_info;
+
+ jc->flexible_event = alloc_percpu(struct perf_event *);
+ if (!jc->flexible_event)
+ goto free_jc_pinned;
+
+ jc->rotated_event = alloc_percpu(struct perf_event *);
+ if (!jc->rotated_event)
+ goto free_jc_flexible;

return &jc->css;
+
+free_jc_flexible:
+ free_percpu(jc->flexible_event);
+free_jc_pinned:
+ free_percpu(jc->pinned_event);
+free_jc_info:
+ free_percpu(jc->info);
+free_jc:
+ kfree(jc);
+
+ return ERR_PTR(-ENOMEM);
}

static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);

+ free_percpu(jc->pinned_event);
+ free_percpu(jc->flexible_event);
+ free_percpu(jc->rotated_event);
free_percpu(jc->info);
kfree(jc);
}
--
2.7.4