[PATCH v6 1/3] perf/core: use rb trees for pinned/flexible groups

From: Alexey Budankov
Date: Wed Aug 02 2017 - 04:14:04 EST


This patch moves event groups into rb tree sorted by CPU, so that
multiplexing hrtimer interrupt handler would be able skipping to the current
CPU's list and ignore groups allocated for the other CPUs.

New API for manipulating event groups in the trees is implemented as well
as adoption on the API in the current implementation.

Because perf_event_groups_iterate() API provides capability to execute
a callback for every event group in a tree, adoption of the API introduces
some code that packs and unpacks arguments of functions existing in
the implementation as well as adjustments of their calling signatures
e.g. ctx_pinned_sched_in(), ctx_flexible_sched_in() and inherit_task_group().

Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
---
include/linux/perf_event.h | 18 ++-
kernel/events/core.c | 389 +++++++++++++++++++++++++++++++++------------
2 files changed, 306 insertions(+), 101 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a3b873f..282f121 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -572,6 +572,20 @@ struct perf_event {
*/
struct list_head group_entry;
struct list_head sibling_list;
+ /*
+ * Node on the pinned or flexible tree located at the event context;
+ * the node may be empty in case its event is not directly attached
+ * to the tree but to group_list list of the event directly
+ * attached to the tree;
+ */
+ struct rb_node group_node;
+ /*
+ * List keeps groups allocated for the same cpu;
+ * the list may be empty in case its event is not directly
+ * attached to the tree but to group_list list of the event directly
+ * attached to the tree;
+ */
+ struct list_head group_list;

/*
* We need storage to track the entries in perf_pmu_migrate_context; we
@@ -741,8 +755,8 @@ struct perf_event_context {
struct mutex mutex;

struct list_head active_ctx_list;
- struct list_head pinned_groups;
- struct list_head flexible_groups;
+ struct rb_root pinned_groups;
+ struct rb_root flexible_groups;
struct list_head event_list;
int nr_events;
int nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ff..0a4f619 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1466,8 +1466,12 @@ static enum event_type_t get_event_type(struct perf_event *event)
return event_type;
}

-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits;
+ */
+static struct rb_root *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
if (event->attr.pinned)
return &ctx->pinned_groups;
@@ -1475,6 +1479,160 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
return &ctx->flexible_groups;
}

+static void
+perf_event_groups_insert(struct rb_root *groups,
+ struct perf_event *event);
+
+static void
+perf_event_groups_delete(struct rb_root *groups,
+ struct perf_event *event);
+
+/*
+ * Helper function to insert event into the pinned or
+ * flexible groups;
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct rb_root *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Helper function to delete event from its groups;
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct rb_root *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_event_groups_insert(struct rb_root *groups,
+ struct perf_event *event)
+{
+ struct rb_node **node;
+ struct rb_node *parent;
+ struct perf_event *node_event;
+
+ node = &groups->rb_node;
+ parent = *node;
+
+ while (*node) {
+ parent = *node;
+ node_event = container_of(*node,
+ struct perf_event, group_node);
+
+ if (event->cpu < node_event->cpu) {
+ node = &parent->rb_left;
+ } else if (event->cpu > node_event->cpu) {
+ node = &parent->rb_right;
+ } else {
+ list_add_tail(&event->group_entry,
+ &node_event->group_list);
+ return;
+ }
+ }
+
+ list_add_tail(&event->group_entry, &event->group_list);
+
+ rb_link_node(&event->group_node, parent, node);
+ rb_insert_color(&event->group_node, groups);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_event_groups_delete(struct rb_root *groups,
+ struct perf_event *event)
+{
+ struct perf_event *next;
+
+ list_del_init(&event->group_entry);
+
+ if (!RB_EMPTY_NODE(&event->group_node)) {
+ if (!RB_EMPTY_ROOT(groups)) {
+ if (list_empty(&event->group_list)) {
+ rb_erase(&event->group_node, groups);
+ } else {
+ next = list_first_entry(&event->group_list,
+ struct perf_event, group_entry);
+ list_replace_init(&event->group_list,
+ &next->group_list);
+ rb_replace_node(&event->group_node,
+ &next->group_node, groups);
+ }
+ }
+ RB_CLEAR_NODE(&event->group_node);
+ }
+}
+
+/*
+ * Find group list by a cpu key and rotate it.
+ */
+static void
+perf_event_groups_rotate(struct rb_root *groups, int cpu)
+{
+ struct rb_node *node;
+ struct perf_event *node_event;
+
+ node = groups->rb_node;
+
+ while (node) {
+ node_event = container_of(node,
+ struct perf_event, group_node);
+
+ if (cpu < node_event->cpu) {
+ node = node->rb_left;
+ } else if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ } else {
+ list_rotate_left(&node_event->group_list);
+ break;
+ }
+ }
+}
+
+typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);
+
+/*
+ * Iterate event groups and call provided callback for every group in the tree.
+ * Iteration stops if the callback returns non zero.
+ */
+static int
+perf_event_groups_iterate(struct rb_root *groups,
+ perf_event_groups_iterate_f callback, void *data)
+{
+ int ret = 0;
+ struct rb_node *node;
+ struct perf_event *node_event, *event;
+
+ for (node = rb_first(groups); node; node = rb_next(node)) {
+ node_event = container_of(node, struct perf_event, group_node);
+ list_for_each_entry(event, &node_event->group_list,
+ group_entry) {
+ ret = callback(event, data);
+ if (ret) {
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
@@ -1493,12 +1651,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
* perf_group_detach can, at all times, locate all siblings.
*/
if (event->group_leader == event) {
- struct list_head *list;
-
event->group_caps = event->event_caps;
-
- list = ctx_group_list(event, ctx);
- list_add_tail(&event->group_entry, list);
+ add_event_to_groups(event, ctx);
}

list_update_cgroup_event(event, ctx, true);
@@ -1689,7 +1843,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
list_del_rcu(&event->event_entry);

if (event->group_leader == event)
- list_del_init(&event->group_entry);
+ del_event_from_groups(event, ctx);

update_group_times(event);

@@ -1730,22 +1884,22 @@ static void perf_group_detach(struct perf_event *event)
goto out;
}

- if (!list_empty(&event->group_entry))
- list = &event->group_entry;
-
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- if (list)
- list_move_tail(&sibling->group_entry, list);
sibling->group_leader = sibling;

/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;

+ if (!list_empty(&event->group_entry)) {
+ list_del_init(&sibling->group_entry);
+ add_event_to_groups(sibling, event->ctx);
+ }
+
WARN_ON_ONCE(sibling->ctx != event->ctx);
}

@@ -1869,6 +2023,22 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}

+struct group_sched_params {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ int can_add_hw;
+};
+
+static int
+group_sched_out_callback(struct perf_event *event, void *data)
+{
+ struct group_sched_params *params = data;
+
+ group_sched_out(event, params->cpuctx, params->ctx);
+
+ return 0;
+}
+
#define DETACH_GROUP 0x01UL

/*
@@ -2712,7 +2882,10 @@ static void ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
{
int is_active = ctx->is_active;
- struct perf_event *event;
+ struct group_sched_params params = {
+ .cpuctx = cpuctx,
+ .ctx = ctx
+ };

lockdep_assert_held(&ctx->lock);

@@ -2759,13 +2932,13 @@ static void ctx_sched_out(struct perf_event_context *ctx,

perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
- list_for_each_entry(event, &ctx->pinned_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
+ perf_event_groups_iterate(&ctx->pinned_groups,
+ group_sched_out_callback, &params);
}

if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry(event, &ctx->flexible_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
+ perf_event_groups_iterate(&ctx->flexible_groups,
+ group_sched_out_callback, &params);
}
perf_pmu_enable(ctx->pmu);
}
@@ -3059,63 +3232,60 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}

-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static int
+ctx_pinned_sched_in(struct perf_event *event, void *data)
{
- struct perf_event *event;
+ struct group_sched_params *params = data;

- list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- if (!event_filter_match(event))
- continue;
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+ if (!event_filter_match(event))
+ return 0;

- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, params->ctx);

- if (group_can_go_on(event, cpuctx, 1))
- group_sched_in(event, cpuctx, ctx);
+ if (group_can_go_on(event, params->cpuctx, 1))
+ group_sched_in(event, params->cpuctx, params->ctx);

- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_ERROR;
- }
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_ERROR;
}
+
+ return 0;
}

-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static int
+ctx_flexible_sched_in(struct perf_event *event, void *data)
{
- struct perf_event *event;
- int can_add_hw = 1;
+ struct group_sched_params *params = data;

- list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
- /* Ignore events in OFF or ERROR state */
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- /*
- * Listen to the 'cpu' scheduling filter constraint
- * of events:
- */
- if (!event_filter_match(event))
- continue;
+ /* Ignore events in OFF or ERROR state */
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+ /*
+ * Listen to the 'cpu' scheduling filter constraint
+ * of events:
+ */
+ if (!event_filter_match(event))
+ return 0;

- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, params->ctx);

- if (group_can_go_on(event, cpuctx, can_add_hw)) {
- if (group_sched_in(event, cpuctx, ctx))
- can_add_hw = 0;
- }
+ if (group_can_go_on(event, params->cpuctx, params->can_add_hw)) {
+ if (group_sched_in(event, params->cpuctx, params->ctx))
+ params->can_add_hw = 0;
}
+
+ return 0;
}

static void
@@ -3125,7 +3295,10 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
int is_active = ctx->is_active;
- u64 now;
+ struct group_sched_params params = {
+ .cpuctx = cpuctx,
+ .ctx = ctx
+ };

lockdep_assert_held(&ctx->lock);

@@ -3144,7 +3317,7 @@ ctx_sched_in(struct perf_event_context *ctx,

if (is_active & EVENT_TIME) {
/* start ctx time */
- now = perf_clock();
+ u64 now = perf_clock();
ctx->timestamp = now;
perf_cgroup_set_timestamp(task, ctx);
}
@@ -3154,11 +3327,15 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ perf_event_groups_iterate(&ctx->pinned_groups,
+ ctx_pinned_sched_in, &params);

/* Then walk through the lower prio flexible groups */
- if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ if (is_active & EVENT_FLEXIBLE) {
+ params.can_add_hw = 1;
+ perf_event_groups_iterate(&ctx->flexible_groups,
+ ctx_flexible_sched_in, &params);
+ }
}

static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -3189,7 +3366,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!list_empty(&ctx->pinned_groups))
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
@@ -3424,8 +3601,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
- if (!ctx->rotate_disable)
- list_rotate_left(&ctx->flexible_groups);
+ if (!ctx->rotate_disable) {
+ int cpu = smp_processor_id();
+
+ perf_event_groups_rotate(&ctx->flexible_groups, -1);
+ perf_event_groups_rotate(&ctx->flexible_groups, cpu);
+ }
}

static int perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3764,8 +3945,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
- INIT_LIST_HEAD(&ctx->pinned_groups);
- INIT_LIST_HEAD(&ctx->flexible_groups);
+ ctx->pinned_groups = RB_ROOT;
+ ctx->flexible_groups = RB_ROOT;
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
}
@@ -9372,6 +9553,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->group_entry);
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
+ RB_CLEAR_NODE(&event->group_node);
+ INIT_LIST_HEAD(&event->group_list);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
@@ -10786,6 +10969,14 @@ static int inherit_group(struct perf_event *parent_event,
return 0;
}

+struct inherit_task_group_params {
+ struct task_struct *parent;
+ struct perf_event_context *parent_ctx;
+ struct task_struct *child;
+ int ctxn;
+ int inherited_all;
+};
+
/*
* Creates the child task context and tries to inherit the event-group.
*
@@ -10798,20 +10989,18 @@ static int inherit_group(struct perf_event *parent_event,
* - <0 on error
*/
static int
-inherit_task_group(struct perf_event *event, struct task_struct *parent,
- struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
- int *inherited_all)
+inherit_task_group(struct perf_event *event, void *data)
{
int ret;
struct perf_event_context *child_ctx;
+ struct inherit_task_group_params *params = data;

if (!event->attr.inherit) {
- *inherited_all = 0;
+ params->inherited_all = 0;
return 0;
}

- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = params->child->perf_event_ctxp[params->ctxn];
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -10819,18 +11008,19 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(params->parent_ctx->pmu,
+ params->child);
if (!child_ctx)
return -ENOMEM;

- child->perf_event_ctxp[ctxn] = child_ctx;
+ params->child->perf_event_ctxp[params->ctxn] = child_ctx;
}

- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
+ ret = inherit_group(event, params->parent, params->parent_ctx,
+ params->child, child_ctx);

if (ret)
- *inherited_all = 0;
+ params->inherited_all = 0;

return ret;
}
@@ -10842,11 +11032,15 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
- struct perf_event *event;
struct task_struct *parent = current;
- int inherited_all = 1;
unsigned long flags;
int ret = 0;
+ struct inherit_task_group_params params = {
+ .parent = parent,
+ .child = child,
+ .ctxn = ctxn,
+ .inherited_all = 1
+ };

if (likely(!parent->perf_event_ctxp[ctxn]))
return 0;
@@ -10859,6 +11053,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
if (!parent_ctx)
return 0;

+ params.parent_ctx = parent_ctx;
+
/*
* No need to check if parent_ctx != NULL here; since we saw
* it non-NULL earlier, the only reason for it to become NULL
@@ -10876,13 +11072,10 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
*/
- list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
- ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
- if (ret)
- goto out_unlock;
- }
-
+ ret = perf_event_groups_iterate(&parent_ctx->pinned_groups,
+ inherit_task_group, &params);
+ if (ret)
+ goto out_unlock;
/*
* We can't hold ctx->lock when iterating the ->flexible_group list due
* to allocations, but we need to prevent rotation because
@@ -10892,19 +11085,17 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
parent_ctx->rotate_disable = 1;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

- list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
- ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
- if (ret)
- goto out_unlock;
- }
+ ret = perf_event_groups_iterate(&parent_ctx->flexible_groups,
+ inherit_task_group, &params);
+ if (ret)
+ goto out_unlock;

raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;

child_ctx = child->perf_event_ctxp[ctxn];

- if (child_ctx && inherited_all) {
+ if (child_ctx && params.inherited_all) {
/*
* Mark the child context as a clone of the parent
* context, or of whatever the parent is a clone of.