Re: [PATCH v6 1/3] perf/core: use rb trees for pinned/flexible groups

From: Peter Zijlstra
Date: Thu Aug 03 2017 - 09:00:31 EST


On Wed, Aug 02, 2017 at 11:13:54AM +0300, Alexey Budankov wrote:
> This patch moves event groups into rb tree sorted by CPU, so that
> multiplexing hrtimer interrupt handler would be able skipping to the current
> CPU's list and ignore groups allocated for the other CPUs.
>
> New API for manipulating event groups in the trees is implemented as well
> as adoption on the API in the current implementation.
>
> Because perf_event_groups_iterate() API provides capability to execute
> a callback for every event group in a tree, adoption of the API introduces
> some code that packs and unpacks arguments of functions existing in
> the implementation as well as adjustments of their calling signatures
> e.g. ctx_pinned_sched_in(), ctx_flexible_sched_in() and inherit_task_group().

This does not speak of why we need group_list.

> Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
> ---
> include/linux/perf_event.h | 18 ++-
> kernel/events/core.c | 389 +++++++++++++++++++++++++++++++++------------
> 2 files changed, 306 insertions(+), 101 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index a3b873f..282f121 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -572,6 +572,20 @@ struct perf_event {
> */
> struct list_head group_entry;
> struct list_head sibling_list;
> + /*
> + * Node on the pinned or flexible tree located at the event context;
> + * the node may be empty in case its event is not directly attached
> + * to the tree but to group_list list of the event directly
> + * attached to the tree;
> + */
> + struct rb_node group_node;
> + /*
> + * List keeps groups allocated for the same cpu;
> + * the list may be empty in case its event is not directly
> + * attached to the tree but to group_list list of the event directly
> + * attached to the tree;
> + */
> + struct list_head group_list;
>
> /*
> * We need storage to track the entries in perf_pmu_migrate_context; we
> @@ -741,8 +755,8 @@ struct perf_event_context {
> struct mutex mutex;
>
> struct list_head active_ctx_list;
> - struct list_head pinned_groups;
> - struct list_head flexible_groups;
> + struct rb_root pinned_groups;
> + struct rb_root flexible_groups;
> struct list_head event_list;
> int nr_events;
> int nr_active;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 426c2ff..0a4f619 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1466,8 +1466,12 @@ static enum event_type_t get_event_type(struct perf_event *event)
> return event_type;
> }
>
> -static struct list_head *
> -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
> +/*
> + * Extract pinned or flexible groups from the context
> + * based on event attrs bits;
> + */
> +static struct rb_root *
> +get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
> {
> if (event->attr.pinned)
> return &ctx->pinned_groups;
> @@ -1475,6 +1479,160 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
> return &ctx->flexible_groups;
> }
>
> +static void
> +perf_event_groups_insert(struct rb_root *groups,
> + struct perf_event *event);
> +
> +static void
> +perf_event_groups_delete(struct rb_root *groups,
> + struct perf_event *event);

Can't we do away with these fwd declarations by simple reordering of the
function definitions?

> +/*
> + * Helper function to insert event into the pinned or
> + * flexible groups;
> + */
> +static void
> +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
> +{
> + struct rb_root *groups;
> +
> + groups = get_event_groups(event, ctx);
> + perf_event_groups_insert(groups, event);
> +}
> +
> +/*
> + * Helper function to delete event from its groups;
> + */
> +static void
> +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
> +{
> + struct rb_root *groups;
> +
> + groups = get_event_groups(event, ctx);
> + perf_event_groups_delete(groups, event);
> +}
> +
> +/*
> + * Insert a group into a tree using event->cpu as a key. If event->cpu node
> + * is already attached to the tree then the event is added to the attached
> + * group's group_list list.
> + */
> +static void
> +perf_event_groups_insert(struct rb_root *groups,
> + struct perf_event *event)
> +{
> + struct rb_node **node;
> + struct rb_node *parent;
> + struct perf_event *node_event;
> +
> + node = &groups->rb_node;
> + parent = *node;
> +
> + while (*node) {
> + parent = *node;
> + node_event = container_of(*node,
> + struct perf_event, group_node);
> +
> + if (event->cpu < node_event->cpu) {
> + node = &parent->rb_left;
> + } else if (event->cpu > node_event->cpu) {
> + node = &parent->rb_right;

I would much prefer you use a comparator like:

static always_inline int
perf_event_less(struct perf_event *left, struct perf_event *right)
{
if (left->cpu < right_cpu)
return 1;

return 0;
}

That way we can add additional order. In specific ARM also wants things
ordered on PMU for their big.LITTLE stuff.

> + } else {
> + list_add_tail(&event->group_entry,
> + &node_event->group_list);
> + return;

Urgh, so this is what you want that list for... why not keep duplicates
in the tree itself and iterate that?

> + }
> + }
> +
> + list_add_tail(&event->group_entry, &event->group_list);
> +
> + rb_link_node(&event->group_node, parent, node);
> + rb_insert_color(&event->group_node, groups);
> +}

> +/*
> + * Find group list by a cpu key and rotate it.
> + */
> +static void
> +perf_event_groups_rotate(struct rb_root *groups, int cpu)
> +{
> + struct rb_node *node;
> + struct perf_event *node_event;
> +
> + node = groups->rb_node;
> +
> + while (node) {
> + node_event = container_of(node,
> + struct perf_event, group_node);
> +
> + if (cpu < node_event->cpu) {
> + node = node->rb_left;
> + } else if (cpu > node_event->cpu) {
> + node = node->rb_right;
> + } else {
> + list_rotate_left(&node_event->group_list);
> + break;
> + }
> + }
> +}

Ah, you worry about how to rotate inside a tree?

You can do that by adding (run)time based ordering, and you'll end up
with a runtime based scheduler.

A trivial variant keeps a simple counter per tree that is incremented
for each rotation. That should end up with the events ordered exactly
like the list. And if you have that comparator like above, expressing
that additional ordering becomes simple ;-)

Something like:

struct group {
u64 vtime;
rb_tree tree;
};

bool event_less(left, right)
{
if (left->cpu < right->cpu)
return true;

if (left->cpu > right_cpu)
return false;

if (left->vtime < right->vtime)
return true;

return false;
}

insert_group(group, event, tail)
{
if (tail)
event->vtime = ++group->vtime;

tree_insert(&group->root, event);
}

Then every time you use insert_group(.tail=1) it goes to the end of that
CPU's 'list'.


The added benefit is that it then becomes fairly simple to improve upon
the RR scheduling, which suffers a bunch of boundary conditions where
the task runtimes mis-align with the rotation window.

> +typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);

We already have perf_iterate_f, the only difference appears to be that
this has a return value. Surely these can be unified.

> +/*
> + * Iterate event groups and call provided callback for every group in the tree.
> + * Iteration stops if the callback returns non zero.
> + */
> +static int
> +perf_event_groups_iterate(struct rb_root *groups,
> + perf_event_groups_iterate_f callback, void *data)
> +{
> + int ret = 0;
> + struct rb_node *node;
> + struct perf_event *node_event, *event;

In general we prefer variable definitions to be ordered on line length,
longest first. So the exact opposite of what you have here.

> +
> + for (node = rb_first(groups); node; node = rb_next(node)) {
> + node_event = container_of(node, struct perf_event, group_node);
> + list_for_each_entry(event, &node_event->group_list,
> + group_entry) {
> + ret = callback(event, data);
> + if (ret) {
> + return ret;
> + }
> + }
> + }
> +
> + return ret;
> +}
> +
> /*
> * Add a event from the lists for its context.
> * Must be called with ctx->mutex and ctx->lock held.

> @@ -1869,6 +2023,22 @@ group_sched_out(struct perf_event *group_event,
> cpuctx->exclusive = 0;
> }
>
> +struct group_sched_params {
> + struct perf_cpu_context *cpuctx;
> + struct perf_event_context *ctx;
> + int can_add_hw;
> +};
> +
> +static int
> +group_sched_out_callback(struct perf_event *event, void *data)
> +{
> + struct group_sched_params *params = data;
> +
> + group_sched_out(event, params->cpuctx, params->ctx);
> +
> + return 0;
> +}

Right, C sucks.. or possibly you've chosen the wrong pattern.

So the alternative is something like:

#define for_each_group_event(event, group, cpu, pmu, field) \
for (event = rb_entry_safe(group_first(group, cpu, pmu),\
typeof(*event), field); \
event && event->cpu == cpu && event->pmu == pmu; \
event = rb_entry_safe(rb_next(&event->field), \
typeof(*event), field))


And then you can write things like:

for_each_group_event(event, group, cpu, pmu)
group_sched_out(event, cpuctx, ctx);


> +
> #define DETACH_GROUP 0x01UL
>
> /*
> @@ -2712,7 +2882,10 @@ static void ctx_sched_out(struct perf_event_context *ctx,
> enum event_type_t event_type)
> {
> int is_active = ctx->is_active;
> - struct perf_event *event;
> + struct group_sched_params params = {
> + .cpuctx = cpuctx,
> + .ctx = ctx
> + };
>
> lockdep_assert_held(&ctx->lock);
>
> @@ -2759,13 +2932,13 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>
> perf_pmu_disable(ctx->pmu);
> if (is_active & EVENT_PINNED) {
> - list_for_each_entry(event, &ctx->pinned_groups, group_entry)
> - group_sched_out(event, cpuctx, ctx);
> + perf_event_groups_iterate(&ctx->pinned_groups,
> + group_sched_out_callback, &params);

So here I would expect to not iterate events where event->cpu !=
smp_processor_id() (and ideally not where event->pmu != ctx->pmu).

> }
>
> if (is_active & EVENT_FLEXIBLE) {
> - list_for_each_entry(event, &ctx->flexible_groups, group_entry)
> - group_sched_out(event, cpuctx, ctx);
> + perf_event_groups_iterate(&ctx->flexible_groups,
> + group_sched_out_callback, &params);

Idem.

> }
> perf_pmu_enable(ctx->pmu);
> }


I think the rest of the patch is just plumbing to make the above useful.
Let me know if I missed something of value.