Re: [PATCH 2/5] perf: generic intel uncore support

From: Peter Zijlstra
Date: Sat Mar 31 2012 - 13:25:54 EST


On Wed, 2012-03-28 at 14:43 +0800, Yan, Zheng wrote:
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> new file mode 100644
> index 0000000..d159e3e
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -0,0 +1,814 @@
> +#include "perf_event_intel_uncore.h"
> +
> +static struct intel_uncore_type *empty_uncore[] = { NULL, };
> +static struct intel_uncore_type **msr_uncores = empty_uncore;
> +
> +/* constraint for box with 2 counters */
> +static struct event_constraint unconstrained_2 =
> + EVENT_CONSTRAINT(0, 0x3, 0);
> +/* constraint for box with 3 counters */
> +static struct event_constraint unconstrained_3 =
> + EVENT_CONSTRAINT(0, 0x7, 0);
> +/* constraint for box with 4 counters */
> +static struct event_constraint unconstrained_4 =
> + EVENT_CONSTRAINT(0, 0xf, 0);
> +/* constraint for box with 8 counters */
> +static struct event_constraint unconstrained_8 =
> + EVENT_CONSTRAINT(0, 0xff, 0);
> +/* constraint for the fixed countesr */
> +static struct event_constraint constraint_fixed =
> + EVENT_CONSTRAINT((u64)-1, 1 << UNCORE_PMC_IDX_FIXED, (u64)-1);

Since they're all different, why not have an struct event_constraint
unconstrained member in your struct intel_uncore_pmu and fill it out
whenever you create that.

> +static DEFINE_SPINLOCK(uncore_box_lock);

> +/*
> + * The overflow interrupt is unavailable for SandyBridge-EP, is broken
> + * for SandyBridge. So we use hrtimer to periodically poll the counter
> + */

To avoid overlow and accumulate into the software u64, right? Not to
actually sample anything.

Might also want to say is broken for anything else, since afaik uncore
PMI has been broken for everything with an uncore.


> +static struct intel_uncore_box *
> +__uncore_pmu_find_box(struct intel_uncore_pmu *pmu, int phyid)
> +{
> + struct intel_uncore_box *box;
> + struct hlist_head *head;
> + struct hlist_node *node;
> +
> + head = &pmu->box_hash[phyid % UNCORE_BOX_HASH_SIZE];
> +
> + hlist_for_each_entry_rcu(box, node, head, hlist) {
> + if (box->phy_id == phyid)
> + return box;
> + }
> +
> + return NULL;
> +}
> +
> +static struct intel_uncore_box *
> +uncore_pmu_find_box(struct intel_uncore_pmu *pmu, int phyid)
> +{
> + struct intel_uncore_box *box;
> +
> + rcu_read_lock();
> + box = __uncore_pmu_find_box(pmu, phyid);
> + rcu_read_unlock();
> +
> + return box;
> +}
> +
> +/* caller should hold the uncore_box_lock */
> +static void uncore_pmu_add_box(struct intel_uncore_pmu *pmu,
> + struct intel_uncore_box *box)
> +{
> + struct hlist_head *head;
> +
> + head = &pmu->box_hash[box->phy_id % UNCORE_BOX_HASH_SIZE];
> + hlist_add_head_rcu(&box->hlist, head);
> +}
> +
> +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
> +{
> + return container_of(event->pmu, struct intel_uncore_pmu, pmu);
> +}
> +
> +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
> +{
> + int phyid = topology_physical_package_id(smp_processor_id());

Who says that this event has anything to do with the current cpu?

> + return uncore_pmu_find_box(uncore_event_to_pmu(event), phyid);
> +}

So why not simply use a per-cpu allocation and have something like:

struct intel_uncore_pmu {
...
struct intel_uncore_box * __percpu box;
};

static inline
struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
{
return per_cpu_ptr(event->pmu->box, event->cpu);
}

And be done with it?

> +static int uncore_collect_events(struct intel_uncore_box *box,
> + struct perf_event *leader, bool dogrp)
> +{
> + struct perf_event *event;
> + int n, max_count;
> +
> + max_count = box->pmu->type->num_counters;
> + if (box->pmu->type->fixed_ctl)
> + max_count++;
> +
> + if (box->n_events >= max_count)
> + return -EINVAL;
> +
> + /*
> + * adding the same events twice to the uncore PMU may cause
> + * general protection fault
> + */

Is that an errata or a 'feature' of some specific box types, or what?

> + for (n = 0; n < box->n_events; n++) {
> + event = box->event_list[n];
> + if (event->hw.config == leader->hw.config)
> + return -EINVAL;
> + }
> +
> + n = box->n_events;
> + box->event_list[n] = leader;
> + n++;
> + if (!dogrp)
> + return n;
> +
> + list_for_each_entry(event, &leader->sibling_list, group_entry) {
> + if (event->state <= PERF_EVENT_STATE_OFF)
> + continue;
> +
> + if (n >= max_count)
> + return -EINVAL;
> +
> + box->event_list[n] = event;
> + n++;
> + }
> + return n;
> +}
> +
> +static struct event_constraint *
> +uncore_event_constraint(struct intel_uncore_type *type,
> + struct perf_event *event)
> +{
> + struct event_constraint *c;
> +
> + if (event->hw.config == (u64)-1)
> + return &constraint_fixed;
> +
> + if (type->constraints) {
> + for_each_event_constraint(c, type->constraints) {
> + if ((event->hw.config & c->cmask) == c->code)
> + return c;
> + }
> + }
> +
> + if (type->num_counters == 2)
> + return &unconstrained_2;
> + if (type->num_counters == 3)
> + return &unconstrained_3;
> + if (type->num_counters == 4)
> + return &unconstrained_4;
> + if (type->num_counters == 8)
> + return &unconstrained_8;
> +
> + WARN_ON_ONCE(1);
> + return &unconstrained_2;

return event->pmu->unconstrained;

seems much saner to me..

> +}
> +
> +static int uncore_assign_events(struct intel_uncore_box *box,
> + int assign[], int n)
> +{
> + struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX];
> + int i, ret, wmin, wmax;
> +
> + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
> + c = uncore_event_constraint(box->pmu->type,
> + box->event_list[i]);
> + constraints[i] = c;
> + wmin = min(wmin, c->weight);
> + wmax = max(wmax, c->weight);
> + }

No fast path then?

> + ret = perf_assign_events(constraints, n, wmin, wmax, assign);
> + return ret ? -EINVAL : 0;
> +}


> +static void uncore_pmu_event_start(struct perf_event *event, int flags)
> +{
> + struct intel_uncore_box *box = uncore_event_to_box(event);
> +
> + raw_spin_lock(&box->lock);
> + __uncore_pmu_event_start(box, event, flags);
> + raw_spin_unlock(&box->lock);
> +}

> +static void uncore_pmu_event_stop(struct perf_event *event, int flags)
> +{
> + struct intel_uncore_box *box = uncore_event_to_box(event);
> +
> + raw_spin_lock(&box->lock);
> + __uncore_pmu_event_stop(box, event, flags);
> + raw_spin_unlock(&box->lock);
> +}

> +static int uncore_pmu_event_add(struct perf_event *event, int flags)
> +{
> + struct intel_uncore_box *box = uncore_event_to_box(event);
> + struct hw_perf_event *hwc = &event->hw;
> + int assign[UNCORE_PMC_IDX_MAX];
> + int i, n, ret;
> +
> + if (!box)
> + return -ENODEV;
> +
> + raw_spin_lock(&box->lock);

> + raw_spin_unlock(&box->lock);
> + return ret;
> +}
> +
> +static void uncore_pmu_event_del(struct perf_event *event, int flags)
> +{
> + struct intel_uncore_box *box = uncore_event_to_box(event);
> + int i;
> +
> + raw_spin_lock(&box->lock);

> + raw_spin_unlock(&box->lock);
> +}

So what's up with all this box->lock business.. why does that lock
exist?

> +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
> +{
> + int ret;
> +
> + pmu->pmu.attr_groups = pmu->type->attr_groups;
> + pmu->pmu.task_ctx_nr = perf_invalid_context;
> + pmu->pmu.event_init = uncore_pmu_event_init;
> + pmu->pmu.add = uncore_pmu_event_add;
> + pmu->pmu.del = uncore_pmu_event_del;
> + pmu->pmu.start = uncore_pmu_event_start;
> + pmu->pmu.stop = uncore_pmu_event_stop;
> + pmu->pmu.read = uncore_pmu_event_read;

Won't this look better as a C99 struct init? Something like:

pmu->pmu = (struct pmu){
.attr_groups = pmu->type->attr_groups,
.task_ctx_nr = perf_invalid_context,
.event_init = uncore_pmu_event_init,
...
};

> + if (pmu->type->num_boxes == 1)
> + sprintf(pmu->name, "uncore_%s", pmu->type->name);
> + else
> + sprintf(pmu->name, "uncore_%s%d", pmu->type->name,
> + pmu->pmu_idx);
> +
> + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
> + return ret;
> +}


> +static int __init uncore_type_init(struct intel_uncore_type *type)
> +{
> + struct intel_uncore_pmu *pmus;
> + struct attribute_group *events_group;
> + struct attribute **attrs;
> + int i, j;
> +
> + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
> + if (!pmus)
> + return -ENOMEM;

Hmm, but if you have a pmu per number of boxes, then what do you need
that pmu->box reference for?

> +
> + for (i = 0; i < type->num_boxes; i++) {
> + pmus[i].func_id = -1;
> + pmus[i].pmu_idx = i;
> + pmus[i].type = type;
> +
> + for (j = 0; j < ARRAY_SIZE(pmus[0].box_hash); j++)
> + INIT_HLIST_HEAD(&pmus[i].box_hash[j]);
> + }
> +
> + if (type->event_descs) {
> + for (i = 0; ; i++) {
> + if (!type->event_descs[i].attr.attr.name)
> + break;
> + }
> +
> + events_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
> + sizeof(*events_group), GFP_KERNEL);
> + if (!events_group)
> + goto fail;
> +
> + attrs = (struct attribute **)(events_group + 1);
> + events_group->name = "events";
> + events_group->attrs = attrs;
> +
> + for (j = 0; j < i; j++)
> + attrs[j] = &type->event_descs[j].attr.attr;
> +
> + type->attr_groups[1] = events_group;
> + }
> + type->pmus = pmus;
> + return 0;
> +fail:
> + uncore_type_exit(type);
> + return -ENOMEM;
> +}
> +


Aside from all this, there's still the problem that you don't place all
events for a particular phys_id onto a single cpu. It doesn't matter
which cpu in that package it is, but all events should go to the same.

This means that on unplug of that cpu, you have to migrate all these
events etc..

I suspect doing this will also allow you to get rid of that box->lock
thing.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/