Re: [PATCH v9 12/14] x86: perf: intel_pt: Intel PT PMU driver

From: Alexander Shishkin
Date: Thu Jan 29 2015 - 10:56:53 EST


On 29 January 2015 at 13:59, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> I think that logic fails for per-task events that have a cpu set.

True.

>> +static bool exclusive_event_ok(struct perf_event *event,
>> + struct perf_event_context *ctx)
>> +{
>> + struct pmu *pmu = event->pmu;
>> + bool ret = true;
>> +
>> + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
>> + return true;
>> +
>> + if (!__exclusive_event_ok(event, ctx))
>> + return false;
>> +
>> + if (ctx->task) {
>> + if (event->cpu != -1) {
>> + struct perf_event_context *cpuctx;
>> +
>> + cpuctx = &per_cpu_ptr(pmu->pmu_cpu_context, event->cpu)->ctx;
>> +
>> + mutex_lock(&cpuctx->mutex);
>> + ret = __exclusive_event_ok(event, cpuctx);
>> + mutex_unlock(&cpuctx->mutex);
>
> We're already holding ctx->mutex, this should have made lockdep scream.

As I mentioned offline, cpuctx->ctx.mutex is set to a lockdep class of
its own, so lockdep doesn't see this. It is, of course, still a
problem.
But as you pointed out, if we grab the exclusive_cnt for per-task
cpu!=-1 events as well, we don't need to look into other contexts here
at all.

>
>> + }
>> + }
>> +
>> + return ret;
>> +}
>
>
> Would something like this work?
>
> ---
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -168,6 +168,7 @@ struct perf_event;
> #define PERF_PMU_CAP_NO_INTERRUPT 0x01
> #define PERF_PMU_CAP_AUX_NO_SG 0x02
> #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x04
> +#define PERF_PMU_CAP_EXCLUSIVE 0x08
>
> /**
> * struct pmu - generic performance monitoring unit
> @@ -188,6 +189,7 @@ struct pmu {
>
> int * __percpu pmu_disable_count;
> struct perf_cpu_context * __percpu pmu_cpu_context;
> + atomic_t exclusive_cnt; /* <0: cpu, >0: tsk */
> int task_ctx_nr;
> int hrtimer_interval_ms;
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3487,6 +3487,9 @@ static void __free_event(struct perf_eve
> if (event->destroy)
> event->destroy(event);
>
> + if (event->pmu && event->ctx)
> + exclusive_event_release(event);

It looks like event can be already removed from its context at this
point, so event->ctx will be NULL and the counter would leak or am I
missing something?
I used the event->attach_state & PERF_ATTACH_TASK to see if it's a
per-task counter, which seems reliable even though it might need
documenting.

> +
> if (event->ctx)
> put_ctx(event->ctx);
>
> @@ -7092,6 +7095,7 @@ int perf_pmu_register(struct pmu *pmu, c
> pmu->event_idx = perf_event_idx_default;
>
> list_add_rcu(&pmu->entry, &pmus);
> + atomic_set(&pmu->exclusive_cnt, 0);
> ret = 0;
> unlock:
> mutex_unlock(&pmus_lock);
> @@ -7537,6 +7541,78 @@ perf_event_set_output(struct perf_event
> return ret;
> }
>
> +static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
> +{
> + if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
> + (e1->cpu == e2->cpu ||
> + e1->cpu == -1 ||
> + e2->cpu == -1))
> + return true;
> + return false;
> +}
> +
> +static bool __exclusive_event_ok(struct perf_event *event,
> + struct perf_event_context *ctx)
> +{
> + struct perf_event *iter_event;
> +
> + list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
> + if (exclusive_event_match(iter_event, event))
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static bool exclusive_event_ok(struct perf_event *event,
> + struct perf_event_context *ctx)

Then, maybe exclusive_event_installable() is a better name, because
this one only deals with the current context; cpu-wide vs per-task
case is taken care of in perf_event_alloc()/__free_event().

> +{
> + struct pmu *pmu = event->pmu;
> + bool ret;
> +
> + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
> + return true;
> +
> + /*
> + * exclusive_cnt <0: cpu
> + * >0: tsk
> + */
> + if (ctx->task) {
> + if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
> + return false;
> + } else {
> + if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
> + return false;
> + }

So I would like to keep this bit in perf_event_alloc() path and the
reverse in __free_event(),

> +
> + mutex_lock(&ctx->lock);
> + ret = __exclusive_event_ok(event, ctx);
> + mutex_unlock(&ctx->lock);
> +
> + if (!ret) {
> + if (ctx->task)
> + atomic_dec(&pmu->exclusive_cnt);
> + else
> + atomic_inc(&pmu->exclusive_cnt);
> + }

in which case we don't need to undo the counter here, because it will
still go through __free_event() in the error path.

> +
> + return ret;
> +}
> +
> +static void exclusive_event_release(struct perf_event *event)
> +{
> + struct perf_event_context *ctx = event->ctx;
> + struct pmu *pmu = event->pmu;
> +
> + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
> + return;
> +
> + if (ctx->task)
> + atomic_dec(&pmu->exclusive_cnt);
> + else
> + atomic_inc(&pmu->exclusive_cnt);
> +}
> +
> static void mutex_lock_double(struct mutex *a, struct mutex *b)
> {
> if (b < a)
> @@ -7702,6 +7778,15 @@ SYSCALL_DEFINE5(perf_event_open,
> task = NULL;
> }
>
> + if (pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) {
> + err = -EBUSY;
> + if (group_leader)
> + goto err_context;
> +
> + if (!exclusive_event_ok(event, ctx))
> + goto err_context;
> + }
> +
> /*
> * Look up the group leader (we will attach this event to it):
> */
> @@ -7903,6 +7988,11 @@ perf_event_create_kernel_counter(struct
> goto err_free;
> }
>
> + if (!exclusive_event_ok(event, ctx)) {
> + err = -EBUSY;
> + goto err_context;
> + }
> +
> WARN_ON_ONCE(ctx->parent_ctx);
> mutex_lock(&ctx->mutex);

So we can call exclusive_event_ok() here without dropping the
ctx::mutex, to make sure we don't race with another event creation.

Regards,
--
Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/