Re: [PATCH 2/7] perf: Generalize task_function_call()ers

From: Peter Zijlstra
Date: Fri Dec 18 2015 - 04:01:39 EST


On Thu, Dec 17, 2015 at 04:07:32PM +0100, Peter Zijlstra wrote:
> On Thu, Dec 17, 2015 at 04:25:14PM +0200, Alexander Shishkin wrote:
>
> > That aside, why I brought it up in the first place is because the two
> > functions are asymmetric: one is called with irqs disabled and the
> > other -- with ctx::lock held (and not because I'm into bikeshedding or
> > anything like that). Looking at the pair of them sets off my "that's not
> > right" trigger and sends me to the event_function_call()
> > implementation. So in that sense, prepending an extra underscore kind of
> > made sense. Maybe __perf_remove_from_context_{on,off}()?
>
> You are quite right, and I think I've found more problems because of
> this. Let me prod at this some more.

So this then...

This fixes, I think, 3 separate bugs:

- remove_from_context used to clear ->is_active, this is against the
update rules from ctx_sched_in() which set ->is_active even though
there might be !nr_events

- install_in_context did bad things to cpuctx->task_ctx; it would not
validate that ctx->task == current and could do random things because
of that.

- cpuctx->task_ctx tracking was iffy

It also unifies a lot of the weird and fragile code we had around all
those IPI calls and adds a bunch of assertions.

It seems to survive a little pounding with 'normal' workloads.

Please have an extra careful look..

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/perf_event.h | 4
kernel/events/core.c | 463 ++++++++++++++++--------------------------
kernel/events/hw_breakpoint.c | 2
3 files changed, 180 insertions(+), 289 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1044,7 +1044,7 @@ extern void perf_swevent_put_recursion_c
extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
-extern int __perf_event_disable(void *info);
+extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_task_tick(void);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
@@ -1106,7 +1106,7 @@ static inline void perf_swevent_put_recu
static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; }
static inline void perf_event_enable(struct perf_event *event) { }
static inline void perf_event_disable(struct perf_event *event) { }
-static inline int __perf_event_disable(void *info) { return -1; }
+static inline void perf_event_disable_local(struct perf_event *event) { }
static inline void perf_event_task_tick(void) { }
static inline int perf_event_release_kernel(struct perf_event *event) { return 0; }
#endif
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -126,21 +126,127 @@ static int cpu_function_call(int cpu, re
return data.ret;
}

-static void event_function_call(struct perf_event *event,
- int (*active)(void *),
- void (*inactive)(void *),
- void *data)
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx)
+ raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (ctx)
+ raw_spin_unlock(&ctx->lock);
+ raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
+ struct perf_event_context *, void *);
+
+struct event_function_struct {
+ struct perf_event *event;
+ event_f func;
+ void *data;
+};
+
+static int event_function(void *info)
+{
+ struct event_function_struct *efs = info;
+ struct perf_event *event = efs->event;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event_context *task_ctx = NULL;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ /*
+ * If this is aimed at a task, check that we hit it.
+ *
+ * This being a smp_call_function() we'll have IRQs disabled, therefore
+ * if this is current, we cannot race with context switches swapping
+ * out contexts, see perf_event_context_sched_out().
+ */
+ if (ctx->task) {
+ /* Wrong task, try again. */
+ if (ctx->task != current)
+ return -EAGAIN;
+
+ task_ctx = ctx;
+ } else
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ /*
+ * Now that we hold locks, double check state. Paranoia pays.
+ */
+ if (task_ctx) {
+ WARN_ON_ONCE(task_ctx->task != current);
+
+ /*
+ * The rules are that:
+ *
+ * * ctx->is_active is set irrespective of the actual number of
+ * events, such that we can know it got scheduled in etc.
+ *
+ * * cpuctx->task_ctx is only set if there were actual events on.
+ *
+ * It can happen that even though we hit the right task,
+ * ->is_active is still false, this is possible when the ctx is
+ * new and the task hasn't scheduled yet, or the ctx on its way
+ * out.
+ */
+ if (task_ctx->is_active && task_ctx->nr_events) {
+ WARN_ON_ONCE(cpuctx->task_ctx != task_ctx);
+ } else {
+ WARN_ON_ONCE(cpuctx->task_ctx);
+ }
+ }
+
+ efs->func(event, cpuctx, ctx, efs->data);
+ perf_ctx_unlock(cpuctx, task_ctx);
+
+ return 0;
+}
+
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };
+
+ int ret = event_function(&efs);
+ WARN_ON_ONCE(ret);
+}
+
+static void event_function_call(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ struct perf_cpu_context *cpuctx;
+
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };

if (!task) {
- cpu_function_call(event->cpu, active, data);
+ cpu_function_call(event->cpu, event_function, &efs);
return;
}

again:
- if (!task_function_call(task, active, data))
+ if (!task_function_call(task, event_function, &efs))
return;

raw_spin_lock_irq(&ctx->lock);
@@ -153,7 +259,8 @@ static void event_function_call(struct p
raw_spin_unlock_irq(&ctx->lock);
goto again;
}
- inactive(data);
+ cpuctx = __get_cpu_context(ctx);
+ func(event, cpuctx, ctx, data);
raw_spin_unlock_irq(&ctx->lock);
}

@@ -368,28 +475,6 @@ static inline u64 perf_event_clock(struc
return event->clock();
}

-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
-static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- raw_spin_lock(&cpuctx->ctx.lock);
- if (ctx)
- raw_spin_lock(&ctx->lock);
-}
-
-static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
-}
-
#ifdef CONFIG_CGROUP_PERF

static inline bool
@@ -1655,47 +1740,18 @@ group_sched_out(struct perf_event *group
cpuctx->exclusive = 0;
}

-struct remove_event {
- struct perf_event *event;
- bool detach_group;
-};
-
-static void ___perf_remove_from_context(void *info)
-{
- struct remove_event *re = info;
- struct perf_event *event = re->event;
- struct perf_event_context *ctx = event->ctx;
-
- if (re->detach_group)
- perf_group_detach(event);
- list_del_event(event, ctx);
-}
-
-/*
- * Cross CPU call to remove a performance event
- *
- * We disable the event on the hardware level first. After that we
- * remove it from the context list.
- */
-static int __perf_remove_from_context(void *info)
+static void
+__perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, void *info)
{
- struct remove_event *re = info;
- struct perf_event *event = re->event;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ bool detach_group = (unsigned long)info;

- raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
- if (re->detach_group)
+ if (detach_group)
perf_group_detach(event);
list_del_event(event, ctx);
- if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
- ctx->is_active = 0;
+ if (!ctx->nr_events && cpuctx->task_ctx == ctx)
cpuctx->task_ctx = NULL;
- }
- raw_spin_unlock(&ctx->lock);
-
- return 0;
}

/*
@@ -1714,70 +1770,33 @@ static int __perf_remove_from_context(vo
static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
- struct remove_event re = {
- .event = event,
- .detach_group = detach_group,
- };

lockdep_assert_held(&ctx->mutex);

event_function_call(event, __perf_remove_from_context,
- ___perf_remove_from_context, &re);
+ (void *)(unsigned long)detach_group);
}

-/*
- * Cross CPU call to disable a performance event
- */
-int __perf_event_disable(void *info)
+static void
+__perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- /*
- * If this is a per-task event, need to check whether this
- * event's task is the current task on this cpu.
- *
- * Can trigger due to concurrent perf_event_context_sched_out()
- * flipping contexts around.
- */
- if (ctx->task && cpuctx->task_ctx != ctx)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
-
- /*
- * If the event is on, turn it off.
- * If it is in error state, leave it in error state.
- */
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
- if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
- else
- event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
- }
-
- raw_spin_unlock(&ctx->lock);
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ return;

- return 0;
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ update_group_times(event);
+ if (event == event->group_leader)
+ group_sched_out(event, cpuctx, ctx);
+ else
+ event_sched_out(event, cpuctx, ctx);
+ event->state = PERF_EVENT_STATE_OFF;
}

-void ___perf_event_disable(void *info)
+void perf_event_disable_local(struct perf_event *event)
{
- struct perf_event *event = info;
-
- /*
- * Since we have the lock this context can't be scheduled
- * in, so we can change the state safely.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_OFF;
- }
+ event_function_local(event, __perf_event_disable, NULL);
}

/*
@@ -1804,8 +1823,7 @@ static void _perf_event_disable(struct p
}
raw_spin_unlock_irq(&ctx->lock);

- event_function_call(event, __perf_event_disable,
- ___perf_event_disable, event);
+ event_function_call(event, __perf_event_disable, NULL);
}

/*
@@ -2054,62 +2072,30 @@ static void perf_event_sched_in(struct p
if (ctx)
ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
- if (ctx)
+ if (ctx) {
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ if (ctx->nr_events)
+ cpuctx->task_ctx = ctx;
+ }
}

-static void ___perf_install_in_context(void *info)
-{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
-
- /*
- * Since the task isn't running, its safe to add the event, us holding
- * the ctx->lock ensures the task won't get scheduled in.
- */
- add_event_to_ctx(event, ctx);
-}
-
-/*
- * Cross CPU call to install and enable a performance event
- *
- * Must be called with ctx->mutex held
- */
-static int __perf_install_in_context(void *info)
+static void perf_resched_context(struct perf_cpu_context *cpuctx)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- struct task_struct *task = current;

- perf_ctx_lock(cpuctx, task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
-
- /*
- * If there was an active task_ctx schedule it out.
- */
if (task_ctx)
task_ctx_sched_out(task_ctx);
-
- /*
- * If the context we're installing events in is not the
- * active task_ctx, flip them.
- */
- if (ctx->task && task_ctx != ctx) {
- if (task_ctx)
- raw_spin_unlock(&task_ctx->lock);
- raw_spin_lock(&ctx->lock);
- task_ctx = ctx;
- }
-
- if (task_ctx) {
- cpuctx->task_ctx = task_ctx;
- task = task_ctx->task;
- }
-
cpu_ctx_sched_out(cpuctx, EVENT_ALL);

+ perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_pmu_enable(cpuctx->ctx.pmu);
+}
+
+static void
+__perf_install_in_context(struct perf_event *event, struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, void *info)
+{
update_context_time(ctx);
/*
* update cgrp time only if current cgrp
@@ -2120,15 +2106,8 @@ static int __perf_install_in_context(vo

add_event_to_ctx(event, ctx);

- /*
- * Schedule everything back in
- */
- perf_event_sched_in(cpuctx, task_ctx, task);
-
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, task_ctx);
-
- return 0;
+ if (ctx->is_active)
+ perf_resched_context(cpuctx);
}

/*
@@ -2152,8 +2131,7 @@ perf_install_in_context(struct perf_even
if (event->cpu != -1)
event->cpu = cpu;

- event_function_call(event, __perf_install_in_context,
- ___perf_install_in_context, event);
+ event_function_call(event, __perf_install_in_context, NULL);
}

/*
@@ -2177,88 +2155,33 @@ static void __perf_event_mark_enabled(st
}
}

-/*
- * Cross CPU call to enable a performance event
- */
-static int __perf_event_enable(void *info)
+static void
+__perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
struct perf_event *leader = event->group_leader;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- int err;
-
- /*
- * There's a time window between 'ctx->is_active' check
- * in perf_event_enable function and this place having:
- * - IRQs on
- * - ctx->lock unlocked
- *
- * where the task could be killed and 'ctx' deactivated
- * by perf_event_exit_task.
- */
- if (!ctx->is_active)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
- update_context_time(ctx);

if (event->state >= PERF_EVENT_STATE_INACTIVE)
- goto unlock;
+ return;

- /*
- * set current task's cgroup time reference point
- */
+ update_context_time(ctx);
perf_cgroup_set_timestamp(current, ctx);
-
__perf_event_mark_enabled(event);

if (!event_filter_match(event)) {
if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
- goto unlock;
+ return;
}

/*
- * If the event is in a group and isn't the group leader,
- * then don't put it on unless the group is on.
+ * If we're part of a group and the leader isn't active, we won't be either.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
- goto unlock;
-
- if (!group_can_go_on(event, cpuctx, 1)) {
- err = -EEXIST;
- } else {
- if (event == leader)
- err = group_sched_in(event, cpuctx, ctx);
- else
- err = event_sched_in(event, cpuctx, ctx);
- }
-
- if (err) {
- /*
- * If this event can't go on and it's part of a
- * group, then the whole group has to come off.
- */
- if (leader != event) {
- group_sched_out(leader, cpuctx, ctx);
- perf_mux_hrtimer_restart(cpuctx);
- }
- if (leader->attr.pinned) {
- update_group_times(leader);
- leader->state = PERF_EVENT_STATE_ERROR;
- }
- }
-
-unlock:
- raw_spin_unlock(&ctx->lock);
-
- return 0;
-}
+ return;

-void ___perf_event_enable(void *info)
-{
- __perf_event_mark_enabled((struct perf_event *)info);
+ if (ctx->is_active)
+ perf_resched_context(cpuctx);
}

/*
@@ -2291,8 +2214,7 @@ static void _perf_event_enable(struct pe
event->state = PERF_EVENT_STATE_OFF;
raw_spin_unlock_irq(&ctx->lock);

- event_function_call(event, __perf_event_enable,
- ___perf_event_enable, event);
+ event_function_call(event, __perf_event_enable, NULL);
}

/*
@@ -2774,9 +2696,6 @@ static void perf_event_context_sched_in(
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);

- if (ctx->nr_events)
- cpuctx->task_ctx = ctx;
-
perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);

perf_pmu_enable(ctx->pmu);
@@ -4086,36 +4005,13 @@ static void perf_event_for_each(struct p
perf_event_for_each_child(sibling, func);
}

-struct period_event {
- struct perf_event *event;
- u64 value;
-};
-
-static void ___perf_event_period(void *info)
-{
- struct period_event *pe = info;
- struct perf_event *event = pe->event;
- u64 value = pe->value;
-
- if (event->attr.freq) {
- event->attr.sample_freq = value;
- } else {
- event->attr.sample_period = value;
- event->hw.sample_period = value;
- }
-
- local64_set(&event->hw.period_left, 0);
-}
-
-static int __perf_event_period(void *info)
+static void
+__perf_event_period(struct perf_event *event, struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx, void *info)
{
- struct period_event *pe = info;
- struct perf_event *event = pe->event;
- struct perf_event_context *ctx = event->ctx;
- u64 value = pe->value;
+ u64 value = *((u64 *)info);
bool active;

- raw_spin_lock(&ctx->lock);
if (event->attr.freq) {
event->attr.sample_freq = value;
} else {
@@ -4125,6 +4021,7 @@ static int __perf_event_period(void *inf

active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
+ WARN_ON_ONCE(!ctx->is_active);
perf_pmu_disable(ctx->pmu);
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -4135,14 +4032,10 @@ static int __perf_event_period(void *inf
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
- raw_spin_unlock(&ctx->lock);
-
- return 0;
}

static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
- struct period_event pe = { .event = event, };
u64 value;

if (!is_sampling_event(event))
@@ -4157,10 +4050,7 @@ static int perf_event_period(struct perf
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;

- pe.value = value;
-
- event_function_call(event, __perf_event_period,
- ___perf_event_period, &pe);
+ event_function_call(event, __perf_event_period, &value);

return 0;
}
@@ -4932,7 +4822,7 @@ static void perf_pending_event(struct ir

if (event->pending_disable) {
event->pending_disable = 0;
- __perf_event_disable(event);
+ perf_event_disable_local(event);
}

if (event->pending_wakeup) {
@@ -9221,13 +9111,14 @@ static void perf_event_init_cpu(int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
- struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *event;

- rcu_read_lock();
- list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
- __perf_remove_from_context(&re);
- rcu_read_unlock();
+ raw_spin_lock(&ctx->lock);
+ list_for_each_entry(event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true);
+ raw_spin_unlock(&ctx->lock);
}

static void perf_event_exit_cpu_context(int cpu)
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct per
* current task.
*/
if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
- __perf_event_disable(bp);
+ perf_event_disable_local(bp);
else
perf_event_disable(bp);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/