Re: [PATCH v2] perf/core: Fix cgroup events tracking

From: Chengming Zhou
Date: Fri Dec 16 2022 - 06:25:45 EST


Hello, ping :-)


On 2022/12/7 20:40, Chengming Zhou wrote:
> We encounter perf warnings when using cgroup events like:
> ```
> cd /sys/fs/cgroup
> mkdir test
> perf stat -e cycles -a -G test
> ```
>
> WARNING: CPU: 0 PID: 690 at kernel/events/core.c:849 perf_cgroup_switch+0xb2/0xc0
> Call Trace:
> <TASK>
> __schedule+0x4ae/0x9f0
> ? _raw_spin_unlock_irqrestore+0x23/0x40
> ? __cond_resched+0x18/0x20
> preempt_schedule_common+0x2d/0x70
> __cond_resched+0x18/0x20
> wait_for_completion+0x2f/0x160
> ? cpu_stop_queue_work+0x9e/0x130
> affine_move_task+0x18a/0x4f0
>
> WARNING: CPU: 0 PID: 690 at kernel/events/core.c:829 ctx_sched_in+0x1cf/0x1e0
> Call Trace:
> <TASK>
> ? ctx_sched_out+0xb7/0x1b0
> perf_cgroup_switch+0x88/0xc0
> __schedule+0x4ae/0x9f0
> ? _raw_spin_unlock_irqrestore+0x23/0x40
> ? __cond_resched+0x18/0x20
> preempt_schedule_common+0x2d/0x70
> __cond_resched+0x18/0x20
> wait_for_completion+0x2f/0x160
> ? cpu_stop_queue_work+0x9e/0x130
> affine_move_task+0x18a/0x4f0
>
> The above two warnings are not complete here since I remove other
> unimportant information. The problem is caused by the perf cgroup
> events tracking:
>
> CPU0 CPU1
> perf_event_open()
> perf_event_alloc()
> account_event()
> account_event_cpu()
> atomic_inc(perf_cgroup_events)
> __perf_event_task_sched_out()
> if (atomic_read(perf_cgroup_events))
> perf_cgroup_switch()
> // kernel/events/core.c:849
> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0)
> if (READ_ONCE(cpuctx->cgrp) == cgrp) // false
> return
> perf_ctx_lock()
> ctx_sched_out()
> cpuctx->cgrp = cgrp
> ctx_sched_in()
> perf_cgroup_set_timestamp()
> // kernel/events/core.c:829
> WARN_ON_ONCE(!ctx->nr_cgroups)
> perf_ctx_unlock()
> perf_install_in_context()
> cpu_function_call()
> __perf_install_in_context()
> add_event_to_ctx()
> list_add_event()
> perf_cgroup_event_enable()
> ctx->nr_cgroups++
> cpuctx->cgrp = X
>
> We can see from above that we wrongly use percpu atomic perf_cgroup_events
> to check if we need to perf_cgroup_switch(), which should only be used
> when we know this CPU has cgroup events enabled.
>
> The commit bd2756811766 ("perf: Rewrite core context handling") change
> to have only one context per-CPU, so we can just use cpuctx->cgrp to
> check if this CPU has cgroup events enabled.
>
> So percpu atomic perf_cgroup_events is not needed.
>
> Fixes: bd2756811766 ("perf: Rewrite core context handling")
> Signed-off-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
> Tested-by: Ravi Bangoria <ravi.bangoria@xxxxxxx>
> ---
> v2:
> - Remove timestamps and fix race graph in commit log per Ravi Bangoria.
> - Add Fixes tag and Tested-by tag.
> ---
> kernel/events/core.c | 42 ++++++++++--------------------------------
> 1 file changed, 10 insertions(+), 32 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index eacc3702654d..5d97a9f26003 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -380,7 +380,6 @@ enum event_type_t {
>
> /*
> * perf_sched_events : >0 events exist
> - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
> */
>
> static void perf_sched_delayed(struct work_struct *work);
> @@ -389,7 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
> static DEFINE_MUTEX(perf_sched_mutex);
> static atomic_t perf_sched_count;
>
> -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
> static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
>
> static atomic_t nr_mmap_events __read_mostly;
> @@ -844,9 +842,16 @@ static void perf_cgroup_switch(struct task_struct *task)
> struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> struct perf_cgroup *cgrp;
>
> - cgrp = perf_cgroup_from_task(task, NULL);
> + /*
> + * cpuctx->cgrp is set when the first cgroup event enabled,
> + * and is cleared when the last cgroup event disabled.
> + */
> + if (READ_ONCE(cpuctx->cgrp) == NULL)
> + return;
>
> WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
> +
> + cgrp = perf_cgroup_from_task(task, NULL);
> if (READ_ONCE(cpuctx->cgrp) == cgrp)
> return;
>
> @@ -3631,8 +3636,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
> * to check if we have to switch out PMU state.
> * cgroup event are system-wide mode only
> */
> - if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
> - perf_cgroup_switch(next);
> + perf_cgroup_switch(next);
> }
>
> static bool perf_less_group_idx(const void *l, const void *r)
> @@ -4974,15 +4978,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event)
> detach_sb_event(event);
> }
>
> -static void unaccount_event_cpu(struct perf_event *event, int cpu)
> -{
> - if (event->parent)
> - return;
> -
> - if (is_cgroup_event(event))
> - atomic_dec(&per_cpu(perf_cgroup_events, cpu));
> -}
> -
> #ifdef CONFIG_NO_HZ_FULL
> static DEFINE_SPINLOCK(nr_freq_lock);
> #endif
> @@ -5048,8 +5043,6 @@ static void unaccount_event(struct perf_event *event)
> schedule_delayed_work(&perf_sched_work, HZ);
> }
>
> - unaccount_event_cpu(event, event->cpu);
> -
> unaccount_pmu_sb_event(event);
> }
>
> @@ -11679,15 +11672,6 @@ static void account_pmu_sb_event(struct perf_event *event)
> attach_sb_event(event);
> }
>
> -static void account_event_cpu(struct perf_event *event, int cpu)
> -{
> - if (event->parent)
> - return;
> -
> - if (is_cgroup_event(event))
> - atomic_inc(&per_cpu(perf_cgroup_events, cpu));
> -}
> -
> /* Freq events need the tick to stay alive (see perf_event_task_tick). */
> static void account_freq_event_nohz(void)
> {
> @@ -11775,8 +11759,6 @@ static void account_event(struct perf_event *event)
> }
> enabled:
>
> - account_event_cpu(event, event->cpu);
> -
> account_pmu_sb_event(event);
> }
>
> @@ -12822,13 +12804,11 @@ static void __perf_pmu_remove(struct perf_event_context *ctx,
>
> perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
> perf_remove_from_context(event, 0);
> - unaccount_event_cpu(event, cpu);
> put_pmu_ctx(event->pmu_ctx);
> list_add(&event->migrate_entry, events);
>
> for_each_sibling_event(sibling, event) {
> perf_remove_from_context(sibling, 0);
> - unaccount_event_cpu(sibling, cpu);
> put_pmu_ctx(sibling->pmu_ctx);
> list_add(&sibling->migrate_entry, events);
> }
> @@ -12847,7 +12827,6 @@ static void __perf_pmu_install_event(struct pmu *pmu,
>
> if (event->state >= PERF_EVENT_STATE_OFF)
> event->state = PERF_EVENT_STATE_INACTIVE;
> - account_event_cpu(event, cpu);
> perf_install_in_context(ctx, event, cpu);
> }
>
> @@ -13742,8 +13721,7 @@ static int __perf_cgroup_move(void *info)
> struct task_struct *task = info;
>
> preempt_disable();
> - if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
> - perf_cgroup_switch(task);
> + perf_cgroup_switch(task);
> preempt_enable();
>
> return 0;