Re: [PATCH] perf: Rewrite core context handling

From: Peter Zijlstra
Date: Mon Oct 10 2022 - 06:23:51 EST


On Tue, Sep 06, 2022 at 11:20:53AM +0530, Ravi Bangoria wrote:

> This one was simple enough so I prepared a patch for this. Let
> me know if you see any issues with below diff.

I've extraed this as a separate patch since it's not strictly required
for correctness and the patch is a quite large enough.

> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 820c56c66b26..e0232e0bb74e 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -9807,6 +9807,44 @@ static struct pmu perf_swevent = {
>
> #ifdef CONFIG_EVENT_TRACING
>
> +static void tp_perf_event_destroy(struct perf_event *event)
> +{
> + perf_trace_destroy(event);
> +}
> +
> +static int perf_tp_event_init(struct perf_event *event)
> +{
> + int err;
> +
> + if (event->attr.type != PERF_TYPE_TRACEPOINT)
> + return -ENOENT;
> +
> + /*
> + * no branch sampling for tracepoint events
> + */
> + if (has_branch_stack(event))
> + return -EOPNOTSUPP;
> +
> + err = perf_trace_init(event);
> + if (err)
> + return err;
> +
> + event->destroy = tp_perf_event_destroy;
> +
> + return 0;
> +}
> +
> +static struct pmu perf_tracepoint = {
> + .task_ctx_nr = perf_sw_context,
> +
> + .event_init = perf_tp_event_init,
> + .add = perf_trace_add,
> + .del = perf_trace_del,
> + .start = perf_swevent_start,
> + .stop = perf_swevent_stop,
> + .read = perf_swevent_read,
> +};
> +
> static int perf_tp_filter_match(struct perf_event *event,
> struct perf_sample_data *data)
> {
> @@ -9856,6 +9894,49 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
> }
> EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
>
> +
> +static void __perf_tp_event_target_task(u64 count, void *record,
> + struct pt_regs *regs,
> + struct perf_sample_data *data,
> + struct perf_event *event)
> +{
> + struct trace_entry *entry = record;
> +
> + if (event->attr.config != entry->type)
> + return;
> + /* Cannot deliver synchronous signal to other task. */
> + if (event->attr.sigtrap)
> + return;
> + if (perf_tp_event_match(event, data, regs))
> + perf_swevent_event(event, count, data, regs);
> +}
> +
> +static void perf_tp_event_target_task(u64 count, void *record,
> + struct pt_regs *regs,
> + struct perf_sample_data *data,
> + struct perf_event_context *ctx)
> +{
> + struct perf_event *event, *sibling;
> +
> + event = perf_event_groups_first(&ctx->pinned_groups, smp_processor_id(),
> + &perf_tracepoint, NULL);
> + for (; event; event = perf_event_groups_next(event, &perf_tracepoint)) {
> + __perf_tp_event_target_task(count, record, regs, data, event);
> + for_each_sibling_event(sibling, event) {
> + __perf_tp_event_target_task(count, record, regs, data, sibling);
> + }
> + }
> +
> + event = perf_event_groups_first(&ctx->flexible_groups, smp_processor_id(),
> + &perf_tracepoint, NULL);
> + for (; event; event = perf_event_groups_next(event, &perf_tracepoint)) {
> + __perf_tp_event_target_task(count, record, regs, data, event);
> + for_each_sibling_event(sibling, event) {
> + __perf_tp_event_target_task(count, record, regs, data, sibling);
> + }
> + }
> +}
> +
> void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> struct pt_regs *regs, struct hlist_head *head, int rctx,
> struct task_struct *task)
> @@ -9886,29 +9967,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> */
> if (task && task != current) {
> struct perf_event_context *ctx;
> - struct trace_entry *entry = record;
>
> rcu_read_lock();
> ctx = rcu_dereference(task->perf_event_ctxp);
> if (!ctx)
> goto unlock;
>
> - // XXX iterate groups instead, we should be able to
> - // find the subtree for the perf_tracepoint pmu and CPU.
> -
> - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
> - if (event->cpu != smp_processor_id())
> - continue;
> - if (event->attr.type != PERF_TYPE_TRACEPOINT)
> - continue;
> - if (event->attr.config != entry->type)
> - continue;
> - /* Cannot deliver synchronous signal to other task. */
> - if (event->attr.sigtrap)
> - continue;
> - if (perf_tp_event_match(event, &data, regs))
> - perf_swevent_event(event, count, &data, regs);
> - }
> + raw_spin_lock(&ctx->lock);
> + perf_tp_event_target_task(count, record, regs, &data, ctx);
> + raw_spin_unlock(&ctx->lock);
> unlock:
> rcu_read_unlock();
> }
> @@ -9917,44 +9984,6 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
> }
> EXPORT_SYMBOL_GPL(perf_tp_event);
>
> -static void tp_perf_event_destroy(struct perf_event *event)
> -{
> - perf_trace_destroy(event);
> -}
> -
> -static int perf_tp_event_init(struct perf_event *event)
> -{
> - int err;
> -
> - if (event->attr.type != PERF_TYPE_TRACEPOINT)
> - return -ENOENT;
> -
> - /*
> - * no branch sampling for tracepoint events
> - */
> - if (has_branch_stack(event))
> - return -EOPNOTSUPP;
> -
> - err = perf_trace_init(event);
> - if (err)
> - return err;
> -
> - event->destroy = tp_perf_event_destroy;
> -
> - return 0;
> -}
> -
> -static struct pmu perf_tracepoint = {
> - .task_ctx_nr = perf_sw_context,
> -
> - .event_init = perf_tp_event_init,
> - .add = perf_trace_add,
> - .del = perf_trace_del,
> - .start = perf_swevent_start,
> - .stop = perf_swevent_stop,
> - .read = perf_swevent_read,
> -};
> -
> #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
> /*
> * Flags in config, used by dynamic PMU kprobe and uprobe
>
> ---