Re: [PATCH 2/3] perf/ftrace: Fix function trace events

From: Steven Rostedt
Date: Tue Oct 10 2017 - 15:44:06 EST


On Tue, 10 Oct 2017 18:31:26 +0200
Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:


> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> ---
> kernel/trace/trace_event_perf.c | 68 +++++++++++++++++++++-------------------
> 1 file changed, 36 insertions(+), 32 deletions(-)
>
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -240,27 +240,31 @@ void perf_trace_destroy(struct perf_even
> int perf_trace_add(struct perf_event *p_event, int flags)
> {
> struct trace_event_call *tp_event = p_event->tp_event;
> - struct hlist_head __percpu *pcpu_list;
> - struct hlist_head *list;
>
> - pcpu_list = tp_event->perf_events;
> - if (WARN_ON_ONCE(!pcpu_list))
> - return -EINVAL;

You're going to add a comment here on v2?

Also, this is highly subtle, that ftrace reg returns non-zero, and all
others return zero. It may be good to add a comment in
include/linux/trace_events.h by the TRACE_REG_PERF_ADD and DEL enums,
stating what is expected when they are passed in.

> + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
> + struct hlist_head __percpu *pcpu_list;
> + struct hlist_head *list;
> +
> + pcpu_list = tp_event->perf_events;
> + if (WARN_ON_ONCE(!pcpu_list))
> + return -EINVAL;
>
> - if (!(flags & PERF_EF_START))
> - p_event->hw.state = PERF_HES_STOPPED;
> + if (!(flags & PERF_EF_START))
> + p_event->hw.state = PERF_HES_STOPPED;
>
> - list = this_cpu_ptr(pcpu_list);
> - hlist_add_head_rcu(&p_event->hlist_entry, list);
> + list = this_cpu_ptr(pcpu_list);
> + hlist_add_head_rcu(&p_event->hlist_entry, list);
> + }
>
> - return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
> + return 0;
> }
>
> void perf_trace_del(struct perf_event *p_event, int flags)
> {
> struct trace_event_call *tp_event = p_event->tp_event;
> - hlist_del_rcu(&p_event->hlist_entry);
> - tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
> +
> + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
> + hlist_del_rcu(&p_event->hlist_entry);
> }
>
> void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
> @@ -306,15 +310,19 @@ static void
> perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
> struct ftrace_ops *ops, struct pt_regs *pt_regs)
> {
> + struct hlist_head head = HLIST_HEAD_INIT;
> struct ftrace_entry *entry;
> - struct hlist_head *head;
> + struct perf_event *event;
> struct pt_regs regs;
> int rctx;
>
> - head = this_cpu_ptr(event_function.perf_events);
> - if (hlist_empty(head))
> + event = container_of(ops, struct perf_event, ftrace_ops);
> +
> + if ((unsigned long)event->ftrace_ops.private != smp_processor_id())

Could you also just do:

if ((unsigned long)ops->private != smp_processor_id())

?

And then you could get the event via container_of() after the test.

> return;
>
> + hlist_add_head(&event->hlist_entry, &head);
> +
> #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
> sizeof(u64)) - sizeof(u32))
>
> @@ -330,17 +338,21 @@ perf_ftrace_function_call(unsigned long
> entry->ip = ip;
> entry->parent_ip = parent_ip;
> perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
> - 1, &regs, head, NULL);
> + 1, &regs, &head, NULL);
>
> #undef ENTRY_SIZE
> +
> + hlist_del_init(&event->hlist_entry);

Hmm, is there a better way to do this? It adds and removes the entry at
each function trace.

> }
>
> static int perf_ftrace_function_register(struct perf_event *event)
> {
> struct ftrace_ops *ops = &event->ftrace_ops;
>
> - ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
> - ops->func = perf_ftrace_function_call;
> + ops->flags |= FTRACE_OPS_FL_RCU;

I know this doesn't change the patch, but I don't believe that "|=" is
necessary. "=" should work too. But you can make that a separate
patch, in case it does break something, and we have a nice bisect to
see this was the cause.

> + ops->func = perf_ftrace_function_call;
> + ops->private = (void *)(unsigned long)nr_cpu_ids;
> +
> return register_ftrace_function(ops);
> }
>
> @@ -352,19 +364,11 @@ static int perf_ftrace_function_unregist
> return ret;
> }
>
> -static void perf_ftrace_function_enable(struct perf_event *event)
> -{
> - ftrace_function_local_enable(&event->ftrace_ops);
> -}
> -
> -static void perf_ftrace_function_disable(struct perf_event *event)
> -{
> - ftrace_function_local_disable(&event->ftrace_ops);
> -}
> -
> int perf_ftrace_event_register(struct trace_event_call *call,
> enum trace_reg type, void *data)
> {
> + struct perf_event *event = data;
> +
> switch (type) {
> case TRACE_REG_REGISTER:
> case TRACE_REG_UNREGISTER:
> @@ -377,11 +381,11 @@ int perf_ftrace_event_register(struct tr
> case TRACE_REG_PERF_CLOSE:
> return perf_ftrace_function_unregister(data);
> case TRACE_REG_PERF_ADD:
> - perf_ftrace_function_enable(data);
> - return 0;
> + event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();

I curious, is there a per CPU event, or is this called during
sched_switch or something?

-- Steve

> + return 1;
> case TRACE_REG_PERF_DEL:
> - perf_ftrace_function_disable(data);
> - return 0;
> + event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
> + return 1;
> }
>
> return -EINVAL;
>