Re: [PATCH] perf/ftrace : Fix repetitious traces when specify a target task

From: Peter Zijlstra
Date: Tue Oct 10 2017 - 11:22:02 EST


On Tue, Oct 10, 2017 at 03:04:48PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 10, 2017 at 01:33:21PM +0200, Peter Zijlstra wrote:
> > But now you've got me looking at 75e8387685f6, which also looks
> > completely insane.
>
> The reason I insta stumbled on that patch is that it only addresses the
> ftrace situation and doesn't mention the other _5_ places that use this
> interface. It doesn't explain why those don't have the problem and if
> not, why their solution doesn't work for ftrace.
>
> So all (well syscall and regular tracepoints, didn't check the others)
> avoid that problem by simply not registering multiple times at the
> tracepoint. Tracepoints use tp_event->perf_refcount and the syscall
> things use sys_perf_refcount_{enter,exit} for that.
>
> Doing the same for function trace looks a little something like the
> below (after reverting 75e8387685f6)
>
> Except the below doesn't compile because of
> kernel/trace/trace_event_filter.c, which is where I lost the plot.

OK, so that filter stuff was the entire reason for this trainwreck :/

Using ftrace_ops filters allows ftrace to patch less functions etc.. So
that requires an ftrace_ops per event. Still that then instantly
suggests we fix the whole hlist situation instead of making it worse.

See below; I now have 3 patches: revert, the below, kill
FTRACE_OPS_FL_PER_CPU.

How's this?

---
kernel/trace/trace_event_perf.c | 68 +++++++++++++++++++++-------------------
1 file changed, 36 insertions(+), 32 deletions(-)

--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,31 @@ void perf_trace_destroy(struct perf_even
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- struct hlist_head __percpu *pcpu_list;
- struct hlist_head *list;

- pcpu_list = tp_event->perf_events;
- if (WARN_ON_ONCE(!pcpu_list))
- return -EINVAL;
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+ struct hlist_head __percpu *pcpu_list;
+ struct hlist_head *list;
+
+ pcpu_list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!pcpu_list))
+ return -EINVAL;

- if (!(flags & PERF_EF_START))
- p_event->hw.state = PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ p_event->hw.state = PERF_HES_STOPPED;

- list = this_cpu_ptr(pcpu_list);
- hlist_add_head_rcu(&p_event->hlist_entry, list);
+ list = this_cpu_ptr(pcpu_list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
+ }

- return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+ return 0;
}

void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- hlist_del_rcu(&p_event->hlist_entry);
- tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+ hlist_del_rcu(&p_event->hlist_entry);
}

void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -306,15 +310,19 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
+ struct hlist_head head = HLIST_HEAD_INIT;
struct ftrace_entry *entry;
- struct hlist_head *head;
+ struct perf_event *event;
struct pt_regs regs;
int rctx;

- head = this_cpu_ptr(event_function.perf_events);
- if (hlist_empty(head))
+ event = container_of(ops, struct perf_event, ftrace_ops);
+
+ if (!event->ftrace_ops.private)
return;

+ hlist_add_head(&event->hlist_entry, &head);
+
#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
sizeof(u64)) - sizeof(u32))

@@ -330,17 +338,21 @@ perf_ftrace_function_call(unsigned long
entry->ip = ip;
entry->parent_ip = parent_ip;
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL);
+ 1, &regs, &head, NULL);

#undef ENTRY_SIZE
+
+ hlist_del_init(&event->hlist_entry);
}

static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;

- ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
- ops->func = perf_ftrace_function_call;
+ ops->flags = FTRACE_OPS_FL_RCU;
+ ops->func = perf_ftrace_function_call;
+ ops->private = NULL;
+
return register_ftrace_function(ops);
}

@@ -352,19 +364,11 @@ static int perf_ftrace_function_unregist
return ret;
}

-static void perf_ftrace_function_enable(struct perf_event *event)
-{
- ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
- ftrace_function_local_disable(&event->ftrace_ops);
-}
-
int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data)
{
+ struct perf_event *event = data;
+
switch (type) {
case TRACE_REG_REGISTER:
case TRACE_REG_UNREGISTER:
@@ -377,11 +381,11 @@ int perf_ftrace_event_register(struct tr
case TRACE_REG_PERF_CLOSE:
return perf_ftrace_function_unregister(data);
case TRACE_REG_PERF_ADD:
- perf_ftrace_function_enable(data);
- return 0;
+ event->ftrace_ops.private = (void *)1UL;
+ return 1;
case TRACE_REG_PERF_DEL:
- perf_ftrace_function_disable(data);
- return 0;
+ event->ftrace_ops.private = (void *)0UL;
+ return 1;
}

return -EINVAL;