Re: [PATCH 2/4] perf/ftrace: Fix function trace events

From: Peter Zijlstra
Date: Fri Oct 13 2017 - 03:17:16 EST



Updated version; Steve could you route these 4 patches, they're mostly
kernel/trace/ related.

---
Subject: perf/ftrace: Fix function trace events
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Tue Oct 10 17:15:47 CEST 2017

The function-trace <-> perf interface is a tad messed up. Where all
the other trace <-> perf interfaces use a single trace hook
registration and use per-cpu RCU based hlist to iterate the events,
function-trace actually needs multiple hook registrations in order to
minimize function entry patching when filters are present.

The end result is that we iterate events both on the trace hook and on
the hlist, which results in reporting events multiple times.

Since function-trace cannot use the regular scheme, fix it the other
way around, use singleton hlists.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/trace_events.h | 5 ++
kernel/trace/trace_event_perf.c | 80 ++++++++++++++++++++++++----------------
2 files changed, 54 insertions(+), 31 deletions(-)

--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,6 +173,11 @@ enum trace_reg {
TRACE_REG_PERF_UNREGISTER,
TRACE_REG_PERF_OPEN,
TRACE_REG_PERF_CLOSE,
+ /*
+ * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
+ * custom action was taken and the default action is not to be
+ * performed.
+ */
TRACE_REG_PERF_ADD,
TRACE_REG_PERF_DEL,
#endif
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_even
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- struct hlist_head __percpu *pcpu_list;
- struct hlist_head *list;
-
- pcpu_list = tp_event->perf_events;
- if (WARN_ON_ONCE(!pcpu_list))
- return -EINVAL;

if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;

- list = this_cpu_ptr(pcpu_list);
- hlist_add_head_rcu(&p_event->hlist_entry, list);
+ /*
+ * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+ * and we need to take the default action of enqueueing our event on
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+ struct hlist_head __percpu *pcpu_list;
+ struct hlist_head *list;
+
+ pcpu_list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!pcpu_list))
+ return -EINVAL;
+
+ list = this_cpu_ptr(pcpu_list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
+ }

- return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+ return 0;
}

void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- hlist_del_rcu(&p_event->hlist_entry);
- tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+ /*
+ * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+ * and we need to take the default action of dequeueing our event from
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+ hlist_del_rcu(&p_event->hlist_entry);
}

void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
struct ftrace_entry *entry;
- struct hlist_head *head;
+ struct perf_event *event;
+ struct hlist_head head;
struct pt_regs regs;
int rctx;

- head = this_cpu_ptr(event_function.perf_events);
- if (hlist_empty(head))
+ if ((unsigned long)ops->private != smp_processor_id())
return;

+ event = container_of(ops, struct perf_event, ftrace_ops);
+
+ /*
+ * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+ * the perf code does is hlist_for_each_entry_rcu(), so we can
+ * get away with simply setting the @head.first pointer in order
+ * to create a singular list.
+ */
+ head.first = &event->hlist_entry;
+
#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
sizeof(u64)) - sizeof(u32))

@@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long
entry->ip = ip;
entry->parent_ip = parent_ip;
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL);
+ 1, &regs, &head, NULL);

#undef ENTRY_SIZE
}
@@ -339,8 +363,10 @@ static int perf_ftrace_function_register
{
struct ftrace_ops *ops = &event->ftrace_ops;

- ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
- ops->func = perf_ftrace_function_call;
+ ops->flags |= FTRACE_OPS_FL_RCU;
+ ops->func = perf_ftrace_function_call;
+ ops->private = (void *)(unsigned long)nr_cpu_ids;
+
return register_ftrace_function(ops);
}

@@ -352,19 +378,11 @@ static int perf_ftrace_function_unregist
return ret;
}

-static void perf_ftrace_function_enable(struct perf_event *event)
-{
- ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
- ftrace_function_local_disable(&event->ftrace_ops);
-}
-
int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data)
{
+ struct perf_event *event = data;
+
switch (type) {
case TRACE_REG_REGISTER:
case TRACE_REG_UNREGISTER:
@@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct tr
case TRACE_REG_PERF_CLOSE:
return perf_ftrace_function_unregister(data);
case TRACE_REG_PERF_ADD:
- perf_ftrace_function_enable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+ return 1;
case TRACE_REG_PERF_DEL:
- perf_ftrace_function_disable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+ return 1;
}

return -EINVAL;