[PATCH] eprobes: Remove redundant event type information

From: Steven Rostedt (Google)
Date: Fri Feb 18 2022 - 18:52:38 EST


Currently, the event probes save the type of the event they are attached
to when recording the event. For example:

# echo 'e:switch sched/sched_switch prev_state=$prev_state prev_prio=$prev_prio next_pid=$next_pid next_prio=$next_prio' > dynamic_events
# cat events/eprobes/switch/format

name: switch
ID: 1717
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;

field:unsigned int __probe_type; offset:8; size:4; signed:0;
field:u64 prev_state; offset:12; size:8; signed:0;
field:u64 prev_prio; offset:20; size:8; signed:0;
field:u64 next_pid; offset:28; size:8; signed:0;
field:u64 next_prio; offset:36; size:8; signed:0;

print fmt: "(%u) prev_state=0x%Lx prev_prio=0x%Lx next_pid=0x%Lx next_prio=0x%Lx", REC->__probe_type, REC->prev_state, REC->prev_prio, REC->next_pid, REC->next_prio

The __probe_type adds 4 bytes to every event.

One of the reasons for creating eprobes is to limit what is traced in an
event to be able to limit what is written into the ring buffer. Having
this redundant 4 bytes to every event takes away from this.

The event that is recorded can be retrieved from the event probe itself,
that is available when the trace is happening. For user space tools, it
could simply read the dynamic_event file to find the event they are for.
So there is really no reason to write this information into the ring
buffer for every event.

Signed-off-by: Steven Rostedt (Google) <rostedt@xxxxxxxxxxx>
---
kernel/trace/trace.h | 1 -
kernel/trace/trace_eprobe.c | 15 +++++++--------
kernel/trace/trace_probe.c | 10 +++++-----
kernel/trace/trace_probe.h | 1 -
4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0f5e22238cd2..07d990270e2a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -136,7 +136,6 @@ struct kprobe_trace_entry_head {

struct eprobe_trace_entry_head {
struct trace_entry ent;
- unsigned int type;
};

struct kretprobe_trace_entry_head {
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 191db32dec46..02838d47129f 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -250,8 +250,6 @@ static int eprobe_event_define_fields(struct trace_event_call *event_call)
if (WARN_ON_ONCE(!tp))
return -ENOENT;

- DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0);
-
return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
}

@@ -270,7 +268,9 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
struct trace_event_call *pevent;
struct trace_event *probed_event;
struct trace_seq *s = &iter->seq;
+ struct trace_eprobe *ep;
struct trace_probe *tp;
+ unsigned int type;

field = (struct eprobe_trace_entry_head *)iter->ent;
tp = trace_probe_primary_from_call(
@@ -278,15 +278,18 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
if (WARN_ON_ONCE(!tp))
goto out;

+ ep = container_of(tp, struct trace_eprobe, tp);
+ type = ep->event->event.type;
+
trace_seq_printf(s, "%s: (", trace_probe_name(tp));

- probed_event = ftrace_find_event(field->type);
+ probed_event = ftrace_find_event(type);
if (probed_event) {
pevent = container_of(probed_event, struct trace_event_call, event);
trace_seq_printf(s, "%s.%s", pevent->class->system,
trace_event_name(pevent));
} else {
- trace_seq_printf(s, "%u", field->type);
+ trace_seq_printf(s, "%u", type);
}

trace_seq_putc(s, ')');
@@ -498,10 +501,6 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
return;

entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
- if (edata->ep->event)
- entry->type = edata->ep->event->event.type;
- else
- entry->type = 0;
store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);

trace_event_buffer_commit(&fbuffer);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 73d90179b51b..80863c6508e5 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -871,15 +871,15 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
switch (ptype) {
case PROBE_PRINT_NORMAL:
fmt = "(%lx)";
- arg = "REC->" FIELD_STRING_IP;
+ arg = ", REC->" FIELD_STRING_IP;
break;
case PROBE_PRINT_RETURN:
fmt = "(%lx <- %lx)";
- arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
break;
case PROBE_PRINT_EVENT:
- fmt = "(%u)";
- arg = "REC->" FIELD_STRING_TYPE;
+ fmt = "";
+ arg = "";
break;
default:
WARN_ON_ONCE(1);
@@ -903,7 +903,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
parg->type->fmt);
}

- pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg);

for (i = 0; i < tp->nr_args; i++) {
parg = tp->args + i;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 99e7a5df025e..92cc149af0fd 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -38,7 +38,6 @@
#define FIELD_STRING_IP "__probe_ip"
#define FIELD_STRING_RETIP "__probe_ret_ip"
#define FIELD_STRING_FUNC "__probe_func"
-#define FIELD_STRING_TYPE "__probe_type"

#undef DEFINE_FIELD
#define DEFINE_FIELD(type, item, name, is_signed) \
--
2.34.1