Re: [PATCH v9.1 02/11] tracing/probes: Add fprobe events for tracing function entry and exit.

From: Google
Date: Tue May 09 2023 - 11:08:58 EST


On Fri, 5 May 2023 12:12:12 -0400
Steven Rostedt <rostedt@xxxxxxxxxxx> wrote:

> On Tue, 2 May 2023 11:17:36 +0900
> "Masami Hiramatsu (Google)" <mhiramat@xxxxxxxxxx> wrote:
>
> > diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
> > new file mode 100644
> > index 000000000000..0049d9ef2402
> > --- /dev/null
> > +++ b/kernel/trace/trace_fprobe.c
> > @@ -0,0 +1,1053 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Fprobe-based tracing events
> > + * Copyright (C) 2022 Google LLC.
> > + */
> > +#define pr_fmt(fmt) "trace_fprobe: " fmt
> > +
> > +#include <linux/fprobe.h>
> > +#include <linux/module.h>
> > +#include <linux/rculist.h>
> > +#include <linux/security.h>
> > +#include <linux/uaccess.h>
> > +
> > +#include "trace_dynevent.h"
> > +#include "trace_probe.h"
> > +#include "trace_probe_kernel.h"
> > +#include "trace_probe_tmpl.h"
> > +
> > +#define FPROBE_EVENT_SYSTEM "fprobes"
> > +#define RETHOOK_MAXACTIVE_MAX 4096
> > +
> > +static int trace_fprobe_create(const char *raw_command);
> > +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
> > +static int trace_fprobe_release(struct dyn_event *ev);
> > +static bool trace_fprobe_is_busy(struct dyn_event *ev);
> > +static bool trace_fprobe_match(const char *system, const char *event,
> > + int argc, const char **argv, struct dyn_event *ev);
> > +
> > +static struct dyn_event_operations trace_fprobe_ops = {
> > + .create = trace_fprobe_create,
> > + .show = trace_fprobe_show,
> > + .is_busy = trace_fprobe_is_busy,
> > + .free = trace_fprobe_release,
> > + .match = trace_fprobe_match,
> > +};
> > +
> > +/*
> > + * Fprobe event core functions
> > + */
> > +struct trace_fprobe {
> > + struct dyn_event devent;
> > + struct fprobe fp;
> > + const char *symbol;
> > + struct trace_probe tp;
> > +};
> > +
> > +static bool is_trace_fprobe(struct dyn_event *ev)
> > +{
> > + return ev->ops == &trace_fprobe_ops;
> > +}
> > +
> > +static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev)
> > +{
> > + return container_of(ev, struct trace_fprobe, devent);
> > +}
> > +
> > +/**
> > + * for_each_trace_fprobe - iterate over the trace_fprobe list
> > + * @pos: the struct trace_fprobe * for each entry
> > + * @dpos: the struct dyn_event * to use as a loop cursor
> > + */
> > +#define for_each_trace_fprobe(pos, dpos) \
> > + for_each_dyn_event(dpos) \
> > + if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos)))
> > +
> > +static bool trace_fprobe_is_return(struct trace_fprobe *tf)
> > +{
> > + return tf->fp.exit_handler != NULL;
> > +}
> > +
> > +static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
> > +{
> > + return tf->symbol ? tf->symbol : "unknown";
> > +}
> > +
> > +static bool trace_fprobe_is_busy(struct dyn_event *ev)
> > +{
> > + struct trace_fprobe *tf = to_trace_fprobe(ev);
> > +
> > + return trace_probe_is_enabled(&tf->tp);
> > +}
> > +
> > +static bool trace_fprobe_match_command_head(struct trace_fprobe *tf,
> > + int argc, const char **argv)
> > +{
> > + char buf[MAX_ARGSTR_LEN + 1];
> > +
> > + if (!argc)
> > + return true;
> > +
> > + snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf));
> > + if (strcmp(buf, argv[0]))
> > + return false;
> > + argc--; argv++;
> > +
> > + return trace_probe_match_command_args(&tf->tp, argc, argv);
> > +}
> > +
> > +static bool trace_fprobe_match(const char *system, const char *event,
> > + int argc, const char **argv, struct dyn_event *ev)
> > +{
> > + struct trace_fprobe *tf = to_trace_fprobe(ev);
> > +
> > + return (event[0] == '\0' ||
> > + strcmp(trace_probe_name(&tf->tp), event) == 0) &&
> > + (!system || strcmp(trace_probe_group_name(&tf->tp), system) == 0) &&
> > + trace_fprobe_match_command_head(tf, argc, argv);
>
> The above is really hard to read, and Linus hates these kinds of
> statements. Please break it up (the compiler should do the right thing).
>
> Reversing the tests to return false:
>
> if (event[0] != '\0' &&
> strcmp(trace_probe_name(&tf->tp), event) != 0))
> return false;
>
> if (system && strcmp(trace_probe_group_name(&tf->tp), system) != 0))
> return false;
>
> return trace_fprobe_match_command_head(tf, argc, argv);
>

OK, I'll clean it up.

>
> > +}
> > +
> > +static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
> > +{
> > + return fprobe_is_registered(&tf->fp);
> > +}
> > +
> > +/* Note that we don't verify it, since the code does not come from user space */
>
> Verify what?
>
> Hmm, I see this is a copy of the comment from both trace_kprobe.c and
> trace_uprobe.c. I think this requires a bit more explanation (and also in
> those locations as well).

Yeah, sorry about this comment. I meant that this fetch process (will run
on probe handlers) did not verify the fetch_insn code because it should be
generated by parser.

/*
* Note that we don't verify the fetch_insn code, since it does not come
* from user space.
*/

>
> > +static int
> > +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
> > + void *base)
> > +{
> > + struct pt_regs *regs = rec;
> > + unsigned long val;
> > +
> > +retry:
> > + /* 1st stage: get value from context */
> > + switch (code->op) {
> > + case FETCH_OP_REG:
> > + val = regs_get_register(regs, code->param);
> > + break;
> > + case FETCH_OP_STACK:
> > + val = regs_get_kernel_stack_nth(regs, code->param);
> > + break;
> > + case FETCH_OP_STACKP:
> > + val = kernel_stack_pointer(regs);
> > + break;
> > + case FETCH_OP_RETVAL:
> > + val = regs_return_value(regs);
> > + break;
>
>
> > + case FETCH_OP_IMM:
> > + val = code->immediate;
> > + break;
> > + case FETCH_OP_COMM:
> > + val = (unsigned long)current->comm;
> > + break;
> > + case FETCH_OP_DATA:
> > + val = (unsigned long)code->data;
> > + break;
>
> These are new and not part of trace_kprobe.c. Should we have a version that
> the two could share?

Oh, I remember that I missed one series which did that.

https://lore.kernel.org/lkml/20230103223202.7963bfb62b4ae5827e51ee30@xxxxxxxxxx/T/

Let's pick it.

>
> Probably a helper function that can be called by these two.
>
> > +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
> > + case FETCH_OP_ARG:
> > + val = regs_get_kernel_argument(regs, code->param);
> > + break;
> > +#endif
> > + case FETCH_NOP_SYMBOL: /* Ignore a place holder */
> > + code++;
> > + goto retry;
> > + default:
> > + return -EILSEQ;
> > + }
> > + code++;
> > +
> > + return process_fetch_insn_bottom(code, val, dest, base);
> > +}
> > +NOKPROBE_SYMBOL(process_fetch_insn)
> > +
> > +/* function entry handler */
> > +static nokprobe_inline void
> > +__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
> > + struct pt_regs *regs,
> > + struct trace_event_file *trace_file)
> > +{
> > + struct fentry_trace_entry_head *entry;
> > + struct trace_event_call *call = trace_probe_event_call(&tf->tp);
> > + struct trace_event_buffer fbuffer;
> > + int dsize;
> > +
> > + WARN_ON(call != trace_file->event_call);
>
> WARN_ON_ONCE()?
>
> And if you are doing the check, perhaps even:
>
> if (WARN_ON_ONCE(call != trace_file->event_call))
> return;

OK.

Thank you!

>
> -- Steve
>
> > +
> > + if (trace_trigger_soft_disabled(trace_file))
> > + return;
> > +
> > + dsize = __get_data_size(&tf->tp, regs);
> > +
> > + entry = trace_event_buffer_reserve(&fbuffer, trace_file,
> > + sizeof(*entry) + tf->tp.size + dsize);
> > + if (!entry)
> > + return;
> > +
> > + fbuffer.regs = regs;
> > + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
> > + entry->ip = entry_ip;
> > + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
> > +
> > + trace_event_buffer_commit(&fbuffer);
> > +}
> > +


--
Masami Hiramatsu (Google) <mhiramat@xxxxxxxxxx>