Re: [PATCH v15 1/4] trace: Add trace any kernel object

From: Jeff Xie
Date: Tue Oct 18 2022 - 04:36:03 EST


Hi steve,

Thank you very much for your patient reply, very happy and very welcome to
rewrite the patch series with fetch instructions.

On Tue, Oct 18, 2022 at 3:58 AM Steven Rostedt <rostedt@xxxxxxxxxxx> wrote:>
> On Fri, 19 Aug 2022 11:27:03 +0800
> Jeff Xie <xiehuan09@xxxxxxxxx> wrote:
>
>
> > index 27febd4ee33e..2c2477dea0f2 100644
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -5621,6 +5621,9 @@ static const char readme_msg[] =
> > "\t enable_hist:<system>:<event>\n"
> > "\t disable_hist:<system>:<event>\n"
> > #endif
> > +#ifdef CONFIG_TRACE_OBJECT
> > + "\t objtrace:add:obj[:count][if <filter>]\n"
> > +#endif
> > #ifdef CONFIG_STACKTRACE
> > "\t\t stacktrace\n"
> > #endif
> > @@ -9261,6 +9264,9 @@ static struct trace_array *trace_array_create(const char *name)
> > if (ftrace_allocate_ftrace_ops(tr) < 0)
> > goto out_free_tr;
> >
> > + if (allocate_objtrace_data(tr) < 0)
> > + goto out_free_tr;
>
> When I was rewriting this, I found there's no reason to associate a objdata
> with a tr. It should all be self contained (per event file). The objdata
> can have a tr, but the tr doesn't care about the objdata.

I'm so glad you can rewrite it. okay, you should have removed the
object_exist(),
so the tr doesn't care about the objdata.

>
> > +
> > ftrace_init_trace_array(tr);
> >
> > init_trace_flags_index(tr);
> > @@ -9280,6 +9286,7 @@ static struct trace_array *trace_array_create(const char *name)
> >
> > out_free_tr:
> > ftrace_free_ftrace_ops(tr);
> > + free_objtrace_data(tr);
> > free_trace_buffers(tr);
> > free_cpumask_var(tr->tracing_cpumask);
> > kfree(tr->name);
> > @@ -9373,6 +9380,7 @@ static int __remove_instance(struct trace_array *tr)
> > event_trace_del_tracer(tr);
> > ftrace_clear_pids(tr);
> > ftrace_destroy_function_files(tr);
> > + free_objtrace_data(tr);
> > tracefs_remove(tr->dir);
> > free_percpu(tr->last_func_repeats);
> > free_trace_buffers(tr);
> > @@ -10110,6 +10118,9 @@ __init static int tracer_alloc_buffers(void)
> > goto out_free_savedcmd;
> > }
> >
> > + if (allocate_objtrace_data(&global_trace))
> > + goto out_free_savedcmd;
> > +
> > if (global_trace.buffer_disabled)
> > tracing_off();
> >
> > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> > index 900e75d96c84..8da1ef78a09a 100644
> > --- a/kernel/trace/trace.h
> > +++ b/kernel/trace/trace.h
> > @@ -54,6 +54,7 @@ enum trace_type {
> > TRACE_TIMERLAT,
> > TRACE_RAW_DATA,
> > TRACE_FUNC_REPEATS,
> > + TRACE_OBJECT,
> >
> > __TRACE_LAST_TYPE,
> > };
> > @@ -363,6 +364,9 @@ struct trace_array {
> > cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
> > int ref;
> > int trace_ref;
> > +#ifdef CONFIG_TRACE_OBJECT
> > + struct objtrace_data *obj_data;
> > +#endif
>
> That means the above is not needed.

Yes, thanks.

>
> > #ifdef CONFIG_FUNCTION_TRACER
> > struct ftrace_ops *ops;
> > struct trace_pid_list __rcu *function_pids;
> > @@ -471,6 +475,7 @@ extern void __ftrace_bad_type(void);
> > TRACE_GRAPH_RET); \
> > IF_ASSIGN(var, ent, struct func_repeats_entry, \
> > TRACE_FUNC_REPEATS); \
> > + IF_ASSIGN(var, ent, struct trace_object_entry, TRACE_OBJECT);\
> > __ftrace_bad_type(); \
> > } while (0)
> >
> > @@ -1536,6 +1541,16 @@ static inline int register_trigger_hist_cmd(void) { return 0; }
> > static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
> > #endif
> >
> > +#ifdef CONFIG_TRACE_OBJECT
> > +extern int register_trigger_object_cmd(void);
> > +extern int allocate_objtrace_data(struct trace_array *tr);
> > +extern void free_objtrace_data(struct trace_array *tr);
> > +#else
> > +static inline int register_trigger_object_cmd(void) { return 0; }
> > +static inline int allocate_objtrace_data(struct trace_array *tr) { return 0; }
> > +static inline void free_objtrace_data(struct trace_array *tr) { };
> > +#endif
> > +
> > extern int register_trigger_cmds(void);
> > extern void clear_event_triggers(struct trace_array *tr);
> >
> > @@ -1611,6 +1626,12 @@ extern bool event_trigger_check_remove(const char *glob);
> > extern bool event_trigger_empty_param(const char *param);
> > extern int event_trigger_separate_filter(char *param_and_filter, char **param,
> > char **filter, bool param_required);
> > +extern int register_trigger(char *glob,
> > + struct event_trigger_data *data,
> > + struct trace_event_file *file);
> > +extern void unregister_trigger(char *glob,
> > + struct event_trigger_data *test,
> > + struct trace_event_file *file);
> > extern struct event_trigger_data *
> > event_trigger_alloc(struct event_command *cmd_ops,
> > char *cmd,
> > diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
> > index cd41e863b51c..bb120d9498a9 100644
> > --- a/kernel/trace/trace_entries.h
> > +++ b/kernel/trace/trace_entries.h
> > @@ -401,3 +401,20 @@ FTRACE_ENTRY(timerlat, timerlat_entry,
> > __entry->context,
> > __entry->timer_latency)
> > );
> > +
> > +/*
> > + * trace object entry:
> > + */
> > +FTRACE_ENTRY(object, trace_object_entry,
> > +
> > + TRACE_OBJECT,
> > +
> > + F_STRUCT(
> > + __field( unsigned long, ip )
> > + __field( unsigned long, parent_ip )
> > + __field( unsigned long, object )
> > + ),
> > +
> > + F_printk(" %ps <-- %ps object:%lx\n",
> > + (void *)__entry->ip, (void *)__entry->parent_ip, __entry->object)
> > +);
> > diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
> > index cb866c3141af..8bad4eb3b997 100644
> > --- a/kernel/trace/trace_events_trigger.c
> > +++ b/kernel/trace/trace_events_trigger.c
> > @@ -560,7 +560,7 @@ void update_cond_flag(struct trace_event_file *file)
> > *
> > * Return: 0 on success, errno otherwise
> > */
> > -static int register_trigger(char *glob,
> > +int register_trigger(char *glob,
> > struct event_trigger_data *data,
> > struct trace_event_file *file)
> > {
> > @@ -605,7 +605,7 @@ static int register_trigger(char *glob,
> > * Usually used directly as the @unreg method in event command
> > * implementations.
> > */
> > -static void unregister_trigger(char *glob,
> > +void unregister_trigger(char *glob,
> > struct event_trigger_data *test,
> > struct trace_event_file *file)
> > {
> > @@ -1976,6 +1976,7 @@ __init int register_trigger_cmds(void)
> > register_trigger_enable_disable_cmds();
> > register_trigger_hist_enable_disable_cmds();
> > register_trigger_hist_cmd();
> > + register_trigger_object_cmd();
> >
> > return 0;
> > }
> > diff --git a/kernel/trace/trace_object.c b/kernel/trace/trace_object.c
> > new file mode 100644
> > index 000000000000..19ec4b1c0186
> > --- /dev/null
> > +++ b/kernel/trace/trace_object.c
> > @@ -0,0 +1,450 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * trace the kernel object in the kernel function parameter
> > + * Copyright (C) 2021 Jeff Xie <xiehuan09@xxxxxxxxx>
> > + */
> > +
> > +#define pr_fmt(fmt) "trace_object: " fmt
> > +
> > +#include "trace_output.h"
> > +
> > +#define MAX_TRACED_OBJECT 5
> > +#define OBJTRACE_CMD_LEN 16
> > +#define OBJTRACE_CMD_ADD "add"
> > +static LIST_HEAD(obj_data_head);
> > +static const int max_args_num = 6;
> > +static void exit_trace_object(struct trace_array *tr);
> > +static int init_trace_object(struct trace_array *tr);
> > +
> > +struct object_instance {
> > + void *obj;
> > + struct trace_array *tr;
>
> The instance will need to hold the object, and last value.

Thanks, I got it.

>
> > +};
> > +
> > +/* objtrace private data */
> > +struct objtrace_trigger_data {
> > + struct ftrace_event_field *field;
> > + char objtrace_cmd[OBJTRACE_CMD_LEN];
> > + struct trace_array *tr;
> > +};
> > +
> > +/* objtrace data with fops and objtrace_instances */
> > +struct objtrace_data {
> > + struct list_head head;
> > + struct trace_array *tr;
> > + struct ftrace_ops fops;
> > + int num_traced_obj;
> > + struct object_instance traced_obj[MAX_TRACED_OBJECT];
> > + raw_spinlock_t obj_data_lock;
> > +};
>
> The objtrace_data and the objtrace_trigger_data could be merged into a
> single instance that get's allocated when the trigger is written. The
> object_instance array size can be the "count" passed in.
>
> It will require initializing the entire instance at the time the trigger is
> written into the trigger file.
>

It does look better;-)

> > +
> > +static struct objtrace_data *get_obj_data(struct trace_array *tr)
> > +{
> > + struct objtrace_data *obj_data = NULL;
> > +
> > + list_for_each_entry(obj_data, &obj_data_head, head) {
> > + if (obj_data->tr == tr)
> > + break;
> > + }
> > + return obj_data;
> > +}
>
> I deleted the above function.

Thanks.

>
> > +
> > +static bool object_exist(void *obj, struct trace_array *tr)
> > +{
> > + int i, max;
> > + struct objtrace_data *obj_data;
> > +
> > + obj_data = get_obj_data(tr);
> > + if (!obj_data)
> > + return false;
> > +
> > + max = READ_ONCE(obj_data->num_traced_obj);
> > + smp_rmb();
> > + for (i = 0; i < max; i++) {
> > + if (obj_data->traced_obj[i].obj == obj)
> > + return true;
> > + }
> > + return false;
> > +}
> > +
> > +static bool object_empty(struct trace_array *tr)
> > +{
> > + struct objtrace_data *obj_data;
> > +
> > + obj_data = get_obj_data(tr);
> > + if (!obj_data)
> > + return false;
> > +
> > + return !READ_ONCE(obj_data->num_traced_obj);
> > +}
> > +
> > +static void set_trace_object(void *obj, struct trace_array *tr)
> > +{
> > + unsigned long flags;
> > + struct object_instance *obj_ins;
> > + struct objtrace_data *obj_data;
>
> BTW, please use "upside-down x-mas tree" formatting for variables. That is:

Thanks for your reminder.

>
> struct object_instance *obj_ins;
> struct objtrace_data *obj_data;
> unsigned long flags;
>
> And I have more to comment on about variable declarations.
>
> > +
> > + if (in_nmi())
> > + return;
> > +
> > + if (!obj && object_exist(obj, tr))
>
> If we are checking values, we do not test for object_exist here. Just !obj.
>
> > + return;
> > +
> > + obj_data = get_obj_data(tr);
> > + if (!obj_data)
> > + return;
> > +
> > + /* only this place has write operations */
> > + raw_spin_lock_irqsave(&obj_data->obj_data_lock, flags);
> > + if (READ_ONCE(obj_data->num_traced_obj) == MAX_TRACED_OBJECT) {
>
> No need for the READ_ONCE() in the spin locked area.

Thanks.

>
> > + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
> > + "object_pool is full, can't trace object:0x%px\n", obj);
>
> Also, I'm not sure if we have a counter, that we are going to want to print
> anything when it is full. Maybe for debugging?

I was worried that if there is no counter, there needs to be a refill logic,
and the implementation will become more complicated ;-)


> > + goto out;
> > + }
> > + obj_ins = &obj_data->traced_obj[READ_ONCE(obj_data->num_traced_obj)];
> > + obj_ins->obj = obj;
> > + obj_ins->tr = tr;
> > + /* make sure the num_traced_obj update always appears after traced_obj update */
> > + smp_wmb();
> > + obj_data->num_traced_obj++;
> > +out:
> > + raw_spin_unlock_irqrestore(&obj_data->obj_data_lock, flags);
> > +}
> > +
> > +static void submit_trace_object(unsigned long ip, unsigned long parent_ip,
> > + unsigned long object, struct trace_array *tr)
> > +{
> > +
> > + struct trace_buffer *buffer = tr->array_buffer.buffer;
> > + struct ring_buffer_event *event;
> > + struct trace_object_entry *entry;
> > + unsigned int trace_ctx = 0;
> > +
> > + trace_ctx = tracing_gen_ctx();
> > + event = trace_buffer_lock_reserve(buffer, TRACE_OBJECT,
> > + sizeof(*entry), trace_ctx);
> > + if (!event)
> > + return;
> > + entry = ring_buffer_event_data(event);
> > + entry->ip = ip;
> > + entry->parent_ip = parent_ip;
> > + entry->object = object;
> > +
> > + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
> > +}
> > +
> > +static void
> > +trace_object_events_call(unsigned long ip, unsigned long parent_ip,
> > + struct ftrace_ops *op, struct ftrace_regs *fregs)
> > +{
> > + struct pt_regs *pt_regs = ftrace_get_regs(fregs);
> > + struct trace_array *tr = op->private;
>
> We do not need the op->private for the tr. instead:
>
> struct obj_data *obj_data;
>
> > + struct trace_array_cpu *data;
> > + unsigned long obj;
> > + long disabled;
> > + int cpu, n;
>
> obj_data = container_of(op, struct obj_data, fops);
>
> tr = obj_data->tr;

learned, thanks.

>
> > +
> > + preempt_disable_notrace();
> > +
> > + cpu = raw_smp_processor_id();
> > + data = per_cpu_ptr(tr->array_buffer.data, cpu);
> > + disabled = atomic_inc_return(&data->disabled);
> > + if (likely(disabled == 1)) {
> > + if (object_empty(tr))
> > + goto out;
> > + for (n = 0; n < max_args_num; n++) {
> > + obj = regs_get_kernel_argument(pt_regs,
>
> As I mentioned, the above is best effort at best. Instead, we should get
> the value and check if it changed. If it did, then we print, otherwise skip
> it. Will need to be under the lock to do this check.

Although this is not my original intention, I am also very happy to
accept your idea.

>
>
> > + if (object_exist((void *)obj, tr))
> > + submit_trace_object(ip, parent_ip, obj, tr);
> > + /* The parameters of a function may match multiple objects */
> > + }
> > + }
> > +out:
> > + atomic_dec(&data->disabled);
> > + preempt_enable_notrace();
> > +}
> > +
> > +static void
> > +trace_object_trigger(struct event_trigger_data *data,
> > + struct trace_buffer *buffer, void *rec,
> > + struct ring_buffer_event *event)
> > +{
> > + struct objtrace_trigger_data *obj_data = data->private_data;
> > + struct trace_array *tr = obj_data->tr;
> > + struct ftrace_event_field *field;
> > + void *obj = NULL;
> > +
> > + field = obj_data->field;
> > + memcpy(&obj, rec + field->offset, sizeof(obj));
> > + set_trace_object(obj, tr);
> > +}
> > +
> > +static void
> > +trace_object_trigger_free(struct event_trigger_data *data)
> > +{
> > + struct objtrace_trigger_data *obj_data = data->private_data;
> > +
> > + if (WARN_ON_ONCE(data->ref <= 0))
> > + return;
> > +
> > + data->ref--;
> > + if (!data->ref) {
> > + exit_trace_object(obj_data->tr);
> > + kfree(data->private_data);
> > + trigger_data_free(data);
> > + }
> > +}
> > +
> > +static void
> > +trace_object_count_trigger(struct event_trigger_data *data,
> > + struct trace_buffer *buffer, void *rec,
> > + struct ring_buffer_event *event)
> > +{
> > + if (!data->count)
> > + return;
> > +
> > + if (data->count != -1)
> > + (data->count)--;
> > +
> > + trace_object_trigger(data, buffer, rec, event);
> > +}
> > +
> > +static int
> > +event_trigger_print(const char *name, struct seq_file *m,
> > + void *data, char *filter_str, void *objtrace_data)
> > +{
> > + long count = (long)data;
> > + struct objtrace_trigger_data *obj_data = objtrace_data;
> > +
> > + seq_puts(m, name);
> > +
> > + seq_printf(m, ":%s", obj_data->objtrace_cmd);
> > + seq_printf(m, ":%s", obj_data->field->name);
> > +
> > + if (count == -1)
> > + seq_puts(m, ":unlimited");
> > + else
> > + seq_printf(m, ":count=%ld", count);
>
> We are not going to have an "unlimited" here. I think the count should be
> the number of objects we can trace.

Thanks, I agree.

>
> > +
> > + if (filter_str)
> > + seq_printf(m, " if %s\n", filter_str);
> > + else
> > + seq_putc(m, '\n');
> > +
> > + return 0;
> > +}
> > +
> > +static int event_object_trigger_init(struct event_trigger_data *data)
> > +{
> > + struct objtrace_trigger_data *obj_data = data->private_data;
> > + int ret;
> > +
> > + if (!data->ref) {
> > + ret = init_trace_object(obj_data->tr);
> > + if (ret)
> > + return ret;
> > + }
> > + data->ref++;
> > + return 0;
> > +}
> > +
> > +static int
> > +trace_object_trigger_print(struct seq_file *m, struct event_trigger_data *data)
> > +{
> > + return event_trigger_print("objtrace", m, (void *)data->count,
> > + data->filter_str, data->private_data);
> > +}
> > +
> > +static struct event_trigger_ops objecttrace_trigger_ops = {
> > + .trigger = trace_object_trigger,
> > + .print = trace_object_trigger_print,
> > + .init = event_object_trigger_init,
> > + .free = trace_object_trigger_free,
> > +};
> > +
> > +static struct event_trigger_ops objecttrace_count_trigger_ops = {
> > + .trigger = trace_object_count_trigger,
> > + .print = trace_object_trigger_print,
> > + .init = event_object_trigger_init,
> > + .free = trace_object_trigger_free,
> > +};
> > +
> > +static struct event_trigger_ops *
> > +objecttrace_get_trigger_ops(char *cmd, char *param)
> > +{
> > + return param ? &objecttrace_count_trigger_ops : &objecttrace_trigger_ops;
> > +}
> > +
> > +static bool field_exist(struct trace_event_file *file,
> > + struct event_command *cmd_ops,
> > + const char *field_name)
> > +{
> > + struct event_trigger_data *data;
> > + struct objtrace_trigger_data *obj_data;
> > +
> > + lockdep_assert_held(&event_mutex);
> > +
> > + list_for_each_entry(data, &file->triggers, list) {
> > + if (data->cmd_ops->trigger_type == cmd_ops->trigger_type) {
> > + obj_data = data->private_data;
> > + if (!strcmp(obj_data->field->name, field_name))
> > + return true;
> > + }
> > + }
> > +
> > + return false;
> > +}
> > +
> > +static int
> > +event_object_trigger_parse(struct event_command *cmd_ops,
> > + struct trace_event_file *file,
> > + char *glob, char *cmd, char *param_and_filter)
> > +{
> > + struct event_trigger_data *trigger_data;
> > + struct objtrace_trigger_data *obj_data;
> > + struct ftrace_event_field *field;
> > + char *objtrace_cmd, *arg;
> > + char *param, *filter;
> > + int ret;
> > + bool remove;
>
> BTW, do not declare more than one variable on the same line. It makes it
> hard to know what is declared, makes diffs harder to know what changed, and
> makes it annoying when the compiler says a variable is unused to delete it.
>
> The above should be:
>
> struct event_trigger_data *trigger_data;
> struct objtrace_trigger_data *obj_data;
> struct ftrace_event_field *field;
> char *objtrace_cmd;
> char *filter;
> char *param;
> char *arg;
> bool remove;
> int ret;

Thanks for your reminder .

> Although, when I changed this to use the fetch instruction, most of it was
> removed anyway (which is why I found the above to be annoying due to all the
> "unused variable" warnings I had).
>
> -- Steve
>
> > +
> > + remove = event_trigger_check_remove(glob);
> > +
> > + /*
> > + * separate the param and the filter:
> > + * objtrace:add:OBJ[:COUNT] [if filter]
> > + */
> > + ret = event_trigger_separate_filter(param_and_filter, &param, &filter, true);
> > + if (ret)
> > + return ret;
> > +
> > + objtrace_cmd = strsep(&param, ":");
> > + if (!objtrace_cmd || strcmp(objtrace_cmd, OBJTRACE_CMD_ADD)) {
> > + pr_err("error objtrace command\n");
> > + return -EINVAL;
> > + }
> > +
> > + arg = strsep(&param, ":");
> > + if (!arg)
> > + return -EINVAL;
> > +
> > + field = trace_find_event_field(file->event_call, arg);
> > + if (!field)
> > + return -EINVAL;
> > +
> > + if (field->size != sizeof(void *)) {
> > + pr_err("the size of the %s should be:%zu\n", field->name, sizeof(void *));
> > + return -EINVAL;
> > + }
> > +
> > + if (remove && !field_exist(file, cmd_ops, field->name))
> > + return -ENOENT;
> > +
> > + obj_data = kzalloc(sizeof(*obj_data), GFP_KERNEL);
> > + if (!obj_data)
> > + return -ENOMEM;
> > +
> > + obj_data->field = field;
> > + obj_data->tr = file->tr;
> > + snprintf(obj_data->objtrace_cmd, OBJTRACE_CMD_LEN, objtrace_cmd);
> > +
> > + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, obj_data);
> > + if (!trigger_data) {
> > + kfree(obj_data);
> > + return -ENOMEM;
> > + }
> > + if (remove) {
> > + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
> > + kfree(obj_data);
> > + kfree(trigger_data);
> > + return 0;
> > + }
> > +
> > + ret = event_trigger_parse_num(param, trigger_data);
> > + if (ret)
> > + goto out_free;
> > +
> > + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
> > + if (ret < 0)
> > + goto out_free;
> > +
> > + ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
> > + if (ret)
> > + goto out_free;
> > +
> > + return ret;
> > +
> >


--
Thanks,
JeffXie