Re: [PATCH] tracing: Add trace_trigger kernel command line option

From: Joel Fernandes
Date: Thu Oct 20 2022 - 10:44:11 EST


Hi Steven,

On Wed, Oct 19, 2022 at 8:01 PM Steven Rostedt <rostedt@xxxxxxxxxxx> wrote:
>
> From: "Steven Rostedt (Google)" <rostedt@xxxxxxxxxxx>
>
> Allow triggers to be enabled at kernel boot up. For example:
>
> trace_trigger="sched_switch.stacktrace if prev_state == 2"
>
> The above will enable the stacktrace trigger on top of the sched_switch
> event and only trigger if its prev_state is 2 (TASK_UNINTERRUPTIBLE). Then
> at boot up, a stacktrace will trigger and be recorded in the tracing ring
> buffer every time the sched_switch happens where the previous state is
> TASK_INTERRUPTIBLE.
>
> As this calls into tracepoint logic during very early boot (before
> interrupts are enabled), a check has to be done to see if early boot
> interrupts are still disabled, and if so, avoid any call to RCU
> synchronization, as that will enable interrupts and cause boot up issues.
>
> Signed-off-by: Steven Rostedt (Google) <rostedt@xxxxxxxxxxx>
> ---

rest_init() -> rcu_scheduler_starting() sets RCU_SCHEDULER_INIT

start_kernel() disables interrupts and does the dance you mentioned,
while setting that early_ flag. start_kernel() is called from arch
code.

>From your crash, the trace trigger is happening before the early_ boot
flag is set to true.
rcu_blocking_is_gp() should return true at the point your trigger
fires because RCU_SCHEDULER_INACTIVE is still set.

So I would expect your synchronize_rcu() to not do anything.

There's comments in rcu_blocking_is_gp() like so:
* During early boot, any blocking grace-period wait automatically
* implies a grace period.

So it should be returning straight from here. I wonder where/why it's
re-enabling interrupts for you. What does rcu_blocking_is_gp() return
if you can print that, at the time of the crash?

Cheers,

- J


> .../admin-guide/kernel-parameters.txt | 19 ++++++
> include/linux/tracepoint.h | 4 ++
> kernel/trace/trace.c | 3 +-
> kernel/trace/trace_events.c | 63 ++++++++++++++++++-
> kernel/tracepoint.c | 6 ++
> 5 files changed, 92 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index a465d5242774..ccf91a4bf113 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -6257,6 +6257,25 @@
> See also Documentation/trace/ftrace.rst "trace options"
> section.
>
> + trace_trigger=[trigger-list]
> + [FTRACE] Add a event trigger on specific events.
> + Set a trigger on top of a specific event, with an optional
> + filter.
> +
> + The format is is "trace_trigger=<event>.<trigger>[ if <filter>],..."
> + Where more than one trigger may be specified that are comma deliminated.
> +
> + For example:
> +
> + trace_trigger="sched_switch.stacktrace if prev_state == 2"
> +
> + The above will enable the "stacktrace" trigger on the "sched_switch"
> + event but only trigger it if the "prev_state" of the "sched_switch"
> + event is "2" (TASK_UNINTERUPTIBLE).
> +
> + See also "Event triggers" in Documentation/trace/events.rst
> +
> +
> traceoff_on_warning
> [FTRACE] enable this option to disable tracing when a
> warning is hit. This turns off "tracing_on". Tracing can
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 4b33b95eb8be..a5c6b5772897 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -90,6 +90,10 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb)
> #ifdef CONFIG_TRACEPOINTS
> static inline void tracepoint_synchronize_unregister(void)
> {
> + /* Early updates do not need synchronization */
> + if (early_boot_irqs_disabled)
> + return;
> +
> synchronize_srcu(&tracepoint_srcu);
> synchronize_rcu();
> }
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 47a44b055a1d..c03fd7037add 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -2749,7 +2749,8 @@ void trace_buffered_event_disable(void)
> preempt_enable();
>
> /* Wait for all current users to finish */
> - synchronize_rcu();
> + if (!early_boot_irqs_disabled)
> + synchronize_rcu();
>
> for_each_tracing_cpu(cpu) {
> free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 0356cae0cf74..06554939252c 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -2796,6 +2796,44 @@ trace_create_new_event(struct trace_event_call *call,
> return file;
> }
>
> +#ifdef CONFIG_HIST_TRIGGERS
> +#define MAX_BOOT_TRIGGERS 32
> +
> +static struct boot_triggers {
> + const char *event;
> + char *trigger;
> +} bootup_triggers[MAX_BOOT_TRIGGERS];
> +
> +static char bootup_trigger_buf[COMMAND_LINE_SIZE];
> +static int nr_boot_triggers;
> +
> +static __init int setup_trace_triggers(char *str)
> +{
> + char *trigger;
> + char *buf;
> + int i;
> +
> + strlcpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
> + ring_buffer_expanded = true;
> + disable_tracing_selftest("running event triggers");
> +
> + buf = bootup_trigger_buf;
> + for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
> + trigger = strsep(&buf, ",");
> + if (!trigger)
> + break;
> + bootup_triggers[i].event = strsep(&trigger, ".");
> + bootup_triggers[i].trigger = strsep(&trigger, ".");
> + if (!bootup_triggers[i].trigger)
> + break;
> + }
> +
> + nr_boot_triggers = i;
> + return 1;
> +}
> +__setup("trace_trigger=", setup_trace_triggers);
> +#endif
> +
> /* Add an event to a trace directory */
> static int
> __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
> @@ -2822,12 +2860,32 @@ __trace_early_add_new_event(struct trace_event_call *call,
> struct trace_array *tr)
> {
> struct trace_event_file *file;
> + int ret;
> + int i;
>
> file = trace_create_new_event(call, tr);
> if (!file)
> return -ENOMEM;
>
> - return event_define_fields(call);
> + ret = event_define_fields(call);
> + if (ret)
> + return ret;
> +
> +#ifdef CONFIG_HIST_TRIGGERS
> + for (i = 0; i < nr_boot_triggers; i++) {
> + if (strcmp(trace_event_name(call), bootup_triggers[i].event))
> + continue;
> + mutex_lock(&event_mutex);
> + ret = trigger_process_regex(file, bootup_triggers[i].trigger);
> + mutex_unlock(&event_mutex);
> + if (ret)
> + pr_err("Failed to register trigger '%s' on event %s\n",
> + bootup_triggers[i].trigger,
> + bootup_triggers[i].event);
> + }
> +#endif
> +
> + return 0;
> }
>
> struct ftrace_module_file_ops;
> @@ -3726,6 +3784,8 @@ static __init int event_trace_enable(void)
> list_add(&call->list, &ftrace_events);
> }
>
> + register_trigger_cmds();
> +
> /*
> * We need the top trace array to have a working set of trace
> * points at early init, before the debug files and directories
> @@ -3740,7 +3800,6 @@ static __init int event_trace_enable(void)
>
> register_event_cmds();
>
> - register_trigger_cmds();
>
> return 0;
> }
> diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
> index f23144af5743..f6e4ee1e40b3 100644
> --- a/kernel/tracepoint.c
> +++ b/kernel/tracepoint.c
> @@ -48,6 +48,9 @@ static void tp_rcu_get_state(enum tp_transition_sync sync)
> {
> struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
>
> + if (early_boot_irqs_disabled)
> + return;
> +
> /* Keep the latest get_state snapshot. */
> snapshot->rcu = get_state_synchronize_rcu();
> snapshot->srcu = start_poll_synchronize_srcu(&tracepoint_srcu);
> @@ -58,6 +61,9 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync)
> {
> struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync];
>
> + if (early_boot_irqs_disabled)
> + return;
> +
> if (!snapshot->ongoing)
> return;
> cond_synchronize_rcu(snapshot->rcu);
> --
> 2.35.1
>