Re: [PATCH 2/5] ftrace: infrastructure for supporting binary record

From: Frédéric Weisbecker
Date: Mon Jan 19 2009 - 14:28:31 EST


2009/1/2 Arnaldo Carvalho de Melo <acme@xxxxxxxxxxxxxxxxxx>:
> On Wed, Dec 31, 2008 at 2:59 AM, Frederic Weisbecker <fweisbec@xxxxxxxxx> wrote:
>> Hi Lai.
>>
>> On Wed, Dec 31, 2008 at 10:56:05AM +0800, Lai Jiangshan wrote:
>>>
>>> Impact: save on memory for tracing
>>>
>>> Current tracers are typically using a struct(like struct ftrace_entry,
>>> struct ctx_switch_entry, struct special_entr etc...)to record a binary
>>> event. These structs can only record a their own kind of events.
>>> A new kind of tracer need a new struct and a lot of code too handle it.
>>>
>>> So we need a generic binary record for events. This infrastructure
>>> is for this purpose.
>>>
>>> Signed-off-by: Lai Jiangshan <laijs@xxxxxxxxxxxxxx>
>>> ---
>>> include/linux/ftrace.h | 3 +
>>> kernel/trace/Kconfig | 6 ++
>>> kernel/trace/Makefile | 1
>>> kernel/trace/trace.c | 57 ++++++++++++++++++++++++++++
>>> kernel/trace/trace.h | 12 +++++
>>> kernel/trace/trace_bprintk.c | 87 +++++++++++++++++++++++++++++++++++++++++++
>>> kernel/trace/trace_output.c | 72 +++++++++++++++++++++++++++++++++++
>>> 7 files changed, 238 insertions(+)
>>> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
>>> index 677432b..0ec4752 100644
>>> --- a/include/linux/ftrace.h
>>> +++ b/include/linux/ftrace.h
>>> @@ -181,6 +181,9 @@ extern int ftrace_make_nop(struct module *mod,
>>> */
>>> extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
>>>
>>> +#ifdef CONFIG_TRACE_BPRINTK
>>> +extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
>>> +#endif
>>>
>>> /* May be defined in arch */
>>> extern int ftrace_arch_read_dyn_info(char *buf, int size);
>>> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
>>> index e2a4ff6..887bd89 100644
>>> --- a/kernel/trace/Kconfig
>>> +++ b/kernel/trace/Kconfig
>>> @@ -77,6 +77,12 @@ config FUNCTION_GRAPH_TRACER
>>> This is done by setting the current return address on the current
>>> task structure into a stack of calls.
>>>
>>> +config TRACE_BPRINTK
>>> + bool "Binary printk for tracing"
>>> + default y
>>> + depends on TRACING
>>> + select BINARY_PRINTF
>>> +
>>> config IRQSOFF_TRACER
>>> bool "Interrupts-off Latency Tracer"
>>> default n
>>> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
>>> index 31cd5fb..8ca3952 100644
>>> --- a/kernel/trace/Makefile
>>> +++ b/kernel/trace/Makefile
>>> @@ -21,6 +21,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
>>> obj-$(CONFIG_TRACING) += trace.o
>>> obj-$(CONFIG_TRACING) += trace_output.o
>>> obj-$(CONFIG_TRACING) += trace_stat.o
>>> +obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
>>> obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
>>> obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
>>> obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
>>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
>>> index b789c01..6e5c9df 100644
>>> --- a/kernel/trace/trace.c
>>> +++ b/kernel/trace/trace.c
>>> @@ -3027,6 +3027,63 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
>>> }
>>> EXPORT_SYMBOL_GPL(__ftrace_printk);
>>>
>>> +/**
>>> + * trace_vbprintk - write binary msg to tracing buffer
>>> + *
>>> + * Caller must insure @fmt are valid when msg is in tracing buffer.
>>> + */
>>> +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
>>> +{
>>> + static DEFINE_SPINLOCK(trace_buf_lock);
>>> + static u32 trace_buf[TRACE_BUF_SIZE];
>>> +
>>> + struct ring_buffer_event *event;
>>> + struct trace_array *tr = &global_trace;
>>> + struct trace_array_cpu *data;
>>> + struct bprintk_entry *entry;
>>> + unsigned long flags, irq_flags;
>>> + int cpu, len = 0, size, pc;
>>> +
>>> + if (tracing_disabled || !trace_bprintk_enable)
>>> + return 0;
>>> +
>>> + pc = preempt_count();
>>> + preempt_disable_notrace();
>>> + cpu = raw_smp_processor_id();
>>> + data = tr->data[cpu];
>>> +
>>> + if (unlikely(atomic_read(&data->disabled)))
>>> + goto out;
>>> +
>>> + spin_lock_irqsave(&trace_buf_lock, flags);
>>> + len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
>>> +
>>> + if (len > TRACE_BUF_SIZE || len < 0)
>>> + goto out_unlock;
>>> +
>>> + size = sizeof(*entry) + sizeof(u32) * len;
>>> + event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
>>> + if (!event)
>>> + goto out_unlock;
>>> + entry = ring_buffer_event_data(event);
>>> + tracing_generic_entry_update(&entry->ent, flags, pc);
>>> + entry->ent.type = TRACE_BPRINTK;
>>> + entry->ip = ip;
>>> + entry->fmt = fmt;
>>> +
>>> + memcpy(entry->buf, trace_buf, sizeof(u32) * len);
>>> + ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
>>> +
>>> +out_unlock:
>>> + spin_unlock_irqrestore(&trace_buf_lock, flags);
>>> +
>>> +out:
>>> + preempt_enable_notrace();
>>> +
>>> + return len;
>>> +}
>>> +EXPORT_SYMBOL_GPL(trace_vbprintk);
>>> +
>>> static int trace_panic_handler(struct notifier_block *this,
>>> unsigned long event, void *unused)
>>> {
>>> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
>>> index a8b624c..6885d13 100644
>>> --- a/kernel/trace/trace.h
>>> +++ b/kernel/trace/trace.h
>>> @@ -18,6 +18,7 @@ enum trace_type {
>>> TRACE_WAKE,
>>> TRACE_STACK,
>>> TRACE_PRINT,
>>> + TRACE_BPRINTK,
>>> TRACE_SPECIAL,
>>> TRACE_MMIO_RW,
>>> TRACE_MMIO_MAP,
>>> @@ -120,6 +121,16 @@ struct print_entry {
>>> char buf[];
>>> };
>>>
>>> +struct bprintk_entry {
>>> + struct trace_entry ent;
>>> + unsigned long ip;
>>> + const char *fmt;
>>> + u32 buf[];
>>> +};
>>> +#ifdef CONFIG_TRACE_BPRINTK
>>> +extern int trace_bprintk_enable;
>>> +#endif
>>> +
>>> #define TRACE_OLD_SIZE 88
>>>
>>> struct trace_field_cont {
>>> @@ -262,6 +273,7 @@ extern void __ftrace_bad_type(void);
>>> IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
>>> IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
>>> IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
>>> + IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
>>> IF_ASSIGN(var, ent, struct special_entry, 0); \
>>> IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
>>> TRACE_MMIO_RW); \
>>> diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
>>> new file mode 100644
>>> index 0000000..1f8e532
>>> --- /dev/null
>>> +++ b/kernel/trace/trace_bprintk.c
>>> @@ -0,0 +1,87 @@
>>> +/*
>>> + * trace binary printk
>>> + *
>>> + * Copyright (C) 2008 Lai Jiangshan <laijs@xxxxxxxxxxxxxx>
>>> + *
>>> + */
>>> +#include <linux/kernel.h>
>>> +#include <linux/ftrace.h>
>>> +#include <linux/string.h>
>>> +#include <linux/ctype.h>
>>> +#include <linux/list.h>
>>> +#include <linux/mutex.h>
>>> +#include <linux/slab.h>
>>> +#include <linux/module.h>
>>> +#include <linux/seq_file.h>
>>> +#include <linux/fs.h>
>>> +#include <linux/marker.h>
>>> +#include <linux/uaccess.h>
>>> +
>>> +#include "trace.h"
>>> +
>>> +/* binary printk basic */
>>> +static DEFINE_MUTEX(btrace_mutex);
>>
>> Looks like you only need a mutex to protect a counter.
>> Wouldn't it be better to have this counter atomic and then drop
>> out the mutex?
>>
>>> +static int btrace_metadata_count;
>>> +
>>> +static inline void lock_btrace(void)
>>> +{
>>> + mutex_lock(&btrace_mutex);
>>> +}
>>> +
>>> +static inline void unlock_btrace(void)
>>> +{
>>> + mutex_unlock(&btrace_mutex);
>>> +}
>>> +
>>> +static void get_btrace_metadata(void)
>>> +{
>>> + lock_btrace();
>>> + btrace_metadata_count++;
>>> + unlock_btrace();
>>> +}
>>> +
>>> +static void put_btrace_metadata(void)
>>> +{
>>> + lock_btrace();
>>> + btrace_metadata_count--;
>>> + unlock_btrace();
>>> +}
>>> +
>>> +/* events tracer */
>>> +int trace_bprintk_enable;
>>> +
>>> +static void start_bprintk_trace(struct trace_array *tr)
>>> +{
>>> + get_btrace_metadata();
>>> + tracing_reset_online_cpus(tr);
>>> + trace_bprintk_enable = 1;
>>> +}
>>> +
>>> +static void stop_bprintk_trace(struct trace_array *tr)
>>> +{
>>> + trace_bprintk_enable = 0;
>>> + tracing_reset_online_cpus(tr);
>>> + put_btrace_metadata();
>>> +}
>>> +
>>> +static int init_bprintk_trace(struct trace_array *tr)
>>> +{
>>> + start_bprintk_trace(tr);
>>> + return 0;
>>> +}
>>> +
>>> +static struct tracer bprintk_trace __read_mostly =
>>> +{
>>> + .name = "events",
>>> + .init = init_bprintk_trace,
>>> + .reset = stop_bprintk_trace,
>>> + .start = start_bprintk_trace,
>>> + .stop = stop_bprintk_trace,
>>> +};
>>> +
>>> +static __init int init_bprintk(void)
>>> +{
>>> + return register_tracer(&bprintk_trace);
>>> +}
>>> +
>>> +device_initcall(init_bprintk);
>>> diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
>>> index df0c25c..ccc9fa7 100644
>>> --- a/kernel/trace/trace_output.c
>>> +++ b/kernel/trace/trace_output.c
>>> @@ -53,6 +53,26 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
>>> return len;
>>> }
>>>
>>> +static int
>>> +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
>>> +{
>>> + int len = (PAGE_SIZE - 1) - s->len;
>>> + int ret;
>>> +
>>> + if (!len)
>>> + return 0;
>>> +
>>> + ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
>>> +
>>> + /* If we can't write it all, don't bother writing anything */
>>> + if (ret >= len)
>>> + return 0;
>>> +
>>> + s->len += ret;
>>> +
>>> + return len;
>>> +}
>>> +
>>> /**
>>> * trace_seq_puts - trace sequence printing of simple string
>>> * @s: trace sequence descriptor
>>> @@ -800,6 +820,57 @@ static struct trace_event trace_print_event = {
>>> .binary = trace_nop_print,
>>> };
>>>
>>> +/* TRACE_BPRINTK */
>>> +static int
>>> +trace_bprintk_print(struct trace_seq *s, struct trace_entry *entry, int flags)
>>> +{
>>> + struct bprintk_entry *field;
>>> +
>>> + trace_assign_type(field, entry);
>>> +
>>> + if (!seq_print_ip_sym(s, field->ip, flags))
>>> + goto partial;
>>> +
>>> + if (!trace_seq_puts(s, ": "))
>>> + goto partial;
>>> +
>>> + if (!trace_seq_bprintf(s, field->fmt, field->buf))
>>> + goto partial;
>>> +
>>> + return 0;
>>> +
>>> + partial:
>>> + return TRACE_TYPE_PARTIAL_LINE;
>>> +}
>>> +
>>> +static int
>>> +trace_bprintk_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
>>> +{
>>> + struct bprintk_entry *field;
>>> +
>>> + trace_assign_type(field, entry);
>>> +
>>> + if (!trace_seq_printf(s, ": %lx : ", field->ip))
>>> + goto partial;
>>> +
>>> + if (!trace_seq_bprintf(s, field->fmt, field->buf))
>>> + goto partial;
>>> +
>>> + return 0;
>>> +
>>> + partial:
>>> + return TRACE_TYPE_PARTIAL_LINE;
>>> +}
>>> +
>>> +static struct trace_event trace_bprintk_event = {
>>> + .type = TRACE_BPRINTK,
>>> + .trace = trace_bprintk_print,
>>> + .latency_trace = trace_bprintk_print,
>>> + .raw = trace_bprintk_raw,
>>> + .hex = trace_nop_print,
>>> + .binary = trace_nop_print,
>>> +};
>>> +
>>> static struct trace_event *events[] __initdata = {
>>> &trace_fn_event,
>>> &trace_ctx_event,
>>> @@ -808,6 +879,7 @@ static struct trace_event *events[] __initdata = {
>>> &trace_stack_event,
>>> &trace_user_stack_event,
>>> &trace_print_event,
>>> + &trace_bprintk_event,
>>> NULL
>>> };
>>>
>>
>>
>> Just for curiosity. Why do you need such a binary tracing?
>> Do you need it because a string output is really too slow for your needs?
>
> warning: I haven't looked at the patch details
>
> But I would love to use something like this to provide the exact
> contents the userspace blktrace utilities want.
>
> - Arnaldo
>


Hi Arnaldo,

Since you talked about binary record for the blk tracer, I just recall
this patch.
Are you sure this infrastructure would cover your needs?

>From your traced site, trace_vbprintk will carry your random typed
datas in a binary-contiguous way.
But actually, its purpose is more about holding binary data transport
to finally print them in a formatted string
output, as usual.

You can compare it with ftrace_printk, since the result is the same.
The difference is in the transport.
ftrace_printk will carry your data as already formatted whereas
trace_vbprintk will carry them as binary values and format them
in the last moment. On most cases, trace_vbprintk would be logically
more lightweight.


But if you have some predefined typed data to export as binary, I
think that can't really help you.
You would prefer to define you own type and then export them as binary
values with your own output helper function.

Hmm?

Steven, Ingo, what are your thoughts about this patch?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/