[PATCH] hwlathist: BPF based latency tracing

From: Daniel Wagner
Date: Thu Jun 11 2015 - 03:19:03 EST


BPF offers another way to generate latency histograms. We attach
kprobes at trace_preempt_off and trace_preempt_on and calculate the
time it takes to from seeing the off/on transition.

The first hashmap is used to store the start time stamp. The key is the
CPU id. The second hashmap stores the log2(time diff). To be able to
have a per-cpu histogram the CPU ID is added to the upper
bits of the hash key.

Obviously, adding kprobes to trace_preempt_[on|off] is
asking for problems:

[ 4425.690514] Modules linked in:
[ 4425.690770] CPU: 1 PID: 1803 Comm: hackbench Not tainted 4.1.0-rc7+ #689
[ 4425.691029] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
[ 4425.691029] task: ffff88007c305240 ti: ffff880078fbc000 task.ti: ffff880078fbc000
[ 4425.691029] RIP: 0010:[<ffffffff8115dc37>] [<ffffffff8115dc37>] htab_map_delete_elem+0x1a7/0x240
[ 4425.691029] RSP: 0018:ffff88007d007af8 EFLAGS: 00010082
[ 4425.691029] RAX: ffff880079bfe880 RBX: ffff88007d007dbc RCX: dead000000200200
[ 4425.691029] RDX: ffff880079bfe880 RSI: ffff88007d007dbc RDI: ffff880079bfe8a8
[ 4425.691029] RBP: ffff88007d007b38 R08: 0000000000000000 R09: 0000000000000000
[ 4425.691029] R10: 0000000000000001 R11: 0000000000000008 R12: ffff880079894500
[ 4425.691029] R13: 0000000000000004 R14: ffff880079894578 R15: 0000000000000096
[ 4425.691029] FS: 00007ff5c4d51740(0000) GS:ffff88007d000000(0000) knlGS:0000000000000000
[ 4425.691029] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4425.691029] CR2: 0000000000fad118 CR3: 0000000078ed6000 CR4: 00000000001406e0
[ 4425.691029] Stack:
[ 4425.691029] ffff88007d007b28 72a82a9b00000092 ffff880079894500 ffff880079894500
[ 4425.691029] ffff88007d007dbc 0000000000000001 ffff88000004d000 ffffffff81141ca1
[ 4425.691029] ffff88007d007b58 ffffffff8115e2d8 ffffc9000021c358 ffffffff81c19780
[ 4425.691029] Call Trace:
[ 4425.691029] <#DB>
[ 4425.691029] [<ffffffff81141ca1>] ? trace_preempt_on+0x1/0x110
[ 4425.691029] [<ffffffff8115e2d8>] bpf_map_delete_elem+0x28/0x60
[ 4425.691029] [<ffffffff81158e53>] __bpf_prog_run+0xa63/0x1230
[ 4425.691029] [<ffffffff81090895>] ? sched_clock_local+0x25/0x90
[ 4425.691029] [<ffffffff81090b88>] ? sched_clock_cpu+0xa8/0x100
[ 4425.691029] [<ffffffff810acb1c>] ? __lock_acquire+0xaac/0x2280
[ 4425.691029] [<ffffffff81090895>] ? sched_clock_local+0x25/0x90
[ 4425.691029] [<ffffffff8114e44f>] ? trace_call_bpf+0x5f/0x1a0
[ 4425.691029] [<ffffffff8114e497>] trace_call_bpf+0xa7/0x1a0
[ 4425.691029] [<ffffffff8114e44f>] ? trace_call_bpf+0x5f/0x1a0
[ 4425.691029] [<ffffffff8114e758>] kprobe_perf_func+0x28/0x240
[ 4425.691029] [<ffffffff810447c4>] ? skip_prefixes+0x24/0x50
[ 4425.691029] [<ffffffff81141ca1>] ? trace_preempt_on+0x1/0x110
[ 4425.691029] [<ffffffff81150408>] kprobe_dispatcher+0x38/0x60
[ 4425.691029] [<ffffffff81141ca0>] ? time_hardirqs_off+0x110/0x110
[ 4425.691029] [<ffffffff81141ca0>] ? time_hardirqs_off+0x110/0x110
[ 4425.691029] [<ffffffff81045628>] kprobe_int3_handler+0x1b8/0x1e0
[ 4425.691029] [<ffffffff81003683>] do_int3+0x53/0x140
[ 4425.691029] [<ffffffff8111ca18>] ? audit_alloc+0xb8/0x2a0
[ 4425.691029] [<ffffffff81b714f1>] int3+0x41/0x80
[ 4425.691029] [<ffffffff8111ca18>] ? audit_alloc+0xb8/0x2a0
[ 4425.691029] [<ffffffff81b71de6>] ? smp_apic_timer_interrupt+0x46/0x60
[ 4425.691029] [<ffffffff81b71de6>] ? smp_apic_timer_interrupt+0x46/0x60
[ 4425.691029] [<ffffffff810606e9>] ? irq_exit+0x19/0xc0
[ 4425.691029] [<ffffffff81141ca1>] ? trace_preempt_on+0x1/0x110
[ 4425.691029] <<EOE>>
[ 4425.691029] <IRQ> [ 4425.691029] [<ffffffff8108a77b>] ? preempt_count_sub+0xab/0xf0
[ 4425.691029] [<ffffffff810606e9>] irq_exit+0x19/0xc0
[ 4425.691029] [<ffffffff81b71de6>] smp_apic_timer_interrupt+0x46/0x60
[ 4425.691029] [<ffffffff81b70800>] apic_timer_interrupt+0x70/0x80
[ 4425.691029] <EOI>
[ 4425.691029] [<ffffffff810af7e0>] ? lock_release+0x130/0x480
[ 4425.691029] [<ffffffff8111ca3d>] audit_alloc+0xdd/0x2a0
[ 4425.691029] [<ffffffff8111c978>] ? audit_alloc+0x18/0x2a0
[ 4425.691029] [<ffffffff810579d3>] copy_process.part.37+0x753/0x1b10
[ 4425.691029] [<ffffffff8108a77b>] ? preempt_count_sub+0xab/0xf0
[ 4425.691029] [<ffffffff8111cedc>] ? __audit_syscall_entry+0x9c/0xf0
[ 4425.691029] [<ffffffff81058f4b>] do_fork+0xdb/0x7c0
[ 4425.691029] [<ffffffff8111cedc>] ? __audit_syscall_entry+0x9c/0xf0
[ 4425.691029] [<ffffffff8111cedc>] ? __audit_syscall_entry+0x9c/0xf0
[ 4425.691029] [<ffffffff81010dfc>] ? do_audit_syscall_entry+0x6c/0x70
[ 4425.691029] [<ffffffff81011bd3>] ? syscall_trace_enter_phase1+0x103/0x170
[ 4425.691029] [<ffffffff814fb4e8>] ? trace_hardirqs_on_thunk+0x17/0x19
[ 4425.691029] [<ffffffff810596b6>] SyS_clone+0x16/0x20
[ 4425.691029] [<ffffffff81b6f997>] system_call_fastpath+0x12/0x6f
[ 4425.691029] Code: 8b 54 24 70 44 89 e9 83 e8 01 21 f0 48 8d 3c c2 48 89 da e8 3c f7 ff ff 48 85 c0 0f 84 8f 00 00 00 48 8b 10 48 8b 48 08 48 85 d2 <48> 89 11 74 04 48 89 4a 08 48 bb 00 02 20 00 00 00 ad de 48 8d
[ 4425.691029] RIP [<ffffffff8115dc37>] htab_map_delete_elem+0x1a7/0x240
[ 4425.691029] RSP <ffff88007d007af8>
[ 4425.691029] ---[ end trace 42ef1259d92d0b4a ]---
[ 4425.691029] Kernel panic - not syncing: Fatal exception in interrupt
[ 4425.691029] Kernel Offset: disabled
[ 4425.691029] ---[ end Kernel panic - not syncing: Fatal exception in interrupt

Nevertheless, I was able run it for a while before it exploded
and got this:

CPU 0
latency : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 3602 |************************************************* |
16 -> 31 : 22 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
CPU 1
latency : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 7533 |************************************************* |
16 -> 31 : 125 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
CPU 2
latency : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 3044 |************************************************* |
16 -> 31 : 13 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
CPU 3
latency : count distribution
1 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 25182 |************************************************* |
16 -> 31 : 1675 |** |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |

The numbers look a bit too nice and low. I suspect something is going
wrong.

All this is based on the tracex3 examples written by
Alexei Starovoitov <ast@xxxxxxxxxxxx>.

Not for inclusion!

Not-signed-off-by: Daniel Wagner <daniel.wagner@xxxxxxxxxxxx>
---
kernel/trace/bpf_trace.c | 2 +
samples/bpf/Makefile | 6 ++-
samples/bpf/bpf_helpers.h | 2 +
samples/bpf/hwlathist_kern.c | 92 ++++++++++++++++++++++++++++++++++++
samples/bpf/hwlathist_user.c | 108 +++++++++++++++++++++++++++++++++++++++++++
5 files changed, 209 insertions(+), 1 deletion(-)
create mode 100644 samples/bpf/hwlathist_kern.c
create mode 100644 samples/bpf/hwlathist_user.c

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce5..e4d3e76 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_probe_read_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;

case BPF_FUNC_trace_printk:
/*
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 8fdbd73..8420648 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -10,6 +10,7 @@ hostprogs-y += tracex1
hostprogs-y += tracex2
hostprogs-y += tracex3
hostprogs-y += tracex4
+hostprogs-y += hwlathist

test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o
@@ -20,6 +21,7 @@ tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
+hwlathist-objs := bpf_load.o libbpf.o hwlathist_user.o

# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -30,6 +32,7 @@ always += tracex2_kern.o
always += tracex3_kern.o
always += tracex4_kern.o
always += tcbpf1_kern.o
+always += hwlathist_kern.o

HOSTCFLAGS += -I$(objtree)/usr/include

@@ -40,9 +43,10 @@ HOSTLOADLIBES_tracex1 += -lelf
HOSTLOADLIBES_tracex2 += -lelf
HOSTLOADLIBES_tracex3 += -lelf
HOSTLOADLIBES_tracex4 += -lelf -lrt
+HOSTLOADLIBES_hwlathist += -lelf

# point this to your LLVM backend with bpf support
-LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
+LLC = /opt/llvm/bin/llc

$(obj)/%.o: $(src)/%.c
clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f960b5f..c99bdb6 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -21,6 +21,8 @@ static unsigned long long (*bpf_ktime_get_ns)(void) =
(void *) BPF_FUNC_ktime_get_ns;
static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
(void *) BPF_FUNC_trace_printk;
+static unsigned int (*bpf_get_smp_processor_id)(void) =
+ (void *) BPF_FUNC_get_smp_processor_id;

/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/hwlathist_kern.c b/samples/bpf/hwlathist_kern.c
new file mode 100644
index 0000000..7a97e7e
--- /dev/null
+++ b/samples/bpf/hwlathist_kern.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(unsigned int),
+ .value_size = sizeof(u64),
+ .max_entries = 1024,
+};
+
+static unsigned int log2(unsigned int v)
+{
+ unsigned int r;
+ unsigned int shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+
+ return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+ unsigned int hi = v >> 32;
+
+ if (hi)
+ return log2(hi) + 32;
+ else
+ return log2(v);
+}
+
+SEC("kprobe/trace_preempt_off")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ int cpu = bpf_get_smp_processor_id();
+ u64 ts = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(&my_map, &cpu, &ts, BPF_ANY);
+
+ return 0;
+}
+
+struct bpf_map_def SEC("maps") my_lat = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(unsigned long long),
+ .value_size = sizeof(long),
+ .max_entries = 1024,
+};
+
+SEC("kprobe/trace_preempt_on")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ long init_val = 1;
+ long *value;
+ u64 *ts;
+ int cpu;
+
+ cpu = bpf_get_smp_processor_id();
+ ts = bpf_map_lookup_elem(&my_map, &cpu);
+ if (!ts)
+ return 0;
+
+ u64 cur_time = bpf_ktime_get_ns();
+ u64 delta = log2l(cur_time - *ts);
+
+ bpf_map_delete_elem(&my_map, &cpu);
+
+ u64 key = (((u64)cpu) << 32) | (log2l(delta) & 0xffffffff);
+ value = bpf_map_lookup_elem(&my_lat, &key);
+ if (value)
+ __sync_fetch_and_add((long *)value, 1);
+ else
+ bpf_map_update_elem(&my_lat, &key, &init_val, BPF_ANY);
+
+ return 0;
+
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/hwlathist_user.c b/samples/bpf/hwlathist_user.c
new file mode 100644
index 0000000..7e19f2c
--- /dev/null
+++ b/samples/bpf/hwlathist_user.c
@@ -0,0 +1,108 @@
+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2015 BMW Car IT GmbH
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_ENTRIES 10
+#define MAX_CPU 8
+#define MAX_STARS 50
+
+static void stars(char *str, long val, long max, int width)
+{
+ int i;
+
+ for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+ str[i] = '*';
+ if (val > max)
+ str[i - 1] = '+';
+ str[i] = '\0';
+}
+
+struct cpu_hist {
+ long data[MAX_ENTRIES];
+ unsigned long max;
+};
+
+static struct cpu_hist cpu_hist[MAX_CPU];
+
+static void get_data(int fd)
+{
+ unsigned long long key, next_key, k;
+ long value;
+ int i, cpu;
+
+ for (i = 0; i < MAX_CPU; i++)
+ cpu_hist[i].max = 0;
+
+ key = 0;
+ while (bpf_get_next_key(fd, &key, &next_key) == 0) {
+ bpf_lookup_elem(fd, &next_key, &value);
+ cpu = next_key >> 32;
+ if (cpu >= MAX_CPU)
+ goto next;
+
+ k = next_key & 0xffffffff;
+ if (k > MAX_ENTRIES)
+ k = MAX_ENTRIES - 1;
+
+ cpu_hist[cpu].data[k] = value;
+ if (value > cpu_hist[cpu].max)
+ cpu_hist[cpu].max = value;
+
+ next:
+ key = next_key;
+ }
+}
+
+static void print_hist(void)
+{
+ char starstr[MAX_STARS];
+ struct cpu_hist *hist;
+ int i, j;
+
+ printf("\033[2J");
+
+ for (j = 0; j < MAX_CPU; j++) {
+ hist = &cpu_hist[j];
+ if (hist->max == 0)
+ continue;
+ printf("CPU %d\n", j);
+ printf(" latency : count distribution\n");
+ for (i = 1; i <= MAX_ENTRIES; i++) {
+ stars(starstr, hist->data[i - 1], hist->max, MAX_STARS);
+ printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+ (1l << i) >> 1, (1l << i) - 1,
+ hist->data[i - 1], MAX_STARS, starstr);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ char filename[256];
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ while (1) {
+ get_data(map_fd[1]);
+ print_hist();
+ sleep(5);
+ }
+
+ return 0;
+}
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/