Re: [PATCH 03/44] perf augmented_raw_syscalls: Use a PERCPU_ARRAY map to copy more string bytes

From: Andrii Nakryiko
Date: Tue Apr 02 2019 - 12:45:06 EST


On Tue, Apr 2, 2019 at 9:12 AM Arnaldo Carvalho de Melo <acme@xxxxxxxxxx> wrote:
>
> From: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
>
> The previous method, copying to the BPF stack limited us in how many
> bytes we could copy from strings, use a PERCPU_ARRAY map like devised by
> the sysdig guys[1] to copy more bytes:
>
> Before:
>
> # trace --no-inherit -e openat touch `python -c "print "$s" 'a' * 2000"`
> touch: cannot touch 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa': File name too long
> openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
> openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
> openat(AT_FDCWD, "/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3
> openat(AT_FDCWD, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", O_CREAT|O_NOCTTY|O_NONBLOCK|O_WRONLY, S_IRUGO|S_IWUGO) = -1 ENAMETOOLONG (File name too long)
> openat(AT_FDCWD, "/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 3
> openat(AT_FDCWD, "/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
> openat(AT_FDCWD, "/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
> <SNIP some openat calls>
> #
>
> After:
>
> [root@quaco acme]# trace --no-inherit -e openat touch `python -c "print "$s" 'a' * 2000"`
> <STRIP what is the same as in the 'before' part>
> openat(AT_FDCWD, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", O_CREAT|O_NOCTTY|O_NONBLOC) = -1 ENAMETOOLONG (File name too long)
> <STRIP what is the same as in the 'before' part>
>
> If we leave something like 'perf trace -e string' to trace all syscalls
> with a string, and then do some 'perf top', to get some annotation for
> the augmented_raw_syscalls.o BPF program we get:
>
> â â callq *ffffffffc45576d1 â
> â augmented_args->filename.size = probe_read_str(&augmented_args->filename.value, â
> 0.05 â mov %eax,0x40(%r13)
>
> Looking with pahole, expanding types, asking for hex offsets and sizes,
> and use of BTF type information to see what is at that 0x40 offset from
> %r13:
>
> # pahole -F btf -C augmented_args_filename --expand_types --hex /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.o
> struct augmented_args_filename {
> struct syscall_enter_args {
> long long unsigned int common_tp_fields; /* 0 0x8 */
> long int syscall_nr; /* 0x8 0x8 */
> long unsigned int args[6]; /* 0x10 0x30 */
> } args; /* 0 0x40 */
> /* --- cacheline 1 boundary (64 bytes) --- */
> struct augmented_filename {
> unsigned int size; /* 0x40 0x4 */
> int reserved; /* 0x44 0x4 */
> char value[4096]; /* 0x48 0x1000 */
> } filename; /* 0x40 0x1008 */
>
> /* size: 4168, cachelines: 66, members: 2 */
> /* last cacheline: 8 bytes */
> };
> #
>
> Then looking if PATH_MAX leaves some signature in the tests:
>
> â if (augmented_args->filename.size < sizeof(augmented_args->filename.value)) { â
> â cmp $0xfff,%rdi
>
> 0xfff == 4095
> sizeof(augmented_args->filename.value) == PATH_MAX == 4096
>
> [1] https://sysdig.com/blog/the-art-of-writing-ebpf-programs-a-primer/
>
> Cc: Adrian Hunter <adrian.hunter@xxxxxxxxx>
> Cc: Andrii Nakryiko <andriin@xxxxxx>
> Cc: Daniel Borkmann <borkmann@xxxxxxxxxxxxx>
> Cc: Gianluca Borello <g.borello@xxxxxxxxx>
> Cc: Jesper Dangaard Brouer <brouer@xxxxxxxxxx>
> Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
> Cc: Luis ClÃudio GonÃalves <lclaudio@xxxxxxxxxx>
> cc: Martin Lau <kafai@xxxxxx>
> Cc: Namhyung Kim <namhyung@xxxxxxxxxx>
> Cc: Wang Nan <wangnan0@xxxxxxxxxx>
> Cc: Yonghong Song <yhs@xxxxxx>
> Link: https://lkml.kernel.org/n/tip-76gce2d2ghzq537ubwhjkone@xxxxxxxxxxxxxx
> Signed-off-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
> ---

Looks good to me. I've been using PERCPU_ARRAY as a sort of heap for
many things that don't fit on BPF stack.

Acked-by: Andrii Nakryiko <andriin@xxxxxx>

> .../examples/bpf/augmented_raw_syscalls.c | 46 +++++++++++--------
> 1 file changed, 28 insertions(+), 18 deletions(-)
>
> diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
> index 9f8b31ad7a49..2422894a8194 100644
> --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
> +++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
> @@ -15,6 +15,7 @@
> */
>
> #include <unistd.h>
> +#include <linux/limits.h>
> #include <pid_filter.h>
>
> /* bpf-output associated map */
> @@ -41,7 +42,7 @@ struct syscall_exit_args {
> struct augmented_filename {
> unsigned int size;
> int reserved;
> - char value[256];
> + char value[PATH_MAX];
> };
>
> /* syscalls where the first arg is a string */
> @@ -119,23 +120,32 @@ struct augmented_filename {
>
> pid_filter(pids_filtered);
>
> +struct augmented_args_filename {
> + struct syscall_enter_args args;
> + struct augmented_filename filename;
> +};
> +
> +bpf_map(augmented_filename_map, PERCPU_ARRAY, int, struct augmented_args_filename, 1);
> +
> SEC("raw_syscalls:sys_enter")
> int sys_enter(struct syscall_enter_args *args)
> {
> - struct {
> - struct syscall_enter_args args;
> - struct augmented_filename filename;
> - } augmented_args;
> - struct syscall *syscall;
> - unsigned int len = sizeof(augmented_args);
> + struct augmented_args_filename *augmented_args;
> + unsigned int len = sizeof(*augmented_args);
> const void *filename_arg = NULL;
> + struct syscall *syscall;
> + int key = 0;
> +
> + augmented_args = bpf_map_lookup_elem(&augmented_filename_map, &key);
> + if (augmented_args == NULL)
> + return 1;
>
> if (pid_filter__has(&pids_filtered, getpid()))
> return 0;
>
> - probe_read(&augmented_args.args, sizeof(augmented_args.args), args);
> + probe_read(&augmented_args->args, sizeof(augmented_args->args), args);
>
> - syscall = bpf_map_lookup_elem(&syscalls, &augmented_args.args.syscall_nr);
> + syscall = bpf_map_lookup_elem(&syscalls, &augmented_args->args.syscall_nr);
> if (syscall == NULL || !syscall->enabled)
> return 0;
> /*
> @@ -191,7 +201,7 @@ int sys_enter(struct syscall_enter_args *args)
> * processor architecture, making the kernel part the same no matter what
> * kernel version or processor architecture it runs on.
> */
> - switch (augmented_args.args.syscall_nr) {
> + switch (augmented_args->args.syscall_nr) {
> case SYS_ACCT:
> case SYS_ADD_KEY:
> case SYS_CHDIR:
> @@ -263,20 +273,20 @@ int sys_enter(struct syscall_enter_args *args)
> }
>
> if (filename_arg != NULL) {
> - augmented_args.filename.reserved = 0;
> - augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,
> - sizeof(augmented_args.filename.value),
> + augmented_args->filename.reserved = 0;
> + augmented_args->filename.size = probe_read_str(&augmented_args->filename.value,
> + sizeof(augmented_args->filename.value),
> filename_arg);
> - if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {
> - len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;
> - len &= sizeof(augmented_args.filename.value) - 1;
> + if (augmented_args->filename.size < sizeof(augmented_args->filename.value)) {
> + len -= sizeof(augmented_args->filename.value) - augmented_args->filename.size;
> + len &= sizeof(augmented_args->filename.value) - 1;
> }
> } else {
> - len = sizeof(augmented_args.args);
> + len = sizeof(augmented_args->args);
> }
>
> /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
> - return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);
> + return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, augmented_args, len);
> }
>
> SEC("raw_syscalls:sys_exit")
> --
> 2.20.1
>