Re: [PATCH RFC/RFT] perf bpf skels: Stop using vmlinux.h generated from BTF, use subset of used structs + CO-RE. was Re: BPF skels in perf .Re: [GIT PULL] perf tools changes for v6.4

From: Ian Rogers
Date: Mon May 08 2023 - 17:53:46 EST


On Fri, May 5, 2023 at 9:56 AM Arnaldo Carvalho de Melo <acme@xxxxxxxxxx> wrote:
>
> Em Fri, May 05, 2023 at 10:33:15AM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Fri, May 05, 2023 at 01:03:14AM +0200, Jiri Olsa escreveu:
> > That with the preserve_access_index isn't needed, we need just the
> > fields that we access in the tools, right?
>
> I'm now doing build test this in many distro containers, without the two
> reverts, i.e. BPF skels continue as opt-out as in my pull request, to
> test build and also for the functionality tests on the tools using such
> bpf skels, see below, no touching of vmlinux nor BTF data during the
> build.
>
> - Arnaldo
>
> From 882adaee50bc27f85374aeb2fbaa5b76bef60d05 Mon Sep 17 00:00:00 2001
> From: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
> Date: Thu, 4 May 2023 19:03:51 -0300
> Subject: [PATCH 1/1] perf bpf skels: Stop using vmlinux.h generated from BTF,
> use subset of used structs + CO-RE
>
> Linus reported a build break due to using a vmlinux without a BTF elf
> section to generate the vmlinux.h header with bpftool for use in the BPF
> tools in tools/perf/util/bpf_skel/*.bpf.c.
>
> Instead add a vmlinux.h file with the structs needed with the fields the
> tools need, marking the structs with __attribute__((preserve_access_index)),
> so that libbpf's CO-RE code can fixup the struct field offsets.
>
> In some cases the vmlinux.h file that was being generated by bpftool
> from the kernel BTF information was not needed at all, just including
> linux/bpf.h, sometimes linux/perf_event.h was enough as non-UAPI
> types were not being used.
>
> To keep te patch small, include those UAPI headers from the trimmed down
> vmlinux.h file, that then provides the tools with just the structs and
> the subset of its fields needed for them.
>
> Testing it:
>
> # perf lock contention -b find / > /dev/null
> ^C contended total wait max wait avg wait type caller
>
> 7 53.59 us 10.86 us 7.66 us rwlock:R start_this_handle+0xa0
> 2 30.35 us 21.99 us 15.17 us rwsem:R iterate_dir+0x52
> 1 9.04 us 9.04 us 9.04 us rwlock:W start_this_handle+0x291
> 1 8.73 us 8.73 us 8.73 us spinlock raw_spin_rq_lock_nested+0x1e
> #
> # perf lock contention -abl find / > /dev/null
> ^C contended total wait max wait avg wait address symbol
>
> 1 262.96 ms 262.96 ms 262.96 ms ffff8e67502d0170 (mutex)
> 12 244.24 us 39.91 us 20.35 us ffff8e6af56f8070 mmap_lock (rwsem)
> 7 30.28 us 6.85 us 4.33 us ffff8e6c865f1d40 rq_lock (spinlock)
> 3 7.42 us 4.03 us 2.47 us ffff8e6c864b1d40 rq_lock (spinlock)
> 2 3.72 us 2.19 us 1.86 us ffff8e6c86571d40 rq_lock (spinlock)
> 1 2.42 us 2.42 us 2.42 us ffff8e6c86471d40 rq_lock (spinlock)
> 4 2.11 us 559 ns 527 ns ffffffff9a146c80 rcu_state (spinlock)
> 3 1.45 us 818 ns 482 ns ffff8e674ae8384c (rwlock)
> 1 870 ns 870 ns 870 ns ffff8e68456ee060 (rwlock)
> 1 663 ns 663 ns 663 ns ffff8e6c864f1d40 rq_lock (spinlock)
> 1 573 ns 573 ns 573 ns ffff8e6c86531d40 rq_lock (spinlock)
> 1 472 ns 472 ns 472 ns ffff8e6c86431740 (spinlock)
> 1 397 ns 397 ns 397 ns ffff8e67413a4f04 (spinlock)
> #
> # perf test offcpu
> 95: perf record offcpu profiling tests : Ok
> #
> # perf kwork latency --use-bpf
> Starting trace, Hit <Ctrl+C> to stop and report
> ^C
> Kwork Name | Cpu | Avg delay | Count | Max delay | Max delay start | Max delay end |
> --------------------------------------------------------------------------------------------------------------------------------
> (w)flush_memcg_stats_dwork | 0000 | 1056.212 ms | 2 | 2112.345 ms | 550113.229573 s | 550115.341919 s |
> (w)toggle_allocation_gate | 0000 | 10.144 ms | 62 | 416.389 ms | 550113.453518 s | 550113.869907 s |
> (w)0xffff8e6748e28080 | 0002 | 0.623 ms | 1 | 0.623 ms | 550110.989841 s | 550110.990464 s |
> (w)vmstat_shepherd | 0000 | 0.586 ms | 10 | 2.828 ms | 550111.971536 s | 550111.974364 s |
> (w)vmstat_update | 0007 | 0.363 ms | 5 | 1.634 ms | 550113.222520 s | 550113.224154 s |
> (w)vmstat_update | 0000 | 0.324 ms | 10 | 2.827 ms | 550111.971526 s | 550111.974354 s |
> (w)0xffff8e674c5f4a58 | 0002 | 0.102 ms | 5 | 0.134 ms | 550110.989839 s | 550110.989972 s |
> (w)psi_avgs_work | 0001 | 0.086 ms | 3 | 0.107 ms | 550114.957852 s | 550114.957959 s |
> (w)psi_avgs_work | 0000 | 0.079 ms | 5 | 0.100 ms | 550118.605668 s | 550118.605768 s |
> (w)kfree_rcu_monitor | 0006 | 0.079 ms | 1 | 0.079 ms | 550110.925821 s | 550110.925900 s |
> (w)psi_avgs_work | 0004 | 0.079 ms | 1 | 0.079 ms | 550109.581835 s | 550109.581914 s |
> (w)psi_avgs_work | 0001 | 0.078 ms | 1 | 0.078 ms | 550109.197809 s | 550109.197887 s |
> (w)psi_avgs_work | 0002 | 0.077 ms | 5 | 0.086 ms | 550110.669819 s | 550110.669905 s |
> <SNIP>
> # strace -e bpf -o perf-stat-bpf-counters.output perf stat -e cycles --bpf-counters sleep 1
>
> Performance counter stats for 'sleep 1':
>
> 6,197,983 cycles
>
> 1.003922848 seconds time elapsed
>
> 0.000000000 seconds user
> 0.002032000 seconds sys
>
> # head -7 perf-stat-bpf-counters.output
> bpf(BPF_OBJ_GET, {pathname="/sys/fs/bpf/perf_attr_map", bpf_fd=0, file_flags=0}, 16) = 3
> bpf(BPF_OBJ_GET_INFO_BY_FD, {info={bpf_fd=3, info_len=88, info=0x7ffcead64990}}, 16) = 0
> bpf(BPF_MAP_LOOKUP_ELEM, {map_fd=3, key=0x24129e0, value=0x7ffcead65a48, flags=BPF_ANY}, 32) = 0
> bpf(BPF_LINK_GET_FD_BY_ID, {link_id=1252}, 12) = -1 ENOENT (No such file or directory)
> bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffcead65780, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 116) = 4
> bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffcead65920, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 128) = 4
> bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0\20\0\0\0\20\0\0\0\5\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=45, btf_log_size=0, btf_log_level=0}, 28) = 4
> #
>
> Reported-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Suggested-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
> Cc: Adrian Hunter <adrian.hunter@xxxxxxxxx>
> Cc: Ian Rogers <irogers@xxxxxxxxxx>
> Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
> Cc: Namhyung Kim <namhyung@xxxxxxxxxx>
> Co-developed-by: Jiri Olsa <jolsa@xxxxxxxxxx>
> Signed-off-by: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
> ---
> tools/perf/Makefile.perf | 20 +---
> tools/perf/util/bpf_skel/.gitignore | 1 -
> tools/perf/util/bpf_skel/vmlinux.h | 173 ++++++++++++++++++++++++++++
> 3 files changed, 174 insertions(+), 20 deletions(-)
> create mode 100644 tools/perf/util/bpf_skel/vmlinux.h
>
> diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> index 48aba186ceb50792..61c33d100b2bcc90 100644
> --- a/tools/perf/Makefile.perf
> +++ b/tools/perf/Makefile.perf
> @@ -1063,25 +1063,7 @@ $(BPFTOOL): | $(SKEL_TMP_OUT)
> $(Q)CFLAGS= $(MAKE) -C ../bpf/bpftool \
> OUTPUT=$(SKEL_TMP_OUT)/ bootstrap
>
> -VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \
> - $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
> - ../../vmlinux \
> - /sys/kernel/btf/vmlinux \
> - /boot/vmlinux-$(shell uname -r)
> -VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
> -
> -$(SKEL_OUT)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL)
> -ifeq ($(VMLINUX_H),)
> - $(QUIET_GEN)$(BPFTOOL) btf dump file $< format c > $@ || \
> - (echo "Failure to generate vmlinux.h needed for the recommended BPF skeleton support." && \
> - echo "To disable this use the build option NO_BPF_SKEL=1." && \
> - echo "Alternatively point at a pre-generated vmlinux.h with VMLINUX_H=<path>." && \
> - false)
> -else
> - $(Q)cp "$(VMLINUX_H)" $@
> -endif
> -
> -$(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) $(SKEL_OUT)/vmlinux.h | $(SKEL_TMP_OUT)
> +$(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT)
> $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \
> -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@
>
> diff --git a/tools/perf/util/bpf_skel/.gitignore b/tools/perf/util/bpf_skel/.gitignore
> index cd01455e1b53c3d9..7a1c832825de8445 100644
> --- a/tools/perf/util/bpf_skel/.gitignore
> +++ b/tools/perf/util/bpf_skel/.gitignore
> @@ -1,4 +1,3 @@
> # SPDX-License-Identifier: GPL-2.0-only
> .tmp
> *.skel.h
> -vmlinux.h
> diff --git a/tools/perf/util/bpf_skel/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux.h
> new file mode 100644
> index 0000000000000000..449b1ea91fc48143
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/vmlinux.h
> @@ -0,0 +1,173 @@
> +#ifndef __VMLINUX_H
> +#define __VMLINUX_H
> +
> +#include <linux/bpf.h>
> +#include <linux/types.h>

Is this inclusion tools/include/linux/types.h or
tools/include/uapi/linux/types.h? The former is the norm in the perf
tree:
https://lore.kernel.org/linux-perf-users/CAP-5=fXKi+VAr-_n5tAaJ7Z2fvU7jc5N-CKCjkCAh_01_pzMfA@xxxxxxxxxxxxxx/
and that has the definitions:
typedef uint64_t u64;
typedef int64_t s64;

> +#include <linux/perf_event.h>
> +#include <stdbool.h>
> +
> +// non-UAPI kernel data structures, used in the .bpf.c BPF tool component.
> +
> +// Just the fields used in these tools preserving the access index so that
> +// libbpf can fixup offsets with the ones used in the kernel when loading the
> +// BPF bytecode, if they differ from what is used here.
> +
> +typedef __u8 u8;
> +typedef __u32 u32;
> +typedef __u64 u64;
> +typedef __s64 s64;

which then collide with these two definitions. On my builds this triggers:
error: typedef redefinition with different types ('__u64' (aka
'unsigned long long') vs 'uint64_t' (aka 'unsigned long'))
I'm working around the issue by going back to using a generated vmlinux.h.

Thanks,
Ian

> +
> +typedef int pid_t;
> +
> +enum cgroup_subsys_id {
> + perf_event_cgrp_id = 8,
> +};
> +
> +enum {
> + HI_SOFTIRQ = 0,
> + TIMER_SOFTIRQ,
> + NET_TX_SOFTIRQ,
> + NET_RX_SOFTIRQ,
> + BLOCK_SOFTIRQ,
> + IRQ_POLL_SOFTIRQ,
> + TASKLET_SOFTIRQ,
> + SCHED_SOFTIRQ,
> + HRTIMER_SOFTIRQ,
> + RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
> +
> + NR_SOFTIRQS
> +};
> +
> +typedef struct {
> + s64 counter;
> +} __attribute__((preserve_access_index)) atomic64_t;
> +
> +typedef atomic64_t atomic_long_t;
> +
> +struct raw_spinlock {
> + int rawlock;
> +} __attribute__((preserve_access_index));
> +
> +typedef struct raw_spinlock raw_spinlock_t;
> +
> +typedef struct {
> + struct raw_spinlock rlock;
> +} __attribute__((preserve_access_index)) spinlock_t;
> +
> +struct sighand_struct {
> + spinlock_t siglock;
> +} __attribute__((preserve_access_index));
> +
> +struct rw_semaphore {
> + atomic_long_t owner;
> +} __attribute__((preserve_access_index));
> +
> +struct mutex {
> + atomic_long_t owner;
> +} __attribute__((preserve_access_index));
> +
> +struct kernfs_node {
> + u64 id;
> +} __attribute__((preserve_access_index));
> +
> +struct cgroup {
> + struct kernfs_node *kn;
> + int level;
> +} __attribute__((preserve_access_index));
> +
> +struct cgroup_subsys_state {
> + struct cgroup *cgroup;
> +} __attribute__((preserve_access_index));
> +
> +struct css_set {
> + struct cgroup_subsys_state *subsys[13];
> + struct cgroup *dfl_cgrp;
> +} __attribute__((preserve_access_index));
> +
> +struct mm_struct {
> + struct rw_semaphore mmap_lock;
> +} __attribute__((preserve_access_index));
> +
> +struct task_struct {
> + unsigned int flags;
> + struct mm_struct *mm;
> + pid_t pid;
> + pid_t tgid;
> + char comm[16];
> + struct sighand_struct *sighand;
> + struct css_set *cgroups;
> +} __attribute__((preserve_access_index));
> +
> +struct trace_entry {
> + short unsigned int type;
> + unsigned char flags;
> + unsigned char preempt_count;
> + int pid;
> +} __attribute__((preserve_access_index));
> +
> +struct trace_event_raw_irq_handler_entry {
> + struct trace_entry ent;
> + int irq;
> + u32 __data_loc_name;
> + char __data[];
> +} __attribute__((preserve_access_index));
> +
> +struct trace_event_raw_irq_handler_exit {
> + struct trace_entry ent;
> + int irq;
> + int ret;
> + char __data[];
> +} __attribute__((preserve_access_index));
> +
> +struct trace_event_raw_softirq {
> + struct trace_entry ent;
> + unsigned int vec;
> + char __data[];
> +} __attribute__((preserve_access_index));
> +
> +struct trace_event_raw_workqueue_execute_start {
> + struct trace_entry ent;
> + void *work;
> + void *function;
> + char __data[];
> +} __attribute__((preserve_access_index));
> +
> +struct trace_event_raw_workqueue_execute_end {
> + struct trace_entry ent;
> + void *work;
> + void *function;
> + char __data[];
> +} __attribute__((preserve_access_index));
> +
> +struct trace_event_raw_workqueue_activate_work {
> + struct trace_entry ent;
> + void *work;
> + char __data[];
> +} __attribute__((preserve_access_index));
> +
> +struct perf_sample_data {
> + u64 addr;
> + u64 period;
> + union perf_sample_weight weight;
> + u64 txn;
> + union perf_mem_data_src data_src;
> + u64 ip;
> + struct {
> + u32 pid;
> + u32 tid;
> + } tid_entry;
> + u64 time;
> + u64 id;
> + struct {
> + u32 cpu;
> + } cpu_entry;
> + u64 phys_addr;
> + u64 data_page_size;
> + u64 code_page_size;
> +} __attribute__((__aligned__(64))) __attribute__((preserve_access_index));
> +
> +struct bpf_perf_event_data_kern {
> + struct perf_sample_data *data;
> + struct perf_event *event;
> +} __attribute__((preserve_access_index));
> +#endif // __VMLINUX_H
> --
> 2.39.2
>