Re: [RFC PATCH v2 4/5] bpf: Add a OOM policy test

From: Chuyi Zhou
Date: Wed Aug 16 2023 - 08:32:14 EST

Next message: Conor Dooley: "Re: [PATCH] riscv: suspend: Add syscore ops for suspend"
Previous message: Cristian Marussi: "Re: [PATCH v4 0/4] firmware: arm_scmi: Add SCMI v3.2 pincontrol protocol basic support"
In reply to: Alan Maguire: "Re: [RFC PATCH v2 4/5] bpf: Add a OOM policy test"
Next in thread: Alan Maguire: "Re: [RFC PATCH v2 4/5] bpf: Add a OOM policy test"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Hello,

在 2023/8/16 19:53, Alan Maguire 写道:

On 10/08/2023 09:13, Chuyi Zhou wrote:

This patch adds a test which implements a priority-based policy through
bpf_oom_evaluate_task.

The BPF program, oom_policy.c, compares the cgroup priority of two tasks
and select the lower one. The userspace program test_oom_policy.c
maintains a priority map by using cgroup id as the keys and priority as
the values. We could protect certain cgroups from oom-killer by setting
higher priority.

Signed-off-by: Chuyi Zhou <zhouchuyi@xxxxxxxxxxxxx>
---
.../bpf/prog_tests/test_oom_policy.c | 140 ++++++++++++++++++
.../testing/selftests/bpf/progs/oom_policy.c | 104 +++++++++++++
2 files changed, 244 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
create mode 100644 tools/testing/selftests/bpf/progs/oom_policy.c

diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c b/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
new file mode 100644
index 000000000000..bea61ff22603
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include <bpf/bpf.h>
+
+#include "cgroup_helpers.h"
+#include "oom_policy.skel.h"
+
+static int map_fd;
+static int cg_nr;
+struct {
+ const char *path;
+ int fd;
+ unsigned long long id;
+} cgs[] = {
+ { "/cg1" },
+ { "/cg2" },
+};
+
+
+static struct oom_policy *open_load_oom_policy_skel(void)
+{
+ struct oom_policy *skel;
+ int err;
+
+ skel = oom_policy__open();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return NULL;
+
+ err = oom_policy__load(skel);
+ if (!ASSERT_OK(err, "skel_load"))
+ goto cleanup;
+
+ return skel;
+
+cleanup:
+ oom_policy__destroy(skel);
+ return NULL;
+}
+
+static void run_memory_consume(unsigned long long consume_size, int idx)
+{
+ char *buf;
+
+ join_parent_cgroup(cgs[idx].path);
+ buf = malloc(consume_size);
+ memset(buf, 0, consume_size);
+ sleep(2);
+ exit(0);
+}
+
+static int set_cgroup_prio(unsigned long long cg_id, int prio)
+{
+ int err;
+
+ err = bpf_map_update_elem(map_fd, &cg_id, &prio, BPF_ANY);
+ ASSERT_EQ(err, 0, "update_map");
+ return err;
+}
+
+static int prepare_cgroup_environment(void)
+{
+ int err;
+
+ err = setup_cgroup_environment();
+ if (err)
+ goto clean_cg_env;
+ for (int i = 0; i < cg_nr; i++) {
+ err = cgs[i].fd = create_and_get_cgroup(cgs[i].path);
+ if (!ASSERT_GE(cgs[i].fd, 0, "cg_create"))
+ goto clean_cg_env;
+ cgs[i].id = get_cgroup_id(cgs[i].path);
+ }
+ return 0;
+clean_cg_env:
+ cleanup_cgroup_environment();
+ return err;
+}
+
+void test_oom_policy(void)
+{
+ struct oom_policy *skel;
+ struct bpf_link *link;
+ int err;
+ int victim_pid;
+ unsigned long long victim_cg_id;
+
+ link = NULL;
+ cg_nr = ARRAY_SIZE(cgs);
+
+ skel = open_load_oom_policy_skel();
+ err = oom_policy__attach(skel);
+ if (!ASSERT_OK(err, "oom_policy__attach"))
+ goto cleanup;
+
+ map_fd = bpf_object__find_map_fd_by_name(skel->obj, "cg_map");
+ if (!ASSERT_GE(map_fd, 0, "find map"))
+ goto cleanup;
+
+ err = prepare_cgroup_environment();
+ if (!ASSERT_EQ(err, 0, "prepare cgroup env"))
+ goto cleanup;
+
+ write_cgroup_file("/", "memory.max", "10M");
+
+ /*
+ * Set higher priority to cg2 and lower to cg1, so we would select
+ * task under cg1 as victim.(see oom_policy.c)
+ */
+ set_cgroup_prio(cgs[0].id, 10);
+ set_cgroup_prio(cgs[1].id, 50);
+
+ victim_cg_id = cgs[0].id;
+ victim_pid = fork();
+
+ if (victim_pid == 0)
+ run_memory_consume(1024 * 1024 * 4, 0);
+
+ if (fork() == 0)
+ run_memory_consume(1024 * 1024 * 8, 1);
+
+ while (wait(NULL) > 0)
+ ;
+
+ ASSERT_EQ(skel->bss->victim_pid, victim_pid, "victim_pid");
+ ASSERT_EQ(skel->bss->victim_cg_id, victim_cg_id, "victim_cgid");
+ ASSERT_EQ(skel->bss->failed_cnt, 1, "failed_cnt");
+cleanup:
+ bpf_link__destroy(link);
+ oom_policy__destroy(skel);
+ cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/progs/oom_policy.c b/tools/testing/selftests/bpf/progs/oom_policy.c
new file mode 100644
index 000000000000..fc9efc93914e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/oom_policy.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, int);
+ __uint(max_entries, 24);
+} cg_map SEC(".maps");
+
+unsigned int victim_pid;
+u64 victim_cg_id;
+int failed_cnt;
+
+#define EOPNOTSUPP 95
+
+enum {
+ NO_BPF_POLICY,
+ BPF_EVAL_ABORT,
+ BPF_EVAL_NEXT,
+ BPF_EVAL_SELECT,
+};

When I built a kernel using this series and tried building the
associated test for that kernel I saw:

progs/oom_policy.c:22:2: error: redefinition of enumerator 'NO_BPF_POLICY'
NO_BPF_POLICY,
^
/home/opc/src/bpf-next/tools/testing/selftests/bpf/tools/include/vmlinux.h:75894:2:
note: previous definition is here
NO_BPF_POLICY = 0,
^
progs/oom_policy.c:23:2: error: redefinition of enumerator 'BPF_EVAL_ABORT'
BPF_EVAL_ABORT,
^
/home/opc/src/bpf-next/tools/testing/selftests/bpf/tools/include/vmlinux.h:75895:2:
note: previous definition is here
BPF_EVAL_ABORT = 1,
^
progs/oom_policy.c:24:2: error: redefinition of enumerator 'BPF_EVAL_NEXT'
BPF_EVAL_NEXT,
^
/home/opc/src/bpf-next/tools/testing/selftests/bpf/tools/include/vmlinux.h:75896:2:
note: previous definition is here
BPF_EVAL_NEXT = 2,
^
progs/oom_policy.c: CLNG-BPF [test_maps] tailcall_bpf2bpf4.bpf.o
25:2: error: redefinition of enumerator 'BPF_EVAL_SELECT'
BPF_EVAL_SELECT,
^
/home/opc/src/bpf-next/tools/testing/selftests/bpf/tools/include/vmlinux.h:75897:2:
note: previous definition is here
BPF_EVAL_SELECT = 3,
^
4 errors generated.

So you shouldn't need the enum definition since it already makes it into
vmlinux.h.
OK. It seems my vmlinux.h doesn't contain these enum...
I also ran into test failures when I removed the above (and compilation
succeeded):

test_oom_policy:PASS:prepare cgroup env 0 nsec
(cgroup_helpers.c:130: errno: No such file or directory) Opening
/mnt/cgroup-test-work-dir23054//memory.max
set_cgroup_prio:PASS:update_map 0 nsec
set_cgroup_prio:PASS:update_map 0 nsec
test_oom_policy:FAIL:victim_pid unexpected victim_pid: actual 0 !=
expected 23058
test_oom_policy:FAIL:victim_cgid unexpected victim_cgid: actual 0 !=
expected 68
test_oom_policy:FAIL:failed_cnt unexpected failed_cnt: actual 0 !=
expected 1
#154 oom_policy:FAIL
Summary: 1/0 PASSED, 0 SKIPPED, 1 FAILED

So it seems that because my system was using the cgroupv1 memory
controller, it could not be used for v2 unless I rebooted with

systemd.unified_cgroup_hierarchy=1

...on the boot commandline. It would be good to note any such
requirements for this test in the selftests/bpf/README.rst.
Might also be worth adding

write_cgroup_file("", "cgroup.subtree_control", "+memory");

...to ensure the memory controller is enabled for the root cgroup.

At that point the test still failed:

set_cgroup_prio:PASS:update_map 0 nsec
test_oom_policy:FAIL:victim_pid unexpected victim_pid: actual 0 !=
expected 12649
test_oom_policy:FAIL:victim_cgid unexpected victim_cgid: actual 0 !=
expected 9583
test_oom_policy:FAIL:failed_cnt unexpected failed_cnt: actual 0 !=
expected 1
#154 oom_policy:FAIL
Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED
Successfully unloaded bpf_testmod.ko.

It seems that OOM is not invoked in your environment(you can check it in demsg). If the memcg OOM is invoked by the test, we would record the *victim_pid* and *victim_cgid* and they would not be zero. I guess the reason is memory_control is not enabled in cgroup "/mnt/cgroup-test-work-dir23054/", because I see the error message:
(cgroup_helpers.c:130: errno: No such file or directory) Opening
> /mnt/cgroup-test-work-dir23054//memory.max

Thanks for your review and test!

Are there other implicit assumptions about configuration that cause this
test to fail perhaps?

Alan

Next message: Conor Dooley: "Re: [PATCH] riscv: suspend: Add syscore ops for suspend"
Previous message: Cristian Marussi: "Re: [PATCH v4 0/4] firmware: arm_scmi: Add SCMI v3.2 pincontrol protocol basic support"
In reply to: Alan Maguire: "Re: [RFC PATCH v2 4/5] bpf: Add a OOM policy test"
Next in thread: Alan Maguire: "Re: [RFC PATCH v2 4/5] bpf: Add a OOM policy test"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]