Re: [RFC PATCH v2 4/5] bpf: Add a OOM policy test

From: Alan Maguire
Date: Wed Aug 16 2023 - 09:50:45 EST


On 16/08/2023 13:31, Chuyi Zhou wrote:
> Hello,
>
> 在 2023/8/16 19:53, Alan Maguire 写道:
>> On 10/08/2023 09:13, Chuyi Zhou wrote:
>>> This patch adds a test which implements a priority-based policy through
>>> bpf_oom_evaluate_task.
>>>
>>> The BPF program, oom_policy.c, compares the cgroup priority of two tasks
>>> and select the lower one. The userspace program test_oom_policy.c
>>> maintains a priority map by using cgroup id as the keys and priority as
>>> the values. We could protect certain cgroups from oom-killer by setting
>>> higher priority.
>>>
>>> Signed-off-by: Chuyi Zhou <zhouchuyi@xxxxxxxxxxxxx>
>>> ---
>>>   .../bpf/prog_tests/test_oom_policy.c          | 140 ++++++++++++++++++
>>>   .../testing/selftests/bpf/progs/oom_policy.c  | 104 +++++++++++++
>>>   2 files changed, 244 insertions(+)
>>>   create mode 100644
>>> tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
>>>   create mode 100644 tools/testing/selftests/bpf/progs/oom_policy.c
>>>
>>> diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
>>> b/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
>>> new file mode 100644
>>> index 000000000000..bea61ff22603
>>> --- /dev/null
>>> +++ b/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
>>> @@ -0,0 +1,140 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +#define _GNU_SOURCE
>>> +
>>> +#include <stdio.h>
>>> +#include <fcntl.h>
>>> +#include <unistd.h>
>>> +#include <stdlib.h>
>>> +#include <signal.h>
>>> +#include <sys/stat.h>
>>> +#include <test_progs.h>
>>> +#include <bpf/btf.h>
>>> +#include <bpf/bpf.h>
>>> +
>>> +#include "cgroup_helpers.h"
>>> +#include "oom_policy.skel.h"
>>> +
>>> +static int map_fd;
>>> +static int cg_nr;
>>> +struct {
>>> +    const char *path;
>>> +    int fd;
>>> +    unsigned long long id;
>>> +} cgs[] = {
>>> +    { "/cg1" },
>>> +    { "/cg2" },
>>> +};
>>> +
>>> +
>>> +static struct oom_policy *open_load_oom_policy_skel(void)
>>> +{
>>> +    struct oom_policy *skel;
>>> +    int err;
>>> +
>>> +    skel = oom_policy__open();
>>> +    if (!ASSERT_OK_PTR(skel, "skel_open"))
>>> +        return NULL;
>>> +
>>> +    err = oom_policy__load(skel);
>>> +    if (!ASSERT_OK(err, "skel_load"))
>>> +        goto cleanup;
>>> +
>>> +    return skel;
>>> +
>>> +cleanup:
>>> +    oom_policy__destroy(skel);
>>> +    return NULL;
>>> +}
>>> +
>>> +static void run_memory_consume(unsigned long long consume_size, int
>>> idx)
>>> +{
>>> +    char *buf;
>>> +
>>> +    join_parent_cgroup(cgs[idx].path);
>>> +    buf = malloc(consume_size);
>>> +    memset(buf, 0, consume_size);
>>> +    sleep(2);
>>> +    exit(0);
>>> +}
>>> +
>>> +static int set_cgroup_prio(unsigned long long cg_id, int prio)
>>> +{
>>> +    int err;
>>> +
>>> +    err = bpf_map_update_elem(map_fd, &cg_id, &prio, BPF_ANY);
>>> +    ASSERT_EQ(err, 0, "update_map");
>>> +    return err;
>>> +}
>>> +
>>> +static int prepare_cgroup_environment(void)
>>> +{
>>> +    int err;
>>> +
>>> +    err = setup_cgroup_environment();
>>> +    if (err)
>>> +        goto clean_cg_env;
>>> +    for (int i = 0; i < cg_nr; i++) {
>>> +        err = cgs[i].fd = create_and_get_cgroup(cgs[i].path);
>>> +        if (!ASSERT_GE(cgs[i].fd, 0, "cg_create"))
>>> +            goto clean_cg_env;
>>> +        cgs[i].id = get_cgroup_id(cgs[i].path);
>>> +    }
>>> +    return 0;
>>> +clean_cg_env:
>>> +    cleanup_cgroup_environment();
>>> +    return err;
>>> +}
>>> +
>>> +void test_oom_policy(void)
>>> +{
>>> +    struct oom_policy *skel;
>>> +    struct bpf_link *link;
>>> +    int err;
>>> +    int victim_pid;
>>> +    unsigned long long victim_cg_id;
>>> +
>>> +    link = NULL;
>>> +    cg_nr = ARRAY_SIZE(cgs);
>>> +
>>> +    skel = open_load_oom_policy_skel();
>>> +    err = oom_policy__attach(skel);
>>> +    if (!ASSERT_OK(err, "oom_policy__attach"))
>>> +        goto cleanup;
>>> +
>>> +    map_fd = bpf_object__find_map_fd_by_name(skel->obj, "cg_map");
>>> +    if (!ASSERT_GE(map_fd, 0, "find map"))
>>> +        goto cleanup;
>>> +
>>> +    err = prepare_cgroup_environment();
>>> +    if (!ASSERT_EQ(err, 0, "prepare cgroup env"))
>>> +        goto cleanup;
>>> +
>>> +    write_cgroup_file("/", "memory.max", "10M");
>>> +
>>> +    /*
>>> +     * Set higher priority to cg2 and lower to cg1, so we would select
>>> +     * task under cg1 as victim.(see oom_policy.c)
>>> +     */
>>> +    set_cgroup_prio(cgs[0].id, 10);
>>> +    set_cgroup_prio(cgs[1].id, 50);
>>> +
>>> +    victim_cg_id = cgs[0].id;
>>> +    victim_pid = fork();
>>> +
>>> +    if (victim_pid == 0)
>>> +        run_memory_consume(1024 * 1024 * 4, 0);
>>> +
>>> +    if (fork() == 0)
>>> +        run_memory_consume(1024 * 1024 * 8, 1);
>>> +
>>> +    while (wait(NULL) > 0)
>>> +        ;
>>> +
>>> +    ASSERT_EQ(skel->bss->victim_pid, victim_pid, "victim_pid");
>>> +    ASSERT_EQ(skel->bss->victim_cg_id, victim_cg_id, "victim_cgid");
>>> +    ASSERT_EQ(skel->bss->failed_cnt, 1, "failed_cnt");
>>> +cleanup:
>>> +    bpf_link__destroy(link);
>>> +    oom_policy__destroy(skel);
>>> +    cleanup_cgroup_environment();
>>> +}
>>> diff --git a/tools/testing/selftests/bpf/progs/oom_policy.c
>>> b/tools/testing/selftests/bpf/progs/oom_policy.c
>>> new file mode 100644
>>> index 000000000000..fc9efc93914e
>>> --- /dev/null
>>> +++ b/tools/testing/selftests/bpf/progs/oom_policy.c
>>> @@ -0,0 +1,104 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +#include <vmlinux.h>
>>> +#include <bpf/bpf_tracing.h>
>>> +#include <bpf/bpf_helpers.h>
>>> +
>>> +char _license[] SEC("license") = "GPL";
>>> +
>>> +struct {
>>> +    __uint(type, BPF_MAP_TYPE_HASH);
>>> +    __type(key, int);
>>> +    __type(value, int);
>>> +    __uint(max_entries, 24);
>>> +} cg_map SEC(".maps");
>>> +
>>> +unsigned int victim_pid;
>>> +u64 victim_cg_id;
>>> +int failed_cnt;
>>> +
>>> +#define    EOPNOTSUPP    95
>>> +
>>> +enum {
>>> +    NO_BPF_POLICY,
>>> +    BPF_EVAL_ABORT,
>>> +    BPF_EVAL_NEXT,
>>> +    BPF_EVAL_SELECT,
>>> +};
>>
>> When I built a kernel using this series and tried building the
>> associated test for that kernel I saw:
>>
>> progs/oom_policy.c:22:2: error: redefinition of enumerator
>> 'NO_BPF_POLICY'
>>          NO_BPF_POLICY,
>>          ^
>> /home/opc/src/bpf-next/tools/testing/selftests/bpf/tools/include/vmlinux.h:75894:2:
>> note: previous definition is here
>>          NO_BPF_POLICY = 0,
>>          ^
>> progs/oom_policy.c:23:2: error: redefinition of enumerator
>> 'BPF_EVAL_ABORT'
>>          BPF_EVAL_ABORT,
>>          ^
>> /home/opc/src/bpf-next/tools/testing/selftests/bpf/tools/include/vmlinux.h:75895:2:
>> note: previous definition is here
>>          BPF_EVAL_ABORT = 1,
>>          ^
>> progs/oom_policy.c:24:2: error: redefinition of enumerator
>> 'BPF_EVAL_NEXT'
>>          BPF_EVAL_NEXT,
>>          ^
>> /home/opc/src/bpf-next/tools/testing/selftests/bpf/tools/include/vmlinux.h:75896:2:
>> note: previous definition is here
>>          BPF_EVAL_NEXT = 2,
>>          ^
>> progs/oom_policy.c:  CLNG-BPF [test_maps] tailcall_bpf2bpf4.bpf.o
>> 25:2: error: redefinition of enumerator 'BPF_EVAL_SELECT'
>>          BPF_EVAL_SELECT,
>>          ^
>> /home/opc/src/bpf-next/tools/testing/selftests/bpf/tools/include/vmlinux.h:75897:2:
>> note: previous definition is here
>>          BPF_EVAL_SELECT = 3,
>>          ^
>> 4 errors generated.
>>
>>
>> So you shouldn't need the enum definition since it already makes it into
>> vmlinux.h.
>> OK. It seems my vmlinux.h doesn't contain these enum...
>> I also ran into test failures when I removed the above (and compilation
>> succeeded):
>>
>>
>> test_oom_policy:PASS:prepare cgroup env 0 nsec
>> (cgroup_helpers.c:130: errno: No such file or directory) Opening
>> /mnt/cgroup-test-work-dir23054//memory.max
>> set_cgroup_prio:PASS:update_map 0 nsec
>> set_cgroup_prio:PASS:update_map 0 nsec
>> test_oom_policy:FAIL:victim_pid unexpected victim_pid: actual 0 !=
>> expected 23058
>> test_oom_policy:FAIL:victim_cgid unexpected victim_cgid: actual 0 !=
>> expected 68
>> test_oom_policy:FAIL:failed_cnt unexpected failed_cnt: actual 0 !=
>> expected 1
>> #154     oom_policy:FAIL
>> Summary: 1/0 PASSED, 0 SKIPPED, 1 FAILED
>>
>> So it seems that because my system was using the cgroupv1 memory
>> controller, it could not be used for v2 unless I rebooted with
>>
>> systemd.unified_cgroup_hierarchy=1
>>
>> ...on the boot commandline. It would be good to note any such
>> requirements for this test in the selftests/bpf/README.rst.
>> Might also be worth adding
>>
>> write_cgroup_file("", "cgroup.subtree_control", "+memory");
>>
>> ...to ensure the memory controller is enabled for the root cgroup.
>>
>> At that point the test still failed:
>>
>> set_cgroup_prio:PASS:update_map 0 nsec
>> test_oom_policy:FAIL:victim_pid unexpected victim_pid: actual 0 !=
>> expected 12649
>> test_oom_policy:FAIL:victim_cgid unexpected victim_cgid: actual 0 !=
>> expected 9583
>> test_oom_policy:FAIL:failed_cnt unexpected failed_cnt: actual 0 !=
>> expected 1
>> #154     oom_policy:FAIL
>> Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED
>> Successfully unloaded bpf_testmod.ko.
>>
>>
> It seems that OOM is not invoked in your environment(you can check it in
> demsg). If the memcg OOM is invoked by the test, we would record the
> *victim_pid* and *victim_cgid* and they would not be zero. I guess the
> reason is memory_control is not enabled in cgroup
> "/mnt/cgroup-test-work-dir23054/", because I see the error message:
> (cgroup_helpers.c:130: errno: No such file or directory) Opening
>> /mnt/cgroup-test-work-dir23054//memory.max

Right, but after I set up unified cgroup hierarchy and rebooted, that
message disappeared and cgroup setup succeeded, _but_ the test still
failed with 0 victim_pid/cgid. I see nothing OOM-related in dmesg, but
the toplevel cgroupv2 cgroup.controllers file contains:

cpuset cpu io memory hugetlb pids rdma

Is there something else that needs to be done to enable OOM scanning?
I see the oom_reaper process:

root 72 2 0 11:30 ? 00:00:00 [oom_reaper]


This test will need to pass BPF CI, so any assumptions about
configuration need to be ironed out. For example, I think you should
probably have

diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
b/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
index bea61ff22603..54fdb8a59816 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_oom_policy.c
@@ -109,6 +109,7 @@ void test_oom_policy(void)
if (!ASSERT_EQ(err, 0, "prepare cgroup env"))
goto cleanup;

+ write_cgroup_file("/", "cgroup.subtree_control", "+memory");
write_cgroup_file("/", "memory.max", "10M");

/*

...to be safe, since

https://docs.kernel.org/admin-guide/cgroup-v2.html#organizing-processes-and-threads

...says

"No controller is enabled by default. Controllers can be enabled and
disabled by writing to the "cgroup.subtree_control" file:

# echo "+cpu +memory -io" > cgroup.subtree_control

"

Are there any other aspects of configuration like that which might
explain why the test passes for you but fails for me?

Alan