[RFC PATCH 1/5] bpf: Introduce BPF_PROG_TYPE_OOM_POLICY

From: Chuyi Zhou
Date: Thu Jul 27 2023 - 03:44:35 EST


This patch introduces a BPF_PROG_TYPE_OOM_POLICY program type. This
prog will be used to select a leaf memcg as victim from the memcg tree
when global oom is invoked.

The program takes two sibling cgroup's id as parameters and return a
comparison result indicating which one should be chosen as the victim.

Suggested-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
Signed-off-by: Chuyi Zhou <zhouchuyi@xxxxxxxxxxxxx>
---
include/linux/bpf_oom.h | 22 +++++
include/linux/bpf_types.h | 2 +
include/uapi/linux/bpf.h | 14 ++++
kernel/bpf/syscall.c | 10 +++
mm/oom_kill.c | 168 ++++++++++++++++++++++++++++++++++++++
5 files changed, 216 insertions(+)
create mode 100644 include/linux/bpf_oom.h

diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h
new file mode 100644
index 000000000000..f4235a83d3bb
--- /dev/null
+++ b/include/linux/bpf_oom.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BPF_OOM_H
+#define _BPF_OOM_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <uapi/linux/bpf.h>
+
+struct bpf_oom_policy {
+ struct bpf_prog_array __rcu *progs;
+};
+
+int oom_policy_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int oom_policy_prog_detach(const union bpf_attr *attr);
+int oom_policy_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+
+int __bpf_run_oom_policy(u64 cg_id_1, u64 cg_id_2);
+
+bool bpf_oom_policy_enabled(void);
+
+#endif
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fc0d6f32c687..8ab6009b7dd9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -83,6 +83,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
struct bpf_nf_ctx, struct bpf_nf_ctx)
#endif
+BPF_PROG_TYPE(BPF_PROG_TYPE_OOM_POLICY, oom_policy,
+ struct bpf_oom_ctx, struct bpf_oom_ctx)

BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 60a9d59beeab..9da0d61cf703 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -987,6 +987,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_NETFILTER,
+ BPF_PROG_TYPE_OOM_POLICY,
};

enum bpf_attach_type {
@@ -1036,6 +1037,7 @@ enum bpf_attach_type {
BPF_LSM_CGROUP,
BPF_STRUCT_OPS,
BPF_NETFILTER,
+ BPF_OOM_POLICY,
__MAX_BPF_ATTACH_TYPE
};

@@ -6825,6 +6827,18 @@ struct bpf_cgroup_dev_ctx {
__u32 minor;
};

+enum {
+ BPF_OOM_CMP_EQUAL = (1ULL << 0),
+ BPF_OOM_CMP_GREATER = (1ULL << 1),
+ BPF_OOM_CMP_LESS = (1ULL << 2),
+};
+
+struct bpf_oom_ctx {
+ __u64 cg_id_1;
+ __u64 cg_id_2;
+ __u8 cmp_ret;
+};
+
struct bpf_raw_tracepoint_args {
__u64 args[0];
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a2aef900519c..fb6fb6294eba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5,6 +5,7 @@
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
+#include <linux/bpf_oom.h>
#include <linux/bpf_verifier.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
@@ -3588,6 +3589,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_XDP;
case BPF_LSM_CGROUP:
return BPF_PROG_TYPE_LSM;
+ case BPF_OOM_POLICY:
+ return BPF_PROG_TYPE_OOM_POLICY;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@@ -3634,6 +3637,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = netns_bpf_prog_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_OOM_POLICY:
+ ret = oom_policy_prog_attach(attr, prog);
+ break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3676,6 +3682,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return lirc_prog_detach(attr);
case BPF_PROG_TYPE_FLOW_DISSECTOR:
return netns_bpf_prog_detach(attr, ptype);
+ case BPF_PROG_TYPE_OOM_POLICY:
+ return oom_policy_prog_detach(attr);
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3733,6 +3741,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
+ case BPF_OOM_POLICY:
+ return oom_policy_prog_query(attr, uattr);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
case BPF_SK_MSG_VERDICT:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 612b5597d3af..01af8adaa16c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -19,6 +19,7 @@
*/

#include <linux/oom.h>
+#include <linux/bpf_oom.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/gfp.h>
@@ -73,6 +74,9 @@ static inline bool is_memcg_oom(struct oom_control *oc)
return oc->memcg != NULL;
}

+DEFINE_MUTEX(oom_policy_lock);
+static struct bpf_oom_policy global_oom_policy;
+
#ifdef CONFIG_NUMA
/**
* oom_cpuset_eligible() - check task eligibility for kill
@@ -1258,3 +1262,167 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
return -ENOSYS;
#endif /* CONFIG_MMU */
}
+
+const struct bpf_prog_ops oom_policy_prog_ops = {
+};
+
+static const struct bpf_func_proto *
+oom_policy_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id);
+}
+
+static bool oom_policy_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off + size > sizeof(struct bpf_oom_ctx) || off % size)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_oom_ctx, cg_id_1):
+ case bpf_ctx_range(struct bpf_oom_ctx, cg_id_2):
+ if (type != BPF_READ)
+ return false;
+ bpf_ctx_record_field_size(info, sizeof(__u64));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u64));
+ case bpf_ctx_range(struct bpf_oom_ctx, cmp_ret):
+ if (type == BPF_READ) {
+ bpf_ctx_record_field_size(info, sizeof(__u8));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u8));
+ } else {
+ return size == sizeof(__u8);
+ }
+ default:
+ return false;
+ }
+}
+
+const struct bpf_verifier_ops oom_policy_verifier_ops = {
+ .get_func_proto = oom_policy_func_proto,
+ .is_valid_access = oom_policy_is_valid_access,
+};
+
+#define BPF_MAX_PROGS 10
+
+int oom_policy_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_prog_array *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&oom_policy_lock);
+ old_array = rcu_dereference(global_oom_policy.progs);
+ if (old_array && bpf_prog_array_length(old_array) >= BPF_MAX_PROGS) {
+ ret = -E2BIG;
+ goto unlock;
+ }
+ ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array);
+ if (ret < 0)
+ goto unlock;
+
+ rcu_assign_pointer(global_oom_policy.progs, new_array);
+ bpf_prog_array_free(old_array);
+
+unlock:
+ mutex_unlock(&oom_policy_lock);
+ return ret;
+}
+
+static int detach_prog(struct bpf_prog *prog)
+{
+ struct bpf_prog_array *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&oom_policy_lock);
+ old_array = rcu_dereference(global_oom_policy.progs);
+ ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array);
+
+ if (ret)
+ goto unlock;
+
+ rcu_assign_pointer(global_oom_policy.progs, new_array);
+ bpf_prog_array_free(old_array);
+ bpf_prog_put(prog);
+unlock:
+ mutex_unlock(&oom_policy_lock);
+ return ret;
+}
+
+int oom_policy_prog_detach(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ int ret;
+
+ if (attr->attach_flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_OOM_POLICY);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ ret = detach_prog(prog);
+ bpf_prog_put(prog);
+
+ return ret;
+}
+
+int oom_policy_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ struct bpf_prog_array *progs;
+ u32 cnt, flags;
+ int ret = 0;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ mutex_lock(&oom_policy_lock);
+ progs = rcu_dereference(global_oom_policy.progs);
+ cnt = progs ? bpf_prog_array_length(progs) : 0;
+ if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {
+ ret = -EFAULT;
+ goto unlock;
+ }
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) {
+ ret = -EFAULT;
+ goto unlock;
+ }
+ if (attr->query.prog_cnt != 0 && prog_ids && cnt)
+ ret = bpf_prog_array_copy_to_user(progs, prog_ids,
+ attr->query.prog_cnt);
+
+unlock:
+ mutex_unlock(&oom_policy_lock);
+ return ret;
+}
+
+int __bpf_run_oom_policy(u64 cg_id_1, u64 cg_id_2)
+{
+ struct bpf_oom_ctx ctx = {
+ .cg_id_1 = cg_id_1,
+ .cg_id_2 = cg_id_2,
+ .cmp_ret = BPF_OOM_CMP_EQUAL,
+ };
+ rcu_read_lock();
+ bpf_prog_run_array(rcu_dereference(global_oom_policy.progs),
+ &ctx, bpf_prog_run);
+ rcu_read_unlock();
+ return ctx.cmp_ret;
+}
+
+bool bpf_oom_policy_enabled(void)
+{
+ struct bpf_prog_array *prog_array;
+ bool empty = true;
+
+ rcu_read_lock();
+ prog_array = rcu_dereference(global_oom_policy.progs);
+ if (prog_array)
+ empty = bpf_prog_array_is_empty(prog_array);
+ rcu_read_unlock();
+ return !empty;
+}
--
2.20.1