[RFC PATCH v2 1/5] mm, oom: Introduce bpf_oom_evaluate_task

From: Chuyi Zhou
Date: Thu Aug 10 2023 - 04:14:05 EST


This patch adds a new hook bpf_oom_evaluate_task in oom_evaluate_task. It
takes oc and current iterating task as parameters and returns a result
indicating which one should be selected. We can use it to bypass the
current logic of oom_evaluate_task and implement customized OOM policies
in the attached BPF progams.

Suggested-by: Michal Hocko <mhocko@xxxxxxxx>
Signed-off-by: Chuyi Zhou <zhouchuyi@xxxxxxxxxxxxx>
---
mm/oom_kill.c | 59 +++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 50 insertions(+), 9 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 612b5597d3af..255c9ef1d808 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -18,6 +18,7 @@
* kernel subsystems and hints as to where to find out what things do.
*/

+#include <linux/bpf.h>
#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/err.h>
@@ -305,6 +306,27 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
return CONSTRAINT_NONE;
}

+enum {
+ NO_BPF_POLICY,
+ BPF_EVAL_ABORT,
+ BPF_EVAL_NEXT,
+ BPF_EVAL_SELECT,
+};
+
+__weak noinline int bpf_oom_evaluate_task(struct task_struct *task, struct oom_control *oc)
+{
+ return NO_BPF_POLICY;
+}
+
+BTF_SET8_START(oom_bpf_fmodret_ids)
+BTF_ID_FLAGS(func, bpf_oom_evaluate_task)
+BTF_SET8_END(oom_bpf_fmodret_ids)
+
+static const struct btf_kfunc_id_set oom_bpf_fmodret_set = {
+ .owner = THIS_MODULE,
+ .set = &oom_bpf_fmodret_ids,
+};
+
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
@@ -317,6 +339,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
goto next;

+ /*
+ * If task is allocating a lot of memory and has been marked to be
+ * killed first if it triggers an oom, then select it.
+ */
+ if (oom_task_origin(task)) {
+ points = LONG_MAX;
+ goto select;
+ }
+
+ switch (bpf_oom_evaluate_task(task, oc)) {
+ case BPF_EVAL_ABORT:
+ goto abort; /* abort search process */
+ case BPF_EVAL_NEXT:
+ goto next; /* ignore the task */
+ case BPF_EVAL_SELECT:
+ goto select; /* select the task */
+ default:
+ break; /* No BPF policy */
+ }
+
/*
* This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves unless
@@ -329,15 +371,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
goto abort;
}

- /*
- * If task is allocating a lot of memory and has been marked to be
- * killed first if it triggers an oom, then select it.
- */
- if (oom_task_origin(task)) {
- points = LONG_MAX;
- goto select;
- }
-
points = oom_badness(task, oc->totalpages);
if (points == LONG_MIN || points < oc->chosen_points)
goto next;
@@ -732,10 +765,18 @@ static struct ctl_table vm_oom_kill_table[] = {

static int __init oom_init(void)
{
+ int err;
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
#ifdef CONFIG_SYSCTL
register_sysctl_init("vm", vm_oom_kill_table);
#endif
+
+#ifdef CONFIG_BPF_SYSCALL
+ err = register_btf_fmodret_id_set(&oom_bpf_fmodret_set);
+ if (err)
+ pr_warn("error while registering oom fmodret entrypoints: %d", err);
+#endif
+
return 0;
}
subsys_initcall(oom_init)
--
2.20.1