[PATCH 1/1] RFC: add pidfd_send_signal flag to reclaim mm while killing a process

From: Suren Baghdasaryan
Date: Fri Nov 13 2020 - 12:34:55 EST


When a process is being killed it might be in an uninterruptible sleep
which leads to an unpredictable delay in its memory reclaim. In low memory
situations, when it's important to free up memory quickly, such delay is
problematic. Kernel solves this problem with oom-reaper thread which
performs memory reclaim even when the victim process is not runnable.
Userspace currently lacks such mechanisms and the need and potential
solutions were discussed before (see links below).
This patch provides a mechanism to perform memory reclaim in the context
of the process that sends SIGKILL signal. New SYNC_REAP_MM flag for
pidfd_send_signal syscall can be used only when sending SIGKILL signal
and will lead to the caller synchronously reclaiming the memory that
belongs to the victim and can be easily reclaimed.

1. https://patchwork.kernel.org/cover/10894999
2. https://lwn.net/Articles/787217
3. https://lore.kernel.org/linux-api/CAJuCfpGz1kPM3G1gZH+09Z7aoWKg05QSAMMisJ7H5MdmRrRhNQ@xxxxxxxxxxxxxx

Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
---
include/linux/oom.h | 2 ++
include/linux/signal.h | 7 ++++
kernel/signal.c | 73 ++++++++++++++++++++++++++++++++++++++++--
mm/oom_kill.c | 2 +-
4 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 2db9a1432511..9a8dcabdfdf1 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -111,6 +111,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm);
long oom_badness(struct task_struct *p,
unsigned long totalpages);

+extern bool task_will_free_mem(struct task_struct *task);
+
extern bool out_of_memory(struct oom_control *oc);

extern void exit_oom_victim(void);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index b256f9c65661..5deafc99035d 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -449,6 +449,13 @@ extern bool unhandled_signal(struct task_struct *tsk, int sig);
(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
(t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

+/*
+ * Flag values used in pidfd_send_signal:
+ *
+ * SYNC_REAP_MM indicates request to reclaim mm after SIGKILL.
+ */
+#define SYNC_REAP_MM 0x1
+
void signals_init(void);

int restore_altstack(const stack_t __user *);
diff --git a/kernel/signal.c b/kernel/signal.c
index ef8f2a28d37c..15d4be5600a3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -46,6 +46,7 @@
#include <linux/livepatch.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
+#include <linux/oom.h>

#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -3711,6 +3712,63 @@ static struct pid *pidfd_to_pid(const struct file *file)
return tgid_pidfd_to_pid(file);
}

+static int reap_mm(struct pid *pid)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int ret = 0;
+
+ /* Get the task_struct */
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ task_lock(task);
+
+ /* Check if memory can be easily reclaimed */
+ if (!task_will_free_mem(task)) {
+ task_unlock(task);
+ ret = -EBUSY;
+ goto release_task;
+ }
+
+ /* Get mm to prevent exit_mmap */
+ mm = task->mm;
+ mmget(mm);
+
+ /* Ensure no competition with OOM-killer to prevent contention */
+ if (unlikely(mm_is_oom_victim(mm)) ||
+ unlikely(test_bit(MMF_OOM_SKIP, &mm->flags))) {
+ /* Already being reclaimed */
+ task_unlock(task);
+ goto drop_mm;
+ }
+ /*
+ * Prevent OOM-killer or other pidfd_send_signal from considering
+ * this task
+ */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+
+ task_unlock(task);
+
+ mmap_read_lock(mm);
+ if (!__oom_reap_task_mm(mm)) {
+ /* Failed to reap part of the address space. User can retry */
+ ret = -EAGAIN;
+ clear_bit(MMF_OOM_SKIP, &mm->flags);
+ }
+ mmap_read_unlock(mm);
+
+drop_mm:
+ mmput(mm);
+release_task:
+ put_task_struct(task);
+out:
+ return ret;
+}
+
/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
@@ -3737,10 +3795,16 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
struct pid *pid;
kernel_siginfo_t kinfo;

- /* Enforce flags be set to 0 until we add an extension. */
- if (flags)
+ /* Enforce only valid flags. */
+ if (flags) {
+ /* Allow SYNC_REAP_MM only with SIGKILL. */
+ if (flags == SYNC_REAP_MM && sig == SIGKILL)
+ goto valid;
+
return -EINVAL;
+ }

+valid:
f = fdget(pidfd);
if (!f.file)
return -EBADF;
@@ -3775,6 +3839,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
}

ret = kill_pid_info(sig, &kinfo, pid);
+ if (unlikely(ret))
+ goto err;
+
+ if (flags & SYNC_REAP_MM)
+ ret = reap_mm(pid);

err:
fdput(f);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8b84661a6410..66c90bca25bc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -808,7 +808,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
* Caller has to make sure that task->mm is stable (hold task_lock or
* it operates on the current).
*/
-static bool task_will_free_mem(struct task_struct *task)
+bool task_will_free_mem(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;
--
2.29.2.299.gdc1121823c-goog