[PATCH 2/5] mm: Implement process_memwatch syscall

From: Muhammad Usama Anjum
Date: Tue Jul 26 2022 - 12:21:20 EST


This syscall can be used to watch the process's memory and perform
atomic operations which aren't possible through procfs. Two operations
have been implemented. MEMWATCH_SD_GET is used to get the soft dirty
pages. MEMWATCH_SD_CLEAR clears the soft dirty bit from dirty pages.
MEMWATCH_SD_IGNORE_VMA can be specified to ignore VMA dirty flags.
These operations can be used collectively in one operation as well.

NAME
process_memwatch - get process's memory information

SYNOPSIS
#include <linux/memwatch.h> /* Definition of MEMWATCH_*
constants */

long process_memwatch(int pidfd, unsigned long start, int len,
unsigned int flags, void *vec,
int vec_len);

Note: Glibc does not provide a wrapper for this system call;
call it using syscall(2).

DESCRIPTION
process_memwatch() system call is used to get information
about the memory of the process.

Arguments
pidfd specifies the pidfd of process whose memory needs to be
watched. The calling process must have PTRACE_MODE_ATTACH_FS‐
CREDS capabilities over the process whose pidfd has been
specified. It can be zero which means that the process wants
to watch its own memory. The operation is determined by
flags. The start argument must be a multiple of the system
page size. The len argument need not be a multiple of the
page size, but since the information is returned for the
whole pages, len is effectively rounded up to the next multi‐
ple of the page size.

vec is an output array in which the offsets of the pages are
returned. Offset is calculated from start address. User lets
the kernel know about the size of the vec by passing size in
vec_len. The system call returns when the whole range has
been searched or vec is completely filled. The whole range
isn't cleared if vec fills up completely.

Operations
The flags argument specifies the operation to be performed.
The MEMWATCH_SD_GET and MEMWATCH_SD_CLEAR operations can be
used separately or together to perform MEMWATCH_SD_GET and
MEMWATCH_SD_CLEAR atomically as one operation.

MEMWATCH_SD_GET
Get the page offsets which are soft dirty.

MEMWATCH_SD_CLEAR
Clear the pages which are soft dirty.

MEMWATCH_SD_NO_REUSED_REGIONS
This optional flag can be specified in combination
with other flags. VM_SOFTDIRTY is ignored for the VMAs
for performance reasons. This flag shows only those
pages dirty which have been written by the user ex‐
plicitly. All new allocations are not be returned as
dirty.

RETURN VALUE
The 0 or positive value is returned on success. Positive
value when returned shows the number of dirty pages filled in
vec. In the event of an error (and assuming that
process_memwatch() was invoked via syscall(2)), all opera‐
tions return -1 and set errno to indicate the error.

ERRORS
EINVAL invalid arguments.

ESRCH Cannot access the process.

EIO I/O error.

This is based on a patch from Gabriel Krisman Bertazi.

Signed-off-by: Muhammad Usama Anjum <usama.anjum@xxxxxxxxxxxxx>
---
include/uapi/linux/memwatch.h | 12 ++
mm/Makefile | 2 +-
mm/memwatch.c | 285 ++++++++++++++++++++++++++++++++++
3 files changed, 298 insertions(+), 1 deletion(-)
create mode 100644 include/uapi/linux/memwatch.h
create mode 100644 mm/memwatch.c

diff --git a/include/uapi/linux/memwatch.h b/include/uapi/linux/memwatch.h
new file mode 100644
index 000000000000..7e86ffdc10f5
--- /dev/null
+++ b/include/uapi/linux/memwatch.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _MEMWATCH_H
+#define _MEMWATCH_H
+
+/* memwatch operations */
+#define MEMWATCH_SD_GET 0x1
+#define MEMWATCH_SD_CLEAR 0x2
+#define MEMWATCH_SD_NO_REUSED_REGIONS 0x4
+
+#endif
+
diff --git a/mm/Makefile b/mm/Makefile
index 8083fa85a348..aa72e4ced1f3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)

mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
+mmu-$(CONFIG_MMU) := highmem.o memory.o memwatch.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
pgtable-generic.o rmap.o vmalloc.o
diff --git a/mm/memwatch.c b/mm/memwatch.c
new file mode 100644
index 000000000000..9be09bc431d2
--- /dev/null
+++ b/mm/memwatch.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 Collabora Ltd.
+ */
+#include <linux/pagewalk.h>
+#include <linux/vmalloc.h>
+#include <linux/syscalls.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <linux/sched/mm.h>
+#include <linux/mm_inline.h>
+#include <uapi/linux/memwatch.h>
+#include <uapi/asm-generic/errno-base.h>
+#include <linux/compat.h>
+#include <linux/minmax.h>
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define MEMWATCH_SD_OPS_MASK (MEMWATCH_SD_GET | MEMWATCH_SD_CLEAR | \
+ MEMWATCH_SD_NO_REUSED_REGIONS)
+
+struct memwatch_sd_private {
+ unsigned long start;
+ unsigned int flags;
+ unsigned int index;
+ unsigned int vec_len;
+ unsigned long *vec;
+};
+
+static int memwatch_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct memwatch_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ unsigned long start = addr;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int dirty;
+ bool dirty_vma = (p->flags & MEMWATCH_SD_NO_REUSED_REGIONS) ? 0 :
+ (vma->vm_flags & VM_SOFTDIRTY);
+
+ end = min(end, walk->vma->vm_end);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (dirty_vma || check_soft_dirty_pmd(vma, addr, pmd, false)) {
+ /*
+ * Break huge page into small pages if operation needs to be performed is
+ * on a portion of the huge page or the return buffer cannot store complete
+ * data. Then process this PMD as having normal pages.
+ */
+ if (((p->flags & MEMWATCH_SD_CLEAR) && (end - addr < HPAGE_SIZE)) ||
+ ((p->flags & MEMWATCH_SD_GET) &&
+ (p->index + HPAGE_SIZE/PAGE_SIZE > p->vec_len))) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, addr);
+ goto process_pages;
+ } else {
+ dirty = check_soft_dirty_pmd(vma, addr, pmd,
+ p->flags & MEMWATCH_SD_CLEAR);
+ if ((p->flags & MEMWATCH_SD_GET) && (dirty_vma || dirty)) {
+ for (; addr != end && p->index < p->vec_len;
+ addr += PAGE_SIZE)
+ p->vec[p->index++] = addr - p->start;
+ }
+ }
+ }
+ spin_unlock(ptl);
+ return 0;
+ }
+
+process_pages:
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ dirty = check_soft_dirty(vma, addr, pte, p->flags & MEMWATCH_SD_CLEAR);
+
+ if ((p->flags & MEMWATCH_SD_GET) && (dirty_vma || dirty)) {
+ p->vec[p->index++] = addr - p->start;
+ WARN_ON(p->index > p->vec_len);
+ }
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ if (p->flags & MEMWATCH_SD_CLEAR)
+ flush_tlb_mm_range(vma->vm_mm, start, end, PAGE_SHIFT, false);
+
+ return 0;
+}
+
+static int memwatch_pte_hole(unsigned long addr, unsigned long end, int depth,
+ struct mm_walk *walk)
+{
+ struct memwatch_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (p->flags & MEMWATCH_SD_NO_REUSED_REGIONS)
+ return 0;
+
+ if (vma && (vma->vm_flags & VM_SOFTDIRTY) && (p->flags & MEMWATCH_SD_GET)) {
+ for (; addr != end && p->index < p->vec_len; addr += PAGE_SIZE)
+ p->vec[p->index++] = addr - p->start;
+ }
+
+ return 0;
+}
+
+static int memwatch_pre_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
+{
+ struct memwatch_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int ret;
+ unsigned long end_cut = end;
+
+ if (p->flags & MEMWATCH_SD_NO_REUSED_REGIONS)
+ return 0;
+
+ if ((p->flags & MEMWATCH_SD_CLEAR) && (vma->vm_flags & VM_SOFTDIRTY)) {
+ if (vma->vm_start < start) {
+ ret = split_vma(vma->vm_mm, vma, start, 1);
+ if (ret)
+ return ret;
+ }
+
+ if (p->flags & MEMWATCH_SD_GET)
+ end_cut = min(start + p->vec_len * PAGE_SIZE, end);
+
+ if (vma->vm_end > end_cut) {
+ ret = split_vma(vma->vm_mm, vma, end_cut, 0);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void memwatch_post_vma(struct mm_walk *walk)
+{
+ struct memwatch_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (p->flags & MEMWATCH_SD_NO_REUSED_REGIONS)
+ return;
+
+ if ((p->flags & MEMWATCH_SD_CLEAR) && (vma->vm_flags & VM_SOFTDIRTY)) {
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
+ }
+}
+
+static int memwatch_pmd_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct memwatch_sd_private *p = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if ((p->flags & MEMWATCH_SD_GET) && (p->index == p->vec_len))
+ return -1;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ return 0;
+}
+
+static const struct mm_walk_ops memwatch_ops = {
+ .test_walk = memwatch_pmd_test_walk,
+ .pre_vma = memwatch_pre_vma,
+ .pmd_entry = memwatch_pmd_entry,
+ .pte_hole = memwatch_pte_hole,
+ .post_vma = memwatch_post_vma,
+};
+
+static long do_process_memwatch(int pidfd, void __user *start_addr, int len,
+ unsigned int flags, loff_t __user *vec, int vec_len)
+{
+ struct memwatch_sd_private watch;
+ struct mmu_notifier_range range;
+ unsigned long start, end;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ unsigned int f_flags;
+ int ret;
+
+ start = (unsigned long)untagged_addr(start_addr);
+ if ((!IS_ALIGNED(start, PAGE_SIZE)) || !access_ok((void __user *)start, len))
+ return -EINVAL;
+
+ if ((flags == 0) || (flags == MEMWATCH_SD_NO_REUSED_REGIONS) ||
+ (flags & ~MEMWATCH_SD_OPS_MASK))
+ return -EINVAL;
+
+ if ((flags & MEMWATCH_SD_GET) && ((vec_len == 0) || (!vec) ||
+ !access_ok(vec, vec_len)))
+ return -EINVAL;
+
+ end = start + len;
+ watch.start = start;
+ watch.flags = flags;
+ watch.index = 0;
+ watch.vec_len = vec_len;
+
+ if (pidfd) {
+ task = pidfd_get_task(pidfd, &f_flags);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ } else {
+ task = current;
+ }
+
+ if (flags & MEMWATCH_SD_GET) {
+ watch.vec = vzalloc(vec_len * sizeof(loff_t));
+ if (!watch.vec) {
+ ret = -ENOMEM;
+ goto put_task;
+ }
+ }
+
+ mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = mm ? PTR_ERR(mm) : -ESRCH;
+ goto free_watch;
+ }
+
+ if (flags & MEMWATCH_SD_CLEAR) {
+ mmap_write_lock(mm);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, NULL,
+ mm, start, end);
+ mmu_notifier_invalidate_range_start(&range);
+ inc_tlb_flush_pending(mm);
+ } else {
+ mmap_read_lock(mm);
+ }
+
+ ret = walk_page_range(mm, start, end, &memwatch_ops, &watch);
+
+ if (flags & MEMWATCH_SD_CLEAR) {
+ mmu_notifier_invalidate_range_end(&range);
+ dec_tlb_flush_pending(mm);
+
+ mmap_write_unlock(mm);
+ } else {
+ mmap_read_unlock(mm);
+ }
+
+ mmput(mm);
+
+ if (ret < 0)
+ goto free_watch;
+
+ if (flags & MEMWATCH_SD_GET) {
+ ret = copy_to_user(vec, watch.vec, watch.index * sizeof(loff_t));
+ if (ret) {
+ ret = -EIO;
+ goto free_watch;
+ }
+ ret = watch.index;
+ } else {
+ ret = 0;
+ }
+
+free_watch:
+ if (flags & MEMWATCH_SD_GET)
+ vfree(watch.vec);
+put_task:
+ if (pidfd)
+ put_task_struct(task);
+
+ return ret;
+}
+#endif
+
+SYSCALL_DEFINE6(process_memwatch, int, pidfd, void __user*, start,
+ int, len, unsigned int, flags, loff_t __user *, vec, int, vec_len)
+{
+ int ret = -EPERM;
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ ret = do_process_memwatch(pidfd, start, len, flags, vec, vec_len);
+#endif
+ return ret;
+}
--
2.30.2