[PATCH] mm: Update do_vmi_align_munmap() return semantics

From: Liam R. Howlett
Date: Fri Jun 30 2023 - 12:06:21 EST


Since do_vmi_align_munmap() will always honor the downgrade request on
the success, the callers no longer have to deal with confusing return
codes.

Update do_vmi_align_munmap() to return 0 for success. Clean up the
callers and comments to always expect the lock downgrade to be honored
on the success path. The error path will always leave the lock
untouched.

As part of the cleanup, the wrapper function do_vmi_munmap() and callers
to the wrapper are also updated.

Suggested-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Link: https://lore.kernel.org/linux-mm/20230629191414.1215929-1-willy@xxxxxxxxxxxxx/
Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
---
mm/mmap.c | 70 ++++++++++++++++++++++++++---------------------------
mm/mremap.c | 26 +++++++++-----------
2 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 141c618847ac..a970542d0055 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -193,8 +193,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
struct mm_struct *mm = current->mm;
struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
- bool populate;
- bool downgraded = false;
+ bool populate = false;
LIST_HEAD(uf);
struct vma_iterator vmi;

@@ -236,13 +235,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto success;
}

- /*
- * Always allow shrinking brk.
- * do_vma_munmap() may downgrade mmap_lock to read.
- */
+ /* Always allow shrinking brk. */
if (brk <= mm->brk) {
- int ret;
-
/* Search one past newbrk */
vma_iter_init(&vmi, mm, newbrk);
brkvma = vma_find(&vmi, oldbrk);
@@ -250,19 +244,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out; /* mapping intersects with an existing non-brk vma. */
/*
* mm->brk must be protected by write mmap_lock.
- * do_vma_munmap() may downgrade the lock, so update it
- * before calling do_vma_munmap().
+ * do_vma_munmap() will downgrade the lock on success, so
+ * update it before calling do_vma_munmap().
*/
mm->brk = brk;
- ret = do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true);
- if (ret == 1) {
- downgraded = true;
- goto success;
- } else if (!ret)
- goto success;
-
- mm->brk = origbrk;
- goto out;
+ if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true))
+ goto out;
+
+ mmap_read_unlock(mm);
+ goto success_unlocked;
}

if (check_brk_limits(oldbrk, newbrk - oldbrk))
@@ -283,19 +273,19 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out;

mm->brk = brk;
+ if (mm->def_flags & VM_LOCKED)
+ populate = true;

success:
- populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
- if (downgraded)
- mmap_read_unlock(mm);
- else
- mmap_write_unlock(mm);
+ mmap_write_unlock(mm);
+success_unlocked:
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;

out:
+ mm->brk = origbrk;
mmap_write_unlock(mm);
return origbrk;
}
@@ -2428,9 +2418,11 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
* @start: The aligned start address to munmap.
* @end: The aligned end address to munmap.
* @uf: The userfaultfd list_head
- * @downgrade: Set to true to attempt a write downgrade of the mmap_lock
+ * @downgrade: Set to true to write downgrade the mmap_lock. Downgrade only
+ * happens on success.
*
- * If @downgrade is true, check return code for potential release of the lock.
+ * Return: 0 on success and downgrades the lock if so directed, error and leaves
+ * the lock held otherwise.
*/
static int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
@@ -2566,7 +2558,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,


validate_mm(mm);
- return downgrade ? 1 : 0;
+ return 0;

clear_tree_failed:
userfaultfd_error:
@@ -2596,7 +2588,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
* to MA_START and sets it up to remove the mapping(s). The @len will be
* aligned and any arch_unmap work will be preformed.
*
- * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise.
+ * Return: 0 on success and downgrades the lock if so directed, error and leaves
+ * the lock held otherwise.
*/
int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
@@ -2617,8 +2610,11 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,

/* Find the first overlapping VMA */
vma = vma_find(vmi, end);
- if (!vma)
+ if (!vma) {
+ if (downgrade)
+ mmap_write_downgrade(mm);
return 0;
+ }

return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
}
@@ -2628,6 +2624,8 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
* @start: The start address to munmap
* @len: The length to be munmapped.
* @uf: The userfaultfd list_head
+ *
+ * Return: 0 on success, error otherwise.
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
@@ -2900,14 +2898,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)

ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade);
/*
- * Returning 1 indicates mmap_lock is downgraded.
- * But 1 is not legal return value of vm_munmap() and munmap(), reset
- * it to 0 before return.
+ * Returning 0 is successful, but the lock status depends what was
+ * passed in.
*/
- if (ret == 1) {
+ if (!ret && downgrade)
mmap_read_unlock(mm);
- ret = 0;
- } else
+ else
mmap_write_unlock(mm);

userfaultfd_unmap_complete(mm, &uf);
@@ -3019,9 +3015,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
* @uf: The userfaultfd list_head
* @downgrade: Attempt to downgrade or not
*
- * Returns: 0 on success and not downgraded, 1 on success and downgraded.
* unmaps a VMA mapping when the vma iterator is already in position.
* Does not handle alignment.
+ *
+ * Return: 0 on success and downgrades the lock of so directed, error on failure
+ * and will still hold the lock.
*/
int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
diff --git a/mm/mremap.c b/mm/mremap.c
index fe6b722ae633..0509354bf7e9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -913,7 +913,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
bool locked = false;
- bool downgraded = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
@@ -1002,21 +1001,22 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* downgrades mmap_lock to read if so directed.
*/
if (old_len >= new_len) {
- int retval;
VMA_ITERATOR(vmi, mm, addr + new_len);

- retval = do_vmi_munmap(&vmi, mm, addr + new_len,
- old_len - new_len, &uf_unmap, true);
- /* Returning 1 indicates mmap_lock is downgraded to read. */
- if (retval == 1) {
- downgraded = true;
- } else if (retval < 0 && old_len != new_len) {
- ret = retval;
+ if (old_len == new_len) {
+ ret = addr;
goto out;
}

+ ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
+ &uf_unmap, true);
+ if (ret)
+ goto out;
+
+ mmap_read_unlock(current->mm);
ret = addr;
- goto out;
+ goto out_unlocked;
+
}

/*
@@ -1101,12 +1101,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
out:
if (offset_in_page(ret))
locked = false;
- if (downgraded)
- mmap_read_unlock(current->mm);
- else
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(current->mm);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
+out_unlocked:
userfaultfd_unmap_complete(mm, &uf_unmap_early);
mremap_userfaultfd_complete(&uf, addr, ret, old_len);
userfaultfd_unmap_complete(mm, &uf_unmap);
--
2.39.2