[PATCH v2] mm: Update do_vmi_align_munmap() return semantics

From: Liam R. Howlett
Date: Thu Jun 29 2023 - 22:28:16 EST


Since do_vmi_align_munmap() will always honor the downgrade request on
the success, the callers no longer have to deal with confusing return
codes. Since all callers that request downgrade actually want the lock
to be dropped, change the downgrade to an unlock request.

Note that the lock still needs to be held in read mode during the page
table clean up to avoid races with a map request.

Update do_vmi_align_munmap() to return 0 for success. Clean up the
callers and comments to always expect the unlock to be honored on the
success path. The error path will always leave the lock untouched.

As part of the cleanup, the wrapper function do_vmi_munmap() and callers
to the wrapper are also updated.

Suggested-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Link: https://lore.kernel.org/linux-mm/20230629191414.1215929-1-willy@infra=
dead.org/
Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
---
include/linux/mm.h | 4 +-
mm/mmap.c | 94 +++++++++++++++++++++-------------------------
mm/mremap.c | 28 ++++++--------
3 files changed, 57 insertions(+), 69 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 39aa409e84d5..95dbc073b4d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3176,7 +3176,7 @@ extern unsigned long do_mmap(struct file *file, unsig=
ned long addr,
unsigned long pgoff, unsigned long *populate, struct list_head *uf);
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
- bool downgrade);
+ bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t le=
n_in, int behavior);
@@ -3184,7 +3184,7 @@ extern int do_madvise(struct mm_struct *mm, unsigned =
long start, size_t len_in,
#ifdef CONFIG_MMU
extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *=
vma,
unsigned long start, unsigned long end,
- struct list_head *uf, bool downgrade);
+ struct list_head *uf, bool unlock);
extern int __mm_populate(unsigned long addr, unsigned long len,
int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
diff --git a/mm/mmap.c b/mm/mmap.c
index 141c618847ac..51e70fa98450 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -193,8 +193,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
struct mm_struct *mm =3D current->mm;
struct vm_area_struct *brkvma, *next =3D NULL;
unsigned long min_brk;
- bool populate;
- bool downgraded =3D false;
+ bool populate =3D false;
LIST_HEAD(uf);
struct vma_iterator vmi;
=20
@@ -236,13 +235,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto success;
}
=20
- /*
- * Always allow shrinking brk.
- * do_vma_munmap() may downgrade mmap_lock to read.
- */
+ /* Always allow shrinking brk. */
if (brk <=3D mm->brk) {
- int ret;
-
/* Search one past newbrk */
vma_iter_init(&vmi, mm, newbrk);
brkvma =3D vma_find(&vmi, oldbrk);
@@ -250,19 +244,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out; /* mapping intersects with an existing non-brk vma. */
/*
* mm->brk must be protected by write mmap_lock.
- * do_vma_munmap() may downgrade the lock, so update it
+ * do_vma_munmap() will drop the lock on success, so update it
* before calling do_vma_munmap().
*/
mm->brk =3D brk;
- ret =3D do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true);
- if (ret =3D=3D 1) {
- downgraded =3D true;
- goto success;
- } else if (!ret)
- goto success;
-
- mm->brk =3D origbrk;
- goto out;
+ if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true))
+ goto out;
+
+ goto success_unlocked;
}
=20
if (check_brk_limits(oldbrk, newbrk - oldbrk))
@@ -283,19 +272,19 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out;
=20
mm->brk =3D brk;
+ if (mm->def_flags & VM_LOCKED)
+ populate =3D true;
=20
success:
- populate =3D newbrk > oldbrk && (mm->def_flags & VM_LOCKED) !=3D 0;
- if (downgraded)
- mmap_read_unlock(mm);
- else
- mmap_write_unlock(mm);
+ mmap_write_unlock(mm);
+success_unlocked:
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
=20
out:
+ mm->brk =3D origbrk;
mmap_write_unlock(mm);
return origbrk;
}
@@ -2428,14 +2417,16 @@ int split_vma(struct vma_iterator *vmi, struct vm_a=
rea_struct *vma,
* @start: The aligned start address to munmap.
* @end: The aligned end address to munmap.
* @uf: The userfaultfd list_head
- * @downgrade: Set to true to attempt a write downgrade of the mmap_lock
+ * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
+ * success.
*
- * If @downgrade is true, check return code for potential release of the l=
ock.
+ * Return: 0 on success and drops the lock if so directed, error and leave=
s the
+ * lock held otherwise.
*/
static int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
- unsigned long end, struct list_head *uf, bool downgrade)
+ unsigned long end, struct list_head *uf, bool unlock)
{
struct vm_area_struct *prev, *next =3D NULL;
struct maple_tree mt_detach;
@@ -2551,22 +2542,24 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struc=
t vm_area_struct *vma,
/* Point of no return */
mm->locked_vm -=3D locked_vm;
mm->map_count -=3D count;
- if (downgrade)
+ if (unlock)
mmap_write_downgrade(mm);
=20
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
*/
- unmap_region(mm, &mt_detach, vma, prev, next, start, end, !downgrade);
+ unmap_region(mm, &mt_detach, vma, prev, next, start, end, !unlock);
/* Statistics and freeing VMAs */
mas_set(&mas_detach, start);
remove_mt(mm, &mas_detach);
__mt_destroy(&mt_detach);
+ if (unlock)
+ mmap_read_unlock(mm);
=20
=20
validate_mm(mm);
- return downgrade ? 1 : 0;
+ return 0;
=20
clear_tree_failed:
userfaultfd_error:
@@ -2589,18 +2582,18 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struc=
t vm_area_struct *vma,
* @start: The start address to munmap
* @len: The length of the range to munmap
* @uf: The userfaultfd list_head
- * @downgrade: set to true if the user wants to attempt to write_downgrade=
the
- * mmap_lock
+ * @unlock: set to true if the user wants to drop the mmap_lock on success
*
* This function takes a @mas that is either pointing to the previous VMA =
or set
* to MA_START and sets it up to remove the mapping(s). The @len will be
* aligned and any arch_unmap work will be preformed.
*
- * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise.
+ * Return: 0 on success and drops the lock if so directed, error and leave=
s the
+ * lock held otherwise.
*/
int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
- bool downgrade)
+ bool unlock)
{
unsigned long end;
struct vm_area_struct *vma;
@@ -2617,10 +2610,13 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct =
mm_struct *mm,
=20
/* Find the first overlapping VMA */
vma =3D vma_find(vmi, end);
- if (!vma)
+ if (!vma) {
+ if (unlock)
+ mmap_write_unlock(mm);
return 0;
+ }
=20
- return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
+ return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}
=20
/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() cal=
ls.
@@ -2628,6 +2624,8 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm=
_struct *mm,
* @start: The start address to munmap
* @len: The length to be munmapped.
* @uf: The userfaultfd list_head
+ *
+ * Return: 0 on success, error otherwise.
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
@@ -2888,7 +2886,7 @@ unsigned long mmap_region(struct file *file, unsigned=
long addr,
return error;
}
=20
-static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
+static int __vm_munmap(unsigned long start, size_t len, bool unlock)
{
int ret;
struct mm_struct *mm =3D current->mm;
@@ -2898,16 +2896,8 @@ static int __vm_munmap(unsigned long start, size_t l=
en, bool downgrade)
if (mmap_write_lock_killable(mm))
return -EINTR;
=20
- ret =3D do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade);
- /*
- * Returning 1 indicates mmap_lock is downgraded.
- * But 1 is not legal return value of vm_munmap() and munmap(), reset
- * it to 0 before return.
- */
- if (ret =3D=3D 1) {
- mmap_read_unlock(mm);
- ret =3D 0;
- } else
+ ret =3D do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
+ if (ret || !unlock)
mmap_write_unlock(mm);
=20
userfaultfd_unmap_complete(mm, &uf);
@@ -3017,21 +3007,23 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, st=
art, unsigned long, size,
* @start: the start of the address to unmap
* @end: The end of the address to unmap
* @uf: The userfaultfd list_head
- * @downgrade: Attempt to downgrade or not
+ * @unlock: Drop the lock on success
*
- * Returns: 0 on success and not downgraded, 1 on success and downgraded.
* unmaps a VMA mapping when the vma iterator is already in position.
* Does not handle alignment.
+ *
+ * Return: 0 on success drops the lock of so directed, error on failure an=
d will
+ * still hold the lock.
*/
int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *uf, bool downgrade)
+ unsigned long start, unsigned long end, struct list_head *uf,
+ bool unlock)
{
struct mm_struct *mm =3D vma->vm_mm;
int ret;
=20
arch_unmap(mm, start, end);
- ret =3D do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
+ ret =3D do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
validate_mm(mm);
return ret;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index fe6b722ae633..11e06e4ab33b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -715,7 +715,7 @@ static unsigned long move_vma(struct vm_area_struct *vm=
a,
}
=20
vma_iter_init(&vmi, mm, old_addr);
- if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
+ if (!do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false)) {
/* OOM: unable to split vma, just get accounts right */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
vm_acct_memory(old_len >> PAGE_SHIFT);
@@ -913,7 +913,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned l=
ong, old_len,
struct vm_area_struct *vma;
unsigned long ret =3D -EINVAL;
bool locked =3D false;
- bool downgraded =3D false;
struct vm_userfaultfd_ctx uf =3D NULL_VM_UFFD_CTX;
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
@@ -999,24 +998,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned=
long, old_len,
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
* do_vmi_munmap does all the needed commit accounting, and
- * downgrades mmap_lock to read if so directed.
+ * unlocks the mmap_lock if so directed.
*/
if (old_len >=3D new_len) {
- int retval;
VMA_ITERATOR(vmi, mm, addr + new_len);
=20
- retval =3D do_vmi_munmap(&vmi, mm, addr + new_len,
- old_len - new_len, &uf_unmap, true);
- /* Returning 1 indicates mmap_lock is downgraded to read. */
- if (retval =3D=3D 1) {
- downgraded =3D true;
- } else if (retval < 0 && old_len !=3D new_len) {
- ret =3D retval;
+ if (old_len =3D=3D new_len) {
+ ret =3D addr;
goto out;
}
=20
+ ret =3D do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
+ &uf_unmap, true);
+ if (ret)
+ goto out;
+
ret =3D addr;
- goto out;
+ goto out_unlocked;
}
=20
/*
@@ -1101,12 +1099,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsign=
ed long, old_len,
out:
if (offset_in_page(ret))
locked =3D false;
- if (downgraded)
- mmap_read_unlock(current->mm);
- else
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(current->mm);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
+out_unlocked:
userfaultfd_unmap_complete(mm, &uf_unmap_early);
mremap_userfaultfd_complete(&uf, addr, ret, old_len);
userfaultfd_unmap_complete(mm, &uf_unmap);
--=20
2.39.2


--6yiabyfxthbyq3ua--