maple tree change made it possible for VMA iteration to see same VMA twice due to late vma_merge() failure

From: Jann Horn
Date: Tue Aug 15 2023 - 15:38:29 EST


commit 18b098af2890 ("vma_merge: set vma iterator to correct
position.") added a vma_prev(vmi) call to vma_merge() at a point where
it's still possible to bail out. My understanding is that this moves
the VMA iterator back by one VMA.

If you patch some extra logging into the kernel and inject a fake
out-of-memory error at the vma_iter_prealloc() call in vma_split() (a
real out-of-memory error there is very unlikely to happen in practice,
I think - my understanding is that the kernel will basically kill
every process on the system except for init before it starts failing
GFP_KERNEL allocations that fit within a single slab, unless the
allocation uses GFP_ACCOUNT or stuff like that, which the maple tree
doesn't):

```
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..a7be4d6a5db6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1454,9 +1454,16 @@ static int userfaultfd_register(struct
userfaultfd_ctx *ctx,
prev = vma;

ret = 0;
+ if (strcmp(current->comm, "FAILME") == 0)
+ pr_warn("%s: begin vma iteration\n", __func__);
for_each_vma_range(vmi, vma, end) {
cond_resched();

+ if (strcmp(current->comm, "FAILME") == 0) {
+ pr_warn("%s: prev=%px, vma=%px (%016lx-%016lx)\n",
+ __func__, prev, vma, vma->vm_start,
vma->vm_end);
+ }
+
BUG_ON(!vma_can_userfault(vma, vm_flags));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
@@ -1481,6 +1488,8 @@ static int userfaultfd_register(struct
userfaultfd_ctx *ctx,
vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx }),
anon_vma_name(vma));
+ if (strcmp(current->comm, "FAILME") == 0)
+ pr_warn("%s: vma_merge returned %px\n", __func__, prev);
if (prev) {
/* vma_merge() invalidated the mas */
vma = prev;
diff --git a/mm/mmap.c b/mm/mmap.c
index 3937479d0e07..fd435c40c43d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -990,7 +990,7 @@ struct vm_area_struct *vma_merge(struct
vma_iterator *vmi, struct mm_struct *mm,
if (err)
return NULL;

- if (vma_iter_prealloc(vmi))
+ if (strcmp(current->comm, "FAILME")==0 || vma_iter_prealloc(vmi))
return NULL;

init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
```

and then you run this reproducer:
```
#define _GNU_SOURCE
#include <err.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <linux/userfaultfd.h>

#ifndef UFFD_USER_MODE_ONLY
#define UFFD_USER_MODE_ONLY 1
#endif

#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \
})

int main(void) {
int uffd = SYSCHK(syscall(__NR_userfaultfd, UFFD_USER_MODE_ONLY));

struct uffdio_api api = { .api = UFFD_API, .features = 0 };
SYSCHK(ioctl(uffd, UFFDIO_API, &api));

/* create vma1 */
SYSCHK(mmap((void*)0x100000UL, 0x1000, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0));

/* set uffd on vma1 */
struct uffdio_register reg1 = {
.range = { .start = 0x100000, .len = 0x1000 },
.mode = UFFDIO_REGISTER_MODE_MISSING
};
SYSCHK(ioctl(uffd, UFFDIO_REGISTER, &reg1));

/* create vma2 */
SYSCHK(mmap((void*)0x101000UL, 0x1000, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0));

/* tries to merge vma2 into vma1, with injected allocation failure
causing merge failure */
SYSCHK(prctl(PR_SET_NAME, "FAILME"));
struct uffdio_register reg2 = {
.range = { .start = 0x101000, .len = 0x1000 },
.mode = UFFDIO_REGISTER_MODE_MISSING
};
SYSCHK(ioctl(uffd, UFFDIO_REGISTER, &reg2));
SYSCHK(prctl(PR_SET_NAME, "normal"));
}
```

then you'll get this fun log output, showing that the same VMA
(ffff88810c0b5e00) was visited by two iterations of the VMA iteration
loop, and on the second iteration, prev==vma:

[ 326.765586] userfaultfd_register: begin vma iteration
[ 326.766985] userfaultfd_register: prev=ffff88810c0b5ef0,
vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
[ 326.768786] userfaultfd_register: vma_merge returned 0000000000000000
[ 326.769898] userfaultfd_register: prev=ffff88810c0b5e00,
vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)

I don't know if this can lead to anything bad but it seems pretty
clearly unintended?