Re: [PATCH] allow vma merging with mlock et. al.

From: Chris Wright
Date: Mon Feb 28 2005 - 15:35:01 EST


* Hugh Dickins (hugh@xxxxxxxxxxx) wrote:
> On Fri, 25 Feb 2005, Chris Wright wrote:
> >
> > Actually I think it winds up being fine since we don't do merging with
> > mlock. But why not? Patch below remedies that.
>
> > Successive mlock/munlock calls can leave fragmented vmas because they can
> > be split but not merged. Give mlock et. al. full vma merging support.
>
> Phew, you followed mprotect, saving me from having to think too deeply
> about the correctness of this (I'm assuming mprotect is perfect ;))

Heh, same assumption here ;-)

> Some remarks then on the three places where you departed from mprotect.
>
> > ===== mm/mlock.c 1.19 vs edited =====
> > --- 1.19/mm/mlock.c 2005-02-11 11:07:35 -08:00
> > +++ edited/mm/mlock.c 2005-02-24 23:53:10 -08:00
> > @@ -7,18 +7,32 @@
> >
> > #include <linux/mman.h>
> > #include <linux/mm.h>
> > +#include <linux/mempolicy.h>
> > #include <linux/syscalls.h>
> >
> >
> > -static int mlock_fixup(struct vm_area_struct * vma,
> > +static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
> > unsigned long start, unsigned long end, unsigned int newflags)
> > {
> > struct mm_struct * mm = vma->vm_mm;
> > + pgoff_t pgoff;
> > int pages;
> > int ret = 0;
> >
> > - if (newflags == vma->vm_flags)
> > + if (newflags == vma->vm_flags) {
> > + *prev = vma;
> > goto out;
> > + }
> > +
> > + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
> > + vma->vm_file, pgoff, vma_policy(vma));
> > + if (*prev) {
> > + vma = *prev;
> > + goto success;
> > + }
> > +
> > + *prev = vma;
>
> You've raised that line higher (so do_mlockall's "Ignore errors" works):
> that's an improvement because it saves special casing errors, I'd like
> you to make the same adjustment to mprotect_fixup, even though it's not
> required there (and delete "Unless it returns an error, " from comment).
> Let's keep the two in step.

Sure, makes sense.

> > if (start != vma->vm_start) {
> > ret = split_vma(mm, vma, start, 1);
> > @@ -32,6 +46,7 @@ static int mlock_fixup(struct vm_area_st
> > goto out;
> > }
> >
> > +success:
> > /*
> > * vm_flags is protected by the mmap_sem held in write mode.
> > * It's okay if try_to_unmap_one unmaps a page just after we
> > @@ -59,7 +74,7 @@ out:
> > static int do_mlock(unsigned long start, size_t len, int on)
> > {
> > unsigned long nstart, end, tmp;
> > - struct vm_area_struct * vma, * next;
> > + struct vm_area_struct * vma, * prev;
> > int error;
> >
> > len = PAGE_ALIGN(len);
> > @@ -68,7 +83,7 @@ static int do_mlock(unsigned long start,
> > return -EINVAL;
> > if (end == start)
> > return 0;
> > - vma = find_vma(current->mm, start);
> > + vma = find_vma_prev(current->mm, start, &prev);
> > if (!vma || vma->vm_start > start)
> > return -ENOMEM;
>
> But here sys_mprotect also says:
>
> if (start > vma->vm_start)
> prev = vma;
>
> Perhaps you've worked your way through vma_merge and convinced yourself
> this is never necessary, that's quite possible; but I'd still be happier
> if you were to add it into your do_mlock: it limits the cases vma_merge
> has to worry about. Or if you feel strongly about it, explain why I'm
> just being silly, and delete it from mprotect too.

No, I think you're right, and it measurably improves (~5%) the worst
case benchmark I was doing, thanks.

> > @@ -81,18 +96,19 @@ static int do_mlock(unsigned long start,
> > if (!on)
> > newflags &= ~VM_LOCKED;
> >
> > - if (vma->vm_end >= end) {
> > - error = mlock_fixup(vma, nstart, end, newflags);
> > - break;
> > - }
> > -
> > tmp = vma->vm_end;
> > - next = vma->vm_next;
> > - error = mlock_fixup(vma, nstart, tmp, newflags);
> > + if (tmp > end)
> > + tmp = end;
> > + error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
> > if (error)
> > break;
> > nstart = tmp;
> > - vma = next;
> > + if (nstart < prev->vm_end)
> > + nstart = prev->vm_end;
> > + if (nstart >= end)
> > + break;
> > +
> > + vma = prev->vm_next;
> > if (!vma || vma->vm_start != nstart) {
> > error = -ENOMEM;
> > break;
> > @@ -141,7 +157,7 @@ asmlinkage long sys_munlock(unsigned lon
> >
> > static int do_mlockall(int flags)
> > {
> > - struct vm_area_struct * vma;
> > + struct vm_area_struct * vma, * prev;
> > unsigned int def_flags = 0;
> >
> > if (flags & MCL_FUTURE)
> > @@ -150,7 +166,7 @@ static int do_mlockall(int flags)
> > if (flags == MCL_FUTURE)
> > goto out;
> >
> > - for (vma = current->mm->mmap; vma ; vma = vma->vm_next) {
> > + for (prev = vma = current->mm->mmap; vma ; vma = vma->vm_next) {
>
> Here prev should be initialized to NULL, rather than the first vma.
> Again, you've probably worked out that it's safe as you've written it,
> but vma_merge does expect prev NULL at the beginning.

No, it was a bug.

> > unsigned int newflags;
> >
> > newflags = vma->vm_flags | VM_LOCKED;
> > @@ -158,7 +174,8 @@ static int do_mlockall(int flags)
> > newflags &= ~VM_LOCKED;
> >
> > /* Ignore errors */
> > - mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags);
> > + mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
> > + vma = prev;
>
> Scrap that "vma = prev;" line, just say "vma = prev->vm_next" in the loop?

Yup, thanks for reviewing. Updated patch below.

thanks,
-chris
--

Successive mlock/munlock calls can leave fragmented vmas because they can
be split but not merged. Give mlock et. al. full vma merging support.
While we're at it, move *pprev assignment above first split_vma in
mprotect_fixup to keep it in step with mlock_fixup (which for mlockall
ignores errors yet still needs a valid prev pointer).

Signed-off-by: Chris Wright <chrisw@xxxxxxxx>

===== mm/mlock.c 1.19 vs edited =====
--- 1.19/mm/mlock.c 2005-02-11 11:07:35 -08:00
+++ edited/mm/mlock.c 2005-02-28 11:08:23 -08:00
@@ -7,18 +7,32 @@

#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/mempolicy.h>
#include <linux/syscalls.h>


-static int mlock_fixup(struct vm_area_struct * vma,
+static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, unsigned int newflags)
{
struct mm_struct * mm = vma->vm_mm;
+ pgoff_t pgoff;
int pages;
int ret = 0;

- if (newflags == vma->vm_flags)
+ if (newflags == vma->vm_flags) {
+ *prev = vma;
goto out;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma));
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;

if (start != vma->vm_start) {
ret = split_vma(mm, vma, start, 1);
@@ -32,6 +46,7 @@ static int mlock_fixup(struct vm_area_st
goto out;
}

+success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
@@ -59,7 +74,7 @@ out:
static int do_mlock(unsigned long start, size_t len, int on)
{
unsigned long nstart, end, tmp;
- struct vm_area_struct * vma, * next;
+ struct vm_area_struct * vma, * prev;
int error;

len = PAGE_ALIGN(len);
@@ -68,10 +83,13 @@ static int do_mlock(unsigned long start,
return -EINVAL;
if (end == start)
return 0;
- vma = find_vma(current->mm, start);
+ vma = find_vma_prev(current->mm, start, &prev);
if (!vma || vma->vm_start > start)
return -ENOMEM;

+ if (start > vma->vm_start)
+ prev = vma;
+
for (nstart = start ; ; ) {
unsigned int newflags;

@@ -81,18 +99,19 @@ static int do_mlock(unsigned long start,
if (!on)
newflags &= ~VM_LOCKED;

- if (vma->vm_end >= end) {
- error = mlock_fixup(vma, nstart, end, newflags);
- break;
- }
-
tmp = vma->vm_end;
- next = vma->vm_next;
- error = mlock_fixup(vma, nstart, tmp, newflags);
+ if (tmp > end)
+ tmp = end;
+ error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
break;
nstart = tmp;
- vma = next;
+ if (nstart < prev->vm_end)
+ nstart = prev->vm_end;
+ if (nstart >= end)
+ break;
+
+ vma = prev->vm_next;
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
@@ -141,7 +160,7 @@ asmlinkage long sys_munlock(unsigned lon

static int do_mlockall(int flags)
{
- struct vm_area_struct * vma;
+ struct vm_area_struct * vma, * prev = NULL;
unsigned int def_flags = 0;

if (flags & MCL_FUTURE)
@@ -150,7 +169,7 @@ static int do_mlockall(int flags)
if (flags == MCL_FUTURE)
goto out;

- for (vma = current->mm->mmap; vma ; vma = vma->vm_next) {
+ for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
unsigned int newflags;

newflags = vma->vm_flags | VM_LOCKED;
@@ -158,7 +177,7 @@ static int do_mlockall(int flags)
newflags &= ~VM_LOCKED;

/* Ignore errors */
- mlock_fixup(vma, vma->vm_start, vma->vm_end, newflags);
+ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
}
out:
return 0;
===== mm/mprotect.c 1.40 vs edited =====
--- 1.40/mm/mprotect.c 2004-12-22 01:31:46 -08:00
+++ edited/mm/mprotect.c 2005-02-28 11:09:39 -08:00
@@ -185,16 +185,13 @@ mprotect_fixup(struct vm_area_struct *vm
goto success;
}

+ *pprev = vma;
+
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
if (error)
goto fail;
}
- /*
- * Unless it returns an error, this function always sets *pprev to
- * the first vma for which vma->vm_end >= end.
- */
- *pprev = vma;

if (end != vma->vm_end) {
error = split_vma(mm, vma, end, 0);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/