[PATCH] mm: make munlock fast when mlock is canceled by sigkill

From: Hiroaki Wakabayashi
Date: Sat Aug 22 2009 - 06:14:53 EST


This patch is for making commit 4779280d1e (mm: make get_user_pages()
interruptible) complete.

At first, munlock() assumes that all pages in vma are pinned,

Now, by the commit, mlock() can be interrupted by SIGKILL, etc So, part of
pages are not pinned.
If SIGKILL, In exit() path, munlock is called for unlocking pinned pages
in vma.

But, there, get_user_pages(write) is used for munlock(). Then, pages are
allocated via page-fault for exsiting process !!! This is problem at canceling
big mlock.
This patch tries to avoid allocating new pages at munlock().

mlock( big area )
<===== sig kill
do_exit()
->mmput()
-> do_munlock()
-> get_user_pages()
<allocate *never used* memory>
->.....freeing allocated memory.

* Test program
% cat run.sh
#!/bin/sh

./mlock_test 2000000000 &
sleep 2
kill -9 $!
wait

% cat mlock_test.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>

int main(int argc, char **argv)
{
size_t length = 50 * 1024 * 1024;
void *addr;
time_t timer;

if (argc >= 2)
length = strtoul(argv[1], NULL, 10);
printf("PID = %d\n", getpid());
addr = mmap(NULL, length, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) {
fprintf(stderr, "mmap failed: %s, length=%lu\n",
strerror(errno), length);
exit(EXIT_FAILURE);
}
printf("try mlock length=%lu\n", length);
timer = time(NULL);
if (mlock(addr, length) < 0) {
fprintf(stderr, "mlock failed: %s, time=%lu[sec]\n",
strerror(errno), time(NULL) - timer);
exit(EXIT_FAILURE);
}
printf("mlock succeed, time=%lu[sec]\n\n", time(NULL) - timer);
printf("try munlock length=%lu\n", length);
timer = time(NULL);
if (munlock(addr, length) < 0) {
fprintf(stderr, "munlock failed: %s, time=%lu[sec]\n",
strerror(errno), time(NULL)-timer);
exit(EXIT_FAILURE);
}
printf("munlock succeed, time=%lu[sec]\n\n", time(NULL) - timer);
if (munmap(addr, length) < 0) {
fprintf(stderr, "munmap failed: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
return 0;
}

* Executed Result
-- Original executed result
% time ./run.sh

PID = 2678
try mlock length=2000000000
./run.sh: line 6: 2678 Killed ./mlock_test 2000000000
./run.sh 0.00s user 2.59s system 13% cpu 18.781 total
%

-- After applied this patch
% time ./run.sh

PID = 2512
try mlock length=2000000000
./run.sh: line 6: 2512 Killed ./mlock_test 2000000000
./run.sh 0.00s user 1.15s system 45% cpu 2.507 total
%

Signed-off-by: Hiroaki Wakabayashi <primulaelatior@xxxxxxxxx>
---
mm/internal.h | 1 +
mm/memory.c | 9 +++++++--
mm/mlock.c | 35 +++++++++++++++++++----------------
3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index f290c4d..4ab5b24 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -254,6 +254,7 @@ static inline void
mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define GUP_FLAGS_FORCE 0x2
#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
#define GUP_FLAGS_IGNORE_SIGKILL 0x8
+#define GUP_FLAGS_ALLOW_NULL 0x10

int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce..b41fbf9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1217,6 +1217,7 @@ int __get_user_pages(struct task_struct *tsk,
struct mm_struct *mm,
int force = !!(flags & GUP_FLAGS_FORCE);
int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
+ int allow_null = !!(flags & GUP_FLAGS_ALLOW_NULL);

if (nr_pages <= 0)
return 0;
@@ -1312,6 +1313,8 @@ int __get_user_pages(struct task_struct *tsk,
struct mm_struct *mm,
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;

+ if (allow_null)
+ break;
ret = handle_mm_fault(mm, vma, start,
(foll_flags & FOLL_WRITE) ?
FAULT_FLAG_WRITE : 0);
@@ -1351,8 +1354,10 @@ int __get_user_pages(struct task_struct *tsk,
struct mm_struct *mm,
if (pages) {
pages[i] = page;

- flush_anon_page(vma, page, start);
- flush_dcache_page(page);
+ if (page) {
+ flush_anon_page(vma, page, start);
+ flush_dcache_page(page);
+ }
}
if (vmas)
vmas[i] = vma;
diff --git a/mm/mlock.c b/mm/mlock.c
index 45eb650..0f5827b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,9 +178,10 @@ static long __mlock_vma_pages_range(struct
vm_area_struct *vma,
*/
if (!mlock)
gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
- GUP_FLAGS_IGNORE_SIGKILL;
+ GUP_FLAGS_IGNORE_SIGKILL |
+ GUP_FLAGS_ALLOW_NULL;

- if (vma->vm_flags & VM_WRITE)
+ if (mlock && (vma->vm_flags & VM_WRITE))
gup_flags |= GUP_FLAGS_WRITE;

while (nr_pages > 0) {
@@ -220,21 +221,23 @@ static long __mlock_vma_pages_range(struct
vm_area_struct *vma,
for (i = 0; i < ret; i++) {
struct page *page = pages[i];

- lock_page(page);
- /*
- * Because we lock page here and migration is blocked
- * by the elevated reference, we need only check for
- * page truncation (file-cache only).
- */
- if (page->mapping) {
- if (mlock)
- mlock_vma_page(page);
- else
- munlock_vma_page(page);
+ if (page) {
+ lock_page(page);
+ /*
+ * Because we lock page here and migration is
+ * blocked by the elevated reference, we need
+ * only check for page truncation
+ * (file-cache only).
+ */
+ if (page->mapping) {
+ if (mlock)
+ mlock_vma_page(page);
+ else
+ munlock_vma_page(page);
+ }
+ unlock_page(page);
+ put_page(page); /* ref from get_user_pages() */
}
- unlock_page(page);
- put_page(page); /* ref from get_user_pages() */
-
/*
* here we assume that get_user_pages() has given us
* a list of virtually contiguous pages.
--
1.5.6.5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/