[PATCH 1/4] madvice: add MADV_SHAREABLE and MADV_UNSHAREABLE calls.

From: Izik Eidus
Date: Wed May 13 2009 - 20:32:25 EST


This patch add MADV_SHAREABLE and MADV_UNSHAREABLE madvise calls,
this calls used to mark vm memory areas with the VM_MERGEABLE flag,
that specific if the memory inside the vma is allowed to be dinamicly shared
with other memorys.

(this is needed for ksm vma scanning support)

Signed-off-by: Izik Eidus <ieidus@xxxxxxxxxx>
---
include/asm-generic/mman.h | 2 +
include/linux/mm.h | 2 +
include/linux/sched.h | 2 +
mm/madvise.c | 116 +++++++++++++++++++++++++++++++++----------
4 files changed, 95 insertions(+), 27 deletions(-)

diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h
index 5e3dde2..830295d 100644
--- a/include/asm-generic/mman.h
+++ b/include/asm-generic/mman.h
@@ -34,6 +34,8 @@
#define MADV_REMOVE 9 /* remove these pages & resources */
#define MADV_DONTFORK 10 /* don't inherit across fork */
#define MADV_DOFORK 11 /* do inherit across fork */
+#define MADV_SHAREABLE 12 /* can share identical pages */
+#define MADV_UNSHAREABLE 13 /* can not share identical pages */

/* compatibility flags */
#define MAP_FILE 0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a0ddfb5..61328a4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -106,6 +106,8 @@ extern unsigned int kobjsize(const void *objp);
#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */

+#define VM_MERGEABLE 0x80000000 /* Memory may be merged */
+
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..7dc786a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -439,6 +439,8 @@ extern int get_dumpable(struct mm_struct *mm);
# define MMF_DUMP_MASK_DEFAULT_ELF 0
#endif

+#define MMF_VM_MERGEABLE 9
+
struct sighand_struct {
atomic_t count;
struct k_sigaction action[_NSIG];
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574..bd215ce 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -30,36 +30,12 @@ static int madvise_need_mmap_write(int behavior)
}
}

-/*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
- */
-static long madvise_behavior(struct vm_area_struct * vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end, int behavior)
+static int handle_vmas(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int new_flags)
{
struct mm_struct * mm = vma->vm_mm;
- int error = 0;
pgoff_t pgoff;
- int new_flags = vma->vm_flags;
-
- switch (behavior) {
- case MADV_NORMAL:
- new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
- break;
- case MADV_SEQUENTIAL:
- new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
- break;
- case MADV_RANDOM:
- new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
- break;
- case MADV_DONTFORK:
- new_flags |= VM_DONTCOPY;
- break;
- case MADV_DOFORK:
- new_flags &= ~VM_DONTCOPY;
- break;
- }
+ int error = 0;

if (new_flags == vma->vm_flags) {
*prev = vma;
@@ -101,6 +77,37 @@ out:
}

/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
+{
+ int new_flags = vma->vm_flags;
+
+ switch (behavior) {
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
+ case MADV_SEQUENTIAL:
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+ break;
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ new_flags &= ~VM_DONTCOPY;
+ break;
+ }
+
+ return handle_vmas(vma, prev, start, end, new_flags);
+}
+
+/*
* Schedule all required I/O operations. Do not wait for completion.
*/
static long madvise_willneed(struct vm_area_struct * vma,
@@ -208,6 +215,54 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}

+/*
+ * Application allows pages to be shared with other pages of identical
+ * content.
+ *
+ */
+static long madvise_shareable(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ int behavior)
+{
+ int ret;
+ struct mm_struct *mm;
+
+ switch (behavior) {
+#if defined(CONFIG_KSM) || defined(CONFIG_KSM_MODULE)
+ case MADV_SHAREABLE:
+ ret = handle_vmas(vma, prev, start, end,
+ vma->vm_flags | VM_MERGEABLE);
+
+ if (!ret) {
+ mm = vma->vm_mm;
+ set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ }
+
+ return ret;
+ case MADV_UNSHAREABLE:
+ ret = handle_vmas(vma, prev, start, end,
+ vma->vm_flags & ~VM_MERGEABLE);
+
+ if (!ret) {
+ mm = vma->vm_mm;
+ vma = mm->mmap;
+ while (vma) {
+ if (vma->vm_flags & VM_MERGEABLE)
+ break;
+ vma = vma->vm_next;
+ }
+ if (!vma)
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ }
+
+ return ret;
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
@@ -238,6 +293,11 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
error = madvise_dontneed(vma, prev, start, end);
break;

+ case MADV_SHAREABLE:
+ case MADV_UNSHAREABLE:
+ error = madvise_shareable(vma, prev, start, end, behavior);
+ break;
+
default:
error = -EINVAL;
break;
@@ -269,6 +329,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
* so the kernel can free resources associated with it.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
+ * MADV_SHAREABLE - the application agrees that pages in the given
+ * range can be shared w/ other pages of identical content.
*
* return values:
* zero - success
--
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/