[RFC PATCH 1/6] mm: add defer mechanism to ksm to make it more suitable

From: Wenwei Tao
Date: Wed Jun 10 2015 - 02:32:42 EST


I observe that it is unlikely for KSM to merge new pages from an area
that has already been scanned twice on Android mobile devices, so it's
a waste of power to continue to scan these areas in high frequency.
In this patch a defer mechanism is introduced which is borrowed from
page compaction to KSM. This defer mechanism can automatic lower the scan
frequency in the above case.

Signed-off-by: Wenwei Tao <wenweitaowenwei@xxxxxxxxx>
---
mm/ksm.c | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 203 insertions(+), 27 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 4162dce..54ffcb2 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -104,6 +104,7 @@ struct mm_slot {
struct list_head mm_list;
struct rmap_item *rmap_list;
struct mm_struct *mm;
+ unsigned long seqnr;
};

/**
@@ -117,9 +118,12 @@ struct mm_slot {
*/
struct ksm_scan {
struct mm_slot *mm_slot;
+ struct mm_slot *active_slot;
unsigned long address;
struct rmap_item **rmap_list;
unsigned long seqnr;
+ unsigned long ksm_considered;
+ unsigned int ksm_defer_shift;
};

/**
@@ -182,6 +186,11 @@ struct rmap_item {
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
#define STABLE_FLAG 0x200 /* is listed from the stable tree */

+#define ACTIVE_SLOT_FLAG 0x100
+#define ACTIVE_SLOT_SEQNR 0x200
+#define KSM_MAX_DEFER_SHIFT 6
+
+
/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
static struct rb_root one_unstable_tree[1] = { RB_ROOT };
@@ -197,14 +206,22 @@ static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
};
+
+static struct mm_slot ksm_mm_active = {
+ .mm_list = LIST_HEAD_INIT(ksm_mm_active.mm_list),
+};
+
static struct ksm_scan ksm_scan = {
.mm_slot = &ksm_mm_head,
+ .active_slot = &ksm_mm_active,
};

static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;

+static bool ksm_merged_or_unstable;
+
/* The number of nodes in the stable tree */
static unsigned long ksm_pages_shared;

@@ -336,6 +353,23 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
}

+static void move_to_active_list(struct mm_slot *mm_slot)
+{
+ if (mm_slot && !(mm_slot->seqnr & ACTIVE_SLOT_FLAG)) {
+ if (ksm_run & KSM_RUN_UNMERGE && mm_slot->rmap_list)
+ return;
+ if (mm_slot == ksm_scan.mm_slot) {
+ if (ksm_scan.active_slot == &ksm_mm_active)
+ return;
+ ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
+ struct mm_slot, mm_list);
+ }
+ list_move_tail(&mm_slot->mm_list,
+ &ksm_scan.active_slot->mm_list);
+ mm_slot->seqnr |= (ACTIVE_SLOT_FLAG | ACTIVE_SLOT_SEQNR);
+ }
+}
+
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
@@ -772,6 +806,15 @@ static int unmerge_and_remove_all_rmap_items(void)
int err = 0;

spin_lock(&ksm_mmlist_lock);
+ mm_slot = list_entry(ksm_mm_active.mm_list.next,
+ struct mm_slot, mm_list);
+ while (mm_slot != &ksm_mm_active) {
+ list_move_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+ mm_slot->seqnr &= ~(ACTIVE_SLOT_FLAG | ACTIVE_SLOT_SEQNR);
+ mm_slot = list_entry(ksm_mm_active.mm_list.next,
+ struct mm_slot, mm_list);
+ }
+ ksm_scan.active_slot = &ksm_mm_active;
ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
struct mm_slot, mm_list);
spin_unlock(&ksm_mmlist_lock);
@@ -790,8 +833,8 @@ static int unmerge_and_remove_all_rmap_items(void)
if (err)
goto error;
}
-
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+ mm_slot->seqnr = 0;

spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -806,6 +849,7 @@ static int unmerge_and_remove_all_rmap_items(void)
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
+ move_to_active_list(mm_slot);
spin_unlock(&ksm_mmlist_lock);
up_read(&mm->mmap_sem);
}
@@ -1401,6 +1445,9 @@ static void stable_tree_append(struct rmap_item *rmap_item,
ksm_pages_sharing++;
else
ksm_pages_shared++;
+
+ if (!ksm_merged_or_unstable)
+ ksm_merged_or_unstable = true;
}

/*
@@ -1468,6 +1515,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
checksum = calc_checksum(page);
if (rmap_item->oldchecksum != checksum) {
rmap_item->oldchecksum = checksum;
+ if ((rmap_item->address & UNSTABLE_FLAG) &&
+ !ksm_merged_or_unstable)
+ ksm_merged_or_unstable = true;
return;
}

@@ -1504,6 +1554,31 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
}
}

+static bool skip_active_slot_vma(struct rmap_item ***rmap_list,
+ struct vm_area_struct *vma, struct mm_slot *mm_slot)
+{
+ unsigned char age;
+ struct rmap_item *rmap_item = **rmap_list;
+
+ age = (unsigned char)(ksm_scan.seqnr - mm_slot->seqnr);
+ if (age > 0)
+ return false;
+ if (!(vma->vm_flags & VM_HUGETLB)) {
+ while (rmap_item && rmap_item->address < vma->vm_end) {
+ if (rmap_item->address < vma->vm_start) {
+ **rmap_list = rmap_item->rmap_list;
+ remove_rmap_item_from_tree(rmap_item);
+ free_rmap_item(rmap_item);
+ rmap_item = **rmap_list;
+ } else {
+ *rmap_list = &rmap_item->rmap_list;
+ rmap_item = rmap_item->rmap_list;
+ }
+ }
+ return true;
+ } else
+ return false;
+}
static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
struct rmap_item **rmap_list,
unsigned long addr)
@@ -1535,15 +1610,18 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
- struct mm_slot *slot;
+ struct mm_slot *slot, *next_slot;
struct vm_area_struct *vma;
struct rmap_item *rmap_item;
int nid;

- if (list_empty(&ksm_mm_head.mm_list))
+ if (list_empty(&ksm_mm_head.mm_list) &&
+ list_empty(&ksm_mm_active.mm_list))
return NULL;
-
- slot = ksm_scan.mm_slot;
+ if (ksm_scan.active_slot != &ksm_mm_active)
+ slot = ksm_scan.active_slot;
+ else
+ slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
/*
* A number of pages can hang around indefinitely on per-cpu
@@ -1582,8 +1660,16 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
root_unstable_tree[nid] = RB_ROOT;

spin_lock(&ksm_mmlist_lock);
- slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
- ksm_scan.mm_slot = slot;
+ if (unlikely(ksm_scan.seqnr == 0 &&
+ !list_empty(&ksm_mm_active.mm_list))) {
+ slot = list_entry(ksm_mm_active.mm_list.next,
+ struct mm_slot, mm_list);
+ ksm_scan.active_slot = slot;
+ } else {
+ slot = list_entry(slot->mm_list.next,
+ struct mm_slot, mm_list);
+ ksm_scan.mm_slot = slot;
+ }
spin_unlock(&ksm_mmlist_lock);
/*
* Although we tested list_empty() above, a racing __ksm_exit
@@ -1608,7 +1694,8 @@ next_mm:
continue;
if (ksm_scan.address < vma->vm_start)
ksm_scan.address = vma->vm_start;
- if (!vma->anon_vma)
+ if (!vma->anon_vma || (ksm_scan.active_slot == slot &&
+ skip_active_slot_vma(&ksm_scan.rmap_list, vma, slot)))
ksm_scan.address = vma->vm_end;

while (ksm_scan.address < vma->vm_end) {
@@ -1639,6 +1726,9 @@ next_mm:
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
+ if ((slot->seqnr & (ACTIVE_SLOT_FLAG | ACTIVE_SLOT_SEQNR)) ==
+ ACTIVE_SLOT_FLAG && vma->vm_flags & VM_HUGETLB)
+ vma->vm_flags &= ~VM_HUGETLB;
}

if (ksm_test_exit(mm)) {
@@ -1652,8 +1742,25 @@ next_mm:
remove_trailing_rmap_items(slot, ksm_scan.rmap_list);

spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(slot->mm_list.next,
- struct mm_slot, mm_list);
+ slot->seqnr &= ~SEQNR_MASK;
+ slot->seqnr |= (ksm_scan.seqnr & SEQNR_MASK);
+ next_slot = list_entry(slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (slot == ksm_scan.active_slot) {
+ if (slot->seqnr & ACTIVE_SLOT_SEQNR)
+ slot->seqnr &= ~ACTIVE_SLOT_SEQNR;
+ else {
+ slot->seqnr &= ~ACTIVE_SLOT_FLAG;
+ list_move_tail(&slot->mm_list,
+ &ksm_scan.mm_slot->mm_list);
+ }
+ ksm_scan.active_slot = next_slot;
+ } else
+ ksm_scan.mm_slot = next_slot;
+
+ if (ksm_scan.active_slot == &ksm_mm_active)
+ ksm_scan.active_slot = list_entry(ksm_mm_active.mm_list.next,
+ struct mm_slot, mm_list);
if (ksm_scan.address == 0) {
/*
* We've completed a full scan of all vmas, holding mmap_sem
@@ -1664,6 +1771,9 @@ next_mm:
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_sem then protects against race with MADV_MERGEABLE).
*/
+ if (ksm_scan.active_slot == slot)
+ ksm_scan.active_slot = list_entry(slot->mm_list.next,
+ struct mm_slot, mm_list);
hash_del(&slot->link);
list_del(&slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
@@ -1678,10 +1788,13 @@ next_mm:
}

/* Repeat until we've completed scanning the whole list */
- slot = ksm_scan.mm_slot;
- if (slot != &ksm_mm_head)
+ if (ksm_scan.active_slot != &ksm_mm_active) {
+ slot = ksm_scan.active_slot;
goto next_mm;
-
+ } else if (ksm_scan.mm_slot != &ksm_mm_head) {
+ slot = ksm_scan.mm_slot;
+ goto next_mm;
+ }
ksm_scan.seqnr++;
return NULL;
}
@@ -1705,9 +1818,40 @@ static void ksm_do_scan(unsigned int scan_npages)
}
}

+/*This is copyed from page compaction*/
+
+static inline void defer_ksm(void)
+{
+ ksm_scan.ksm_considered = 0;
+ if (++ksm_scan.ksm_defer_shift > KSM_MAX_DEFER_SHIFT)
+ ksm_scan.ksm_defer_shift = KSM_MAX_DEFER_SHIFT;
+
+}
+
+static inline bool ksm_defered(void)
+{
+ unsigned long defer_limit = 1UL << ksm_scan.ksm_defer_shift;
+
+ if (++ksm_scan.ksm_considered > defer_limit)
+ ksm_scan.ksm_considered = defer_limit;
+ return ksm_scan.ksm_considered < defer_limit &&
+ list_empty(&ksm_mm_active.mm_list);
+}
+
+static inline void reset_ksm_defer(void)
+{
+ if (ksm_scan.ksm_defer_shift != 0) {
+ ksm_scan.ksm_considered = 0;
+ ksm_scan.ksm_defer_shift = 0;
+ }
+}
+
+
static int ksmd_should_run(void)
{
- return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+ return (ksm_run & KSM_RUN_MERGE) &&
+ !(list_empty(&ksm_mm_head.mm_list) &&
+ list_empty(&ksm_mm_active.mm_list));
}

static int ksm_scan_thread(void *nothing)
@@ -1718,8 +1862,14 @@ static int ksm_scan_thread(void *nothing)
while (!kthread_should_stop()) {
mutex_lock(&ksm_thread_mutex);
wait_while_offlining();
- if (ksmd_should_run())
+ if (ksmd_should_run() && !ksm_defered()) {
+ ksm_merged_or_unstable = false;
ksm_do_scan(ksm_thread_pages_to_scan);
+ if (ksm_merged_or_unstable)
+ reset_ksm_defer();
+ else
+ defer_ksm();
+ }
mutex_unlock(&ksm_thread_mutex);

try_to_freeze();
@@ -1739,6 +1889,8 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
+ unsigned long vma_length =
+ (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
int err;

switch (advice) {
@@ -1761,8 +1913,19 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (err)
return err;
}
-
- *vm_flags |= VM_MERGEABLE;
+ /*
+ * Since hugetlb vma is not supported by ksm
+ * use VM_HUGETLB to indicate new mergeable vma
+ */
+ *vm_flags |= VM_MERGEABLE | VM_HUGETLB;
+ if (vma_length > ksm_thread_pages_to_scan) {
+ struct mm_slot *mm_slot;
+
+ spin_lock(&ksm_mmlist_lock);
+ mm_slot = get_mm_slot(mm);
+ move_to_active_list(mm_slot);
+ spin_unlock(&ksm_mmlist_lock);
+ }
break;

case MADV_UNMERGEABLE:
@@ -1775,7 +1938,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return err;
}

- *vm_flags &= ~VM_MERGEABLE;
+ *vm_flags &= ~(VM_MERGEABLE | VM_HUGETLB);
break;
}

@@ -1792,7 +1955,8 @@ int __ksm_enter(struct mm_struct *mm)
return -ENOMEM;

/* Check ksm_run too? Would need tighter locking */
- needs_wakeup = list_empty(&ksm_mm_head.mm_list);
+ needs_wakeup = list_empty(&ksm_mm_head.mm_list) &&
+ list_empty(&ksm_mm_active.mm_list);

spin_lock(&ksm_mmlist_lock);
insert_to_mm_slots_hash(mm, mm_slot);
@@ -1806,10 +1970,8 @@ int __ksm_enter(struct mm_struct *mm)
* scanning cursor, otherwise KSM pages in newly forked mms will be
* missed: then we might as well insert at the end of the list.
*/
- if (ksm_run & KSM_RUN_UNMERGE)
- list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
- else
- list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ list_add_tail(&mm_slot->mm_list, &ksm_scan.active_slot->mm_list);
+ mm_slot->seqnr |= (ACTIVE_SLOT_FLAG | ACTIVE_SLOT_SEQNR);
spin_unlock(&ksm_mmlist_lock);

set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1823,7 +1985,7 @@ int __ksm_enter(struct mm_struct *mm)

void __ksm_exit(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct mm_slot *mm_slot, *current_slot;
int easy_to_free = 0;

/*
@@ -1837,14 +1999,28 @@ void __ksm_exit(struct mm_struct *mm)

spin_lock(&ksm_mmlist_lock);
mm_slot = get_mm_slot(mm);
- if (mm_slot && ksm_scan.mm_slot != mm_slot) {
+ if (ksm_scan.active_slot != &ksm_mm_active)
+ current_slot = ksm_scan.active_slot;
+ else
+ current_slot = ksm_scan.mm_slot;
+ if (mm_slot && mm_slot != current_slot) {
if (!mm_slot->rmap_list) {
hash_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
easy_to_free = 1;
} else {
- list_move(&mm_slot->mm_list,
- &ksm_scan.mm_slot->mm_list);
+ if (mm_slot == ksm_scan.mm_slot)
+ ksm_scan.mm_slot =
+ list_entry(mm_slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (ksm_run & KSM_RUN_UNMERGE)
+ list_move(&mm_slot->mm_list,
+ &ksm_scan.mm_slot->mm_list);
+ else {
+ list_move(&mm_slot->mm_list,
+ &ksm_scan.active_slot->mm_list);
+ mm_slot->seqnr |= ACTIVE_SLOT_FLAG;
+ }
}
}
spin_unlock(&ksm_mmlist_lock);
--
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/