[PATCH 3/3] vrange: Add page purging logic & SIGBUS trap

From: John Stultz
Date: Fri Mar 14 2014 - 14:34:03 EST


Finally, this patch adds the hooks in the vmscan logic to discard volatile
pages and mark their pte as purged. With this, volatile pages will be
purged under pressure, and their ptes swap entry's marked. If the
purged pages are accessed before being marked non-volatile, we catch this
and send a SIGBUS.

This is a simplified implementation that uses logic from Minchan's earlier
efforts, so credit to Minchan for his work.

Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Android Kernel Team <kernel-team@xxxxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Robert Love <rlove@xxxxxxxxxx>
Cc: Mel Gorman <mel@xxxxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Cc: Dave Hansen <dave@xxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Dmitry Adamushko <dmitry.adamushko@xxxxxxxxx>
Cc: Neil Brown <neilb@xxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Mike Hommey <mh@xxxxxxxxxxxx>
Cc: Taras Glek <tglek@xxxxxxxxxxx>
Cc: Dhaval Giani <dgiani@xxxxxxxxxxx>
Cc: Jan Kara <jack@xxxxxxx>
Cc: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxx>
Cc: Michel Lespinasse <walken@xxxxxxxxxx>
Cc: Minchan Kim <minchan@xxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx <linux-mm@xxxxxxxxx>
Signed-off-by: John Stultz <john.stultz@xxxxxxxxxx>
---
include/linux/vrange.h | 2 ++
mm/internal.h | 2 --
mm/memory.c | 21 +++++++++++
mm/rmap.c | 5 +++
mm/vmscan.c | 12 +++++++
mm/vrange.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/include/linux/vrange.h b/include/linux/vrange.h
index c4a1616..b18551f 100644
--- a/include/linux/vrange.h
+++ b/include/linux/vrange.h
@@ -7,6 +7,8 @@
#define VRANGE_NONVOLATILE 0
#define VRANGE_VOLATILE 1

+extern int discard_vpage(struct page *page);
+
static inline swp_entry_t swp_entry_mk_vrange_purged(void)
{
return swp_entry(SWP_VRANGE_PURGED, 0);
diff --git a/mm/internal.h b/mm/internal.h
index 29e1e76..ea66bf9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -225,10 +225,8 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
-#endif
#else /* !CONFIG_MMU */
static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
{
diff --git a/mm/memory.c b/mm/memory.c
index 22dfa61..7ea9712 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -60,6 +60,7 @@
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/dma-debug.h>
+#include <linux/vrange.h>

#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -3643,6 +3644,8 @@ static int handle_pte_fault(struct mm_struct *mm,

entry = *pte;
if (!pte_present(entry)) {
+ swp_entry_t vrange_entry;
+retry:
if (pte_none(entry)) {
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
@@ -3652,6 +3655,24 @@ static int handle_pte_fault(struct mm_struct *mm,
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
+
+ vrange_entry = pte_to_swp_entry(entry);
+ if (unlikely(entry_is_vrange_purged(vrange_entry))) {
+ if (vma->vm_flags & VM_VOLATILE)
+ return VM_FAULT_SIGBUS;
+
+ /* zap pte */
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*pte, entry)))
+ goto unlock;
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ ptep_clear_flush(vma, address, pte);
+ pte_unmap_unlock(pte, ptl);
+ goto retry;
+ }
+
+
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
pte, pmd, flags, entry);
diff --git a/mm/rmap.c b/mm/rmap.c
index d9d4231..2b6f079 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -728,6 +728,11 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
referenced++;
}
pte_unmap_unlock(pte, ptl);
+ if (vma->vm_flags & VM_VOLATILE) {
+ pra->mapcount = 0;
+ pra->vm_flags |= VM_VOLATILE;
+ return SWAP_FAIL;
+ }
}

if (referenced) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a9c74b4..c5c0ee0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -43,6 +43,7 @@
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/prefetch.h>
+#include <linux/vrange.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -683,6 +684,7 @@ enum page_references {
PAGEREF_RECLAIM,
PAGEREF_RECLAIM_CLEAN,
PAGEREF_KEEP,
+ PAGEREF_DISCARD,
PAGEREF_ACTIVATE,
};

@@ -703,6 +705,13 @@ static enum page_references page_check_references(struct page *page,
if (vm_flags & VM_LOCKED)
return PAGEREF_RECLAIM;

+ /*
+ * If volatile page is reached on LRU's tail, we discard the
+ * page without considering recycle the page.
+ */
+ if (vm_flags & VM_VOLATILE)
+ return PAGEREF_DISCARD;
+
if (referenced_ptes) {
if (PageSwapBacked(page))
return PAGEREF_ACTIVATE;
@@ -930,6 +939,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
+ case PAGEREF_DISCARD:
+ if (may_enter_fs && discard_vpage(page) == 0)
+ goto free_it;
case PAGEREF_KEEP:
goto keep_locked;
case PAGEREF_RECLAIM:
diff --git a/mm/vrange.c b/mm/vrange.c
index 844571b..fc9906f 100644
--- a/mm/vrange.c
+++ b/mm/vrange.c
@@ -205,3 +205,100 @@ SYSCALL_DEFINE4(vrange, unsigned long, start,
out:
return ret;
}
+
+static void try_to_discard_one(struct page *page, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *pte;
+ pte_t pteval;
+ spinlock_t *ptl;
+ unsigned long addr;
+
+ VM_BUG_ON(!PageLocked(page));
+
+ addr = vma_address(page, vma);
+ pte = page_check_address(page, mm, addr, &ptl, 0);
+ if (!pte)
+ return;
+
+ BUG_ON(vma->vm_flags & (VM_SPECIAL|VM_LOCKED|VM_MIXEDMAP|VM_HUGETLB));
+
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ pteval = ptep_clear_flush(vma, addr, pte);
+
+ update_hiwater_rss(mm);
+ if (PageAnon(page))
+ dec_mm_counter(mm, MM_ANONPAGES);
+ else
+ dec_mm_counter(mm, MM_FILEPAGES);
+
+ page_remove_rmap(page);
+ page_cache_release(page);
+
+ set_pte_at(mm, addr, pte,
+ swp_entry_to_pte(swp_entry_mk_vrange_purged()));
+
+ pte_unmap_unlock(pte, ptl);
+ mmu_notifier_invalidate_page(mm, addr);
+
+}
+
+
+static int try_to_discard_anon_vpage(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff;
+
+ anon_vma = page_lock_anon_vma_read(page);
+ if (!anon_vma)
+ return -1;
+
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ /*
+ * During interating the loop, some processes could see a page as
+ * purged while others could see a page as not-purged because we have
+ * no global lock between parent and child for protecting vrange system
+ * call during this loop. But it's not a problem because the page is
+ * not *SHARED* page but *COW* page so parent and child can see other
+ * data anytime. The worst case by this race is a page was purged
+ * but couldn't be discarded so it makes unnecessary page fault but
+ * it wouldn't be severe.
+ */
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+
+ if (!(vma->vm_flags & VM_VOLATILE))
+ continue;
+ try_to_discard_one(page, vma);
+ }
+ page_unlock_anon_vma_read(anon_vma);
+ return 0;
+}
+
+
+static int try_to_discard_vpage(struct page *page)
+{
+ if (PageAnon(page))
+ return try_to_discard_anon_vpage(page);
+ return -1;
+}
+
+
+int discard_vpage(struct page *page)
+{
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageLRU(page));
+
+ if (!try_to_discard_vpage(page)) {
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+
+ if (page_freeze_refs(page, 1)) {
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ return 1;
+}
--
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/