[PATCH 7/7] mm: Batch unmapping of pages that are in swap cache

From: Tim Chen
Date: Tue May 03 2016 - 17:03:42 EST


We created a new function __remove_swap_mapping_batch that
allows all pages under the same swap partition to be removed
from the swap cache's mapping in a single acquisition
of the mapping's tree lock.ÂÂThis reduces the contention
on the lock when multiple threads are reclaiming
memory by swapping to the same swap partition.

The handle_pgout_batch function is updated so all the
pages under the same swap partition are unmapped together
when the have been paged out.

Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---
Âmm/vmscan.c | 426 ++++++++++++++++++++++++++++++++++++++++--------------------
Â1 file changed, 286 insertions(+), 140 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9fc04e1..5e4b8ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -690,6 +690,103 @@ cannot_free:
 return 0;
Â}
Â
+/* use this only for swap mapped pages */
+static void __remove_swap_mapping_batch(struct page *pages[],
+ ÂÂÂÂbool reclaimed, short ret[], int nr)
+{
+ unsigned long flags;
+ struct page *page;
+ swp_entry_t swap[SWAP_BATCH];
+ struct address_space *mapping;
+
+ int i, batch_size;
+
+ if (nr <= 0)
+ return;
+
+ while (nr) {
+ mapping = page_mapping(pages[0]);
+ BUG_ON(!mapping);
+
+ batch_size = min(nr, SWAP_BATCH);
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ for (i = 0; i < batch_size; ++i) {
+ page = pages[i];
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!PageSwapCache(page));
+ BUG_ON(mapping != page_mapping(page));
+
+ /* stop batching if mapping changes */
+ if (mapping != page_mapping(page)) {
+ batch_size = i;
+ break;
+ }
+ /*
+ Â* The non racy check for a busy page.
+ Â*
+ Â* Must be careful with the order of the tests. When someone has
+ Â* a ref to the page, it may be possible that they dirty it then
+ Â* drop the reference. So if PageDirty is tested before page_count
+ Â* here, then the following race may occur:
+ Â*
+ Â* get_user_pages(&page);
+ Â* [user mapping goes away]
+ Â* write_to(page);
+ Â* !PageDirty(page)ÂÂÂÂ[good]
+ Â* SetPageDirty(page);
+ Â* put_page(page);
+ Â* !page_count(page)ÂÂÂ[good, discard it]
+ Â*
+ Â* [oops, our write_to data is lost]
+ Â*
+ Â* Reversing the order of the tests ensures such a situation cannot
+ Â* escape unnoticed. The smp_rmb is needed to ensure the page->flags
+ Â* load is not satisfied before that of page->_count.
+ Â*
+ Â* Note that if SetPageDirty is always performed via set_page_dirty,
+ Â* and thus under tree_lock, then this ordering is not required.
+ Â*/
+ if (!page_ref_freeze(page, 2))
+ goto cannot_free;
+ /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+ if (unlikely(PageDirty(page))) {
+ page_ref_unfreeze(page, 2);
+ goto cannot_free;
+ }
+
+ swap[i].val = page_private(page);
+ __delete_from_swap_cache(page);
+
+ ret[i] = 1;
+ continue;
+
+cannot_free:
+ ret[i] = 0;
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+ /* need to keep irq off for mem_cgroup accounting, don't restore flags yetÂÂ*/
+ local_irq_disable();
+ for (i = 0; i < batch_size; ++i) {
+ if (ret[i]) {
+ page = pages[i];
+ mem_cgroup_swapout(page, swap[i]);
+ }
+ }
+ local_irq_enable();
+
+ for (i = 0; i < batch_size; ++i) {
+ if (ret[i])
+ swapcache_free(swap[i]);
+ }
+ /* advance to next batch */
+ pages += batch_size;
+ ret += batch_size;
+ nr -= batch_size;
+ }
+}
Â/*
 * Attempt to detach a locked page from its ->mapping.ÂÂIf it is dirty or if
 * someone else has a ref on the page, abort and return 0.ÂÂIf it was
@@ -897,177 +994,226 @@ static void handle_pgout_batch(struct list_head *page_list,
 int nr)
Â{
 struct address_space *mapping;
+ struct page *umap_pages[SWAP_BATCH];
 struct page *page;
- int i;
-
- for (i = 0; i < nr; ++i) {
- page = pages[i];
- mapping =ÂÂpage_mapping(page);
+ int i, j, batch_size;
+ short umap_ret[SWAP_BATCH], idx[SWAP_BATCH];
+
+ while (nr) {
+ j = 0;
+ batch_size = min(nr, SWAP_BATCH);
+ mapping = NULL;
+
+ for (i = 0; i < batch_size; ++i) {
+ page = pages[i];
+
+ if (mapping) {
+ if (mapping != page_mapping(page)) {
+ /* mapping change, stop batch here */
+ batch_size = i;
+ break;
+ }
+ } else
+ mapping =ÂÂpage_mapping(page);
Â
- /* check outcome of cache addition */
- if (!ret[i]) {
- ret[i] = PG_ACTIVATE_LOCKED;
- continue;
- }
- /*
- Â* The page is mapped into the page tables of one or more
- Â* processes. Try to unmap it here.
- Â*/
- if (page_mapped(page) && mapping) {
- switch (swap_ret[i] = try_to_unmap(page, lazyfree ?
- (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
- (ttu_flags | TTU_BATCH_FLUSH))) {
- case SWAP_FAIL:
+ /* check outcome of cache addition */
+ if (!ret[i]) {
 ret[i] = PG_ACTIVATE_LOCKED;
 continue;
- case SWAP_AGAIN:
- ret[i] = PG_KEEP_LOCKED;
- continue;
- case SWAP_MLOCK:
- ret[i] = PG_MLOCKED;
- continue;
- case SWAP_LZFREE:
- goto lazyfree;
- case SWAP_SUCCESS:
- ; /* try to free the page below */
 }
- }
-
- if (PageDirty(page)) {
 /*
- Â* Only kswapd can writeback filesystem pages to
- Â* avoid risk of stack overflow but only writeback
- Â* if many dirty pages have been encountered.
+ Â* The page is mapped into the page tables of one or more
+ Â* processes. Try to unmap it here.
 Â*/
- if (page_is_file_cache(page) &&
- (!current_is_kswapd() ||
- Â!test_bit(ZONE_DIRTY, &zone->flags))) {
+ if (page_mapped(page) && mapping) {
+ switch (swap_ret[i] = try_to_unmap(page, lazyfree ?
+ (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
+ (ttu_flags | TTU_BATCH_FLUSH))) {
+ case SWAP_FAIL:
+ ret[i] = PG_ACTIVATE_LOCKED;
+ continue;
+ case SWAP_AGAIN:
+ ret[i] = PG_KEEP_LOCKED;
+ continue;
+ case SWAP_MLOCK:
+ ret[i] = PG_MLOCKED;
+ continue;
+ case SWAP_LZFREE:
+ goto lazyfree;
+ case SWAP_SUCCESS:
+ ; /* try to free the page below */
+ }
+ }
+
+ if (PageDirty(page)) {
 /*
- Â* Immediately reclaim when written back.
- Â* Similar in principal to deactivate_page()
- Â* except we already have the page isolated
- Â* and know it's dirty
+ Â* Only kswapd can writeback filesystem pages to
+ Â* avoid risk of stack overflow but only writeback
+ Â* if many dirty pages have been encountered.
 Â*/
- inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
- SetPageReclaim(page);
-
- ret[i] = PG_KEEP_LOCKED;
- continue;
- }
+ if (page_is_file_cache(page) &&
+ (!current_is_kswapd() ||
+ Â!test_bit(ZONE_DIRTY, &zone->flags))) {
+ /*
+ Â* Immediately reclaim when written back.
+ Â* Similar in principal to deactivate_page()
+ Â* except we already have the page isolated
+ Â* and know it's dirty
+ Â*/
+ inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+ SetPageReclaim(page);
Â
- if (references == PAGEREF_RECLAIM_CLEAN) {
- ret[i] = PG_KEEP_LOCKED;
- continue;
- }
- if (!may_enter_fs) {
- ret[i] = PG_KEEP_LOCKED;
- continue;
- }
- if (!sc->may_writepage) {
- ret[i] = PG_KEEP_LOCKED;
- continue;
- }
+ ret[i] = PG_KEEP_LOCKED;
+ continue;
+ }
Â
- /*
- Â* Page is dirty. Flush the TLB if a writable entry
- Â* potentially exists to avoid CPU writes after IO
- Â* starts and then write it out here.
- Â*/
- try_to_unmap_flush_dirty();
- switch (pageout(page, mapping, sc)) {
- case PAGE_KEEP:
- ret[i] = PG_KEEP_LOCKED;
- continue;
- case PAGE_ACTIVATE:
- ret[i] = PG_ACTIVATE_LOCKED;
- continue;
- case PAGE_SUCCESS:
- if (PageWriteback(page)) {
- ret[i] = PG_KEEP;
+ if (references == PAGEREF_RECLAIM_CLEAN) {
+ ret[i] = PG_KEEP_LOCKED;
+ continue;
+ }
+ if (!may_enter_fs) {
+ ret[i] = PG_KEEP_LOCKED;
 continue;
 }
- if (PageDirty(page)) {
- ret[i] = PG_KEEP;
+ if (!sc->may_writepage) {
+ ret[i] = PG_KEEP_LOCKED;
 continue;
 }
Â
 /*
- Â* A synchronous write - probably a ramdisk.ÂÂGo
- Â* ahead and try to reclaim the page.
+ Â* Page is dirty. Flush the TLB if a writable entry
+ Â* potentially exists to avoid CPU writes after IO
+ Â* starts and then write it out here.
 Â*/
- if (!trylock_page(page)) {
- ret[i] = PG_KEEP;
- continue;
- }
- if (PageDirty(page) || PageWriteback(page)) {
+ try_to_unmap_flush_dirty();
+ switch (pageout(page, mapping, sc)) {
+ case PAGE_KEEP:
 ret[i] = PG_KEEP_LOCKED;
 continue;
+ case PAGE_ACTIVATE:
+ ret[i] = PG_ACTIVATE_LOCKED;
+ continue;
+ case PAGE_SUCCESS:
+ if (PageWriteback(page)) {
+ ret[i] = PG_KEEP;
+ continue;
+ }
+ if (PageDirty(page)) {
+ ret[i] = PG_KEEP;
+ continue;
+ }
+
+ /*
+ Â* A synchronous write - probably a ramdisk.ÂÂGo
+ Â* ahead and try to reclaim the page.
+ Â*/
+ if (!trylock_page(page)) {
+ ret[i] = PG_KEEP;
+ continue;
+ }
+ if (PageDirty(page) || PageWriteback(page)) {
+ ret[i] = PG_KEEP_LOCKED;
+ continue;
+ }
+ mapping = page_mapping(page);
+ case PAGE_CLEAN:
+ ; /* try to free the page below */
 }
- mapping = page_mapping(page);
- case PAGE_CLEAN:
- ; /* try to free the page below */
 }
- }
Â
- /*
- Â* If the page has buffers, try to free the buffer mappings
- Â* associated with this page. If we succeed we try to free
- Â* the page as well.
- Â*
- Â* We do this even if the page is PageDirty().
- Â* try_to_release_page() does not perform I/O, but it is
- Â* possible for a page to have PageDirty set, but it is actually
- Â* clean (all its buffers are clean).ÂÂThis happens if the
- Â* buffers were written out directly, with submit_bh(). ext3
- Â* will do this, as well as the blockdev mapping.
- Â* try_to_release_page() will discover that cleanness and will
- Â* drop the buffers and mark the page clean - it can be freed.
- Â*
- Â* Rarely, pages can have buffers and no ->mapping.ÂÂThese are
- Â* the pages which were not successfully invalidated in
- Â* truncate_complete_page().ÂÂWe try to drop those buffers here
- Â* and if that worked, and the page is no longer mapped into
- Â* process address space (page_count == 1) it can be freed.
- Â* Otherwise, leave the page on the LRU so it is swappable.
- Â*/
- if (page_has_private(page)) {
- if (!try_to_release_page(page, sc->gfp_mask)) {
- ret[i] = PG_ACTIVATE_LOCKED;
+ /*
+ Â* If the page has buffers, try to free the buffer mappings
+ Â* associated with this page. If we succeed we try to free
+ Â* the page as well.
+ Â*
+ Â* We do this even if the page is PageDirty().
+ Â* try_to_release_page() does not perform I/O, but it is
+ Â* possible for a page to have PageDirty set, but it is actually
+ Â* clean (all its buffers are clean).ÂÂThis happens if the
+ Â* buffers were written out directly, with submit_bh(). ext3
+ Â* will do this, as well as the blockdev mapping.
+ Â* try_to_release_page() will discover that cleanness and will
+ Â* drop the buffers and mark the page clean - it can be freed.
+ Â*
+ Â* Rarely, pages can have buffers and no ->mapping.ÂÂThese are
+ Â* the pages which were not successfully invalidated in
+ Â* truncate_complete_page().ÂÂWe try to drop those buffers here
+ Â* and if that worked, and the page is no longer mapped into
+ Â* process address space (page_count == 1) it can be freed.
+ Â* Otherwise, leave the page on the LRU so it is swappable.
+ Â*/
+ if (page_has_private(page)) {
+ if (!try_to_release_page(page, sc->gfp_mask)) {
+ ret[i] = PG_ACTIVATE_LOCKED;
+ continue;
+ }
+ if (!mapping && page_count(page) == 1) {
+ unlock_page(page);
+ if (put_page_testzero(page)) {
+ ret[i] = PG_FREE;
+ continue;
+ } else {
+ /*
+ Â* rare race with speculative reference.
+ Â* the speculative reference will free
+ Â* this page shortly, so we may
+ Â* increment nr_reclaimed (and
+ Â* leave it off the LRU).
+ Â*/
+ ret[i] = PG_SPECULATIVE_REF;
+ continue;
+ }
+ }
+ }
+lazyfree:
+ if (!mapping) {
+ ret[i] = PG_KEEP_LOCKED;
 continue;
 }
- if (!mapping && page_count(page) == 1) {
- unlock_page(page);
- if (put_page_testzero(page)) {
- ret[i] = PG_FREE;
- continue;
- } else {
- /*
- Â* rare race with speculative reference.
- Â* the speculative reference will free
- Â* this page shortly, so we may
- Â* increment nr_reclaimed (and
- Â* leave it off the LRU).
- Â*/
- ret[i] = PG_SPECULATIVE_REF;
+ if (!PageSwapCache(page)) {
+ if (!__remove_mapping(mapping, page, true)) {
+ ret[i] = PG_KEEP_LOCKED;
 continue;
 }
+ __ClearPageLocked(page);
+ ret[i] = PG_FREE;
+ continue;
 }
+
+ /* note pages to be unmapped */
+ ret[i] = PG_UNKNOWN;
+ idx[j] = i;
+ umap_pages[j] = page;
+ ++j;
 }
-lazyfree:
- if (!mapping || !__remove_mapping(mapping, page, true)) {
- ret[i] = PG_KEEP_LOCKED;
- continue;
+
+ /* handle remaining pages that need to be unmapped */
+ __remove_swap_mapping_batch(umap_pages, true, umap_ret, j);
+
+ for (i = 0; i < j; ++i) {
+ if (!umap_ret[i]) {
+ /* unmap failed */
+ ret[idx[i]] = PG_KEEP_LOCKED;
+ continue;
+ }
+
+ page = umap_pages[i];
+ /*
+ Â* At this point, we have no other references and there is
+ Â* no way to pick any more up (removed from LRU, removed
+ Â* from pagecache). Can use non-atomic bitops now (and
+ Â* we obviously don't have to worry about waking up a process
+ Â* waiting on the page lock, because there are no references.
+ Â*/
+ __ClearPageLocked(page);
+ ret[idx[i]] = PG_FREE;
 }
Â
- /*
- Â* At this point, we have no other references and there is
- Â* no way to pick any more up (removed from LRU, removed
- Â* from pagecache). Can use non-atomic bitops now (and
- Â* we obviously don't have to worry about waking up a process
- Â* waiting on the page lock, because there are no references.
- Â*/
- __ClearPageLocked(page);
- ret[i] = PG_FREE;
+ /* advance pointers to next batch and remaining page count */
+ nr = nr - batch_size;
+ pages += batch_size;
+ ret += batch_size;
+ swap_ret += batch_size;
 }
Â}
Â
--Â
2.5.5