[RFC PATCH 04/10] mm/swap: remove cache bypass swapin

From: Kairui Song
Date: Tue Mar 26 2024 - 15:05:16 EST


From: Kairui Song <kasong@xxxxxxxxxxx>

We used to have the cache bypass swapin path for better performance,
but by removing it, more optimization can be applied and have
an even better overall performance and less hackish.

And these optimizations are not easily doable or not doable at all
without this.

This patch simply removes it, and the performance will drop heavily
for simple swapin, things won't get this worse for real workloads
but still observable. Following commits will fix this and archive a
better performance.

Swapout/in 30G zero pages from ZRAM (This mostly measures overhead
of swap path itself, because zero pages are not compressed but simply
recorded in ZRAM, and performance drops more as SWAP device is getting
full):

Test result of sequential swapin/out:

Before (us) After (us)
Swapout: 33619409 33624641
Swapin: 32393771 41614858 (-28.4%)
Swapout (THP): 7817909 7795530
Swapin (THP) : 32452387 41708471 (-28.4%)

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
mm/memory.c | 18 ++++-------------
mm/swap.h | 10 +++++-----
mm/swap_state.c | 53 ++++++++++---------------------------------------
mm/swapfile.c | 13 ------------
4 files changed, 19 insertions(+), 75 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index dfdb620a9123..357d239ee2f6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3932,7 +3932,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
- bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -4000,14 +3999,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
- /* skip swapcache and readahead */
folio = swapin_direct(entry, GFP_HIGHUSER_MOVABLE, vmf);
- if (PTR_ERR(folio) == -EBUSY)
- goto out;
- need_clear_cache = true;
} else {
folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
- swapcache = folio;
}

if (!folio) {
@@ -4023,6 +4017,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto unlock;
}

+ swapcache = folio;
page = folio_file_page(folio, swp_offset(entry));

/* Had to read the page from swap area: Major fault */
@@ -4187,7 +4182,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->orig_pte = pte;

/* ksm created a completely new copy */
- if (unlikely(folio != swapcache && swapcache)) {
+ if (unlikely(folio != swapcache)) {
folio_add_new_anon_rmap(folio, vma, vmf->address);
folio_add_lru_vma(folio, vma);
} else {
@@ -4201,7 +4196,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);

folio_unlock(folio);
- if (folio != swapcache && swapcache) {
+ if (folio != swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
@@ -4227,9 +4222,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- /* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache)
- swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4240,12 +4232,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(folio);
out_release:
folio_put(folio);
- if (folio != swapcache && swapcache) {
+ if (folio != swapcache) {
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache)
- swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
diff --git a/mm/swap.h b/mm/swap.h
index aee134907a70..ac9573b03432 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,7 +41,6 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -100,14 +99,15 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
{
return NULL;
}
-
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline struct folio *swapin_direct(swp_entry_t entry, gfp_t flag,
+ struct vm_fault *vmf);
{
- return 0;
+ return NULL;
}

-static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
{
+ return 0;
}

static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2a9c6bdff5ea..49ef6250f676 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -880,61 +880,28 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
}

/**
- * swapin_direct - swap in folios skipping swap cache and readahead
+ * swapin_direct - swap in folios skipping readahead
* @entry: swap entry of this memory
* @gfp_mask: memory allocation flags
* @vmf: fault information
*
- * Returns the struct folio for entry and addr after the swap entry is read
- * in.
+ * Returns the folio for entry after it is read in.
*/
struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
- struct vm_area_struct *vma = vmf->vma;
+ struct mempolicy *mpol;
struct folio *folio;
- void *shadow = NULL;
-
- /*
- * Prevent parallel swapin from proceeding with
- * the cache flag. Otherwise, another thread may
- * finish swapin first, free the entry, and swapout
- * reusing the same entry. It's undetectable as
- * pte_same() returns true due to entry reuse.
- */
- if (swapcache_prepare(entry)) {
- /* Relax a bit to prevent rapid repeated page faults */
- schedule_timeout_uninterruptible(1);
- return ERR_PTR(-EBUSY);
- }
-
- /* skip swapcache */
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
- vma, vmf->address, false);
- if (folio) {
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
- if (mem_cgroup_swapin_charge_folio(folio,
- vma->vm_mm, GFP_KERNEL,
- entry)) {
- folio_unlock(folio);
- folio_put(folio);
- return NULL;
- }
- mem_cgroup_swapin_uncharge_swap(entry);
-
- shadow = get_shadow_from_swap_cache(entry);
- if (shadow)
- workingset_refault(folio, shadow);
+ bool page_allocated;
+ pgoff_t ilx;

- folio_add_lru(folio);
+ mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
+ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+ &page_allocated, false);
+ mpol_cond_put(mpol);

- /* To provide entry to swap_read_folio() */
- folio->swap = entry;
+ if (page_allocated)
swap_read_folio(folio, true, NULL);
- folio->private = NULL;
- }

return folio;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4dd894395a0f..ae8d3aa05df7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3389,19 +3389,6 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}

-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
-{
- struct swap_cluster_info *ci;
- unsigned long offset = swp_offset(entry);
- unsigned char usage;
-
- ci = lock_cluster_or_swap_info(si, offset);
- usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
- unlock_cluster_or_swap_info(si, ci);
- if (!usage)
- free_swap_slot(entry);
-}
-
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
--
2.43.0