[PATCH 05/24] mm/swap: move readahead policy checking into swapin_readahead

From: Kairui Song
Date: Sun Nov 19 2023 - 14:48:34 EST


From: Kairui Song <kasong@xxxxxxxxxxx>

This makes swapin_readahead a main entry for swapin pages,
prepare for optimizations in later commits.

This also makes swapoff able to make use of readahead checking
based on entry. Swapping off a 10G ZRAM (lzo-rle) is faster:

Before:
time swapoff /dev/zram0
real 0m12.337s
user 0m0.001s
sys 0m12.329s

After:
time swapoff /dev/zram0
real 0m9.728s
user 0m0.001s
sys 0m9.719s

And what's more, because now swapoff will also make use of no-readahead
swapin helper, this also fixed a bug for no-readahead case (eg. ZRAM):
when a process that swapped out some memory previously was moved to a new
cgroup, and the original cgroup is dead, swapoff the swap device will
make the swapped in pages accounted into the process doing the swapoff
instead of the new cgroup the process was moved to.

This can be easily reproduced by:
- Setup a ramdisk (eg. ZRAM) swap.
- Create memory cgroup A, B and C.
- Spawn process P1 in cgroup A and make it swap out some pages.
- Move process P1 to memory cgroup B.
- Destroy cgroup A.
- Do a swapoff in cgroup C.
- Swapped in pages is accounted into cgroup C.

This patch will fix it make the swapped in pages accounted in cgroup B.

The same bug exists for readahead path too, we'll fix it in later
commits.

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
mm/memory.c | 22 +++++++---------------
mm/swap.h | 6 ++----
mm/swap_state.c | 33 ++++++++++++++++++++++++++-------
mm/swapfile.c | 2 +-
4 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index fba4a5229163..f4237a2e3b93 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3792,6 +3792,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
rmap_t rmap_flags = RMAP_NONE;
bool exclusive = false;
swp_entry_t entry;
+ bool swapcached;
pte_t pte;
vm_fault_t ret = 0;

@@ -3855,22 +3856,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
swapcache = folio;

if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
- __swap_count(entry) == 1) {
- /* skip swapcache and readahead */
- page = swapin_no_readahead(entry, GFP_HIGHUSER_MOVABLE,
- vmf);
- if (page)
- folio = page_folio(page);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ vmf, &swapcached);
+ if (page) {
+ folio = page_folio(page);
+ if (swapcached)
+ swapcache = folio;
} else {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
- vmf);
- if (page)
- folio = page_folio(page);
- swapcache = folio;
- }
-
- if (!folio) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
diff --git a/mm/swap.h b/mm/swap.h
index ea4be4791394..f82d43d7b52a 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -55,9 +55,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
- struct vm_fault *vmf);
-struct page *swapin_no_readahead(swp_entry_t entry, gfp_t flag,
- struct vm_fault *vmf);
+ struct vm_fault *vmf, bool *swapcached);

static inline unsigned int folio_swap_flags(struct folio *folio)
{
@@ -89,7 +87,7 @@ static inline struct page *swap_cluster_readahead(swp_entry_t entry,
}

static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
- struct vm_fault *vmf)
+ struct vm_fault *vmf, bool *swapcached)
{
return NULL;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 45dd8b7c195d..fd0047ae324e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -316,6 +316,11 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
release_pages(pages, nr);
}

+static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
+{
+ return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
+}
+
static inline bool swap_use_vma_readahead(void)
{
return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
@@ -861,8 +866,8 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
* Returns the struct page for entry and addr after the swap entry is read
* in.
*/
-struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_fault *vmf)
+static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
@@ -904,6 +909,8 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
* @entry: swap entry of this memory
* @gfp_mask: memory allocation flags
* @vmf: fault information
+ * @swapcached: pointer to a bool used as indicator if the
+ * page is swapped in through swapcache.
*
* Returns the struct page for entry and addr, after queueing swapin.
*
@@ -912,17 +919,29 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
* or vma-based(ie, virtual address based on faulty address) readahead.
*/
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_fault *vmf)
+ struct vm_fault *vmf, bool *swapcached)
{
struct mempolicy *mpol;
- pgoff_t ilx;
struct page *page;
+ pgoff_t ilx;
+ bool cached;

mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
- page = swap_use_vma_readahead() ?
- swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
- swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+ if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
+ page = swapin_no_readahead(entry, gfp_mask, vmf);
+ cached = false;
+ } else if (swap_use_vma_readahead()) {
+ page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
+ cached = true;
+ } else {
+ page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+ cached = true;
+ }
mpol_cond_put(mpol);
+
+ if (swapcached)
+ *swapcached = cached;
+
return page;
}

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 756104ebd585..0142bfc71b81 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1874,7 +1874,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
};

page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
- &vmf);
+ &vmf, NULL);
if (page)
folio = page_folio(page);
}
--
2.42.0