[PATCH 18/24] mm/swap: introduce a helper non fault swapin

From: Kairui Song
Date: Sun Nov 19 2023 - 14:50:02 EST


From: Kairui Song <kasong@xxxxxxxxxxx>

There are two places where swapin is not direct caused by page fault:
shmem swapin is invoked through shmem mapping, swapoff cause swapin by
walking the page table. They used to construct a pseudo vmfault struct
for swapin function.

Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d
("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff
path is still using a pseudo vmfault.

Introduce a helper for them both, this help save stack usage for swapoff
path, and help apply a unified swapin cache and readahead policy check.

Also prepare for follow up commits.

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
mm/shmem.c | 51 ++++++++++++++++---------------------------------
mm/swap.h | 11 +++++++++++
mm/swap_state.c | 38 ++++++++++++++++++++++++++++++++++++
mm/swapfile.c | 23 +++++++++++-----------
4 files changed, 76 insertions(+), 47 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index f9ce4067c742..81d129aa66d1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1565,22 +1565,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
pgoff_t index, unsigned int order, pgoff_t *ilx);

-static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- struct mempolicy *mpol;
- pgoff_t ilx;
- struct page *page;
-
- mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
- page = swap_cluster_readahead(swap, gfp, mpol, ilx);
- mpol_cond_put(mpol);
-
- if (!page)
- return NULL;
- return page_folio(page);
-}
-
/*
* Make sure huge_gfp is always more limited than limit_gfp.
* Some of the flags set permissions, while others set limitations.
@@ -1854,9 +1838,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct swap_info_struct *si;
+ enum swap_cache_result result;
struct folio *folio = NULL;
+ struct mempolicy *mpol;
+ struct page *page;
swp_entry_t swap;
+ pgoff_t ilx;
int error;

VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
@@ -1866,34 +1853,30 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (is_poisoned_swp_entry(swap))
return -EIO;

- si = get_swap_device(swap);
- if (!si) {
+ mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ page = swapin_page_non_fault(swap, gfp, mpol, ilx, fault_mm, &result);
+ mpol_cond_put(mpol);
+
+ if (PTR_ERR(page) == -EBUSY) {
if (!shmem_confirm_swap(mapping, index, swap))
return -EEXIST;
else
return -EINVAL;
- }
-
- /* Look it up and read it in.. */
- folio = swap_cache_get_folio(swap, NULL, NULL);
- if (!folio) {
- /* Or update major stats only when swapin succeeds?? */
- if (fault_type) {
+ } else if (!page) {
+ error = -ENOMEM;
+ goto failed;
+ } else {
+ folio = page_folio(page);
+ if (fault_type && result != SWAP_CACHE_HIT) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
- /* Here we actually start the io */
- folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
- error = -ENOMEM;
- goto failed;
- }
}

/* We have to do this with folio locked to prevent races */
folio_lock(folio);
- if (!folio_test_swapcache(folio) ||
+ if ((result != SWAP_CACHE_BYPASS && !folio_test_swapcache(folio)) ||
folio->swap.val != swap.val ||
!shmem_confirm_swap(mapping, index, swap)) {
error = -EEXIST;
@@ -1930,7 +1913,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
delete_from_swap_cache(folio);
folio_mark_dirty(folio);
swap_free(swap);
- put_swap_device(si);

*foliop = folio;
return 0;
@@ -1944,7 +1926,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
folio_unlock(folio);
folio_put(folio);
}
- put_swap_device(si);

return error;
}
diff --git a/mm/swap.h b/mm/swap.h
index da9deb5ba37d..b073c29c9790 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -62,6 +62,10 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf, enum swap_cache_result *result);
+struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+ struct mempolicy *mpol, pgoff_t ilx,
+ struct mm_struct *mm,
+ enum swap_cache_result *result);

static inline unsigned int folio_swap_flags(struct folio *folio)
{
@@ -103,6 +107,13 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}

+static inline struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+ struct mempolicy *mpol, pgoff_t ilx, struct mm_struct *mm,
+ enum swap_cache_result *result)
+{
+ return NULL;
+}
+
static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
{
return 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ff8a166603d0..eef66757c615 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -956,6 +956,44 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
return page;
}

+struct page *swapin_page_non_fault(swp_entry_t entry, gfp_t gfp_mask,
+ struct mempolicy *mpol, pgoff_t ilx,
+ struct mm_struct *mm, enum swap_cache_result *result)
+{
+ enum swap_cache_result cache_result;
+ struct swap_info_struct *si;
+ void *shadow = NULL;
+ struct folio *folio;
+ struct page *page;
+
+ /* Prevent swapoff from happening to us */
+ si = get_swap_device(entry);
+ if (unlikely(!si))
+ return ERR_PTR(-EBUSY);
+
+ folio = swap_cache_get_folio(entry, NULL, &shadow);
+ if (folio) {
+ page = folio_file_page(folio, swp_offset(entry));
+ cache_result = SWAP_CACHE_HIT;
+ goto done;
+ }
+
+ if (swap_use_no_readahead(si, swp_offset(entry))) {
+ page = swapin_no_readahead(entry, gfp_mask, mpol, ilx, mm);
+ if (shadow)
+ workingset_refault(page_folio(page), shadow);
+ cache_result = SWAP_CACHE_BYPASS;
+ } else {
+ page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+ cache_result = SWAP_CACHE_MISS;
+ }
+done:
+ put_swap_device(si);
+ if (result)
+ *result = cache_result;
+ return page;
+}
+
#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 925ad92486a4..f8c5096fe0f0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1822,20 +1822,15 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,

si = swap_info[type];
do {
+ int ret;
+ pte_t ptent;
+ pgoff_t ilx;
+ swp_entry_t entry;
struct page *page;
unsigned long offset;
+ struct mempolicy *mpol;
unsigned char swp_count;
struct folio *folio = NULL;
- swp_entry_t entry;
- int ret;
- pte_t ptent;
-
- struct vm_fault vmf = {
- .vma = vma,
- .address = addr,
- .real_address = addr,
- .pmd = pmd,
- };

if (!pte++) {
pte = pte_offset_map(pmd, addr);
@@ -1855,8 +1850,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
offset = swp_offset(entry);
pte_unmap(pte);
pte = NULL;
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
- &vmf, NULL);
+
+ mpol = get_vma_policy(vma, addr, 0, &ilx);
+ page = swapin_page_non_fault(entry, GFP_HIGHUSER_MOVABLE,
+ mpol, ilx, vma->vm_mm, NULL);
+ mpol_cond_put(mpol);
+
if (IS_ERR(page))
return PTR_ERR(page);
else if (page)
--
2.42.0