Re: [RFC PATCH 10/10] mm/swap: optimize synchronous swapin

From: Barry Song
Date: Wed Mar 27 2024 - 04:09:33 EST


On Wed, Mar 27, 2024 at 8:06 AM Kairui Song <ryncsn@xxxxxxxxx> wrote:
>
> From: Kairui Song <kasong@xxxxxxxxxxx>
>
> Interestingly the major performance overhead of synchronous is actually
> from the workingset nodes update, that's because synchronous swap in
> keeps adding single folios into a xa_node, making the node no longer
> a shadow node and have to be removed from shadow_nodes, then remove
> the folio very shortly and making the node a shadow node again,
> so it has to add back to the shadow_nodes.

Hi Kairui,

Thank you for clarifying this. I'm unsure how it relates to SWP_SYNCHRONOUS_IO.
Does this observation apply universally to all instances where
__swap_count(entry)
== 1, even on devices not using SYNCHRONOUS_IO?


>
> Mark synchronous swapin folio with a special bit in swap entry embedded
> in folio->swap, as we still have some usable bits there. Skip workingset
> node update on insertion of such folio because it will be removed very
> quickly, and will trigger the update ensuring the workingset info is
> eventual consensus.
>
> Test result of sequential swapin/out of 30G zero page on ZRAM:
>
> Before (us) After (us)
> Swapout: 33853883 33886008
> Swapin: 38336519 32465441 (+15.4%)
> Swapout (THP): 6814619 6899938
> Swapin (THP) : 38383367 33193479 (+13.6%)
>
> Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
> ---
> include/linux/swapops.h | 5 +++-
> mm/filemap.c | 16 +++++++++---
> mm/memory.c | 34 ++++++++++++++----------
> mm/swap.h | 15 +++++++++++
> mm/swap_state.c | 57 ++++++++++++++++++++++++-----------------
> mm/vmscan.c | 6 +++++
> mm/workingset.c | 2 +-
> 7 files changed, 92 insertions(+), 43 deletions(-)
>
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 48b700ba1d18..ebc0c3e4668d 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -25,7 +25,10 @@
> * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
> */
> #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
> -#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
> +#define SWP_CACHE_FLAG_BITS 1
> +#define SWP_CACHE_SYNCHRONOUS BIT(SWP_TYPE_SHIFT - 1)
> +#define SWP_OFFSET_BITS (SWP_TYPE_SHIFT - SWP_CACHE_FLAG_BITS)
> +#define SWP_OFFSET_MASK (BIT(SWP_OFFSET_BITS) - 1)
>
> /*
> * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 5e8e3fd26b8d..ac24cc65d1da 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -923,12 +923,20 @@ int __filemap_add_swapcache(struct address_space *mapping, struct folio *folio,
> pgoff_t index, gfp_t gfp, void **shadowp)
> {
> XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
> + bool synchronous = swap_cache_test_synchronous(folio);
> long nr;
> int ret;
>
> VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
> VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> - mapping_set_update(&xas, mapping);
> +
> + /*
> + * Skip node update for synchronous folio insertion, it will be
> + * updated on folio deletion very soon, avoid repeated LRU locking.
> + */
> + if (!synchronous)
> + xas_set_update(&xas, workingset_update_node);
> + xas_set_lru(&xas, &shadow_nodes);
>
> nr = folio_nr_pages(folio);
> folio_ref_add(folio, nr);
> @@ -936,8 +944,10 @@ int __filemap_add_swapcache(struct address_space *mapping, struct folio *folio,
> ret = __filemap_lock_store(&xas, folio, index, gfp, shadowp);
> if (likely(!ret)) {
> mapping->nrpages += nr;
> - __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
> - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
> + if (!synchronous) {
> + __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
> + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
> + }
> xas_unlock_irq(&xas);
> } else {
> folio_put_refs(folio, nr);
> diff --git a/mm/memory.c b/mm/memory.c
> index 774a912eb46d..bb40202b4f29 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3933,6 +3933,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> struct swap_info_struct *si = NULL;
> rmap_t rmap_flags = RMAP_NONE;
> bool folio_allocated = false;
> + bool synchronous_io = false;
> bool exclusive = false;
> swp_entry_t entry;
> pte_t pte;
> @@ -4032,18 +4033,19 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> if (ret & VM_FAULT_RETRY)
> goto out_release;
>
> - if (swapcache) {
> - /*
> - * Make sure folio_free_swap() or swapoff did not release the
> - * swapcache from under us. The page pin, and pte_same test
> - * below, are not enough to exclude that. Even if it is still
> - * swapcache, we need to check that the page's swap has not
> - * changed.
> - */
> - if (unlikely(!folio_test_swapcache(folio) ||
> - page_swap_entry(page).val != entry.val))
> - goto out_page;
> + /*
> + * Make sure folio_free_swap() or swapoff did not release the
> + * swapcache from under us. The page pin, and pte_same test
> + * below, are not enough to exclude that. Even if it is still
> + * swapcache, we need to check that the page's swap has not
> + * changed.
> + */
> + if (unlikely(!folio_test_swapcache(folio) ||
> + (page_swap_entry(page).val & ~SWP_CACHE_SYNCHRONOUS) != entry.val))
> + goto out_page;
>
> + synchronous_io = swap_cache_test_synchronous(folio);
> + if (!synchronous_io) {
> /*
> * KSM sometimes has to copy on read faults, for example, if
> * page->index of !PageKSM() pages would be nonlinear inside the
> @@ -4105,9 +4107,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> */
> if (!folio_test_ksm(folio)) {
> exclusive = pte_swp_exclusive(vmf->orig_pte);
> - if (folio != swapcache) {
> + if (synchronous_io || folio != swapcache) {
> /*
> - * We have a fresh page that is not exposed to the
> + * We have a fresh page that is not sharable through the
> * swapcache -> certainly exclusive.
> */
> exclusive = true;
> @@ -4148,7 +4150,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> * yet.
> */
> swap_free(entry);
> - if (should_try_to_free_swap(folio, vma, vmf->flags))
> + if (synchronous_io)
> + delete_from_swap_cache(folio);
> + else if (should_try_to_free_swap(folio, vma, vmf->flags))
> folio_free_swap(folio);
>
> inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> @@ -4223,6 +4227,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> out_nomap:
> if (vmf->pte)
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> + if (synchronous_io)
> + delete_from_swap_cache(folio);
> out_page:
> folio_unlock(folio);
> out_release:
> diff --git a/mm/swap.h b/mm/swap.h
> index bd872b157950..9d106eebddbd 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -31,6 +31,21 @@ extern struct address_space *swapper_spaces[];
> (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
> >> SWAP_ADDRESS_SPACE_SHIFT])
>
> +static inline void swap_cache_mark_synchronous(struct folio *folio)
> +{
> + folio->swap.val |= SWP_CACHE_SYNCHRONOUS;
> +}
> +
> +static inline bool swap_cache_test_synchronous(struct folio *folio)
> +{
> + return folio->swap.val & SWP_CACHE_SYNCHRONOUS;
> +}
> +
> +static inline void swap_cache_clear_synchronous(struct folio *folio)
> +{
> + folio->swap.val &= ~SWP_CACHE_SYNCHRONOUS;
> +}
> +
> void show_swap_cache_info(void);
> bool add_to_swap(struct folio *folio);
> void *get_shadow_from_swap_cache(swp_entry_t entry);
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index cf178dd1131a..b0b1b5391ac1 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -86,7 +86,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
> * but sets SwapCache flag and private instead of mapping and index.
> */
> static int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
> - gfp_t gfp, void **shadowp)
> + gfp_t gfp, bool synchronous, void **shadowp)
> {
> struct address_space *address_space = swap_address_space(entry);
> pgoff_t idx = swp_offset(entry);
> @@ -98,11 +98,12 @@ static int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
>
> folio_set_swapcache(folio);
> folio->swap = entry;
> -
> + if (synchronous)
> + swap_cache_mark_synchronous(folio);
> ret = __filemap_add_swapcache(address_space, folio, idx, gfp, shadowp);
> if (ret) {
> - folio_clear_swapcache(folio);
> folio->swap.val = 0;
> + folio_clear_swapcache(folio);
> }
>
> return ret;
> @@ -129,11 +130,13 @@ void __delete_from_swap_cache(struct folio *folio,
> xas_set_order(&xas, idx, folio_order(folio));
> xas_store(&xas, shadow);
>
> - folio->swap.val = 0;
> folio_clear_swapcache(folio);
> address_space->nrpages -= nr;
> - __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
> - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
> + if (!swap_cache_test_synchronous(folio)) {
> + __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
> + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
> + }
> + folio->swap.val = 0;
> }
>
> /**
> @@ -393,7 +396,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
> * else or hitting OOM.
> */
> static struct folio *swap_cache_add_or_get(struct folio *folio,
> - swp_entry_t entry, gfp_t gfp_mask)
> + swp_entry_t entry, gfp_t gfp_mask, bool synchronous)
> {
> int ret = 0;
> void *shadow = NULL;
> @@ -403,7 +406,7 @@ static struct folio *swap_cache_add_or_get(struct folio *folio,
> if (folio) {
> __folio_set_locked(folio);
> __folio_set_swapbacked(folio);
> - ret = add_to_swap_cache(folio, entry, gfp_mask, &shadow);
> + ret = add_to_swap_cache(folio, entry, gfp_mask, synchronous, &shadow);
> if (ret)
> __folio_clear_locked(folio);
> }
> @@ -460,7 +463,7 @@ int swap_cache_add_wait(struct folio *folio, swp_entry_t entry, gfp_t gfp)
> struct folio *wait_folio;
>
> for (;;) {
> - ret = add_to_swap_cache(folio, entry, gfp, NULL);
> + ret = add_to_swap_cache(folio, entry, gfp, false, NULL);
> if (ret != -EEXIST)
> break;
> wait_folio = filemap_get_folio(swap_address_space(entry),
> @@ -493,7 +496,7 @@ struct folio *swap_cache_alloc_or_get(swp_entry_t entry, gfp_t gfp_mask,
> /* We are very likely the first user, alloc and try add to the swapcache. */
> folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, mpol, ilx,
> numa_node_id());
> - swapcache = swap_cache_add_or_get(folio, entry, gfp_mask);
> + swapcache = swap_cache_add_or_get(folio, entry, gfp_mask, false);
> if (swapcache != folio) {
> folio_put(folio);
> goto out_no_alloc;
> @@ -875,21 +878,27 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
> struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
> struct vm_fault *vmf, bool *folio_allocated)
> {
> - struct mempolicy *mpol;
> - struct folio *folio;
> - pgoff_t ilx;
> -
> - mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
> - folio = swap_cache_alloc_or_get(entry, gfp_mask, mpol, ilx,
> - folio_allocated);
> - mpol_cond_put(mpol);
> -
> - if (*folio_allocated)
> + struct folio *folio = NULL, *swapcache;
> + /* First do a racy check if cache is already loaded. */
> + swapcache = swap_cache_try_get(entry);
> + if (unlikely(swapcache))
> + goto out;
> + folio = vma_alloc_folio(gfp_mask, 0, vmf->vma, vmf->address, false);
> + swapcache = swap_cache_add_or_get(folio, entry, gfp_mask, true);
> + if (!swapcache)
> + goto out_nocache;
> + if (swapcache == folio) {
> swap_read_folio(folio, true, NULL);
> - else if (folio)
> - swap_cache_update_ra(folio, vmf->vma, vmf->address);
> -
> - return folio;
> + *folio_allocated = true;
> + return folio;
> + }
> +out:
> + swap_cache_update_ra(swapcache, vmf->vma, vmf->address);
> +out_nocache:
> + if (folio)
> + folio_put(folio);
> + *folio_allocated = false;
> + return swapcache;
> }
>
> /**
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c3db39393428..e71b049fee01 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1228,6 +1228,12 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> if (!add_to_swap(folio))
> goto activate_locked_split;
> }
> + } else if (swap_cache_test_synchronous(folio)) {
> + /*
> + * We see a folio being swapped in but not activated either
> + * due to missing shadow or lived too short, active it.
> + */
> + goto activate_locked;
> }
> } else if (folio_test_swapbacked(folio) &&
> folio_test_large(folio)) {
> diff --git a/mm/workingset.c b/mm/workingset.c
> index f2a0ecaf708d..83a0b409be0f 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -753,7 +753,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
> */
> if (WARN_ON_ONCE(!node->nr_values))
> goto out_invalid;
> - if (WARN_ON_ONCE(node->count != node->nr_values))
> + if (WARN_ON_ONCE(node->count != node->nr_values && mapping->host != NULL))
> goto out_invalid;
> xa_delete_node(node, workingset_update_node);
> __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
> --
> 2.43.0
>
>

Thanks
Barry