Re: [PATCH v2 9/9] mm/swap, shmem: use new swapin helper to skip readahead conditionally

From: Huang, Ying
Date: Mon Jan 08 2024 - 21:05:47 EST


Kairui Song <ryncsn@xxxxxxxxx> writes:

> From: Kairui Song <kasong@xxxxxxxxxxx>
>
> Currently, shmem uses cluster readahead for all swap backends. Cluster
> readahead is not a good solution for ramdisk based device (ZRAM) at all.
>
> After switching to the new helper, most benchmarks showed a good result:
>
> - Single file sequence read:
> perf stat --repeat 20 dd if=/tmpfs/test of=/dev/null bs=1M count=8192
> (/tmpfs/test is a zero filled file, using brd as swap, 4G memcg limit)
> Before: 22.248 +- 0.549
> After: 22.021 +- 0.684 (-1.1%)
>
> - Random read stress test:
> fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
> --size=256m --ioengine=mmap --rw=randread --random_distribution=random \
> --time_based --ramp_time=1m --runtime=5m --group_reporting
> (using brd as swap, 2G memcg limit)
>
> Before: 1818MiB/s
> After: 1888MiB/s (+3.85%)
>
> - Zipf biased random read stress test:
> fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
> --size=256m --ioengine=mmap --rw=randread --random_distribution=zipf:1.2 \
> --time_based --ramp_time=1m --runtime=5m --group_reporting
> (using brd as swap, 2G memcg limit)
>
> Before: 31.1GiB/s
> After: 32.3GiB/s (+3.86%)
>
> So cluster readahead doesn't help much even for single sequence read,
> and for random stress test, the performance is better without it.
>
> Considering both memory and swap device will get more fragmented
> slowly, and commonly used ZRAM consumes much more CPU than plain
> ramdisk, false readahead could occur more frequently and waste
> more CPU. Direct SWAP is cheaper, so use the new helper and skip
> read ahead for SWP_SYNCHRONOUS_IO device.

It's good to take advantage of swap_direct (no readahead). I also hopes
we can take advantage of VMA based swapin if shmem is accessed via mmap.
That appears possible.

--
Best Regards,
Huang, Ying

> Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
> ---
> mm/shmem.c | 67 +++++++++++++++++++++++++------------------------
> mm/swap.h | 9 -------
> mm/swap_state.c | 11 ++++++--
> 3 files changed, 43 insertions(+), 44 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9da9f7a0e620..3c0729fe934d 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1564,20 +1564,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
> static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
> pgoff_t index, unsigned int order, pgoff_t *ilx);
>
> -static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
> - struct shmem_inode_info *info, pgoff_t index)
> -{
> - struct mempolicy *mpol;
> - pgoff_t ilx;
> - struct folio *folio;
> -
> - mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> - folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
> - mpol_cond_put(mpol);
> -
> - return folio;
> -}
> -
> /*
> * Make sure huge_gfp is always more limited than limit_gfp.
> * Some of the flags set permissions, while others set limitations.
> @@ -1851,9 +1837,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> {
> struct address_space *mapping = inode->i_mapping;
> struct shmem_inode_info *info = SHMEM_I(inode);
> + enum swap_cache_result cache_result;
> struct swap_info_struct *si;
> struct folio *folio = NULL;
> + struct mempolicy *mpol;
> swp_entry_t swap;
> + pgoff_t ilx;
> int error;
>
> VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
> @@ -1871,36 +1860,40 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> return -EINVAL;
> }
>
> - /* Look it up and read it in.. */
> - folio = swap_cache_get_folio(swap, NULL, 0, NULL);
> + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> + folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result);
> + mpol_cond_put(mpol);
> +
> if (!folio) {
> - /* Or update major stats only when swapin succeeds?? */
> + error = -ENOMEM;
> + goto failed;
> + }
> + if (cache_result != SWAP_CACHE_HIT) {
> if (fault_type) {
> *fault_type |= VM_FAULT_MAJOR;
> count_vm_event(PGMAJFAULT);
> count_memcg_event_mm(fault_mm, PGMAJFAULT);
> }
> - /* Here we actually start the io */
> - folio = shmem_swapin_cluster(swap, gfp, info, index);
> - if (!folio) {
> - error = -ENOMEM;
> - goto failed;
> - }
> }
>
> /* We have to do this with folio locked to prevent races */
> folio_lock(folio);
> - if (!folio_test_swapcache(folio) ||
> - folio->swap.val != swap.val ||
> - !shmem_confirm_swap(mapping, index, swap)) {
> + if (cache_result != SWAP_CACHE_BYPASS) {
> + /* With cache bypass, folio is new allocated, sync, and not in cache */
> + if (!folio_test_swapcache(folio) || folio->swap.val != swap.val) {
> + error = -EEXIST;
> + goto unlock;
> + }
> + if (!folio_test_uptodate(folio)) {
> + error = -EIO;
> + goto failed;
> + }
> + folio_wait_writeback(folio);
> + }
> + if (!shmem_confirm_swap(mapping, index, swap)) {
> error = -EEXIST;
> goto unlock;
> }
> - if (!folio_test_uptodate(folio)) {
> - error = -EIO;
> - goto failed;
> - }
> - folio_wait_writeback(folio);
>
> /*
> * Some architectures may have to restore extra metadata to the
> @@ -1908,12 +1901,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> */
> arch_swap_restore(swap, folio);
>
> - if (shmem_should_replace_folio(folio, gfp)) {
> + /* With cache bypass, folio is new allocated and always respect gfp flags */
> + if (cache_result != SWAP_CACHE_BYPASS && shmem_should_replace_folio(folio, gfp)) {
> error = shmem_replace_folio(&folio, gfp, info, index);
> if (error)
> goto failed;
> }
>
> + /*
> + * The expected value checking below should be enough to ensure
> + * only one up-to-date swapin success. swap_free() is called after
> + * this, so the entry can't be reused. As long as the mapping still
> + * has the old entry value, it's never swapped in or modified.
> + */
> error = shmem_add_to_page_cache(folio, mapping, index,
> swp_to_radix_entry(swap), gfp);
> if (error)
> @@ -1924,7 +1924,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> if (sgp == SGP_WRITE)
> folio_mark_accessed(folio);
>
> - delete_from_swap_cache(folio);
> + if (cache_result != SWAP_CACHE_BYPASS)
> + delete_from_swap_cache(folio);
> folio_mark_dirty(folio);
> swap_free(swap);
> put_swap_device(si);
> diff --git a/mm/swap.h b/mm/swap.h
> index 8f790a67b948..20f4048c971c 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -57,9 +57,6 @@ void __delete_from_swap_cache(struct folio *folio,
> void delete_from_swap_cache(struct folio *folio);
> void clear_shadow_from_swap_cache(int type, unsigned long begin,
> unsigned long end);
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> - struct vm_area_struct *vma, unsigned long addr,
> - void **shadowp);
> struct folio *filemap_get_incore_folio(struct address_space *mapping,
> pgoff_t index);
>
> @@ -123,12 +120,6 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
> return 0;
> }
>
> -static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
> - struct vm_area_struct *vma, unsigned long addr)
> -{
> - return NULL;
> -}
> -
> static inline
> struct folio *filemap_get_incore_folio(struct address_space *mapping,
> pgoff_t index)
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 3edf4b63158d..10eec68475dd 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -318,7 +318,14 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
>
> static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
> {
> - return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
> + int count;
> +
> + if (!data_race(si->flags & SWP_SYNCHRONOUS_IO))
> + return false;
> +
> + count = __swap_count(entry);
> +
> + return (count == 1 || count == SWAP_MAP_SHMEM);
> }
>
> static inline bool swap_use_vma_readahead(void)
> @@ -334,7 +341,7 @@ static inline bool swap_use_vma_readahead(void)
> *
> * Caller must lock the swap device or hold a reference to keep it valid.
> */
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> +static struct folio *swap_cache_get_folio(swp_entry_t entry,
> struct vm_area_struct *vma, unsigned long addr, void **shadowp)
> {
> struct folio *folio;