Re: [PATCH v3 3/3] mm: Batch-zap large anonymous folio PTE mappings

From: Nathan Chancellor
Date: Wed Jul 26 2023 - 12:20:12 EST


Hi Ryan,

On Thu, Jul 20, 2023 at 12:29:55PM +0100, Ryan Roberts wrote:
> This allows batching the rmap removal with folio_remove_rmap_range(),
> which means we avoid spuriously adding a partially unmapped folio to the
> deferred split queue in the common case, which reduces split queue lock
> contention.
>
> Previously each page was removed from the rmap individually with
> page_remove_rmap(). If the first page belonged to a large folio, this
> would cause page_remove_rmap() to conclude that the folio was now
> partially mapped and add the folio to the deferred split queue. But
> subsequent calls would cause the folio to become fully unmapped, meaning
> there is no value to adding it to the split queue.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@xxxxxxx>
> ---
> mm/memory.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 120 insertions(+)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 01f39e8144ef..189b1cfd823d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1391,6 +1391,94 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
> pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
> }
>
> +static inline unsigned long page_cont_mapped_vaddr(struct page *page,
> + struct page *anchor, unsigned long anchor_vaddr)
> +{
> + unsigned long offset;
> + unsigned long vaddr;
> +
> + offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
> + vaddr = anchor_vaddr + offset;
> +
> + if (anchor > page) {
> + if (vaddr > anchor_vaddr)
> + return 0;
> + } else {
> + if (vaddr < anchor_vaddr)
> + return ULONG_MAX;
> + }
> +
> + return vaddr;
> +}
> +
> +static int folio_nr_pages_cont_mapped(struct folio *folio,
> + struct page *page, pte_t *pte,
> + unsigned long addr, unsigned long end)
> +{
> + pte_t ptent;
> + int floops;
> + int i;
> + unsigned long pfn;
> + struct page *folio_end;
> +
> + if (!folio_test_large(folio))
> + return 1;
> +
> + folio_end = &folio->page + folio_nr_pages(folio);
> + end = min(page_cont_mapped_vaddr(folio_end, page, addr), end);
> + floops = (end - addr) >> PAGE_SHIFT;
> + pfn = page_to_pfn(page);
> + pfn++;
> + pte++;
> +
> + for (i = 1; i < floops; i++) {
> + ptent = ptep_get(pte);
> +
> + if (!pte_present(ptent) || pte_pfn(ptent) != pfn)
> + break;
> +
> + pfn++;
> + pte++;
> + }
> +
> + return i;
> +}
> +
> +static unsigned long try_zap_anon_pte_range(struct mmu_gather *tlb,
> + struct vm_area_struct *vma,
> + struct folio *folio,
> + struct page *page, pte_t *pte,
> + unsigned long addr, int nr_pages,
> + struct zap_details *details)
> +{
> + struct mm_struct *mm = tlb->mm;
> + pte_t ptent;
> + bool full;
> + int i;
> +
> + for (i = 0; i < nr_pages;) {
> + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
> + tlb_remove_tlb_entry(tlb, pte, addr);
> + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
> + full = __tlb_remove_page(tlb, page, 0);
> +
> + if (unlikely(page_mapcount(page) < 1))
> + print_bad_pte(vma, addr, ptent, page);
> +
> + i++;
> + page++;
> + pte++;
> + addr += PAGE_SIZE;
> +
> + if (unlikely(full))
> + break;
> + }
> +
> + folio_remove_rmap_range(folio, page - i, i, vma);
> +
> + return i;
> +}
> +
> static unsigned long zap_pte_range(struct mmu_gather *tlb,
> struct vm_area_struct *vma, pmd_t *pmd,
> unsigned long addr, unsigned long end,
> @@ -1428,6 +1516,38 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> page = vm_normal_page(vma, addr, ptent);
> if (unlikely(!should_zap_page(details, page)))
> continue;
> +
> + /*
> + * Batch zap large anonymous folio mappings. This allows
> + * batching the rmap removal, which means we avoid
> + * spuriously adding a partially unmapped folio to the
> + * deferrred split queue in the common case, which
> + * reduces split queue lock contention.
> + */
> + if (page && PageAnon(page)) {
> + struct folio *folio = page_folio(page);
> + int nr_pages_req, nr_pages;
> +
> + nr_pages_req = folio_nr_pages_cont_mapped(
> + folio, page, pte, addr, end);
> +
> + nr_pages = try_zap_anon_pte_range(tlb, vma,
> + folio, page, pte, addr,
> + nr_pages_req, details);
> +
> + rss[mm_counter(page)] -= nr_pages;
> + nr_pages--;
> + pte += nr_pages;
> + addr += nr_pages << PAGE_SHIFT;
> +
> + if (unlikely(nr_pages < nr_pages_req)) {
> + force_flush = 1;
> + addr += PAGE_SIZE;
> + break;
> + }
> + continue;
> + }
> +
> ptent = ptep_get_and_clear_full(mm, addr, pte,
> tlb->fullmm);
> tlb_remove_tlb_entry(tlb, pte, addr);
> --
> 2.25.1
>

After this change in -next as commit 904d9713b3b0 ("mm: batch-zap large
anonymous folio PTE mappings"), I see the following splats several times
when booting Debian's s390x configuration (which I have mirrored at [1])
in QEMU (bisect log below):

$ qemu-system-s390x \
-display none \
-nodefaults \
-M s390-ccw-virtio \
-kernel arch/s390/boot/bzImage \
-initrd rootfs.cpio \
-m 512m \
-serial mon:stdio
KASLR disabled: CPU has no PRNG
KASLR disabled: CPU has no PRNG
[ 2.502282] Linux version 6.5.0-rc3+ (nathan@dev-arch.thelio-3990X) (s390-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jul 26 09:14:20 MST 2023
...
[ 3.406011] Freeing initrd memory: 7004K
[ 3.492739] BUG: Bad page state in process modprobe pfn:01b18
[ 3.492909] page:00000000233d9f2f refcount:0 mapcount:1 mapping:0000000000000000 index:0xdb pfn:0x1b18
[ 3.492998] flags: 0xa0004(uptodate|mappedtodisk|swapbacked|zone=0)
[ 3.493195] page_type: 0x0()
[ 3.493457] raw: 00000000000a0004 0000000000000100 0000000000000122 0000000000000000
[ 3.493492] raw: 00000000000000db 0000000000000000 0000000000000000 0000000000000000
[ 3.493525] page dumped because: nonzero mapcount
[ 3.493549] Modules linked in:
[ 3.493719] CPU: 0 PID: 38 Comm: modprobe Not tainted 6.5.0-rc3+ #1
[ 3.493814] Hardware name: QEMU 8561 QEMU (KVM/Linux)
[ 3.493892] Call Trace:
[ 3.494117] [<0000000000add35a>] dump_stack_lvl+0x62/0x88
[ 3.494333] [<00000000003d565a>] bad_page+0x8a/0x130
[ 3.494355] [<00000000003d6728>] free_unref_page_prepare+0x268/0x3d8
[ 3.494375] [<00000000003d9408>] free_unref_page+0x48/0x140
[ 3.494394] [<00000000003ad99c>] unmap_page_range+0x924/0x1388
[ 3.494412] [<00000000003ae54c>] unmap_vmas+0x14c/0x200
[ 3.494429] [<00000000003be2f2>] exit_mmap+0xba/0x3a0
[ 3.494447] [<0000000000147000>] __mmput+0x50/0x180
[ 3.494466] [<0000000000152a00>] do_exit+0x320/0xb40
[ 3.494484] [<0000000000153450>] do_group_exit+0x40/0xb8
[ 3.494502] [<00000000001534f6>] __s390x_sys_exit_group+0x2e/0x30
[ 3.494520] [<0000000000b05080>] __do_syscall+0x1e8/0x210
[ 3.494539] [<0000000000b15970>] system_call+0x70/0x98
[ 3.494663] Disabling lock debugging due to kernel taint
[ 3.494809] BUG: Bad page map in process modprobe pte:01b1831f pmd:1fff9000
[ 3.494833] page:00000000233d9f2f refcount:0 mapcount:0 mapping:0000000000000000 index:0xdb pfn:0x1b18
[ 3.494852] flags: 0xa0004(uptodate|mappedtodisk|swapbacked|zone=0)
[ 3.494866] page_type: 0xffffffff()
[ 3.494882] raw: 00000000000a0004 0000000000000100 0000000000000122 0000000000000000
[ 3.494898] raw: 00000000000000db 0000000000000000 ffffffff00000000 0000000000000000
[ 3.494908] page dumped because: bad pte
[ 3.494923] addr:000002aa1d75c000 vm_flags:08100071 anon_vma:000000001fffc340 mapping:000000000286d6b8 index:db
[ 3.494983] file:busybox fault:shmem_fault mmap:shmem_mmap read_folio:0x0
[ 3.495247] CPU: 0 PID: 38 Comm: modprobe Tainted: G B 6.5.0-rc3+ #1
[ 3.495267] Hardware name: QEMU 8561 QEMU (KVM/Linux)
[ 3.495277] Call Trace:
[ 3.495285] [<0000000000add35a>] dump_stack_lvl+0x62/0x88
[ 3.495307] [<00000000003ab30e>] print_bad_pte+0x176/0x2c8
[ 3.495324] [<00000000003ae098>] unmap_page_range+0x1020/0x1388
[ 3.495341] [<00000000003ae54c>] unmap_vmas+0x14c/0x200
[ 3.495357] [<00000000003be2f2>] exit_mmap+0xba/0x3a0
[ 3.495375] [<0000000000147000>] __mmput+0x50/0x180
[ 3.495394] [<0000000000152a00>] do_exit+0x320/0xb40
[ 3.495411] [<0000000000153450>] do_group_exit+0x40/0xb8
[ 3.495429] [<00000000001534f6>] __s390x_sys_exit_group+0x2e/0x30
[ 3.495447] [<0000000000b05080>] __do_syscall+0x1e8/0x210
[ 3.495465] [<0000000000b15970>] system_call+0x70/0x98
...

The rootfs is available at [2] if it is relevant. I am happy to provide
any additional information or test patches as necessary.

Cheers,
Nathan

[1]: https://github.com/nathanchance/llvm-kernel-testing/blob/79aa4ab2edc595979366c427cb49f477c7f31c68/configs/debian/s390x.config
[2]: https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230707-182910/s390-rootfs.cpio.zst

# bad: [0ba5d07205771c50789fd9063950aa75e7f1183f] Add linux-next specific files for 20230726
# good: [18b44bc5a67275641fb26f2c54ba7eef80ac5950] ovl: Always reevaluate the file signature for IMA
git bisect start '0ba5d07205771c50789fd9063950aa75e7f1183f' '18b44bc5a67275641fb26f2c54ba7eef80ac5950'
# bad: [8fe1b33ece8f8fe1377082e839817886cb8c0f81] Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
git bisect bad 8fe1b33ece8f8fe1377082e839817886cb8c0f81
# bad: [932bd67958459da3dc755b5bea7758e9d951dee5] Merge branch 'ti-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux.git
git bisect bad 932bd67958459da3dc755b5bea7758e9d951dee5
# bad: [a4abec0a3653fb9dfb3ea6cea2ad1d36f507ca97] Merge branch 'perf-tools-next' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git
git bisect bad a4abec0a3653fb9dfb3ea6cea2ad1d36f507ca97
# bad: [5a52022bde252d090e051077af297dcfeff9fd0d] powerpc/book3s64/radix: add debug message to give more details of vmemmap allocation
git bisect bad 5a52022bde252d090e051077af297dcfeff9fd0d
# good: [671115657ee2403d18cb849061d7245687d9fdc5] mm/pgtable: notes on pte_offset_map[_lock]()
git bisect good 671115657ee2403d18cb849061d7245687d9fdc5
# good: [26c3a4fe0eb027ff00ad42168c8732db0c0b40d7] arm64/smmu: use TLBI ASID when invalidating entire range
git bisect good 26c3a4fe0eb027ff00ad42168c8732db0c0b40d7
# bad: [8585d0b53780f11cad8dad37997369949e3d5043] mm: memcg: use rstat for non-hierarchical stats
git bisect bad 8585d0b53780f11cad8dad37997369949e3d5043
# bad: [9abfe35eb187c3f79af5bb07c2f9815a480c4965] mm/compaction: correct comment of candidate pfn in fast_isolate_freepages
git bisect bad 9abfe35eb187c3f79af5bb07c2f9815a480c4965
# bad: [208f64c37a4e22b25b8037776c5713545eaf54fa] selftests: line buffer test program's stdout
git bisect bad 208f64c37a4e22b25b8037776c5713545eaf54fa
# good: [08356142587c28b86817646ff318317b5237fdeb] mmu_notifiers: rename invalidate_range notifier
git bisect good 08356142587c28b86817646ff318317b5237fdeb
# good: [652555287069f2c0bbbfaf262eb41638f5c87550] mm: allow deferred splitting of arbitrary large anon folios
git bisect good 652555287069f2c0bbbfaf262eb41638f5c87550
# bad: [904d9713b3b0e64329b2f6d159966b5c737444ff] mm: batch-zap large anonymous folio PTE mappings
git bisect bad 904d9713b3b0e64329b2f6d159966b5c737444ff
# good: [9a7c14665a566fbc1adc2c35982898abc1546525] mm: implement folio_remove_rmap_range()
git bisect good 9a7c14665a566fbc1adc2c35982898abc1546525
# first bad commit: [904d9713b3b0e64329b2f6d159966b5c737444ff] mm: batch-zap large anonymous folio PTE mappings