Re: [PATCH v3 9/9] fork: Use __mt_dup() to duplicate maple tree in dup_mmap()

From: Liam R. Howlett
Date: Tue Oct 03 2023 - 14:47:30 EST


* Peng Zhang <zhangpeng.00@xxxxxxxxxxxxx> [230924 23:58]:
> In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then
> directly replacing the entries of VMAs in the new maple tree can result
> in better performance. __mt_dup() uses DFS pre-order to duplicate the
> maple tree, so it is very efficient. The average time complexity of
> duplicating VMAs is reduced from O(n * log(n)) to O(n). The optimization
> effect is proportional to the number of VMAs.

I am not confident in the big O calculations here. Although the addition
of the tree is reduced, adding a VMA still needs to create the nodes
above it - which are a function of n. How did you get O(n * log(n)) for
the existing fork?

I would think your new algorithm is n * log(n/16), while the
previous was n * log(n/16) * f(n). Where f(n) would be something
to do with the decision to split/rebalance in bulk insert mode.

It's certainly a better algorithm to duplicate trees, but I don't think
it is O(n). Can you please explain?

>
> As the entire maple tree is duplicated using __mt_dup(), if dup_mmap()
> fails, there will be a portion of VMAs that have not been duplicated in
> the maple tree. This makes it impossible to unmap all VMAs in exit_mmap().
> To solve this problem, undo_dup_mmap() is introduced to handle the failure
> of dup_mmap(). I have carefully tested the failure path and so far it
> seems there are no issues.
>
> There is a "spawn" in byte-unixbench[1], which can be used to test the
> performance of fork(). I modified it slightly to make it work with
> different number of VMAs.
>
> Below are the test results. By default, there are 21 VMAs. The first row
> shows the number of additional VMAs added on top of the default. The last
> two rows show the number of fork() calls per ten seconds. The test results
> were obtained with CPU binding to avoid scheduler load balancing that
> could cause unstable results. There are still some fluctuations in the
> test results, but at least they are better than the original performance.
>
> Increment of VMAs: 0 100 200 400 800 1600 3200 6400
> next-20230921: 112326 75469 54529 34619 20750 11355 6115 3183
> Apply this: 116505 85971 67121 46080 29722 16665 9050 4805
> +3.72% +13.92% +23.09% +33.11% +43.24% +46.76% +48.00% +50.96%
>
> [1] https://github.com/kdlucas/byte-unixbench/tree/master
>
> Signed-off-by: Peng Zhang <zhangpeng.00@xxxxxxxxxxxxx>
> ---
> include/linux/mm.h | 1 +
> kernel/fork.c | 34 ++++++++++++++++++++----------
> mm/internal.h | 3 ++-
> mm/memory.c | 7 ++++---
> mm/mmap.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--
> 5 files changed, 80 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1f1d0d6b8f20..10c59dc7ffaa 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3242,6 +3242,7 @@ extern void unlink_file_vma(struct vm_area_struct *);
> extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
> unsigned long addr, unsigned long len, pgoff_t pgoff,
> bool *need_rmap_locks);
> +extern void undo_dup_mmap(struct mm_struct *mm, struct vm_area_struct *vma_end);
> extern void exit_mmap(struct mm_struct *);
>
> static inline int check_data_rlimit(unsigned long rlim,
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 7ae36c2e7290..2f3d83e89fe6 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -650,7 +650,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
> int retval;
> unsigned long charge = 0;
> LIST_HEAD(uf);
> - VMA_ITERATOR(old_vmi, oldmm, 0);
> VMA_ITERATOR(vmi, mm, 0);
>
> uprobe_start_dup_mmap();
> @@ -678,16 +677,25 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
> goto out;
> khugepaged_fork(mm, oldmm);
>
> - retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
> - if (retval)
> + /* Use __mt_dup() to efficiently build an identical maple tree. */
> + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
> + if (unlikely(retval))
> goto out;
>
> mt_clear_in_rcu(vmi.mas.tree);
> - for_each_vma(old_vmi, mpnt) {
> + for_each_vma(vmi, mpnt) {
> struct file *file;
>
> vma_start_write(mpnt);
> if (mpnt->vm_flags & VM_DONTCOPY) {
> + mas_store_gfp(&vmi.mas, NULL, GFP_KERNEL);
> +
> + /* If failed, undo all completed duplications. */
> + if (unlikely(mas_is_err(&vmi.mas))) {
> + retval = xa_err(vmi.mas.node);
> + goto loop_out;
> + }
> +
> vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
> continue;
> }
> @@ -749,9 +757,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
> if (is_vm_hugetlb_page(tmp))
> hugetlb_dup_vma_private(tmp);
>
> - /* Link the vma into the MT */
> - if (vma_iter_bulk_store(&vmi, tmp))
> - goto fail_nomem_vmi_store;
> + /*
> + * Link the vma into the MT. After using __mt_dup(), memory
> + * allocation is not necessary here, so it cannot fail.
> + */
> + mas_store(&vmi.mas, tmp);
>
> mm->map_count++;
> if (!(tmp->vm_flags & VM_WIPEONFORK))
> @@ -760,15 +770,19 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
> if (tmp->vm_ops && tmp->vm_ops->open)
> tmp->vm_ops->open(tmp);
>
> - if (retval)
> + if (retval) {
> + mpnt = vma_next(&vmi);
> goto loop_out;
> + }
> }
> /* a new mm has just been created */
> retval = arch_dup_mmap(oldmm, mm);
> loop_out:
> vma_iter_free(&vmi);
> - if (!retval)
> + if (likely(!retval))
> mt_set_in_rcu(vmi.mas.tree);
> + else
> + undo_dup_mmap(mm, mpnt);
> out:
> mmap_write_unlock(mm);
> flush_tlb_mm(oldmm);
> @@ -778,8 +792,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
> uprobe_end_dup_mmap();
> return retval;
>
> -fail_nomem_vmi_store:
> - unlink_anon_vmas(tmp);
> fail_nomem_anon_vma_fork:
> mpol_put(vma_policy(tmp));
> fail_nomem_policy:
> diff --git a/mm/internal.h b/mm/internal.h
> index 7a961d12b088..288ec81770cb 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -111,7 +111,8 @@ void folio_activate(struct folio *folio);
>
> void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> struct vm_area_struct *start_vma, unsigned long floor,
> - unsigned long ceiling, bool mm_wr_locked);
> + unsigned long ceiling, unsigned long tree_end,
> + bool mm_wr_locked);
> void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
>
> struct zap_details;
> diff --git a/mm/memory.c b/mm/memory.c
> index 983a40f8ee62..1fd66a0d5838 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -362,7 +362,8 @@ void free_pgd_range(struct mmu_gather *tlb,
>
> void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> struct vm_area_struct *vma, unsigned long floor,
> - unsigned long ceiling, bool mm_wr_locked)
> + unsigned long ceiling, unsigned long tree_end,
> + bool mm_wr_locked)
> {
> do {
> unsigned long addr = vma->vm_start;
> @@ -372,7 +373,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
> * be 0. This will underflow and is okay.
> */
> - next = mas_find(mas, ceiling - 1);
> + next = mas_find(mas, tree_end - 1);
>
> /*
> * Hide vma from rmap and truncate_pagecache before freeing
> @@ -393,7 +394,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> while (next && next->vm_start <= vma->vm_end + PMD_SIZE
> && !is_vm_hugetlb_page(next)) {
> vma = next;
> - next = mas_find(mas, ceiling - 1);
> + next = mas_find(mas, tree_end - 1);
> if (mm_wr_locked)
> vma_start_write(vma);
> unlink_anon_vmas(vma);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 2ad950f773e4..daed3b423124 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -2312,7 +2312,7 @@ static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> mas_set(mas, mt_start);
> free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
> next ? next->vm_start : USER_PGTABLES_CEILING,
> - mm_wr_locked);
> + tree_end, mm_wr_locked);
> tlb_finish_mmu(&tlb);
> }
>
> @@ -3178,6 +3178,54 @@ int vm_brk(unsigned long addr, unsigned long len)
> }
> EXPORT_SYMBOL(vm_brk);
>
> +void undo_dup_mmap(struct mm_struct *mm, struct vm_area_struct *vma_end)
> +{
> + unsigned long tree_end;
> + VMA_ITERATOR(vmi, mm, 0);
> + struct vm_area_struct *vma;
> + unsigned long nr_accounted = 0;
> + int count = 0;
> +
> + /*
> + * vma_end points to the first VMA that has not been duplicated. We need
> + * to unmap all VMAs before it.
> + * If vma_end is NULL, it means that all VMAs in the maple tree have
> + * been duplicated, so setting tree_end to 0 will overflow to ULONG_MAX
> + * when using it.
> + */
> + if (vma_end) {
> + tree_end = vma_end->vm_start;
> + if (tree_end == 0)
> + goto destroy;
> + } else
> + tree_end = 0;
> +
> + vma = mas_find(&vmi.mas, tree_end - 1);
> +
> + if (vma) {
> + arch_unmap(mm, vma->vm_start, tree_end);
> + unmap_region(mm, &vmi.mas, vma, NULL, NULL, 0, tree_end,
> + tree_end, true);

next is vma_end, as per your comment above. Using next = vma_end allows
you to avoid adding another argument to free_pgtables().

> +
> + mas_set(&vmi.mas, vma->vm_end);
> + do {
> + if (vma->vm_flags & VM_ACCOUNT)
> + nr_accounted += vma_pages(vma);
> + remove_vma(vma, true);
> + count++;
> + cond_resched();
> + vma = mas_find(&vmi.mas, tree_end - 1);
> + } while (vma != NULL);
> +
> + BUG_ON(count != mm->map_count);
> +
> + vm_unacct_memory(nr_accounted);
> + }
> +
> +destroy:
> + __mt_destroy(&mm->mm_mt);
> +}
> +
> /* Release all mmaps. */
> void exit_mmap(struct mm_struct *mm)
> {
> @@ -3217,7 +3265,7 @@ void exit_mmap(struct mm_struct *mm)
> mt_clear_in_rcu(&mm->mm_mt);
> mas_set(&mas, vma->vm_end);
> free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
> - USER_PGTABLES_CEILING, true);
> + USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true);
> tlb_finish_mmu(&tlb);
>
> /*
> --
> 2.20.1
>