Re: [PATCH v8 1/2] hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing

From: Peter Xu
Date: Thu Nov 10 2022 - 15:57:58 EST


Hi, Mike,

Sorry to be late, took me quite some time working on another bug..

On Mon, Nov 07, 2022 at 05:19:09PM -0800, Mike Kravetz wrote:
> madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page
> tables associated with the address range. For hugetlb vmas,
> zap_page_range will call __unmap_hugepage_range_final. However,
> __unmap_hugepage_range_final assumes the passed vma is about to be removed
> and deletes the vma_lock to prevent pmd sharing as the vma is on the way
> out. In the case of madvise(MADV_DONTNEED) the vma remains, but the
> missing vma_lock prevents pmd sharing and could potentially lead to issues
> with truncation/fault races.
>
> This issue was originally reported here [1] as a BUG triggered in
> page_try_dup_anon_rmap. Prior to the introduction of the hugetlb
> vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to
> prevent pmd sharing. Subsequent faults on this vma were confused as
> VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was
> not set in new pages added to the page table. This resulted in pages that
> appeared anonymous in a VM_SHARED vma and triggered the BUG.
>
> Address issue by:
> - Add a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from
> unmap_vmas(). This is used to indicate the 'final' unmapping of a vma.
> When called via MADV_DONTNEED, this flag is not set and the vm_lock is
> not deleted.
> - mmu notification is removed from __unmap_hugepage_range to avoid
> duplication, and notification is added to the other calling routine
> (unmap_hugepage_range).
> - notification range in zap_page_range_single is updated to take into
> account the possibility of hugetlb pmd sharing.
> - zap_page_range_single renamed to __zap_page_range_single to be used
> internally within mm/memory.c
> - zap_vma_range() interface created to zap a range within a single vma.
> - madvise_dontneed_single_vma is updated to call zap_vma_range instead of
> zap_page_range as it only operates on a range within a single vma
>
> [1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6JudD0g@xxxxxxxxxxxxxx/
> Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings")
> Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
> Reported-by: Wei Chen <harperchen1110@xxxxxxxxx>
> Cc: <stable@xxxxxxxxxxxxxxx>
> ---
> include/linux/mm.h | 5 +++++
> mm/hugetlb.c | 45 +++++++++++++++++++++++++++------------------
> mm/madvise.c | 4 ++--
> mm/memory.c | 17 +++++++++++++----
> 4 files changed, 47 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 978c17df053e..d205bcd9cd2e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1840,6 +1840,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
> unsigned long size);
> void zap_page_range(struct vm_area_struct *vma, unsigned long address,
> unsigned long size);
> +void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
> + unsigned long size);
> void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
> struct vm_area_struct *start_vma, unsigned long start,
> unsigned long end);
> @@ -3464,4 +3466,7 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
> */
> #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0))
>
> +/* Set in unmap_vmas() to indicate an unmap call. Only used by hugetlb */
> +#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))

It seems this is not set anywhere in the patch?

> +
> #endif /* _LINUX_MM_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ceb47c4e183a..7c8fbce4441e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5072,7 +5072,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
> struct page *page;
> struct hstate *h = hstate_vma(vma);
> unsigned long sz = huge_page_size(h);
> - struct mmu_notifier_range range;
> unsigned long last_addr_mask;
> bool force_flush = false;
>
> @@ -5087,13 +5086,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
> tlb_change_page_size(tlb, sz);
> tlb_start_vma(tlb, vma);
>
> - /*
> - * If sharing possible, alert mmu notifiers of worst case.
> - */
> - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
> - end);
> - adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
> - mmu_notifier_invalidate_range_start(&range);
> last_addr_mask = hugetlb_mask_last_page(h);
> address = start;
> for (; address < end; address += sz) {
> @@ -5178,7 +5170,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
> if (ref_page)
> break;
> }
> - mmu_notifier_invalidate_range_end(&range);
> tlb_end_vma(tlb, vma);
>
> /*
> @@ -5203,32 +5194,50 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> unsigned long end, struct page *ref_page,
> zap_flags_t zap_flags)
> {
> + bool final = zap_flags & ZAP_FLAG_UNMAP;
> +
> hugetlb_vma_lock_write(vma);
> i_mmap_lock_write(vma->vm_file->f_mapping);
>
> __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
>
> /*
> - * Unlock and free the vma lock before releasing i_mmap_rwsem. When
> - * the vma_lock is freed, this makes the vma ineligible for pmd
> - * sharing. And, i_mmap_rwsem is required to set up pmd sharing.
> - * This is important as page tables for this unmapped range will
> - * be asynchrously deleted. If the page tables are shared, there
> - * will be issues when accessed by someone else.
> + * When called via zap_vma_range (MADV_DONTNEED), this is not the
> + * final unmap of the vma, and we do not want to delete the vma_lock.
> */
> - __hugetlb_vma_unlock_write_free(vma);
> -
> - i_mmap_unlock_write(vma->vm_file->f_mapping);
> + if (final) {
> + /*
> + * Unlock and free the vma lock before releasing i_mmap_rwsem.
> + * When the vma_lock is freed, this makes the vma ineligible
> + * for pmd sharing. And, i_mmap_rwsem is required to set up
> + * pmd sharing. This is important as page tables for this
> + * unmapped range will be asynchrously deleted. If the page
> + * tables are shared, there will be issues when accessed by
> + * someone else.
> + */
> + __hugetlb_vma_unlock_write_free(vma);
> + i_mmap_unlock_write(vma->vm_file->f_mapping);
> + } else {
> + i_mmap_unlock_write(vma->vm_file->f_mapping);
> + hugetlb_vma_unlock_write(vma);
> + }
> }
>
> void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
> unsigned long end, struct page *ref_page,
> zap_flags_t zap_flags)
> {
> + struct mmu_notifier_range range;
> struct mmu_gather tlb;
>
> + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,

Should this be s/UNMAP/CLEAR/? As IIUC the unmap path was only happening
in __unmap_hugepage_range_final().

> + start, end);
> + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
> tlb_gather_mmu(&tlb, vma->vm_mm);
> +
> __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
> +
> + mmu_notifier_invalidate_range_end(&range);
> tlb_finish_mmu(&tlb);
> }
>
> diff --git a/mm/madvise.c b/mm/madvise.c
> index c7105ec6d08c..9d2625b8029a 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -772,7 +772,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
> * Application no longer needs these pages. If the pages are dirty,
> * it's OK to just throw them away. The app will be more careful about
> * data it wants to keep. Be sure to free swap resources too. The
> - * zap_page_range call sets things up for shrink_active_list to actually free
> + * zap_vma_range call sets things up for shrink_active_list to actually free
> * these pages later if no one else has touched them in the meantime,
> * although we could add these pages to a global reuse list for
> * shrink_active_list to pick up before reclaiming other pages.
> @@ -790,7 +790,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
> static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
> unsigned long start, unsigned long end)
> {
> - zap_page_range(vma, start, end - start);
> + zap_vma_range(vma, start, end - start);

I'd rather just call zap_page_range_single() directly with NULL passed
over, considering that this is for stable, but no strong opinions.

> return 0;
> }
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 6090124b64f1..af3a4724b464 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1717,7 +1717,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
> }
>
> /**
> - * zap_page_range_single - remove user pages in a given range
> + * __zap_page_range_single - remove user pages in a given range

Same nitpick here, I'd rather keep the name at least for this patch. But
again, no strong opinion.

> * @vma: vm_area_struct holding the applicable pages
> * @address: starting address of pages to zap
> * @size: number of bytes to zap
> @@ -1725,7 +1725,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
> *
> * The range must fit into one VMA.
> */
> -static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
> +static void __zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
> unsigned long size, struct zap_details *details)
> {
> struct mmu_notifier_range range;
> @@ -1734,6 +1734,9 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
> lru_add_drain();
> mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> address, address + size);
> + if (is_vm_hugetlb_page(vma))
> + adjust_range_if_pmd_sharing_possible(vma, &range.start,
> + &range.end);
> tlb_gather_mmu(&tlb, vma->vm_mm);
> update_hiwater_rss(vma->vm_mm);
> mmu_notifier_invalidate_range_start(&range);
> @@ -1742,6 +1745,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
> tlb_finish_mmu(&tlb);
> }
>
> +void zap_vma_range(struct vm_area_struct *vma, unsigned long address,
> + unsigned long size)
> +{
> + __zap_page_range_single(vma, address, size, NULL);
> +}
> +
> /**
> * zap_vma_ptes - remove ptes mapping the vma
> * @vma: vm_area_struct holding ptes to be zapped
> @@ -1760,7 +1769,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
> !(vma->vm_flags & VM_PFNMAP))
> return;
>
> - zap_page_range_single(vma, address, size, NULL);
> + __zap_page_range_single(vma, address, size, NULL);
> }
> EXPORT_SYMBOL_GPL(zap_vma_ptes);
>
> @@ -3453,7 +3462,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
> unsigned long start_addr, unsigned long end_addr,
> struct zap_details *details)
> {
> - zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
> + __zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
> }
>
> static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
> --
> 2.37.3
>
>

--
Peter Xu