Re: [PATCH] mm/khugepaged: skip shmem with armed userfaultfd

From: Yang Shi
Date: Wed Feb 01 2023 - 12:37:02 EST


On Tue, Jan 31, 2023 at 7:42 PM David Stevens <stevensd@xxxxxxxxxxxx> wrote:
>
> From: David Stevens <stevensd@xxxxxxxxxxxx>
>
> Collapsing memory in a vma that has an armed userfaultfd results in
> zero-filling any missing pages, which breaks user-space paging for those
> filled pages. Avoid khugepage bypassing userfaultfd by not collapsing
> pages in shmem reached via scanning a vma with an armed userfaultfd if
> doing so would zero-fill any pages.
>
> Signed-off-by: David Stevens <stevensd@xxxxxxxxxxxx>
> ---
> mm/khugepaged.c | 35 ++++++++++++++++++++++++-----------
> 1 file changed, 24 insertions(+), 11 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 79be13133322..48e944fb8972 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1736,8 +1736,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
> * + restore gaps in the page cache;
> * + unlock and free huge page;
> */
> -static int collapse_file(struct mm_struct *mm, unsigned long addr,
> - struct file *file, pgoff_t start,
> +static int collapse_file(struct mm_struct *mm, struct vm_area_struct *vma,
> + unsigned long addr, struct file *file, pgoff_t start,
> struct collapse_control *cc)
> {
> struct address_space *mapping = file->f_mapping;
> @@ -1784,6 +1784,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> * be able to map it or use it in another way until we unlock it.
> */
>
> + if (is_shmem)
> + mmap_read_lock(mm);

If you release mmap_lock before then reacquire it here, the vma is not
trusted anymore. It is not safe to use the vma anymore.

Since you already read uffd_was_armed before releasing mmap_lock, so
you could pass it directly to collapse_file w/o dereferencing vma
again. The problem may be false positive (not userfaultfd armed
anymore), but it should be fine. Khugepaged could collapse this area
in the next round.

Also +userfaultfd folks.

> +
> xas_set(&xas, start);
> for (index = start; index < end; index++) {
> struct page *page = xas_next(&xas);
> @@ -1792,6 +1795,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> VM_BUG_ON(index != xas.xa_index);
> if (is_shmem) {
> if (!page) {
> + if (userfaultfd_armed(vma)) {
> + result = SCAN_EXCEED_NONE_PTE;
> + goto xa_locked;
> + }
> /*
> * Stop if extent has been truncated or
> * hole-punched, and is now completely
> @@ -2095,6 +2102,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> hpage->mapping = NULL;
> }
>
> + if (is_shmem)
> + mmap_read_unlock(mm);
> if (hpage)
> unlock_page(hpage);
> out:
> @@ -2108,8 +2117,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> return result;
> }
>
> -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> - struct file *file, pgoff_t start,
> +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma,
> + unsigned long addr, struct file *file, pgoff_t start,
> struct collapse_control *cc)
> {
> struct page *page = NULL;
> @@ -2118,6 +2127,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> int present, swap;
> int node = NUMA_NO_NODE;
> int result = SCAN_SUCCEED;
> + bool uffd_was_armed = userfaultfd_armed(vma);
> +
> + mmap_read_unlock(mm);
>
> present = 0;
> swap = 0;
> @@ -2193,13 +2205,16 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> }
> rcu_read_unlock();
>
> + if (uffd_was_armed && present < HPAGE_PMD_NR)
> + result = SCAN_EXCEED_SWAP_PTE;
> +
> if (result == SCAN_SUCCEED) {
> if (cc->is_khugepaged &&
> present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
> result = SCAN_EXCEED_NONE_PTE;
> count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> } else {
> - result = collapse_file(mm, addr, file, start, cc);
> + result = collapse_file(mm, vma, addr, file, start, cc);
> }
> }
>
> @@ -2207,8 +2222,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> return result;
> }
> #else
> -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> - struct file *file, pgoff_t start,
> +static int hpage_collapse_scan_file(struct mm_struct *mm, struct vm_area_struct *vma,
> + unsigned long addr, struct file *file, pgoff_t start,
> struct collapse_control *cc)
> {
> BUILD_BUG();
> @@ -2304,8 +2319,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> pgoff_t pgoff = linear_page_index(vma,
> khugepaged_scan.address);
>
> - mmap_read_unlock(mm);
> - *result = hpage_collapse_scan_file(mm,
> + *result = hpage_collapse_scan_file(mm, vma,
> khugepaged_scan.address,
> file, pgoff, cc);
> mmap_locked = false;
> @@ -2656,9 +2670,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
> struct file *file = get_file(vma->vm_file);
> pgoff_t pgoff = linear_page_index(vma, addr);
>
> - mmap_read_unlock(mm);
> mmap_locked = false;
> - result = hpage_collapse_scan_file(mm, addr, file, pgoff,
> + result = hpage_collapse_scan_file(mm, vma, addr, file, pgoff,
> cc);
> fput(file);
> } else {
> --
> 2.39.1.456.gfc5497dd1b-goog
>
>