Re: [PATCH v25 2/5] fs/proc/task_mmu: Implement IOCTL to get and optionally clear info about PTEs

From: Andrei Vagin
Date: Tue Jul 18 2023 - 12:08:47 EST


On Tue, Jul 18, 2023 at 1:18 AM Muhammad Usama Anjum
<usama.anjum@xxxxxxxxxxxxx> wrote:
>
> On 7/17/23 10:26 PM, Andrei Vagin wrote:
> > On Thu, Jul 13, 2023 at 3:14 AM Muhammad Usama Anjum
> > <usama.anjum@xxxxxxxxxxxxx> wrote:
> >>
> >> This IOCTL, PAGEMAP_SCAN on pagemap file can be used to get and/or clear
> >> the info about page table entries. The following operations are supported
> >> in this ioctl:
> >> - Get the information if the pages have been written-to (PAGE_IS_WRITTEN),
> >> file mapped (PAGE_IS_FILE), present (PAGE_IS_PRESENT), swapped
> >> (PAGE_IS_SWAPPED) or page has pfn zero (PAGE_IS_PFNZERO).
> >> - Find pages which have been written-to and/or write protect the pages
> >> (atomic PM_SCAN_OP_GET + PM_SCAN_OP_WP)
> >>
> >> This IOCTL can be extended to get information about more PTE bits. The
> >> entire address range passed by user [start, end) is scanned until either
> >> the user provided buffer is full or max_pages have been found.
> >>
> >
> > Reviewed-by: Andrei Vagin <avagin@xxxxxxxxx>
> Thank you.
>

<snip>

> >> +#ifdef CONFIG_HUGETLB_PAGE
> >> +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
> >> + unsigned long start, unsigned long end,
> >> + struct mm_walk *walk)
> >> +{
> >> + unsigned long n_pages = (end - start)/PAGE_SIZE;
> >> + struct pagemap_scan_private *p = walk->private;
> >> + struct vm_area_struct *vma = walk->vma;
> >> + bool is_written, interesting = true;
> >> + struct hstate *h = hstate_vma(vma);
> >> + unsigned long bitmap;
> >> + spinlock_t *ptl;
> >> + int ret = 0;
> >> + pte_t ptent;
> >> +
> >> + if (IS_PM_SCAN_WP(p->flags) && n_pages < HPAGE_SIZE/PAGE_SIZE) {
> >> + p->end_addr = start;
> >> + return -EINVAL;
> >> + }
> >> +
> >> + if (n_pages > p->max_pages - p->found_pages)
> >> + n_pages = p->max_pages - p->found_pages;
> >> +
> >> + if (IS_PM_SCAN_WP(p->flags)) {
> >> + i_mmap_lock_write(vma->vm_file->f_mapping);
> >> + ptl = huge_pte_lock(h, vma->vm_mm, ptep);
> >> + }
> >> +
> >> + ptent = huge_ptep_get(ptep);
> >> + is_written = !is_huge_pte_uffd_wp(ptent);
> >> +
> >> + bitmap = PM_SCAN_FLAGS(is_written, pagemap_scan_is_huge_file(ptent),
> >> + pte_present(ptent), is_swap_pte(ptent),
> >> + pte_present(ptent) && is_zero_pfn(pte_pfn(ptent)));
> >> +
> >> + if (IS_PM_SCAN_GET(p->flags))
> >> + interesting = pagemap_scan_is_interesting_page(bitmap, p);
> >> +
> >> + if (interesting) {
> >> + /*
> >> + * Partial hugetlb page clear isn't supported
> >> + */
> >> + if (is_written && IS_PM_SCAN_WP(p->flags) &&
> >> + n_pages < HPAGE_SIZE/PAGE_SIZE) {
> >> + ret = PM_SCAN_END_WALK;
> >> + p->end_addr = start;
> >> + goto unlock_and_return;
> >> + }
> >> +
> >> + if (IS_PM_SCAN_GET(p->flags))
> >> + ret = pagemap_scan_output(bitmap, p, start, n_pages);
> >> +
> >> + if (IS_PM_SCAN_WP(p->flags) && is_written && ret >= 0) {
> >> + make_uffd_wp_huge_pte(vma, start, ptep, ptent);
> >> + flush_hugetlb_tlb_range(vma, start, end);
> >> + }
> >> + }
> >> +
> >> +unlock_and_return:
> >> + if (IS_PM_SCAN_WP(p->flags)) {
> >> + spin_unlock(ptl);
> >> + i_mmap_unlock_write(vma->vm_file->f_mapping);
> >> + }
> >> +
> >> + return ret;
> >> +}
> >> +#else
> >> +#define pagemap_scan_hugetlb_entry NULL
> >> +#endif
> >> +
> >> +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
> >> + int depth, struct mm_walk *walk)
> >> +{
> >> + unsigned long n_pages = (end - addr)/PAGE_SIZE;
> >> + struct pagemap_scan_private *p = walk->private;
> >> + struct vm_area_struct *vma = walk->vma;
> >> + bool interesting = true;
> >> + unsigned long bitmap;
> >> + int ret = 0;
> >> +
> >> + if (!vma)
> >> + return 0;
> >> +
> >> + bitmap = PM_SCAN_FLAGS(false, false, false, false, false);
> >> +
> >> + if (IS_PM_SCAN_GET(p->flags))
> >> + interesting = pagemap_scan_is_interesting_page(bitmap, p);
> >> +
> >> + if (interesting) {
> >> + if (IS_PM_SCAN_GET(p->flags)) {
> >> + if (n_pages > p->max_pages - p->found_pages)
> >> + n_pages = p->max_pages - p->found_pages;
> >> +
> >> + ret = pagemap_scan_output(bitmap, p, addr, n_pages);
> >> + }
> >> +
> >> + if (IS_PM_SCAN_WP(p->flags) && !ret &&
> >> + uffd_wp_range(vma, addr, end - addr, true) < 0)
> >
> > Why do we need to call uffd_wp_range for holes? Should we call
> > flush_tlb_range after it?

Did you skip this question?

> >
> >> + ret = -EINVAL;
> >> + }
> >> +
> >> + return ret;
> >> +}
> >> +
> >> +static const struct mm_walk_ops pagemap_scan_ops = {
> >> + .test_walk = pagemap_scan_test_walk,
> >> + .pmd_entry = pagemap_scan_pmd_entry,
> >> + .pte_hole = pagemap_scan_pte_hole,
> >> + .hugetlb_entry = pagemap_scan_hugetlb_entry,
> >> +};
> >> +
> >> +static int pagemap_scan_args_valid(struct pm_scan_arg *arg, unsigned long start,
> >> + unsigned long end, struct page_region __user *vec)
> >> +{
> >> + /* Detect illegal size, flags, len and masks */
> >> + if (arg->size != sizeof(struct pm_scan_arg))
> >> + return -EINVAL;
> >> + if (!arg->flags)
> >> + return -EINVAL;
> >> + if (arg->flags & ~PM_SCAN_OPS)
> >> + return -EINVAL;
> >> + if (!(end - start))
> >> + return -EINVAL;
> >> + if ((arg->required_mask | arg->anyof_mask | arg->excluded_mask |
> >> + arg->return_mask) & ~PM_SCAN_BITS_ALL)
> >> + return -EINVAL;
> >> + if (!arg->required_mask && !arg->anyof_mask &&
> >> + !arg->excluded_mask)
> >> + return -EINVAL;
> >> + if (!arg->return_mask)
> >> + return -EINVAL;
> >> +
> >> + /* Validate memory range */
> >> + if (!IS_ALIGNED(start, PAGE_SIZE))
> >> + return -EINVAL;
> >> + if (!access_ok((void __user *)start, end - start))
> >> + return -EFAULT;
> >> +
> >> + if (IS_PM_SCAN_GET(arg->flags)) {
> >> + if (arg->vec_len == 0)
> >> + return -EINVAL;
> >> + if (!vec)
> >> + return -EFAULT;
> >> + if (!access_ok((void __user *)vec,
> >> + arg->vec_len * sizeof(struct page_region)))
> >> + return -EFAULT;
> >> + }
> >> +
> >> + if (IS_PM_SCAN_WP(arg->flags) && !IS_PM_SCAN_GET(arg->flags) &&
> >> + arg->max_pages)
> >> + return -EINVAL;
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +static long do_pagemap_scan(struct mm_struct *mm, unsigned long __arg)
> >> +{
> >> + struct pm_scan_arg __user *uarg = (struct pm_scan_arg __user *)__arg;
> >> + unsigned long long start, end, walk_start, walk_end;
> >> + unsigned long empty_slots, vec_index = 0;
> >> + struct mmu_notifier_range range;
> >> + struct page_region __user *vec;
> >> + struct pagemap_scan_private p;
> >> + struct pm_scan_arg arg;
> >> + int ret = 0;
> >> +
> >> + if (copy_from_user(&arg, uarg, sizeof(arg)))
> >> + return -EFAULT;
> >> +
> >> + start = untagged_addr((unsigned long)arg.start);
> >> + end = untagged_addr((unsigned long)arg.end);
> >> + vec = (struct page_region __user *)untagged_addr((unsigned long)arg.vec);
> >> +
> >> + ret = pagemap_scan_args_valid(&arg, start, end, vec);
> >> + if (ret)
> >> + return ret;
> >> +
> >> + p.max_pages = (arg.max_pages) ? arg.max_pages : ULONG_MAX;
> >> + p.found_pages = 0;
> >> + p.required_mask = arg.required_mask;
> >> + p.anyof_mask = arg.anyof_mask;
> >> + p.excluded_mask = arg.excluded_mask;
> >> + p.return_mask = arg.return_mask;
> >> + p.flags = arg.flags;
> >> + p.flags |= ((p.required_mask | p.anyof_mask | p.excluded_mask) &
> >> + PAGE_IS_WRITTEN) ? PM_SCAN_REQUIRE_UFFD : 0;
> >> + p.cur_buf.start = p.cur_buf.len = p.cur_buf.flags = 0;
> >> + p.vec_buf = NULL;
> >> + p.vec_buf_len = PAGEMAP_WALK_SIZE >> PAGE_SHIFT;
> >> + p.vec_buf_index = 0;
> >> + p.end_addr = 0;
> >> +
> >> + /*
> >> + * Allocate smaller buffer to get output from inside the page walk
> >> + * functions and walk page range in PAGEMAP_WALK_SIZE size chunks. As
> >> + * we want to return output to user in compact form where no two
> >> + * consecutive regions should be continuous and have the same flags.
> >> + * So store the latest element in p.cur_buf between different walks and
> >> + * store the p.cur_buf at the end of the walk to the user buffer.
> >> + */
> >> + if (IS_PM_SCAN_GET(p.flags)) {
> >> + p.vec_buf = kmalloc_array(p.vec_buf_len, sizeof(*p.vec_buf),
> >> + GFP_KERNEL);
> >> + if (!p.vec_buf)
> >> + return -ENOMEM;
> >> + }
> >> +
> >> + /*
> >> + * Protection change for the range is going to happen.
> >> + */
> >> + if (IS_PM_SCAN_WP(p.flags)) {
> >> + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
> >> + mm, start, end);
> >> + mmu_notifier_invalidate_range_start(&range);
> >> + }
> >> +
> >> + walk_start = walk_end = start;
> >> + while (walk_end < end && !ret) {
> >> + if (IS_PM_SCAN_GET(p.flags)) {
> >> + /*
> >> + * All data is copied to cur_buf first. When more data
> >> + * is found, we push cur_buf to vec_buf and copy new
> >> + * data to cur_buf. Subtract 1 from length as the
> >> + * index of cur_buf isn't counted in length.
> >> + */
> >> + empty_slots = arg.vec_len - vec_index;
> >> + p.vec_buf_len = min(p.vec_buf_len, empty_slots - 1);
> >> + }
> >> +
> >
> > I still don't understand why we don't want/need to check for pending signals.
> We haven't added it as other existing code such as mincore() and

It doesn't convince me. There should be reasons to do or not to do
certain things.
We can't say how long this loop can be running, so it is the reason
why we can want
to check pending signals.

> pagemap_read() don't have it either.

I already explained that this case is different, because the size of
the output buffer is
limited for pagemap_read.

> Also mmap_read_lock_killable would return error if there is some critical single pending.\

It isn't completely true. It doesn't return errors in the fast path
when it takes the lock right
away. It checks signals only when it needs to wait for the lock.

>
> >
> >> + ret = mmap_read_lock_killable(mm);
> >> + if (ret)
> >> + goto out;
> >> +
> >> + walk_end = min((walk_start + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK, end);
> >> +
> >> + ret = walk_page_range(mm, walk_start, walk_end,
> >> + &pagemap_scan_ops, &p);
> >> + mmap_read_unlock(mm);
> >> +
> >> + if (ret == PM_SCAN_FOUND_MAX_PAGES || ret == PM_SCAN_END_WALK)
> >> + arg.start = p.end_addr;
> >
> > nit: this check can be moved out of the loop.
> No, ret could get replaced by error if copy_to_user() fails. So we have to
> do this before that.

If we fail to copy a vector, it is a fatal error and it probably doesn't matter
what end address has been there. It is up to you to leave it here or not.

>
> >
> >> +
> >> + if (ret && ret != PM_SCAN_FOUND_MAX_PAGES &&
> >> + ret != PM_SCAN_END_WALK)
> >> + goto out;
> >> +
> >> + if (p.vec_buf_index) {
> >> + if (copy_to_user(&vec[vec_index], p.vec_buf,
> >> + p.vec_buf_index * sizeof(*p.vec_buf))) {
> >> + /*
> >> + * Return error even though the OP succeeded
> >> + */
> >> + ret = -EFAULT;
> >> + goto out;
> >> + }
> >> + vec_index += p.vec_buf_index;
> >> + p.vec_buf_index = 0;
> >> + }
> >> + walk_start = walk_end;
> >> + }
> >> +
> >> + if (p.cur_buf.len) {
> >> + if (copy_to_user(&vec[vec_index], &p.cur_buf, sizeof(p.cur_buf))) {
> >> + ret = -EFAULT;
> >> + goto out;
> >> + }
> >> + vec_index++;
> >> + }
> >> +
> >> + ret = vec_index;
> >> +
> >> +out:
> >> + if (!p.end_addr)
> >> + arg.start = walk_start;
> >> + if (copy_to_user(&uarg->start, &arg.start, sizeof(arg.start)))
> >> + ret = -EFAULT;
> >> +
> >> + if (IS_PM_SCAN_WP(p.flags))
> >> + mmu_notifier_invalidate_range_end(&range);
> >> +
> >> + kfree(p.vec_buf);
> >> + return ret;
> >> +}
> >> +

Thanks,
Andrei