Re: [PATCH v21 2/5] fs/proc/task_mmu: Implement IOCTL to get and optionally clear info about PTEs

From: Muhammad Usama Anjum
Date: Tue Jun 27 2023 - 05:00:53 EST


Hi Andrei and Michal,

Lets resolve last two points. Please reply below.

On 6/27/23 6:46 AM, Andrei Vagin wrote:
...
>> +#ifdef CONFIG_HUGETLB_PAGE
>> +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
>> + unsigned long start, unsigned long end,
>> + struct mm_walk *walk)
>> +{
>> + unsigned long n_pages = (end - start)/PAGE_SIZE;
>> + struct pagemap_scan_private *p = walk->private;
>> + struct vm_area_struct *vma = walk->vma;
>> + bool is_written, is_interesting = true;
>> + struct hstate *h = hstate_vma(vma);
>> + unsigned long bitmap;
>> + spinlock_t *ptl;
>> + int ret = 0;
>> + pte_t ptent;
>> +
>> + if (IS_PM_SCAN_WP(p->flags) && n_pages < HPAGE_SIZE/PAGE_SIZE)
>> + return -EINVAL;
>> +
>> + if (n_pages > p->max_pages - p->found_pages)
>> + n_pages = p->max_pages - p->found_pages;
>> +
>> + if (IS_PM_SCAN_WP(p->flags)) {
>> + i_mmap_lock_write(vma->vm_file->f_mapping);
>> + ptl = huge_pte_lock(h, vma->vm_mm, ptep);
>> + }
>> +
>> + ptent = huge_ptep_get(ptep);
>> + is_written = !is_huge_pte_uffd_wp(ptent);
>> +
>> + /*
>> + * Partial hugetlb page clear isn't supported
>> + */
>> + if (is_written && IS_PM_SCAN_WP(p->flags) &&
>> + n_pages < HPAGE_SIZE/PAGE_SIZE) {
>
> should it be done only if is_interesting is set?
This can be good optimization. We shouldn't return error before finding if
page is interesting. I'll update.

>
>> + ret = PM_SCAN_END_WALK;
>> + goto unlock_and_return;
>> + }
>> +
>> + bitmap = PM_SCAN_FLAGS(is_written, pagemap_scan_is_huge_file(ptent),
>> + pte_present(ptent), is_swap_pte(ptent),
>> + pte_present(ptent) && is_zero_pfn(pte_pfn(ptent)));
>> +
>> + if (IS_PM_SCAN_GET(p->flags)) {
>> + is_interesting = pagemap_scan_is_interesting_page(bitmap, p);
>> + if (is_interesting)
>> + ret = pagemap_scan_output(bitmap, p, start, n_pages);
>> + }
>> +
>> + if (IS_PM_SCAN_WP(p->flags) && is_written && is_interesting &&
>> + ret >= 0) {
>> + make_uffd_wp_huge_pte(vma, start, ptep, ptent);
>> + flush_hugetlb_tlb_range(vma, start, end);
>> + }
>> +
>> +unlock_and_return:
>> + if (IS_PM_SCAN_WP(p->flags)) {
>> + spin_unlock(ptl);
>> + i_mmap_unlock_write(vma->vm_file->f_mapping);
>> + }
>> +
>> + return ret;
>> +}
...
>> +
>> +static long do_pagemap_scan(struct mm_struct *mm, unsigned long __arg)
>> +{
>> + struct pm_scan_arg __user *uarg = (struct pm_scan_arg __user *)__arg;
>> + unsigned long long start, end, walk_start, walk_end;
>> + unsigned long empty_slots, vec_index = 0;
>> + struct mmu_notifier_range range;
>> + struct page_region __user *vec;
>> + struct pagemap_scan_private p;
>> + struct pm_scan_arg arg;
>> + int ret = 0;
>> +
>> + if (copy_from_user(&arg, uarg, sizeof(arg)))
>> + return -EFAULT;
>> +
>> + start = untagged_addr((unsigned long)arg.start);
>> + vec = (struct page_region *)untagged_addr((unsigned long)arg.vec);
>> +
>> + ret = pagemap_scan_args_valid(&arg, start, vec);
>> + if (ret)
>> + return ret;
>> +
>> + end = start + arg.len;
>> + p.max_pages = (arg.max_pages) ? arg.max_pages : ULONG_MAX;
>> + p.found_pages = 0;
>> + p.required_mask = arg.required_mask;
>> + p.anyof_mask = arg.anyof_mask;
>> + p.excluded_mask = arg.excluded_mask;
>> + p.return_mask = arg.return_mask;
>> + p.flags = arg.flags;
>> + p.flags |= ((p.required_mask | p.anyof_mask | p.excluded_mask) &
>> + PAGE_IS_WRITTEN) ? PM_SCAN_REQUIRE_UFFD : 0;
>> + p.cur_buf.start = p.cur_buf.len = p.cur_buf.flags = 0;
>> + p.vec_buf = NULL;
>> + p.vec_buf_len = PAGEMAP_WALK_SIZE >> PAGE_SHIFT;
>> +
>> + /*
>> + * Allocate smaller buffer to get output from inside the page walk
>> + * functions and walk page range in PAGEMAP_WALK_SIZE size chunks. As
>> + * we want to return output to user in compact form where no two
>> + * consecutive regions should be continuous and have the same flags.
>> + * So store the latest element in p.cur_buf between different walks and
>> + * store the p.cur_buf at the end of the walk to the user buffer.
>> + */
>> + if (IS_PM_SCAN_GET(p.flags)) {
>> + p.vec_buf = kmalloc_array(p.vec_buf_len, sizeof(*p.vec_buf),
>> + GFP_KERNEL);
>> + if (!p.vec_buf)
>> + return -ENOMEM;
>> + }
>> +
>> + if (IS_PM_SCAN_WP(p.flags)) {
>> + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
>> + mm, start, end);
>> + mmu_notifier_invalidate_range_start(&range);
>> + }
>> +
>> + walk_start = walk_end = start;
>> + while (walk_end < end && !ret) {
>> + if (IS_PM_SCAN_GET(p.flags)) {
>> + p.vec_buf_index = 0;
>> +
>> + /*
>> + * All data is copied to cur_buf first. When more data
>> + * is found, we push cur_buf to vec_buf and copy new
>> + * data to cur_buf. Subtract 1 from length as the
>> + * index of cur_buf isn't counted in length.
>> + */
>> + empty_slots = arg.vec_len - vec_index;
>> + p.vec_buf_len = min(p.vec_buf_len, empty_slots - 1);
>> + }
>> +
>> + walk_end = (walk_start + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
>> + if (walk_end > end)
>> + walk_end = end;
>> +
>
> If this loop can run for a long time, we need to interrupt it in case of
> pending signals.
>
> If you think we don't need to do that, pls explain in the commit
> message, so that maintainers don't miss this part and double check that
> everything is alright here.
This can be done. I'll add to the commit message that we are walking over
entire range passed.

>
>> + ret = mmap_read_lock_killable(mm);
>> + if (ret)
>
> If any pages have been handled, we need to report them to user-space. It
> isn't acceptable to return a error in such cases.
This will return error only when task has gotten some serios signal and it
is giong to be killed. In this scenerio, we shouldn't care about returning
gracefully. Why do you think we should return gracefully in this case?

>
> And we need to report an address where it stopped scanning.
> We can do that by adding zero length vector.
I don't want to do multiplexing the ending address in vec. Can we add
end_addr variable in struct pm_scan_arg to always return the ending address?

struct pm_scan_arg {
...
_u64 end_addr;
};


>
>
>> + goto free_data;
>> + ret = walk_page_range(mm, walk_start, walk_end,
>> + &pagemap_scan_ops, &p);
>> + mmap_read_unlock(mm);
>> +
>> + if (ret && ret != PM_SCAN_FOUND_MAX_PAGES &&
>> + ret != PM_SCAN_END_WALK)
>> + goto free_data;
>> +
>> + walk_start = walk_end;
>> + if (IS_PM_SCAN_GET(p.flags) && p.vec_buf_index) {
>> + if (copy_to_user(&vec[vec_index], p.vec_buf,
>> + p.vec_buf_index * sizeof(*p.vec_buf))) {
>> + /*
>> + * Return error even though the OP succeeded
>> + */
>> + ret = -EFAULT;
>> + goto free_data;
>> + }
>> + vec_index += p.vec_buf_index;
>
> Should we set ret to zero here if it is equal PM_SCAN_END_WALK.
No, PM_SCAN_END_WALK is just internal code to stop the page walk and return
immedtitely. When we get this return value, we stop this loop and return to
user with whatever data we have in user buffer.

>
>> + }
>> + }
>> +
>> + if (p.cur_buf.len) {
>> + if (copy_to_user(&vec[vec_index], &p.cur_buf, sizeof(p.cur_buf))) {
>> + ret = -EFAULT;
>> + goto free_data;
>> + }
>> + vec_index++;
>> + }
>> +
>> + ret = vec_index;
>> +
>> +free_data:
>> + if (IS_PM_SCAN_WP(p.flags))
>> + mmu_notifier_invalidate_range_end(&range);
>> +
>> + kfree(p.vec_buf);
>> + return ret;
>> +}
>> +
...

--
BR,
Muhammad Usama Anjum