Re: [PATCH v29 2/6] fs/proc/task_mmu: Implement IOCTL to get and optionally clear info about PTEs

From: Muhammad Usama Anjum
Date: Sun Aug 13 2023 - 04:14:52 EST


On 8/12/23 8:49 PM, Michał Mirosław wrote:
> On Fri, Aug 11, 2023 at 11:08:38PM +0500, Muhammad Usama Anjum wrote:
>> The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally
>> clear the info about page table entries. The following operations are supported
>> in this IOCTL:
>> - Scan the address range and get the memory ranges matching the provided criteria.
>> This is performed by default when the output buffer is specified.
>
> Nit: This is actually performed always, but you can disable the output part
> by passing {NULL, 0} for the buffer.
I'll update it to:
"This is performed when the output buffer is specified."

>
> [...]
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -19,6 +19,8 @@
>> #include <linux/shmem_fs.h>
>> #include <linux/uaccess.h>
>> #include <linux/pkeys.h>
>> +#include <linux/minmax.h>
>> +#include <linux/overflow.h>
>>
>> #include <asm/elf.h>
>> #include <asm/tlb.h>
>> @@ -1749,11 +1751,682 @@ static int pagemap_release(struct inode *inode, struct file *file)
>> return 0;
>> }
>>
>> +#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
>> + PAGE_IS_FILE | PAGE_IS_PRESENT | \
>> + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
>> + PAGE_IS_HUGE)
>> +#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
>> +
>> +struct pagemap_scan_private {
>> + struct pm_scan_arg arg;
>> + unsigned long masks_of_interest, cur_vma_category;
>> + struct page_region *vec_buf;
>> + unsigned long vec_buf_len, vec_buf_index, found_pages, walk_end_addr;
>> + struct page_region __user *vec_out;
>> +};
> [...]
>> +static unsigned long pagemap_thp_category(pmd_t pmd)
>> +{
>> + unsigned long categories = PAGE_IS_HUGE;
>> +
>> + /*
>> + * THPs don't support file-backed memory. So PAGE_IS_FILE
>> + * hasn't been checked here.
>
> "hasn't been" -> "is not"
> (same for HugeTLB comment)
I'll update.

>
>> +static bool pagemap_scan_push_range(unsigned long categories,
>> + struct pagemap_scan_private *p,
>> + unsigned long addr, unsigned long end)
>> +{
>> + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
>> +
>> + /*
>> + * When there is no output buffer provided at all, the sentinel values
>> + * won't match here. There is no other way for `cur_buf->end` to be
>> + * non-zero other than it being non-empty.
>> + */
>> + if (addr == cur_buf->end && categories == cur_buf->categories) {
>> + cur_buf->end = end;
>> + return true;
>> + }
>> +
>> + if (cur_buf->end) {
>> + if (p->vec_buf_index >= p->vec_buf_len - 1)
>> + return false;
>> +
>> + cur_buf = &p->vec_buf[++p->vec_buf_index];
>> + }
>> +
>> + cur_buf->start = addr;
>> + cur_buf->end = end;
>> + cur_buf->categories = categories;
>> +
>> + return true;
>> +}
>> +
>> +static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
>> + unsigned long addr, unsigned long end)
>> +{
>> + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
>> +
>> + if (cur_buf->start != addr) {
>> + cur_buf->end = addr;
>> + } else {
>> + cur_buf->start = cur_buf->end = 0;
>> + if (p->vec_buf_index > 0)
>> + p->vec_buf_index--;
>
> There is no need to move to the previous index, as if the walk ends at
> this moment, the flush_buffer() code will ignore the empty last range.
Yeah, I'll update.

>
>> + }
>> +
>> + p->found_pages -= (end - addr) / PAGE_SIZE;
>> +}
>> +
>> +static int pagemap_scan_output(unsigned long categories,
>> + struct pagemap_scan_private *p,
>> + unsigned long addr, unsigned long *end)
>> +{
>> + unsigned long n_pages, total_pages;
>> + int ret = 0;
>> +
>> + if (!p->vec_buf)
>> + return 0;
>> +
>> + categories &= p->arg.return_mask;
>> +
>> + n_pages = (*end - addr) / PAGE_SIZE;
>> + if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
>> + total_pages > p->arg.max_pages) {
>> + size_t n_too_much = total_pages - p->arg.max_pages;
>> + *end -= n_too_much * PAGE_SIZE;
>> + n_pages -= n_too_much;
>> + ret = -ENOSPC;
>> + }
>> +
>> + if (!pagemap_scan_push_range(categories, p, addr, *end)) {
>> + *end = addr;
>> + n_pages = 0;
>> + ret = -ENOSPC;
>> + }
>> +
>> + p->found_pages += n_pages;
>> + if (ret)
>> + p->walk_end_addr = *end;
>> +
>> + return ret;
>> +}
> [...]
>> +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
>> +{
>> + if (!p->arg.vec_len)
>> + return 0;
>
> The removal of `cur_buf` lost the case of empty non-NULL output buffer
> passed in args. That was requesting the walk to stop at first matching
> page (with the address returned in `walk_end`). The push_range() call
> is still checking that, but since neither the buffer nor the sentinel
> values are set, the case is not possible to invoke.
Yeah, this is why I've removed all that logic here. The vec_len is set to 0
and vec_buf to NULL. This handles all the cases.

>
>> + /*
>> + * Allocate a smaller buffer to get output from inside the page
>> + * walk functions and walk the range in PAGEMAP_WALK_SIZE chunks.
>> + */
>
> I think this is no longer true? We can now allocate arbitrary number of
> entries, but should probably have at least 512 to cover one PMD of pages.
> So it would be better to have a constant that holds the number of
> entries in the bounce buffer.
I'll remove the comment. PAGEMAP_WALK_SIZE >> PAGE_SHIFT is a constant
already, just a fancy one.

Altough if we can increase 512 to bigger number, it'll be better in terms
of performance. I'm not sure how much we can increase it.

>
>> + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
>> + p->arg.vec_len);
>> + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
>> + GFP_KERNEL);
>> + if (!p->vec_buf)
>> + return -ENOMEM;
>> +
>> + p->vec_buf[0].end = 0;
>
> p->vec_buf->start = p->vec_buf->end = 0;
Sure.

>
>> + p->vec_out = (struct page_region __user *)p->arg.vec;
>> +
>> + return 0;
>> +}
>> +
>> +static int pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
>> +{
>> + const struct page_region *buf = p->vec_buf;
>> + int n = (int)p->vec_buf_index;
>
> Why do you need an `int` here (requiring a cast)?
Just looked at it, n and return code of pagemap_scan_flush_buffer() should
be long. Changed.

>
>> + if (p->arg.vec_len == 0)
>> + return 0;
>
> This should be actually `if (!buf)` as this notes that we don't have any
> buffer allocated (due to no output requested).
I'll update. !buf seems more reasonable.

>
>> + if (buf[n].end && buf[n].end != buf[n].start)
>> + n++;
>
> Testing `buf[n].end` is redundant, as the range is nonempty if
> `end != start`.
Sure.

>
>> + if (!n)
>> + return 0;
>> +
>> + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
>> + return -EFAULT;
>> +
>> + p->arg.vec_len -= n;
>> + p->vec_out += n;
>> +
>> + p->vec_buf_index = 0;
>> + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
>> + p->vec_buf[0].end = 0;
>
> buf->start = buf->end = 0;
Sure.

>
>> + return n;
>> +}
>> +
>> +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
>> +{
>> + struct mmu_notifier_range range;
>> + struct pagemap_scan_private p = {0};
>> + unsigned long walk_start;
>> + size_t n_ranges_out = 0;
>> + int ret;
>> +
>> + ret = pagemap_scan_get_args(&p.arg, uarg);
>> + if (ret)
>> + return ret;
>> +
>> + p.masks_of_interest = p.arg.category_inverted | p.arg.category_mask |
>> + p.arg.category_anyof_mask | p.arg.return_mask;
>
> `category_inverted` can be left out, because if a set bit it is not also in one
> of the masks, then its value is going to be ignored.
Okay.

>
> [...]
>> + for (walk_start = p.arg.start; walk_start < p.arg.end;
>> + walk_start = p.arg.walk_end) {
>> + int n_out;
>> +
>> + if (fatal_signal_pending(current)) {
>> + ret = -EINTR;
>> + break;
>> + }
>> +
>> + ret = mmap_read_lock_killable(mm);
>> + if (ret)
>> + break;
>> + ret = walk_page_range(mm, walk_start, p.arg.end,
>> + &pagemap_scan_ops, &p);
>> + mmap_read_unlock(mm);
>> +
>> + n_out = pagemap_scan_flush_buffer(&p);
>> + if (n_out < 0)
>> + ret = n_out;
>> + else
>> + n_ranges_out += n_out;
>> +
>> + p.walk_end_addr = p.walk_end_addr ? p.walk_end_addr : p.arg.end;
>
> Why is `p.walk_end_addr` needed? It is not used in the loop code. Shoudn't
> it be `p.arg.walk_end` as used in the `for` loop continuation statement?
It isn't needed for the loop. But we need to note down the ending address
of walk. We can switch to using p.arg.walk_end for better logical reason.
I'll update code.

>
>> + if (ret != -ENOSPC || p.arg.vec_len == 0 ||
>> + p.found_pages == p.arg.max_pages)
>> + break;
>
> Nit: I think you could split this into two or three separate `if (x)
> break;` for easier reading. The `vec_len` and `found_pages` are
> buffer-full tests, so could go along, but `ret != ENOSPC` is checking an
> error condition aborting the scan before it ends.
Can be done.

>
>> + }
>> +
>> + /* ENOSPC signifies early stop (buffer full) from the walk. */
>> + if (!ret || ret == -ENOSPC)
>> + ret = n_ranges_out;
>> +
>> + p.arg.walk_end = p.walk_end_addr ? p.walk_end_addr : walk_start;
>> + if (pagemap_scan_writeback_args(&p.arg, uarg))
>> + ret = -EFAULT;
> [...]
>> --- a/include/uapi/linux/fs.h
>> +++ b/include/uapi/linux/fs.h
> [...]
>> +/*
>> + * struct pm_scan_arg - Pagemap ioctl argument
>> + * @size: Size of the structure
>> + * @flags: Flags for the IOCTL
>> + * @start: Starting address of the region
>> + * @end: Ending address of the region
>> + * @walk_end Address where the scan stopped (written by kernel).
>> + * walk_end == end (tag removed) informs that the scan completed on entire range.
>
> I'm not sure `tag removed` is enough to know what tag was removed.
> Maybe something like "with address tags cleared" would fit?
Okay.

>
> Best Regards
> Michał Mirosław

--
BR,
Muhammad Usama Anjum