Re: [PATCH v2 1/4] hmm: Device exclusive memory access

From: Christoph Hellwig
Date: Fri Feb 19 2021 - 04:49:40 EST


> page = migration_entry_to_page(swpent);
> else if (is_device_private_entry(swpent))
> page = device_private_entry_to_page(swpent);
> + else if (is_device_exclusive_entry(swpent))
> + page = device_exclusive_entry_to_page(swpent);

> page = migration_entry_to_page(swpent);
> else if (is_device_private_entry(swpent))
> page = device_private_entry_to_page(swpent);
> + else if (is_device_exclusive_entry(swpent))
> + page = device_exclusive_entry_to_page(swpent);

> if (is_device_private_entry(entry))
> page = device_private_entry_to_page(entry);
> +
> + if (is_device_exclusive_entry(entry))
> + page = device_exclusive_entry_to_page(entry);

Any chance we can come up with a clever scheme to avoid all this
boilerplate code (and maybe also what it gets compiled to)?

> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 866a0fa104c4..5d28ff6d4d80 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -109,6 +109,10 @@ struct hmm_range {
> */
> int hmm_range_fault(struct hmm_range *range);
>
> +int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
> + unsigned long end, struct page **pages);
> +vm_fault_t hmm_remove_exclusive_entry(struct vm_fault *vmf);

Can we avoid the hmm naming for new code (we should probably also kill
it off for the existing code)?

> +#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e) \
> + || is_device_exclusive_entry(e)); })
> +#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e) \
> + || is_device_exclusive_entry(e)); })

Can you turn these into properly formatted inline functions? As-is this
becomes pretty unreadable.

> +static inline void make_device_exclusive_entry_read(swp_entry_t *entry)
> +{
> + *entry = swp_entry(SWP_DEVICE_EXCLUSIVE_READ, swp_offset(*entry));
> +}

s/make_device_exclusive_entry_read/mark_device_exclusive_entry_readable/
??

> +
> +static inline swp_entry_t make_device_exclusive_entry(struct page *page, bool write)
> +{
> + return swp_entry(write ? SWP_DEVICE_EXCLUSIVE_WRITE : SWP_DEVICE_EXCLUSIVE_READ,
> + page_to_pfn(page));
> +}

I'd split this into two helpers, which is easier to follow and avoids
the pointlessly overlong lines.

> +static inline bool is_device_exclusive_entry(swp_entry_t entry)
> +{
> + int type = swp_type(entry);
> + return type == SWP_DEVICE_EXCLUSIVE_READ || type == SWP_DEVICE_EXCLUSIVE_WRITE;
> +}

Another overly long line. I also wouldn't bother with the local
variable:

return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;


> +static inline bool is_write_device_exclusive_entry(swp_entry_t entry)
> +{
> + return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
> +}

Or reuse these kind of helpers..

> +
> +static inline unsigned long device_exclusive_entry_to_pfn(swp_entry_t entry)
> +{
> + return swp_offset(entry);
> +}
> +
> +static inline struct page *device_exclusive_entry_to_page(swp_entry_t entry)
> +{
> + return pfn_to_page(swp_offset(entry));
> +}

I'd rather open code these two, and as a prep patch also kill off the
equivalents for the migration and device private entries, which would
actually clean up a lot of the mess mentioned in my first comment above.

> +static int hmm_exclusive_skip(unsigned long start,
> + unsigned long end,
> + __always_unused int depth,
> + struct mm_walk *walk)
> +{
> + struct hmm_exclusive_walk *hmm_exclusive_walk = walk->private;
> + unsigned long addr;
> +
> + for (addr = start; addr < end; addr += PAGE_SIZE)
> + hmm_exclusive_walk->pages[hmm_exclusive_walk->npages++] = NULL;
> +
> + return 0;
> +}

Wouldn't pre-zeroing the array be simpler and more efficient?

> +int hmm_exclusive_range(struct mm_struct *mm, unsigned long start,
> + unsigned long end, struct page **pages)
> +{
> + struct hmm_exclusive_walk hmm_exclusive_walk = { .pages = pages, .npages = 0 };
> + int i;
> +
> + /* Collect and lock candidate pages */
> + walk_page_range(mm, start, end, &hmm_exclusive_walk_ops, &hmm_exclusive_walk);

Please avoid the overly long lines.

But more importantly: Unless I'm missing something obvious this
walk_page_range call just open codes get_user_pages_fast, why can't you
use that?

> +#if defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) || defined(CONFIG_HUGETLB)
> + if (PageTransHuge(page)) {
> + VM_BUG_ON_PAGE(1, page);
> + continue;
> + }
> +#endif

Doesn't PageTransHuge always return false for that case? If not
shouldn't we make sure it does?

> +
> + pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
> + if (pte_swp_soft_dirty(*pvmw.pte))
> + pte = pte_mksoft_dirty(pte);
> +
> + entry = pte_to_swp_entry(*pvmw.pte);
> + if (pte_swp_uffd_wp(*pvmw.pte))
> + pte = pte_mkuffd_wp(pte);
> + else if (is_write_device_exclusive_entry(entry))
> + pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> +
> + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
> +
> + /*
> + * No need to take a page reference as one was already
> + * created when the swap entry was made.
> + */
> + if (PageAnon(page))
> + page_add_anon_rmap(page, vma, pvmw.address, false);
> + else
> + page_add_file_rmap(page, false);
> +
> + if (vma->vm_flags & VM_LOCKED)
> + mlock_vma_page(page);
> +
> + /*
> + * No need to invalidate - it was non-present before. However
> + * secondary CPUs may have mappings that need invalidating.
> + */
> + update_mmu_cache(vma, pvmw.address, pvmw.pte);

It would be nice to split out the body of this loop into a helper.

> + if (!is_device_private_entry(entry) &&
> + !is_device_exclusive_entry(entry))

The normal style for this would be:

if (!is_device_private_entry(entry) &&
!is_device_exclusive_entry(entry))

> - if (!is_device_private_entry(entry))
> + if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry))

Plase split this into two lines.

> @@ -216,6 +219,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> }
> if (!map_pte(pvmw))
> goto next_pte;
> +
> while (1) {
> if (check_pte(pvmw))
> return true;

Spurious whitespace change.

> - if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
> + if (IS_ENABLED(CONFIG_MIGRATION) && (flags & (TTU_MIGRATION | TTU_EXCLUSIVE)) &&

Please split this into two lines.

> is_zone_device_page(page) && !is_device_private_page(page))
> return true;
>
> @@ -1591,6 +1591,33 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
> /* We have to invalidate as we cleared the pte */
> mmu_notifier_invalidate_range(mm, address,
> address + PAGE_SIZE);
> + } else if (flags & TTU_EXCLUSIVE) {

try_to_unmap_one has turned into a monster. A little refactoring to
split it into managable pieces would be nice.