Re: linux-next: manual merge of the akpm-current tree with the folio tree

From: Stephen Rothwell
Date: Wed Feb 16 2022 - 04:50:11 EST


Hi all,

On Wed, 16 Feb 2022 17:21:09 +1100 Stephen Rothwell <sfr@xxxxxxxxxxxxxxxx> wrote:
>
> It looks like Andrew now has a new version of Hugh's patches and there
> are quite a few other conflicts as well (see my attempt at mm/gup.c).

I have attached the conflicts I get when I merge the folio tree into
today's linux-next.
--
Cheers,
Stephen Rothwell
diff --cc include/linux/mm.h
index 49692a64d645,68e68d37a3d0..000000000000
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -814,8 -829,15 +825,19 @@@ static inline int page_mapcount(struct
return atomic_read(&page->_mapcount) + 1;
}

+ int folio_mapcount(struct folio *folio);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++<<<<<<< HEAD
+int total_mapcount(struct page *page);
++=======
+ static inline int total_mapcount(struct page *page)
+ {
+ return folio_mapcount(page_folio(page));
+ }
+
+ int page_trans_huge_mapcount(struct page *page);
++>>>>>>> folio/for-next
#else
static inline int total_mapcount(struct page *page)
{
@@@ -1103,7 -1162,32 +1149,36 @@@ static inline bool put_devmap_managed_p
{
return false;
}
++<<<<<<< HEAD
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
++=======
+
+ static inline void put_devmap_managed_page(struct page *page)
+ {
+ }
+ #endif /* CONFIG_DEV_PAGEMAP_OPS */
+
+ static inline bool is_device_private_page(const struct page *page)
+ {
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+ is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+ }
+
+ static inline bool folio_is_device_private(const struct folio *folio)
+ {
+ return is_device_private_page(&folio->page);
+ }
+
+ static inline bool is_pci_p2pdma_page(const struct page *page)
+ {
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+ is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
+ }
++>>>>>>> folio/for-next

/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
diff --cc include/linux/rmap.h
index 73cce292d32c,17230c458341..000000000000
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@@ -11,9 -11,8 +11,10 @@@
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>
+ #include <linux/pagemap.h>

+#include <linux/refcount.h>
+
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
@@@ -240,7 -261,7 +263,11 @@@ unsigned long page_address_in_vma(struc
*/
int folio_mkclean(struct folio *);

++<<<<<<< HEAD
+void remove_migration_ptes(struct page *old, struct page *new, bool locked);
++=======
+ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
++>>>>>>> folio/for-next

/*
* Called by memory-failure.c to kill processes.
diff --cc mm/gup.c
index 4ab43b4fc9bc,57bf69ac8ab4..000000000000
--- a/mm/gup.c
+++ b/mm/gup.c
@@@ -1844,84 -1783,50 +1786,128 @@@ static long check_and_migrate_movable_p
struct page **pages,
unsigned int gup_flags)
{
++<<<<<<< HEAD
+ unsigned long isolation_error_count = 0, i;
+ struct page *prev_head = NULL;
+ LIST_HEAD(movable_page_list);
+ bool drain_allow = true;
+ int ret = 0;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *head = compound_head(pages[i]);
+
+ if (head == prev_head)
+ continue;
+ prev_head = head;
+
+ /*
+ * Device private pages will get faulted in during gup so it
+ * shouldn't be possible to see one here.
+ */
+ if (WARN_ON_ONCE(is_device_private_page(head))) {
+ ret = -EFAULT;
+ goto unpin_pages;
++=======
+ unsigned long i;
+ unsigned long isolation_error_count = 0;
+ bool drain_allow = true;
+ LIST_HEAD(movable_page_list);
+ long ret = 0;
+ struct folio *folio, *prev_folio = NULL;
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_NOWARN,
+ };
+
+ for (i = 0; i < nr_pages; i++) {
+ folio = page_folio(pages[i]);
+ if (folio == prev_folio)
+ continue;
+ prev_folio = folio;
+ /*
+ * If we get a movable page, since we are going to be pinning
+ * these entries, try to move them out if possible.
+ */
+ if (!is_pinnable_page(&folio->page)) {
+ if (folio_test_hugetlb(folio)) {
+ if (!isolate_huge_page(&folio->page,
+ &movable_page_list))
+ isolation_error_count++;
+ } else {
+ if (!folio_test_lru(folio) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
+ }
+
+ if (folio_isolate_lru(folio)) {
+ isolation_error_count++;
+ continue;
+ }
+ list_add_tail(&folio->lru, &movable_page_list);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON +
+ folio_is_file_lru(folio),
+ folio_nr_pages(folio));
+ }
++>>>>>>> folio/for-next
+ }
+
+ /*
+ * Device coherent pages are managed by a driver and should not
+ * be pinned indefinitely as it prevents the driver moving the
+ * page. So when trying to pin with FOLL_LONGTERM instead try
+ * to migrate the page out of device memory.
+ */
+ if (is_device_coherent_page(head)) {
+ WARN_ON_ONCE(PageCompound(head));
+
+ /*
+ * Migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
+ */
+ if (gup_flags & FOLL_PIN) {
+ get_page(head);
+ unpin_user_page(head);
+ }
+
+ pages[i] = migrate_device_page(head, gup_flags);
+ if (!pages[i]) {
+ ret = -EBUSY;
+ goto unpin_pages;
+ }
+ continue;
}
+
+ if (is_pinnable_page(head))
+ continue;
+
+ /*
+ * Try to move out any movable page before pinning the range.
+ */
+ if (PageHuge(head)) {
+ if (!isolate_huge_page(head, &movable_page_list))
+ isolation_error_count++;
+ continue;
+ }
+
+ if (!PageLRU(head) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
+ }
+
+ if (isolate_lru_page(head)) {
+ isolation_error_count++;
+ continue;
+ }
+ list_add_tail(&head->lru, &movable_page_list);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON + page_is_file_lru(head),
+ thp_nr_pages(head));
}

+ if (!list_empty(&movable_page_list) || isolation_error_count)
+ goto unpin_pages;
+
/*
* If list is empty, and no isolation errors, means that all pages are
* in the correct zone.
diff --cc mm/huge_memory.c
index 09fb65a80e63,f85b04b31bd1..000000000000
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -2137,6 -2114,8 +2134,11 @@@ void __split_huge_pmd(struct vm_area_st
{
spinlock_t *ptl;
struct mmu_notifier_range range;
++<<<<<<< HEAD
++=======
+ bool do_unlock_folio = false;
+ pmd_t _pmd;
++>>>>>>> folio/for-next

mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@@ -2155,14 -2134,42 +2157,49 @@@
goto out;
}

-repeat:
if (pmd_trans_huge(*pmd)) {
++<<<<<<< HEAD
+ if (!page)
+ page = pmd_page(*pmd);
++=======
+ if (!folio) {
+ folio = page_folio(pmd_page(*pmd));
+ /*
+ * An anonymous page must be locked, to ensure that a
+ * concurrent reuse_swap_page() sees stable mapcount;
+ * but reuse_swap_page() is not used on shmem or file,
+ * and page lock must not be taken when zap_pmd_range()
+ * calls __split_huge_pmd() while i_mmap_lock is held.
+ */
+ if (folio_test_anon(folio)) {
+ if (unlikely(!folio_trylock(folio))) {
+ folio_get(folio);
+ _pmd = *pmd;
+ spin_unlock(ptl);
+ folio_lock(folio);
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, _pmd))) {
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
+ goto repeat;
+ }
+ folio_put(folio);
+ }
+ do_unlock_folio = true;
+ }
+ }
++>>>>>>> folio/for-next
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
goto out;
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
++<<<<<<< HEAD
++=======
+ if (do_unlock_folio)
+ folio_unlock(folio);
++>>>>>>> folio/for-next
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@@ -2455,28 -2464,52 +2494,77 @@@ static void __split_huge_page(struct pa
}
}

++<<<<<<< HEAD
+int total_mapcount(struct page *page)
+{
+ int i, compound, nr, ret;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ compound = compound_mapcount(page);
+ nr = compound_nr(page);
+ if (PageHuge(page))
+ return compound;
+ ret = compound;
+ for (i = 0; i < nr; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+ /* File pages has compound_mapcount included in _mapcount */
+ if (!PageAnon(page))
+ return ret - compound * nr;
+ if (PageDoubleMap(page))
+ ret -= nr;
+ return ret;
++=======
+ /*
+ * This calculates accurately how many mappings a transparent hugepage
+ * has (unlike page_mapcount() which isn't fully accurate). This full
+ * accuracy is primarily needed to know if copy-on-write faults can
+ * reuse the page and change the mapping to read-write instead of
+ * copying them. At the same time this returns the total_mapcount too.
+ *
+ * The function returns the highest mapcount any one of the subpages
+ * has. If the return value is one, even if different processes are
+ * mapping different subpages of the transparent hugepage, they can
+ * all reuse it, because each process is reusing a different subpage.
+ *
+ * The total_mapcount is instead counting all virtual mappings of the
+ * subpages. If the total_mapcount is equal to "one", it tells the
+ * caller all mappings belong to the same "mm" and in turn the
+ * anon_vma of the transparent hugepage can become the vma->anon_vma
+ * local one as no other process may be mapping any of the subpages.
+ *
+ * It would be more accurate to replace page_mapcount() with
+ * page_trans_huge_mapcount(), however we only use
+ * page_trans_huge_mapcount() in the copy-on-write faults where we
+ * need full accuracy to avoid breaking page pinning, because
+ * page_trans_huge_mapcount() is slower than page_mapcount().
+ */
+ int page_trans_huge_mapcount(struct page *page)
+ {
+ int i, ret;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ page = compound_head(page);
+
+ ret = 0;
+ for (i = 0; i < thp_nr_pages(page); i++) {
+ int mapcount = atomic_read(&page[i]._mapcount) + 1;
+ ret = max(ret, mapcount);
+ }
+
+ if (PageDoubleMap(page))
+ ret -= 1;
+
+ return ret + compound_mapcount(page);
++>>>>>>> folio/for-next
}

/* Racy check whether the huge page can be split */
@@@ -3116,9 -3151,6 +3206,12 @@@ void remove_migration_pmd(struct page_v
else
page_add_file_rmap(new, vma, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
++<<<<<<< HEAD
+
+ /* No need to invalidate - it was non-present before */
++=======
++>>>>>>> folio/for-next
update_mmu_cache_pmd(vma, address, pvmw->pmd);
+ trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif
diff --cc mm/internal.h
index 7ed98955c8f4,f0e4dfac0264..000000000000
--- a/mm/internal.h
+++ b/mm/internal.h
@@@ -409,15 -417,22 +417,33 @@@ extern int mlock_future_check(struct mm
* pte mappings of THPs, which cannot be consistently counted: a pte
* mapping of the THP head cannot be distinguished by the page alone.
*/
++<<<<<<< HEAD
+void mlock_page(struct page *page);
+static inline void mlock_vma_page(struct page *page,
++=======
+ void mlock_folio(struct folio *folio);
+ static inline void mlock_vma_folio(struct folio *folio,
++>>>>>>> folio/for-next
struct vm_area_struct *vma, bool compound)
{
/* VM_IO check prevents migration from double-counting during mlock */
if (unlikely((vma->vm_flags & (VM_LOCKED|VM_IO)) == VM_LOCKED) &&
++<<<<<<< HEAD
+ (compound || !PageTransCompound(page)))
+ mlock_page(page);
+}
++=======
+ (compound || !folio_test_large(folio)))
+ mlock_folio(folio);
+ }
+
+ static inline void mlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound)
+ {
+ mlock_vma_folio(page_folio(page), vma, compound);
+ }
+
++>>>>>>> folio/for-next
void munlock_page(struct page *page);
static inline void munlock_vma_page(struct page *page,
struct vm_area_struct *vma, bool compound)
@@@ -717,9 -745,9 +745,16 @@@ void vunmap_range_noflush(unsigned lon
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);

++<<<<<<< HEAD
+DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+
+void free_zone_device_page(struct page *page);
+struct page *migrate_device_page(struct page *page, unsigned int gup_flags);
++=======
+ /*
+ * mm/gup.c
+ */
+ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
++>>>>>>> folio/for-next

#endif /* __MM_INTERNAL_H */
diff --cc mm/khugepaged.c
index 7d45d463acf5,000825a6e086..000000000000
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@@ -1823,7 -1835,7 +1824,11 @@@ static void collapse_file(struct mm_str
}

if (page_mapped(page))
++<<<<<<< HEAD
+ try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
++=======
+ try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
++>>>>>>> folio/for-next

xas_lock_irq(&xas);
xas_set(&xas, index);
diff --cc mm/madvise.c
index ede6affa1350,ae35d72627ef..000000000000
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@@ -554,14 -530,9 +554,20 @@@ static void madvise_cold_page_range(str
tlb_end_vma(tlb, vma);
}

++<<<<<<< HEAD
+static inline bool can_madv_lru_non_huge_vma(struct vm_area_struct *vma)
+{
+ return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP));
+}
+
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
+{
+ return can_madv_lru_non_huge_vma(vma) && !is_vm_hugetlb_page(vma);
++=======
+ static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
+ {
+ return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
++>>>>>>> folio/for-next
}

static long madvise_cold(struct vm_area_struct *vma,
diff --cc mm/memory-failure.c
index 3e404b06efdc,aa8236848949..000000000000
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@@ -1405,22 -1413,26 +1407,45 @@@ static bool hwpoison_user_mappings(stru
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);

++<<<<<<< HEAD
+ if (PageHuge(hpage) && !PageAnon(hpage)) {
+ /*
+ * For hugetlb pages in shared mappings, try_to_unmap
+ * could potentially call huge_pmd_unshare. Because of
+ * this, take semaphore in write mode here and set
+ * TTU_RMAP_LOCKED to indicate we have taken the lock
+ * at this higher level.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (mapping) {
+ try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ } else
+ pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+ } else {
+ try_to_unmap(hpage, ttu);
++=======
+ if (!PageHuge(hpage)) {
+ try_to_unmap(folio, ttu);
+ } else {
+ if (!PageAnon(hpage)) {
+ /*
+ * For hugetlb pages in shared mappings, try_to_unmap
+ * could potentially call huge_pmd_unshare. Because of
+ * this, take semaphore in write mode here and set
+ * TTU_RMAP_LOCKED to indicate we have taken the lock
+ * at this higher level.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (mapping) {
+ try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ } else
+ pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+ } else {
+ try_to_unmap(folio, ttu);
+ }
++>>>>>>> folio/for-next
}

unmap_success = !page_mapped(hpage);
diff --cc mm/migrate.c
index 88b59f9f8d29,8a2f0a64f703..000000000000
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@@ -251,9 -248,6 +246,12 @@@ static bool remove_migration_pte(struc
}
if (vma->vm_flags & VM_LOCKED)
mlock_page_drain(smp_processor_id());
++<<<<<<< HEAD
+
+ trace_remove_migration_pte(pvmw.address, pte_val(pte),
+ compound_order(new));
++=======
++>>>>>>> folio/for-next

/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
@@@ -2157,6 -2135,768 +2160,771 @@@ out
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA */

++<<<<<<< HEAD
++=======
+ #ifdef CONFIG_DEVICE_PRIVATE
+ static int migrate_vma_collect_skip(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+ {
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+
+ return 0;
+ }
+
+ static int migrate_vma_collect_hole(unsigned long start,
+ unsigned long end,
+ __always_unused int depth,
+ struct mm_walk *walk)
+ {
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+ }
+
+ return 0;
+ }
+
+ static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+ {
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ again:
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+ if (pmd_trans_huge(*pmdp)) {
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ page = pmd_page(*pmdp);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ if (pmd_trans_unstable(pmdp))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ } else {
+ int ret;
+
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, -1,
+ walk);
+ }
+ }
+
+ if (unlikely(pmd_bad(*pmdp)))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn = 0, pfn;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+
+ pte = *ptep;
+
+ if (pte_none(pte)) {
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
+ goto next;
+ }
+
+ if (!pte_present(pte)) {
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = pfn_swap_entry_to_page(entry);
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ page->pgmap->owner != migrate->pgmap_owner)
+ goto next;
+
+ mpfn = migrate_pfn(page_to_pfn(page)) |
+ MIGRATE_PFN_MIGRATE;
+ if (is_writable_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
+ pfn = pte_pfn(pte);
+ if (is_zero_pfn(pfn)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ goto next;
+ }
+ page = vm_normal_page(migrate->vma, addr, pte);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
+ /* FIXME support THP */
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = 0;
+ goto next;
+ }
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the page from the lru
+ * for non device page (device page are not on the lru and thus
+ * can't be dropped from it).
+ */
+ get_page(page);
+
+ /*
+ * Optimize for the common case where page is only mapped once
+ * in one process. If we can lock the page, then we can safely
+ * set up a special migration page table entry now.
+ */
+ if (trylock_page(page)) {
+ pte_t swp_pte;
+
+ migrate->cpages++;
+ ptep_get_and_clear(mm, addr, ptep);
+
+ /* Setup special migration page table entry */
+ if (mpfn & MIGRATE_PFN_WRITE)
+ entry = make_writable_migration_entry(
+ page_to_pfn(page));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_present(pte)) {
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
+ set_pte_at(mm, addr, ptep, swp_pte);
+
+ /*
+ * This is like regular unmap: we remove the rmap and
+ * drop page refcount. Page won't be freed, as we took
+ * a reference just above.
+ */
+ page_remove_rmap(page, vma, false);
+ put_page(page);
+
+ if (pte_present(pte))
+ unmapped++;
+ } else {
+ put_page(page);
+ mpfn = 0;
+ }
+
+ next:
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
+ /* Only flush the TLB if we actually modified any entries */
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return 0;
+ }
+
+ static const struct mm_walk_ops migrate_vma_walk_ops = {
+ .pmd_entry = migrate_vma_collect_pmd,
+ .pte_hole = migrate_vma_collect_hole,
+ };
+
+ /*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+ static void migrate_vma_collect(struct migrate_vma *migrate)
+ {
+ struct mmu_notifier_range range;
+
+ /*
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
+ * that the registered device driver can skip invalidating device
+ * private page mappings that won't be migrated.
+ */
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
+ migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
+ mmu_notifier_invalidate_range_start(&range);
+
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+ &migrate_vma_walk_ops, migrate);
+
+ mmu_notifier_invalidate_range_end(&range);
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+ }
+
+ /*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * folio_migrate_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+ static bool migrate_vma_check_page(struct page *page)
+ {
+ /*
+ * One extra ref because caller holds an extra reference, either from
+ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+ * a device page.
+ */
+ int extra = 1;
+
+ /*
+ * FIXME support THP (transparent huge page), it is bit more complex to
+ * check them than regular pages, because they can be mapped with a pmd
+ * or with a pte (split pte mapping).
+ */
+ if (PageCompound(page))
+ return false;
+
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page))
+ extra++;
+
+ /* For file back page */
+ if (page_mapping(page))
+ extra += 1 + page_has_private(page);
+
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+ return true;
+ }
+
+ /*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
+ * special migration pte entry and check if it has been pinned. Pinned pages are
+ * restored because we cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+ static void migrate_vma_unmap(struct migrate_vma *migrate)
+ {
+ const unsigned long npages = migrate->npages;
+ unsigned long i, restore = 0;
+ bool allow_drain = true;
+
+ lru_add_drain();
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct folio *folio;
+
+ if (!page)
+ continue;
+
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
+
+ if (isolate_lru_page(page)) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ continue;
+ }
+
+ /* Drop the reference we took in collect */
+ put_page(page);
+ }
+
+ folio = page_folio(page);
+ if (folio_mapped(folio))
+ try_to_migrate(folio, 0);
+
+ if (page_mapped(page) || !migrate_vma_check_page(page)) {
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
+
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ continue;
+ }
+ }
+
+ for (i = 0; i < npages && restore; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct folio *folio;
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ folio = page_folio(page);
+ remove_migration_ptes(folio, folio, false);
+
+ migrate->src[i] = 0;
+ folio_unlock(folio);
+ folio_put(folio);
+ restore--;
+ }
+ }
+
+ /**
+ * migrate_vma_setup() - prepare to migrate a range of memory
+ * @args: contains the vma, start, and pfns arrays for the migration
+ *
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
+ * without an error.
+ *
+ * Prepare to migrate a range of memory virtual address range by collecting all
+ * the pages backing each virtual address in the range, saving them inside the
+ * src array. Then lock those pages and unmap them. Once the pages are locked
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
+ * corresponding src array entry. Then restores any pages that are pinned, by
+ * remapping and unlocking those pages.
+ *
+ * The caller should then allocate destination memory and copy source memory to
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
+ * flag set). Once these are allocated and copied, the caller must update each
+ * corresponding entry in the dst array with the pfn value of the destination
+ * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
+ * lock_page().
+ *
+ * Note that the caller does not have to migrate all the pages that are marked
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
+ * device memory to system memory. If the caller cannot migrate a device page
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
+ * consequences for the userspace process, so it must be avoided if at all
+ * possible.
+ *
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing the caller to allocate device memory for those unbacked virtual
+ * addresses. For this the caller simply has to allocate device memory and
+ * properly set the destination entry like for regular migration. Note that
+ * this can still fail, and thus inside the device driver you must check if the
+ * migration was successful for those entries after calling migrate_vma_pages(),
+ * just like for regular migration.
+ *
+ * After that, the callers must call migrate_vma_pages() to go over each entry
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then migrate_vma_pages() to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
+ * src array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table after migrate_vma_pages() because
+ * both destination and source page are still locked, and the mmap_lock is held
+ * in read mode (hence no one can unmap the range being migrated).
+ *
+ * Once the caller is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) it finally calls
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
+ * for successfully migrated pages or otherwise restore the CPU page table to
+ * point to the original source pages.
+ */
+ int migrate_vma_setup(struct migrate_vma *args)
+ {
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
+
+ args->start &= PAGE_MASK;
+ args->end &= PAGE_MASK;
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (nr_pages <= 0)
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
+ return -EINVAL;
+ if (!args->src || !args->dst)
+ return -EINVAL;
+
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+
+ if (args->cpages)
+ migrate_vma_unmap(args);
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the drivers.
+ */
+ return 0;
+
+ }
+ EXPORT_SYMBOL(migrate_vma_setup);
+
+ /*
+ * This code closely matches the code in:
+ * __handle_mm_fault()
+ * handle_pte_fault()
+ * do_anonymous_page()
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
+ * private page.
+ */
+ static void migrate_vma_insert_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src)
+ {
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ bool flush = false;
+ spinlock_t *ptl;
+ pte_t entry;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ /* Only allow populating anonymous memory */
+ if (!vma_is_anonymous(vma))
+ goto abort;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ goto abort;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ goto abort;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ goto abort;
+
+ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ goto abort;
+
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have mmap_read_lock(mm).
+ */
+ if (pte_alloc(mm, pmdp))
+ goto abort;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmdp)))
+ goto abort;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto abort;
+ if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
+ goto abort;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (is_zone_device_page(page)) {
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
+
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ /*
+ * For now we only support migrating to un-addressable
+ * device memory.
+ */
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
+ }
+ } else {
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+
+ if (check_stable_address_space(mm))
+ goto unlock_abort;
+
+ if (pte_present(*ptep)) {
+ unsigned long pfn = pte_pfn(*ptep);
+
+ if (!is_zero_pfn(pfn))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pte_none(*ptep))
+ goto unlock_abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ inc_mm_counter(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ if (!is_zone_device_page(page))
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ get_page(page);
+
+ if (flush) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ } else {
+ /* No need to invalidate - it was non-present before */
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ }
+
+ pte_unmap_unlock(ptep, ptl);
+ *src = MIGRATE_PFN_MIGRATE;
+ return;
+
+ unlock_abort:
+ pte_unmap_unlock(ptep, ptl);
+ abort:
+ *src &= ~MIGRATE_PFN_MIGRATE;
+ }
+
+ /**
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+ void migrate_vma_pages(struct migrate_vma *migrate)
+ {
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ struct mmu_notifier_range range;
+ unsigned long addr, i;
+ bool notified = false;
+
+ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct address_space *mapping;
+ int r;
+
+ if (!newpage) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (!page) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+ if (!notified) {
+ notified = true;
+
+ mmu_notifier_range_init_owner(&range,
+ MMU_NOTIFY_MIGRATE, 0, migrate->vma,
+ migrate->vma->vm_mm, addr, migrate->end,
+ migrate->pgmap_owner);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+ migrate_vma_insert_page(migrate, addr, newpage,
+ &migrate->src[i]);
+ continue;
+ }
+
+ mapping = page_mapping(page);
+
+ if (is_zone_device_page(newpage)) {
+ if (is_device_private_page(newpage)) {
+ /*
+ * For now only support private anonymous when
+ * migrating to un-addressable device memory.
+ */
+ if (mapping) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ } else {
+ /*
+ * Other types of ZONE_DEVICE page are not
+ * supported.
+ */
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ }
+
+ r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+ if (r != MIGRATEPAGE_SUCCESS)
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
+ * did already call it.
+ */
+ if (notified)
+ mmu_notifier_invalidate_range_only_end(&range);
+ }
+ EXPORT_SYMBOL(migrate_vma_pages);
+
+ /**
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+ void migrate_vma_finalize(struct migrate_vma *migrate)
+ {
+ const unsigned long npages = migrate->npages;
+ unsigned long i;
+
+ for (i = 0; i < npages; i++) {
+ struct folio *dst, *src;
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ continue;
+ }
+
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ newpage = page;
+ }
+
+ src = page_folio(page);
+ dst = page_folio(newpage);
+ remove_migration_ptes(src, dst, false);
+ folio_unlock(src);
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+
+ if (newpage != page) {
+ unlock_page(newpage);
+ if (is_zone_device_page(newpage))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+ }
+ }
+ EXPORT_SYMBOL(migrate_vma_finalize);
+ #endif /* CONFIG_DEVICE_PRIVATE */
+
++>>>>>>> folio/for-next
/*
* node_demotion[] example:
*
diff --cc mm/mlock.c
index d28e56529e5b,9858e733c29b..000000000000
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@@ -75,183 -75,183 +75,374 @@@ static struct lruvec *__mlock_page(stru
}
goto out;
}
++<<<<<<< HEAD
+
+ if (PageUnevictable(page)) {
+ if (PageMlocked(page))
+ page->mlock_count++;
+ goto out;
+ }
+
+ del_page_from_lru_list(page, lruvec);
+ ClearPageActive(page);
+ SetPageUnevictable(page);
+ page->mlock_count = !!PageMlocked(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+out:
+ SetPageLRU(page);
+ return lruvec;
+}
+
+static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
+
+ /* As above, this is a little surprising, but possible */
+ if (unlikely(page_evictable(page)))
+ goto out;
+
+ SetPageUnevictable(page);
+ page->mlock_count = !!PageMlocked(page);
+ __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+out:
+ add_page_to_lru_list(page, lruvec);
+ SetPageLRU(page);
+ return lruvec;
+}
+
+static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec)
+{
+ int nr_pages = thp_nr_pages(page);
+ bool isolated = false;
+
+ if (!TestClearPageLRU(page))
+ goto munlock;
+
+ isolated = true;
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
+
+ if (PageUnevictable(page)) {
+ /* Then mlock_count is maintained, but might undercount */
+ if (page->mlock_count)
+ page->mlock_count--;
+ if (page->mlock_count)
+ goto out;
+ }
+ /* else assume that was the last mlock: reclaim will fix it if not */
+
+munlock:
+ if (TestClearPageMlocked(page)) {
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ if (isolated || !PageUnevictable(page))
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+ else
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ }
+
+ /* page_evictable() has to be checked *after* clearing Mlocked */
+ if (isolated && PageUnevictable(page) && page_evictable(page)) {
+ del_page_from_lru_list(page, lruvec);
+ ClearPageUnevictable(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ }
+out:
+ if (isolated)
+ SetPageLRU(page);
+ return lruvec;
+}
+
+/*
+ * Flags held in the low bits of a struct page pointer on the mlock_pvec.
+ */
+#define LRU_PAGE 0x1
+#define NEW_PAGE 0x2
+static inline struct page *mlock_lru(struct page *page)
+{
+ return (struct page *)((unsigned long)page + LRU_PAGE);
+}
+
+static inline struct page *mlock_new(struct page *page)
+{
+ return (struct page *)((unsigned long)page + NEW_PAGE);
+}
+
+/*
+ * mlock_pagevec() is derived from pagevec_lru_move_fn():
+ * perhaps that can make use of such page pointer flags in future,
+ * but for now just keep it for mlock. We could use three separate
+ * pagevecs instead, but one feels better (munlocking a full pagevec
+ * does not need to drain mlocking pagevecs first).
+ */
+static void mlock_pagevec(struct pagevec *pvec)
+{
+ struct lruvec *lruvec = NULL;
+ unsigned long mlock;
+ struct page *page;
+ int i;
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ page = pvec->pages[i];
+ mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE);
+ page = (struct page *)((unsigned long)page - mlock);
+ pvec->pages[i] = page;
+
+ if (mlock & LRU_PAGE)
+ lruvec = __mlock_page(page, lruvec);
+ else if (mlock & NEW_PAGE)
+ lruvec = __mlock_new_page(page, lruvec);
+ else
+ lruvec = __munlock_page(page, lruvec);
+ }
+
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ release_pages(pvec->pages, pvec->nr);
+ pagevec_reinit(pvec);
+}
+
+void mlock_page_drain(int cpu)
+{
+ struct pagevec *pvec;
+
+ pvec = &per_cpu(mlock_pvec, cpu);
+ if (pagevec_count(pvec))
+ mlock_pagevec(pvec);
+}
+
+bool need_mlock_page_drain(int cpu)
+{
+ return pagevec_count(&per_cpu(mlock_pvec, cpu));
+}
+
+/**
+ * mlock_page - mlock a page already on (or temporarily off) LRU
+ * @page: page to be mlocked, either a normal page or a THP head.
+ */
+void mlock_page(struct page *page)
+{
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+
+ if (!TestSetPageMlocked(page)) {
+ int nr_pages = thp_nr_pages(page);
+
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+ }
+
+ get_page(page);
+ if (!pagevec_add(pvec, mlock_lru(page)) ||
+ PageHead(page) || lru_cache_disabled())
+ mlock_pagevec(pvec);
+ put_cpu_var(mlock_pvec);
+}
+
+/**
+ * mlock_new_page - mlock a newly allocated page not yet on LRU
+ * @page: page to be mlocked, either a normal page or a THP head.
+ */
+void mlock_new_page(struct page *page)
+{
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+ int nr_pages = thp_nr_pages(page);
+
+ SetPageMlocked(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+
++ get_page(page);
++ if (!pagevec_add(pvec, mlock_new(page)) ||
++ PageHead(page) || lru_cache_disabled())
++=======
+
+ if (PageUnevictable(page)) {
+ if (PageMlocked(page))
+ page->mlock_count++;
+ goto out;
+ }
+
+ del_page_from_lru_list(page, lruvec);
+ ClearPageActive(page);
+ SetPageUnevictable(page);
+ page->mlock_count = !!PageMlocked(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+ out:
+ SetPageLRU(page);
+ return lruvec;
+ }
+
+ static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec)
+ {
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
+
+ /* As above, this is a little surprising, but possible */
+ if (unlikely(page_evictable(page)))
+ goto out;
+
+ SetPageUnevictable(page);
+ page->mlock_count = !!PageMlocked(page);
+ __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+ out:
+ add_page_to_lru_list(page, lruvec);
+ SetPageLRU(page);
+ return lruvec;
+ }
+
+ static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec)
+ {
+ int nr_pages = thp_nr_pages(page);
+ bool isolated = false;
+
+ if (!TestClearPageLRU(page))
+ goto munlock;
+
+ isolated = true;
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
+
+ if (PageUnevictable(page)) {
+ /* Then mlock_count is maintained, but might undercount */
+ if (page->mlock_count)
+ page->mlock_count--;
+ if (page->mlock_count)
+ goto out;
+ }
+ /* else assume that was the last mlock: reclaim will fix it if not */
+
+ munlock:
+ if (TestClearPageMlocked(page)) {
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ if (isolated || !PageUnevictable(page))
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+ else
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ }
+
+ /* page_evictable() has to be checked *after* clearing Mlocked */
+ if (isolated && PageUnevictable(page) && page_evictable(page)) {
+ del_page_from_lru_list(page, lruvec);
+ ClearPageUnevictable(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ }
+ out:
+ if (isolated)
+ SetPageLRU(page);
+ return lruvec;
+ }
+
+ /*
+ * Flags held in the low bits of a struct page pointer on the mlock_pvec.
+ */
+ #define LRU_PAGE 0x1
+ #define NEW_PAGE 0x2
+ static inline struct page *mlock_lru(struct page *page)
+ {
+ return (struct page *)((unsigned long)page + LRU_PAGE);
+ }
+
+ static inline struct page *mlock_new(struct page *page)
+ {
+ return (struct page *)((unsigned long)page + NEW_PAGE);
+ }
+
+ /*
+ * mlock_pagevec() is derived from pagevec_lru_move_fn():
+ * perhaps that can make use of such page pointer flags in future,
+ * but for now just keep it for mlock. We could use three separate
+ * pagevecs instead, but one feels better (munlocking a full pagevec
+ * does not need to drain mlocking pagevecs first).
+ */
+ static void mlock_pagevec(struct pagevec *pvec)
+ {
+ struct lruvec *lruvec = NULL;
+ unsigned long mlock;
+ struct page *page;
+ int i;
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ page = pvec->pages[i];
+ mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE);
+ page = (struct page *)((unsigned long)page - mlock);
+ pvec->pages[i] = page;
+
+ if (mlock & LRU_PAGE)
+ lruvec = __mlock_page(page, lruvec);
+ else if (mlock & NEW_PAGE)
+ lruvec = __mlock_new_page(page, lruvec);
+ else
+ lruvec = __munlock_page(page, lruvec);
+ }
+
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ release_pages(pvec->pages, pvec->nr);
+ pagevec_reinit(pvec);
+ }
+
+ void mlock_page_drain(int cpu)
+ {
+ struct pagevec *pvec;
+
+ pvec = &per_cpu(mlock_pvec, cpu);
+ if (pagevec_count(pvec))
+ mlock_pagevec(pvec);
+ }
+
+ bool need_mlock_page_drain(int cpu)
+ {
+ return pagevec_count(&per_cpu(mlock_pvec, cpu));
+ }
+
+ /**
+ * mlock_folio - mlock a folio already on (or temporarily off) LRU
+ * @page: folio to be mlocked.
+ */
+ void mlock_folio(struct folio *folio)
+ {
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+
+ if (!folio_test_set_mlocked(folio)) {
+ int nr_pages = folio_nr_pages(folio);
+
+ zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+ }
+
+ folio_get(folio);
+ if (!pagevec_add(pvec, mlock_lru(&folio->page)) ||
+ folio_test_large(folio) || lru_cache_disabled())
++>>>>>>> folio/for-next
+ mlock_pagevec(pvec);
+ put_cpu_var(mlock_pvec);
+ }
+
+ /**
++<<<<<<< HEAD
++ * munlock_page - munlock a page
++ * @page: page to be munlocked, either a normal page or a THP head.
++ */
++void munlock_page(struct page *page)
++{
++ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
++=======
+ * mlock_new_page - mlock a newly allocated page not yet on LRU
+ * @page: page to be mlocked, either a normal page or a THP head.
+ */
+ void mlock_new_page(struct page *page)
+ {
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+ int nr_pages = thp_nr_pages(page);
+
+ SetPageMlocked(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+
get_page(page);
if (!pagevec_add(pvec, mlock_new(page)) ||
PageHead(page) || lru_cache_disabled())
@@@ -266,6 -266,6 +457,7 @@@
void munlock_page(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(mlock_pvec);
++>>>>>>> folio/for-next

/*
* TestClearPageMlocked(page) must be left to __munlock_page(),
@@@ -296,7 -296,7 +488,11 @@@ static int mlock_pte_range(pmd_t *pmd,
goto out;
page = pmd_page(*pmd);
if (vma->vm_flags & VM_LOCKED)
++<<<<<<< HEAD
+ mlock_page(page);
++=======
+ mlock_folio(page_folio(page));
++>>>>>>> folio/for-next
else
munlock_page(page);
goto out;
@@@ -312,7 -312,7 +508,11 @@@
if (PageTransCompound(page))
continue;
if (vma->vm_flags & VM_LOCKED)
++<<<<<<< HEAD
+ mlock_page(page);
++=======
+ mlock_folio(page_folio(page));
++>>>>>>> folio/for-next
else
munlock_page(page);
}
diff --cc mm/rmap.c
index a13487385820,5470c8de2ec0..000000000000
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@@ -106,10 -104,10 +106,10 @@@ static inline struct anon_vma *anon_vma

static inline void anon_vma_free(struct anon_vma *anon_vma)
{
- VM_BUG_ON(atomic_read(&anon_vma->refcount));
+ VM_BUG_ON(refcount_read(&anon_vma->refcount));

/*
- * Synchronize against page_lock_anon_vma_read() such that
+ * Synchronize against folio_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
@@@ -815,9 -810,9 +812,15 @@@ static bool folio_referenced_one(struc
address = pvmw.address;

if ((vma->vm_flags & VM_LOCKED) &&
++<<<<<<< HEAD
+ (!PageTransCompound(page) || !pvmw.pte)) {
+ /* Restore the mlock which got missed */
+ mlock_vma_page(page, vma, !pvmw.pte);
++=======
+ (!folio_test_large(folio) || !pvmw.pte)) {
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma, !pvmw.pte);
++>>>>>>> folio/for-next
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
return false; /* To break the loop */
@@@ -1469,24 -1454,25 +1462,40 @@@ static bool try_to_unmap_one(struct fol

while (page_vma_mapped_walk(&pvmw)) {
/* Unexpected PMD-mapped THP? */
++<<<<<<< HEAD
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ /*
+ * If the page is in an mlock()d vma, we must not swap it out.
++=======
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+
+ /*
+ * If the folio is in an mlock()d vma, we must not swap it out.
++>>>>>>> folio/for-next
*/
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
/* Restore the mlock which got missed */
++<<<<<<< HEAD
+ mlock_vma_page(page, vma, false);
++=======
+ mlock_vma_folio(folio, vma, false);
++>>>>>>> folio/for-next
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}

++<<<<<<< HEAD
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
++=======
+ subpage = folio_page(folio,
+ pte_pfn(*pvmw.pte) - folio_pfn(folio));
++>>>>>>> folio/for-next
address = pvmw.address;

- if (PageHuge(page) && !PageAnon(page)) {
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
@@@ -1590,31 -1577,8 +1600,36 @@@
}

/* MADV_FREE page check */
++<<<<<<< HEAD
+ if (!PageSwapBacked(page)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = page_count(page);
+ map_count = page_mapcount(page);
+
+ /*
+ * Order reads for page refcount and dirty flag;
+ * see __remove_mapping().
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be from the isolation
+ * plus one or more rmap's (dropped by discard:).
+ */
+ if ((ref_count == 1 + map_count) &&
+ !PageDirty(page)) {
++=======
+ if (!folio_test_swapbacked(folio)) {
+ if (!folio_test_dirty(folio)) {
++>>>>>>> folio/for-next
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
@@@ -1683,10 -1648,10 +1699,17 @@@ discard
*
* See Documentation/vm/mmu_notifier.rst
*/
++<<<<<<< HEAD
+ page_remove_rmap(subpage, vma, PageHuge(page));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
+ put_page(page);
++=======
+ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
+ folio_put(folio);
++>>>>>>> folio/for-next
}

mmu_notifier_invalidate_range_end(&range);
@@@ -1852,8 -1815,8 +1873,13 @@@ static bool try_to_migrate_one(struct f
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);

++<<<<<<< HEAD
+ if (is_device_private_page(page)) {
+ unsigned long pfn = page_to_pfn(page);
++=======
+ if (folio_is_zone_device(folio)) {
+ unsigned long pfn = folio_pfn(folio);
++>>>>>>> folio/for-next
swp_entry_t entry;
pte_t swp_pte;

@@@ -1891,11 -1852,11 +1917,15 @@@
* changed when hugepage migrations to device private
* memory are supported.
*/
++<<<<<<< HEAD
+ subpage = page;
++=======
+ subpage = &folio->page;
++>>>>>>> folio/for-next
} else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
- if (PageHuge(page)) {
- hugetlb_count_sub(compound_nr(page), mm);
+ if (folio_test_hugetlb(folio)) {
+ hugetlb_count_sub(folio_nr_pages(folio), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
@@@ -1963,10 -1922,10 +1993,17 @@@
*
* See Documentation/vm/mmu_notifier.rst
*/
++<<<<<<< HEAD
+ page_remove_rmap(subpage, vma, PageHuge(page));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
+ put_page(page);
++=======
+ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
+ folio_put(folio);
++>>>>>>> folio/for-next
}

mmu_notifier_invalidate_range_end(&range);
@@@ -1999,8 -1958,7 +2036,12 @@@ void try_to_migrate(struct folio *folio
TTU_SYNC)))
return;

++<<<<<<< HEAD
+ if (is_zone_device_page(page) &&
+ (!is_device_private_page(page) && !is_device_coherent_page(page)))
++=======
+ if (folio_is_zone_device(folio) && !folio_is_device_private(folio))
++>>>>>>> folio/for-next
return;

/*
@@@ -2015,9 -1973,9 +2056,13 @@@
rwc.invalid_vma = invalid_migration_vma;

if (flags & TTU_RMAP_LOCKED)
- rmap_walk_locked(page, &rwc);
+ rmap_walk_locked(folio, &rwc);
else
++<<<<<<< HEAD
+ rmap_walk(page, &rwc);
++=======
+ rmap_walk(folio, &rwc);
++>>>>>>> folio/for-next
}

#ifdef CONFIG_DEVICE_PRIVATE
diff --cc mm/vmscan.c
index 5f471c1e279f,7db5d0237333..000000000000
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@@ -986,12 -985,23 +986,12 @@@ static inline int is_page_cache_freeabl
* that isolated the page, the page cache and optional buffer
* heads at page->private.
*/
- int page_cache_pins = thp_nr_pages(page);
- return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
+ return folio_ref_count(folio) - folio_test_private(folio) ==
+ 1 + folio_nr_pages(folio);
}

-static int may_write_to_inode(struct inode *inode)
-{
- if (current->flags & PF_SWAPWRITE)
- return 1;
- if (!inode_write_congested(inode))
- return 1;
- if (inode_to_bdi(inode) == current->backing_dev_info)
- return 1;
- return 0;
-}
-
/*
- * We detected a synchronous write error writing a page out. Probably
+ * We detected a synchronous write error writing a folio out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
* fsync(), msync() or close().
*
@@@ -1191,8 -1201,10 +1191,8 @@@ static pageout_t pageout(struct folio *
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_inode(mapping->host))
- return PAGE_KEEP;

- if (clear_page_dirty_for_io(page)) {
+ if (folio_clear_dirty_for_io(folio)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
@@@ -1365,19 -1383,19 +1371,24 @@@ enum page_references
PAGEREF_ACTIVATE,
};

- static enum page_references page_check_references(struct page *page,
+ static enum page_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
- int referenced_ptes, referenced_page;
+ int referenced_ptes, referenced_folio;
unsigned long vm_flags;

- referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
- &vm_flags);
- referenced_page = TestClearPageReferenced(page);
+ referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
+ &vm_flags);
+ referenced_folio = folio_test_clear_referenced(folio);

/*
++<<<<<<< HEAD
+ * The supposedly reclaimable page was found to be in a VM_LOCKED vma.
+ * Let the page, now marked Mlocked, be moved to the unevictable list.
++=======
+ * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
+ * Let the folio, now marked Mlocked, be moved to the unevictable list.
++>>>>>>> folio/for-next
*/
if (vm_flags & VM_LOCKED)
return PAGEREF_ACTIVATE;
@@@ -1566,8 -1586,10 +1579,15 @@@ retry
* end of the LRU a second time.
*/
mapping = page_mapping(page);
++<<<<<<< HEAD
+ if (writeback && PageReclaim(page))
+ stat->nr_congested++;
++=======
+ if (((dirty || writeback) && mapping &&
+ inode_write_congested(mapping->host)) ||
+ (writeback && PageReclaim(page)))
+ stat->nr_congested += nr_pages;
++>>>>>>> folio/for-next

/*
* If a page at the tail of the LRU is under writeback, there
@@@ -1716,9 -1738,9 +1736,15 @@@
/* Adding to swap updated mapping */
mapping = page_mapping(page);
}
++<<<<<<< HEAD
+ } else if (unlikely(PageTransHuge(page))) {
+ /* Split file/lazyfree THP */
+ if (split_huge_page_to_list(page, page_list))
++=======
+ } else if (PageSwapBacked(page) && PageTransHuge(page)) {
+ /* Split shmem THP */
+ if (split_folio_to_list(folio, page_list))
++>>>>>>> folio/for-next
goto keep_locked;
}

Attachment: pgp6ZqeUETQuW.pgp
Description: OpenPGP digital signature