Re: [RFC PATCH 2/2] mm/khugepaged: Remove compound_pagelist

From: Yang Shi
Date: Fri Sep 29 2023 - 15:07:52 EST


On Tue, Sep 26, 2023 at 3:07 PM Yang Shi <shy828301@xxxxxxxxx> wrote:
>
> On Fri, Sep 22, 2023 at 9:33 PM Vishal Moola (Oracle)
> <vishal.moola@xxxxxxxxx> wrote:
> >
> > Currently, khugepaged builds a compound_pagelist while scanning, which
> > is used to properly account for compound pages. We can now account
> > for a compound page as a singular folio instead, so remove this list.
> >
> > Large folios are guaranteed to have consecutive ptes and addresses, so
> > once the first pte of a large folio is found skip over the rest.
>
> The address space may just map a partial folio, for example, in the
> extreme case the HUGE_PMD size range may have HUGE_PMD_NR folios with
> mapping one subpage from each folio per PTE. So assuming the PTE
> mapped folio is mapped consecutively may be wrong.
>
> Please refer to collapse_compound_extreme() in
> tools/testing/selftests/mm/khugepaged.c.
>
> >
> > This helps convert khugepaged to use folios. It removes 3 compound_head
> > calls in __collapse_huge_page_copy_succeeded(), and removes 980 bytes of
> > kernel text.
> >
> > Signed-off-by: Vishal Moola (Oracle) <vishal.moola@xxxxxxxxx>
> > ---
> > mm/khugepaged.c | 76 ++++++++++++-------------------------------------
> > 1 file changed, 18 insertions(+), 58 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index f46a7a7c489f..b6c7d55a8231 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -498,10 +498,9 @@ static void release_pte_page(struct page *page)
> > release_pte_folio(page_folio(page));
> > }
> >
> > -static void release_pte_pages(pte_t *pte, pte_t *_pte,
> > - struct list_head *compound_pagelist)
> > +static void release_pte_folios(pte_t *pte, pte_t *_pte)
> > {
> > - struct folio *folio, *tmp;
> > + struct folio *folio;
> >
> > while (--_pte >= pte) {
> > pte_t pteval = ptep_get(_pte);
> > @@ -514,12 +513,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
> > continue;
> > folio = pfn_folio(pfn);
> > if (folio_test_large(folio))
> > - continue;
> > - release_pte_folio(folio);
> > - }
> > -
> > - list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
> > - list_del(&folio->lru);
> > + _pte -= folio_nr_pages(folio) - 1;
> > release_pte_folio(folio);
> > }
> > }
> > @@ -538,8 +532,7 @@ static bool is_refcount_suitable(struct page *page)
> > static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > unsigned long address,
> > pte_t *pte,
> > - struct collapse_control *cc,
> > - struct list_head *compound_pagelist)
> > + struct collapse_control *cc)
> > {
> > struct folio *folio = NULL;
> > pte_t *_pte;
> > @@ -588,19 +581,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > }
> > }
> >
> > - if (folio_test_large(folio)) {
> > - struct folio *f;
> > -
> > - /*
> > - * Check if we have dealt with the compound page
> > - * already
> > - */
> > - list_for_each_entry(f, compound_pagelist, lru) {
> > - if (folio == f)
> > - goto next;
> > - }
> > - }
> > -
> > /*
> > * We can do it before isolate_lru_page because the
> > * page can't be freed from under us. NOTE: PG_lock
> > @@ -644,9 +624,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
> > VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> >
> > - if (folio_test_large(folio))
> > - list_add_tail(&folio->lru, compound_pagelist);
> > -next:
> > /*
> > * If collapse was initiated by khugepaged, check that there is
> > * enough young pte to justify collapsing the page
> > @@ -660,6 +637,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > if (pte_write(pteval))
> > writable = true;
> >
> > + if (folio_test_large(folio)) {
> > + _pte += folio_nr_pages(folio) - 1;
> > + address += folio_size(folio) - PAGE_SIZE;
> > + }
> > }
> >
> > if (unlikely(!writable)) {
> > @@ -673,7 +654,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > return result;
> > }
> > out:
> > - release_pte_pages(pte, _pte, compound_pagelist);
> > + release_pte_folios(pte, _pte);
> > trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
> > referenced, writable, result);
> > return result;
> > @@ -682,11 +663,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> > struct vm_area_struct *vma,
> > unsigned long address,
> > - spinlock_t *ptl,
> > - struct list_head *compound_pagelist)
> > + spinlock_t *ptl)
> > {
> > struct page *src_page;
> > - struct page *tmp;
> > pte_t *_pte;
> > pte_t pteval;
> >
> > @@ -706,8 +685,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> > }
> > } else {
> > src_page = pte_page(pteval);
> > - if (!PageCompound(src_page))
> > - release_pte_page(src_page);
> > + release_pte_page(src_page);

This line is problematic too. It may cause double unlock if I read it
correctly. The loop scans the mapped subpages from the same folio,
release_pte_page() is called for the same folio multiple times.

> > /*
> > * ptl mostly unnecessary, but preempt has to
> > * be disabled to update the per-cpu stats
> > @@ -720,23 +698,12 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> > free_page_and_swap_cache(src_page);
> > }
> > }
> > -
> > - list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
> > - list_del(&src_page->lru);
> > - mod_node_page_state(page_pgdat(src_page),
> > - NR_ISOLATED_ANON + page_is_file_lru(src_page),
> > - -compound_nr(src_page));
> > - unlock_page(src_page);
> > - free_swap_cache(src_page);
> > - putback_lru_page(src_page);
> > - }
> > }
> >
> > static void __collapse_huge_page_copy_failed(pte_t *pte,
> > pmd_t *pmd,
> > pmd_t orig_pmd,
> > - struct vm_area_struct *vma,
> > - struct list_head *compound_pagelist)
> > + struct vm_area_struct *vma)
> > {
> > spinlock_t *pmd_ptl;
> >
> > @@ -753,7 +720,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> > * Release both raw and compound pages isolated
> > * in __collapse_huge_page_isolate.
> > */
> > - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> > + release_pte_folios(pte, pte + HPAGE_PMD_NR);
> > }
> >
> > /*
> > @@ -769,7 +736,6 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> > * @vma: the original raw pages' virtual memory area
> > * @address: starting address to copy
> > * @ptl: lock on raw pages' PTEs
> > - * @compound_pagelist: list that stores compound pages
> > */
> > static int __collapse_huge_page_copy(pte_t *pte,
> > struct page *page,
> > @@ -777,8 +743,7 @@ static int __collapse_huge_page_copy(pte_t *pte,
> > pmd_t orig_pmd,
> > struct vm_area_struct *vma,
> > unsigned long address,
> > - spinlock_t *ptl,
> > - struct list_head *compound_pagelist)
> > + spinlock_t *ptl)
> > {
> > struct page *src_page;
> > pte_t *_pte;
> > @@ -804,11 +769,9 @@ static int __collapse_huge_page_copy(pte_t *pte,
> > }
> >
> > if (likely(result == SCAN_SUCCEED))
> > - __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> > - compound_pagelist);
> > + __collapse_huge_page_copy_succeeded(pte, vma, address, ptl);
> > else
> > - __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> > - compound_pagelist);
> > + __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma);
> >
> > return result;
> > }
> > @@ -1081,7 +1044,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > int referenced, int unmapped,
> > struct collapse_control *cc)
> > {
> > - LIST_HEAD(compound_pagelist);
> > pmd_t *pmd, _pmd;
> > pte_t *pte;
> > pgtable_t pgtable;
> > @@ -1168,8 +1130,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >
> > pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> > if (pte) {
> > - result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > - &compound_pagelist);
> > + result = __collapse_huge_page_isolate(vma, address, pte, cc);
> > spin_unlock(pte_ptl);
> > } else {
> > result = SCAN_PMD_NULL;
> > @@ -1198,8 +1159,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > anon_vma_unlock_write(vma->anon_vma);
> >
> > result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
> > - vma, address, pte_ptl,
> > - &compound_pagelist);
> > + vma, address, pte_ptl);
> > pte_unmap(pte);
> > if (unlikely(result != SCAN_SUCCEED))
> > goto out_up_write;
> > --
> > 2.40.1
> >