Re: [PATCH 2/5] mm/khugepaged: Convert hpage_collapse_scan_pmd() to use folios

From: Yang Shi
Date: Tue Oct 17 2023 - 16:42:02 EST


On Mon, Oct 16, 2023 at 1:06 PM Vishal Moola (Oracle)
<vishal.moola@xxxxxxxxx> wrote:
>
> Replaces 5 calls to compound_head(), and removes 1466 bytes of kernel
> text.
>
> Previously, to determine if any pte was shared, the page mapcount
> corresponding exactly to the pte was checked. This gave us a precise
> number of shared ptes. Using folio_estimated_sharers() instead uses
> the mapcount of the head page, giving us an estimate for tail page ptes.
>
> This means if a tail page's mapcount is greater than its head page's
> mapcount, folio_estimated_sharers() would be underestimating the number of
> shared ptes, and vice versa.
>
> Signed-off-by: Vishal Moola (Oracle) <vishal.moola@xxxxxxxxx>
> ---
> mm/khugepaged.c | 26 ++++++++++++--------------
> 1 file changed, 12 insertions(+), 14 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 7a552fe16c92..67aac53b31c8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1245,7 +1245,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> pte_t *pte, *_pte;
> int result = SCAN_FAIL, referenced = 0;
> int none_or_zero = 0, shared = 0;
> - struct page *page = NULL;
> + struct folio *folio = NULL;
> unsigned long _address;
> spinlock_t *ptl;
> int node = NUMA_NO_NODE, unmapped = 0;
> @@ -1316,13 +1316,13 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> if (pte_write(pteval))
> writable = true;
>
> - page = vm_normal_page(vma, _address, pteval);
> - if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> + folio = vm_normal_folio(vma, _address, pteval);
> + if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
> result = SCAN_PAGE_NULL;
> goto out_unmap;
> }
>
> - if (page_mapcount(page) > 1) {
> + if (folio_estimated_sharers(folio) > 1) {

This doesn't look correct. The max_ptes_shared is used to control the
cap of shared PTEs. IIRC, folio_estimated_sharers() just reads the
mapcount of the head page. If we set max_ptes_shared to 256, and just
the head page is shared, but "shared" will return 512 and prevent from
collapsing the area even though just one PTE is shared. This breaks
the semantics of max_ptes_shared.

> ++shared;
> if (cc->is_khugepaged &&
> shared > khugepaged_max_ptes_shared) {
> @@ -1332,29 +1332,27 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> }
> }
>
> - page = compound_head(page);
> -
> /*
> * Record which node the original page is from and save this
> * information to cc->node_load[].
> * Khugepaged will allocate hugepage from the node has the max
> * hit record.
> */
> - node = page_to_nid(page);
> + node = folio_nid(folio);
> if (hpage_collapse_scan_abort(node, cc)) {
> result = SCAN_SCAN_ABORT;
> goto out_unmap;
> }
> cc->node_load[node]++;
> - if (!PageLRU(page)) {
> + if (!folio_test_lru(folio)) {
> result = SCAN_PAGE_LRU;
> goto out_unmap;
> }
> - if (PageLocked(page)) {
> + if (folio_test_locked(folio)) {
> result = SCAN_PAGE_LOCK;
> goto out_unmap;
> }
> - if (!PageAnon(page)) {
> + if (!folio_test_anon(folio)) {
> result = SCAN_PAGE_ANON;
> goto out_unmap;
> }
> @@ -1369,7 +1367,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> * has excessive GUP pins (i.e. 512). Anyway the same check
> * will be done again later the risk seems low.
> */
> - if (!is_refcount_suitable(page)) {
> + if (!is_refcount_suitable(&folio->page)) {
> result = SCAN_PAGE_COUNT;
> goto out_unmap;
> }
> @@ -1379,8 +1377,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> * enough young pte to justify collapsing the page
> */
> if (cc->is_khugepaged &&
> - (pte_young(pteval) || page_is_young(page) ||
> - PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
> + (pte_young(pteval) || folio_test_young(folio) ||
> + folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
> address)))
> referenced++;
> }
> @@ -1402,7 +1400,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> *mmap_locked = false;
> }
> out:
> - trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> + trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
> none_or_zero, result, unmapped);
> return result;
> }
> --
> 2.40.1
>