[PATCH 1/1] mm/khugepaged: reduce process visible downtime by pre-zeroing hugepage

From: Lance Yang
Date: Fri Mar 08 2024 - 02:49:49 EST


The patch reduces the process visible downtime during hugepage
collapse. This is achieved by pre-zeroing the hugepage before
acquiring mmap_lock(write mode) if nr_pte_none >= 256, without
affecting the efficiency of khugepaged.

On an Intel Core i5 CPU, the process visible downtime during
hugepage collapse is as follows:

| nr_ptes_none | w/o __GFP_ZERO | w/ __GFP_ZERO | Change |
--------------------------------------------------—----------
| 511 | 233us | 95us | -59.21%|
| 384 | 376us | 219us | -41.20%|
| 256 | 421us | 323us | -23.28%|
| 128 | 523us | 507us | -3.06%|

Of course, alloc_charge_hpage() will take longer to run with
the __GFP_ZERO flag.

| Func | w/o __GFP_ZERO | w/ __GFP_ZERO |
|----------------------|----------------|---------------|
| alloc_charge_hpage | 198us | 295us |

But it's not a big deal because it doesn't impact the total
time spent by khugepaged in collapsing a hugepage. In fact,
it would decrease.

Signed-off-by: Lance Yang <ioworker0@xxxxxxxxx>
---
mm/khugepaged.c | 33 +++++++++++++++++++++++++++------
1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 38830174608f..a2872596b865 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -88,6 +88,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
+static unsigned int khugepaged_min_ptes_none_prezero __read_mostly;

#define MM_SLOTS_HASH_BITS 10
static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -96,6 +97,7 @@ static struct kmem_cache *mm_slot_cache __ro_after_init;

struct collapse_control {
bool is_khugepaged;
+ bool alloc_zeroed_hpage;

/* Num pages scanned per node */
u32 node_load[MAX_NUMNODES];
@@ -396,6 +398,7 @@ int __init khugepaged_init(void)
khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
+ khugepaged_min_ptes_none_prezero = HPAGE_PMD_NR / 2;

return 0;
}
@@ -782,6 +785,7 @@ static int __collapse_huge_page_copy(pte_t *pte,
struct vm_area_struct *vma,
unsigned long address,
spinlock_t *ptl,
+ struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *src_page;
@@ -797,7 +801,8 @@ static int __collapse_huge_page_copy(pte_t *pte,
_pte++, page++, _address += PAGE_SIZE) {
pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- clear_user_highpage(page, _address);
+ if (!cc->alloc_zeroed_hpage)
+ clear_user_highpage(page, _address);
continue;
}
src_page = pte_page(pteval);
@@ -1067,6 +1072,9 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
int node = hpage_collapse_find_target_node(cc);
struct folio *folio;

+ if (cc->alloc_zeroed_hpage)
+ gfp |= __GFP_ZERO;
+
if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
*hpage = NULL;
return SCAN_ALLOC_HUGE_PAGE_FAIL;
@@ -1209,7 +1217,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
anon_vma_unlock_write(vma->anon_vma);

result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
- vma, address, pte_ptl,
+ vma, address, pte_ptl, cc,
&compound_pagelist);
pte_unmap(pte);
if (unlikely(result != SCAN_SUCCEED))
@@ -1272,6 +1280,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,

memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+ cc->alloc_zeroed_hpage = false;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte) {
result = SCAN_PMD_NULL;
@@ -1408,6 +1417,10 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
+ if (cc->is_khugepaged &&
+ none_or_zero >= khugepaged_min_ptes_none_prezero)
+ cc->alloc_zeroed_hpage = true;
+
result = collapse_huge_page(mm, address, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
@@ -2054,7 +2067,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
index = start;
list_for_each_entry(page, &pagelist, lru) {
while (index < page->index) {
- clear_highpage(hpage + (index % HPAGE_PMD_NR));
+ if (!cc->alloc_zeroed_hpage)
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) {
@@ -2064,7 +2078,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
index++;
}
while (index < end) {
- clear_highpage(hpage + (index % HPAGE_PMD_NR));
+ if (!cc->alloc_zeroed_hpage)
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}

@@ -2234,6 +2249,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
swap = 0;
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+ cc->alloc_zeroed_hpage = false;
rcu_read_lock();
xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, page))
@@ -2305,11 +2321,16 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
rcu_read_unlock();

if (result == SCAN_SUCCEED) {
- if (cc->is_khugepaged &&
- present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ if (!cc->is_khugepaged)
+ result = collapse_file(mm, addr, file, start, cc);
+ else if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
+ if (HPAGE_PMD_NR - present >=
+ khugepaged_min_ptes_none_prezero)
+ cc->alloc_zeroed_hpage = true;
+
result = collapse_file(mm, addr, file, start, cc);
}
}
--
2.33.1