[RFC PATCH 20/26] mm: vmscan: use compaction_suitable() check in kswapd

From: Johannes Weiner
Date: Tue Apr 18 2023 - 15:15:38 EST


Kswapd currently bails on higher-order allocations with an open-coded
check for whether it's reclaimed the compaction gap.

compaction_suitable() is the customary interface to coordinate reclaim
with compaction.

Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
---
mm/vmscan.c | 67 ++++++++++++++++++-----------------------------------
1 file changed, 23 insertions(+), 44 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ee8c8ca2e7b5..723705b9e4d9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
if (!managed_zone(zone))
continue;

+ /* Allocation can succeed in any zone, done */
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
mark = wmark_pages(zone, WMARK_PROMO);
else
mark = high_wmark_pages(zone);
if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
return true;
+
+ /* Allocation can't succeed, but enough order-0 to compact */
+ if (compaction_suitable(zone, order,
+ highest_zoneidx) == COMPACT_CONTINUE)
+ return true;
}

/*
@@ -6968,16 +6974,6 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
*/
shrink_node(pgdat, sc);

- /*
- * Fragmentation may mean that the system cannot be rebalanced for
- * high-order allocations. If twice the allocation size has been
- * reclaimed then recheck watermarks only at order-0 to prevent
- * excessive reclaim. Assume that a process requested a high-order
- * can direct reclaim/compact.
- */
- if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
- sc->order = 0;
-
return sc->nr_scanned >= sc->nr_to_reclaim;
}

@@ -7018,15 +7014,13 @@ clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
* that are eligible for use by the caller until at least one zone is
* balanced.
*
- * Returns the order kswapd finished reclaiming at.
- *
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), any page in that zone
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
-static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
+static void balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long nr_soft_reclaimed;
@@ -7226,14 +7220,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
__fs_reclaim_release(_THIS_IP_);
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
-
- /*
- * Return the order kswapd stopped reclaiming at as
- * prepare_kswapd_sleep() takes it into account. If another caller
- * entered the allocator slow path while kswapd was awake, order will
- * remain at the higher level.
- */
- return sc.order;
}

/*
@@ -7251,7 +7237,7 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
}

-static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
unsigned int highest_zoneidx)
{
long remaining = 0;
@@ -7269,7 +7255,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* eligible zone balanced that it's also unlikely that compaction will
* succeed.
*/
- if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
+ if (prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
/*
* Compaction records what page blocks it recently failed to
* isolate pages from and skips them in the future scanning.
@@ -7282,7 +7268,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* We have freed the memory, now we should compact it to make
* allocation of the requested order possible.
*/
- wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
+ wakeup_kcompactd(pgdat, order, highest_zoneidx);

remaining = schedule_timeout(HZ/10);

@@ -7296,8 +7282,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
kswapd_highest_zoneidx(pgdat,
highest_zoneidx));

- if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
- WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
+ if (READ_ONCE(pgdat->kswapd_order) < order)
+ WRITE_ONCE(pgdat->kswapd_order, order);
}

finish_wait(&pgdat->kswapd_wait, &wait);
@@ -7308,8 +7294,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (!remaining &&
- prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
+ if (!remaining && prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

/*
@@ -7350,8 +7335,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
*/
static int kswapd(void *p)
{
- unsigned int alloc_order, reclaim_order;
- unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
+ unsigned int order, highest_zoneidx;
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -7374,22 +7358,20 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
set_freezable();

- WRITE_ONCE(pgdat->kswapd_order, 0);
+ order = 0;
+ highest_zoneidx = MAX_NR_ZONES - 1;
+ WRITE_ONCE(pgdat->kswapd_order, order);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+
atomic_set(&pgdat->nr_writeback_throttled, 0);
+
for ( ; ; ) {
bool ret;

- alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
- highest_zoneidx = kswapd_highest_zoneidx(pgdat,
- highest_zoneidx);
-
-kswapd_try_sleep:
- kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
- highest_zoneidx);
+ kswapd_try_to_sleep(pgdat, order, highest_zoneidx);

/* Read the new order and highest_zoneidx */
- alloc_order = READ_ONCE(pgdat->kswapd_order);
+ order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
highest_zoneidx);
WRITE_ONCE(pgdat->kswapd_order, 0);
@@ -7415,11 +7397,8 @@ static int kswapd(void *p)
* request (alloc_order).
*/
trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
- alloc_order);
- reclaim_order = balance_pgdat(pgdat, alloc_order,
- highest_zoneidx);
- if (reclaim_order < alloc_order)
- goto kswapd_try_sleep;
+ order);
+ balance_pgdat(pgdat, order, highest_zoneidx);
}

tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
--
2.39.2