[PATCH mm-unstable v1 2/4] mm/mglru: try to stop at high watermarks

From: Yu Zhao
Date: Fri Dec 08 2023 - 01:14:25 EST


The initial MGLRU patchset didn't include the memcg LRU support, and
it relied on should_abort_scan(), added by commit f76c83378851 ("mm:
multi-gen LRU: optimize multiple memcgs"), to "backoff to avoid
overshooting their aggregate reclaim target by too much".

Later on when the memcg LRU was added, should_abort_scan() was deemed
unnecessary, and the test results [1] showed no side effects after it
was removed by commit a579086c99ed ("mm: multi-gen LRU: remove
eviction fairness safeguard").

However, that test used memory.reclaim, which sets nr_to_reclaim to
SWAP_CLUSTER_MAX. So it can overshoot only by SWAP_CLUSTER_MAX-1
pages, i.e., from nr_reclaimed=nr_to_reclaim-1 to
nr_reclaimed=nr_to_reclaim+SWAP_CLUSTER_MAX-1. Compared with the batch
size kswapd sets to nr_to_reclaim, SWAP_CLUSTER_MAX is tiny. Therefore
that test isn't able to reproduce the worst case scenario, i.e.,
kswapd overshooting GBs on large systems and "consuming 100% CPU" (see
the Closes tag).

Bring back a simplified version of should_abort_scan() on top of the
memcg LRU, so that kswapd stops when all eligible zones are above
their respective high watermarks plus a small delta to lower the
chance of KSWAPD_HIGH_WMARK_HIT_QUICKLY. Note that this only applies
to order-0 reclaim, meaning compaction-induced reclaim can still run
wild (which is a different problem).

On Android, launching 55 apps sequentially:
Before After Change
pgpgin 838377172 802955040 -4%
pgpgout 38037080 34336300 -10%

[1] https://lore.kernel.org/20221222041905.2431096-1-yuzhao@xxxxxxxxxx/

Fixes: a579086c99ed ("mm: multi-gen LRU: remove eviction fairness safeguard")
Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
Reported-by: Charan Teja Kalla <quic_charante@xxxxxxxxxxx>
Reported-by: Jaroslav Pulchart <jaroslav.pulchart@xxxxxxxxxxxx>
Closes: https://lore.kernel.org/CAK8fFZ4DY+GtBA40Pm7Nn5xCHy+51w3sfxPqkqpqakSXYyX+Wg@xxxxxxxxxxxxxx/
Tested-by: Jaroslav Pulchart <jaroslav.pulchart@xxxxxxxxxxxx>
Tested-by: Kalesh Singh <kaleshsingh@xxxxxxxxxx>
Cc: stable@xxxxxxxxxxxxxxx
---
mm/vmscan.c | 36 ++++++++++++++++++++++++++++--------
1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e67631c60ac0..10e964cd0efe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4676,20 +4676,41 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
}

-static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
{
+ int i;
+ enum zone_watermarks mark;
+
/* don't abort memcg reclaim to ensure fairness */
if (!root_reclaim(sc))
- return -1;
+ return false;

- return max(sc->nr_to_reclaim, compact_gap(sc->order));
+ if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
+ return true;
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (!current_is_kswapd() || sc->order)
+ return false;
+
+ mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
+ WMARK_PROMO : WMARK_HIGH;
+
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+ unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;
+
+ if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0))
+ return false;
+ }
+
+ /* kswapd should abort if all eligible zones are safe */
+ return true;
}

static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
long nr_to_scan;
unsigned long scanned = 0;
- unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
int swappiness = get_swappiness(lruvec, sc);

/* clean file folios are more likely to exist */
@@ -4711,7 +4732,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (scanned >= nr_to_scan)
break;

- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (should_abort_scan(lruvec, sc))
break;

cond_resched();
@@ -4772,7 +4793,6 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
struct lru_gen_folio *lrugen;
struct mem_cgroup *memcg;
const struct hlist_nulls_node *pos;
- unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);

bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
restart:
@@ -4805,7 +4825,7 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)

rcu_read_lock();

- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (should_abort_scan(lruvec, sc))
break;
}

@@ -4816,7 +4836,7 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)

mem_cgroup_put(memcg);

- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (!is_a_nulls(pos))
return;

/* restart if raced with lru_gen_rotate_memcg() */
--
2.43.0.472.g3155946c3a-goog