[RFC PATCH v1 09/11] mm: Use kswapd to demote pages when toptier memory is tight

From: Tim Chen
Date: Mon Apr 05 2021 - 14:09:38 EST


Demote pages from memory cgroup that has excess
toptier memory usage when top tier memory is tight.

When free top tier memory falls below this fraction
"toptier_scale_factor/10000" of overall toptier memory in a node, kswapd
reclaims top tier memory from those mem cgroups that exceeded their
toptier memory soft limit by deomoting the top tier pages to
lower memory tier.

Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---
Documentation/admin-guide/sysctl/vm.rst | 12 +++++
include/linux/mmzone.h | 2 +
mm/page_alloc.c | 14 +++++
mm/vmscan.c | 69 ++++++++++++++++++++++++-
4 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 9de3847c3469..6b49e2e90953 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -74,6 +74,7 @@ Currently, these files are in /proc/sys/vm:
- vfs_cache_pressure
- watermark_boost_factor
- watermark_scale_factor
+- toptier_scale_factor
- zone_reclaim_mode


@@ -962,6 +963,17 @@ too small for the allocation bursts occurring in the system. This knob
can then be used to tune kswapd aggressiveness accordingly.


+toptier_scale_factor
+====================
+
+This factor controls when kswapd wakes up to demote pages of those
+cgroups that have exceeded their memory soft limit.
+
+The unit is in fractions of 10,000. The default value of 2000 means the
+if there are less than 20% of free top tier memory in the
+node/system, we will start to demote pages of those memory cgroups
+that have exceeded their soft memory limit.
+
zone_reclaim_mode
=================

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bbe649c4fdee..4ee0073d255f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -332,12 +332,14 @@ enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
+ WMARK_TOPTIER,
NR_WMARK
};

#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define toptier_wmark_pages(z) (z->_watermark[WMARK_TOPTIER] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)

struct per_cpu_pages {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 471a2c342c4f..20f3caee60f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7964,6 +7964,20 @@ static void __setup_per_zone_wmarks(void)
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;

+ tmp = mult_frac(zone_managed_pages(zone),
+ toptier_scale_factor, 10000);
+ /*
+ * Clamp toptier watermark between twice high watermark
+ * and max managed pages.
+ */
+ if (tmp < 2 * zone->_watermark[WMARK_HIGH])
+ tmp = 2 * zone->_watermark[WMARK_HIGH];
+ if (tmp > zone_managed_pages(zone))
+ tmp = zone_managed_pages(zone);
+ zone->_watermark[WMARK_TOPTIER] = tmp;
+
+ zone->watermark_boost = 0;
+
spin_unlock_irqrestore(&zone->lock, flags);
}

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 11bb0c6fa524..270880c8baef 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -185,6 +185,7 @@ static void set_task_reclaim_state(struct task_struct *task,

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+int toptier_scale_factor = 2000;

#ifdef CONFIG_MEMCG
/*
@@ -3624,6 +3625,34 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
return false;
}

+static bool pgdat_toptier_balanced(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ int i;
+ unsigned long mark;
+ struct zone *zone;
+
+ zone = pgdat->node_zones + ZONE_NORMAL;
+
+ if (!node_state(pgdat->node_id, N_TOPTIER) ||
+ next_demotion_node(pgdat->node_id) == -1 ||
+ order > 0 || classzone_idx < ZONE_NORMAL) {
+ return true;
+ }
+
+ zone = pgdat->node_zones + ZONE_NORMAL;
+
+ if (!managed_zone(zone))
+ return true;
+
+ mark = min(toptier_wmark_pages(zone),
+ zone_managed_pages(zone));
+
+ if (zone_page_state(zone, NR_FREE_PAGES) < mark)
+ return false;
+
+ return true;
+}
+
/* Clear pgdat state for congested, dirty or under writeback. */
static void clear_pgdat_congested(pg_data_t *pgdat)
{
@@ -4049,6 +4078,39 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
finish_wait(&pgdat->kswapd_wait, &wait);
}

+static bool toptier_soft_reclaim(pg_data_t *pgdat,
+ unsigned int reclaim_order,
+ unsigned int classzone_idx)
+{
+ unsigned long nr_soft_scanned, nr_soft_reclaimed;
+ int ret;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .order = reclaim_order,
+ .may_unmap = 1,
+ };
+
+ if (!node_state(pgdat->node_id, N_TOPTIER) || kthread_should_stop())
+ return false;
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+
+ if (!pgdat_toptier_balanced(pgdat, 0, classzone_idx)) {
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat,
+ 0, GFP_KERNEL,
+ &nr_soft_scanned, N_TOPTIER);
+ }
+
+ set_task_reclaim_state(current, NULL);
+
+ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx) &&
+ !kthread_should_stop())
+ return true;
+ else
+ return false;
+}
+
/*
* The background pageout daemon, started as a kernel thread
* from the init process.
@@ -4108,6 +4170,10 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);

+ if (toptier_soft_reclaim(pgdat, 0,
+ highest_zoneidx))
+ goto kswapd_try_sleep;
+
ret = try_to_freeze();
if (kthread_should_stop())
break;
@@ -4173,7 +4239,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,

/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- (pgdat_balanced(pgdat, order, highest_zoneidx) &&
+ (pgdat_toptier_balanced(pgdat, 0, highest_zoneidx) &&
+ pgdat_balanced(pgdat, order, highest_zoneidx) &&
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
* There may be plenty of free memory available, but it's too
--
2.20.1