[PATCH] memcg: add interface to force disable swap

From: Jianlin Lv
Date: Sat Oct 07 2023 - 09:09:28 EST


From: Jianlin Lv <iecedge@xxxxxxxxx>

Global reclaim will swap even if swappiness is set to 0. In particular
case, users wish to be able to completely disable swap for specific
processes. One scenario is that if JVM memory pages falls into swap,
the performance will noticeably reduce and the GC pauses tend to increase
to levels not tolerable by most applications.
If it's possible to only disable swap out for specific processes, it can
address the JVM GC pauses issues, and at the same time, memory reclaim
pressure is also manageable.

This patch adds "memory.swap_force_disable" control file to support disable
swap for non-root cgroup. When process is associated with a cgroup,
'echo 1 > memory.swap_force_disable' will forbid anon pages be swapped out.
This patch also adds read and write handler of the control file.

Signed-off-by: Jianlin Lv <iecedge@xxxxxxxxx>
---
.../admin-guide/cgroup-v1/memory.rst | 15 ++++++++++
include/linux/memcontrol.h | 1 +
include/linux/swap.h | 15 ++++++++++
mm/memcontrol.c | 28 +++++++++++++++++++
mm/vmscan.c | 3 +-
5 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index ff456871bf4b..be84b98bc6fe 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -86,6 +86,7 @@ Brief summary of control files.
memory.pressure_level set memory pressure notifications
memory.swappiness set/show swappiness parameter of vmscan
(See sysctl's vm.swappiness)
+ memory.swap_force_disable set/show force disable swap
memory.move_charge_at_immigrate set/show controls of moving charges
This knob is deprecated and shouldn't be
used.
@@ -615,6 +616,20 @@ enforces that 0 swappiness really prevents from any swapping even if
there is a swap storage available. This might lead to memcg OOM killer
if there are no file pages to reclaim.

+swap_force_disable is used to allow control group to disable swap even if swap
+storage is available. This feature is disabled by default. If you want to
+disable swap for specified processes, swap_force_disable can be setup by
+following commands::
+
+ # cd /sys/fs/cgroup/memory/
+ # mkdir test
+ # cd test
+ # echo 1 > memory.swap_force_disable
+ # echo <PID> > cgroup.procs
+
+.. note::
+ swap_force_disable only take effect for non-root cgroups.
+
5.4 failcnt
-----------

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e24da16d2c..b26dcb0756c0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -246,6 +246,7 @@ struct mem_cgroup {
int under_oom;

int swappiness;
+ int swap_force_disable;
/* OOM-Killer disable */
int oom_kill_disable;

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 493487ed7c38..b202de576984 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -624,6 +624,21 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
}
#endif

+#ifdef CONFIG_MEMCG
+static inline int mem_cgroup_swap_force_disable(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
+ return 0;
+
+ return memcg->swap_force_disable;
+}
+#else
+static inline int mem_cgroup_swap_force_disable(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+#endif
+
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5b009b233ab8..024750444c79 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4196,6 +4196,28 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
return 0;
}

+static u64 mem_cgroup_swap_force_disable_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return mem_cgroup_swap_force_disable(memcg);
+}
+
+static int mem_cgroup_swap_force_disable_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ /* cannot set to root cgroup and only 0 and 1 are allowed */
+ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
+ return -EINVAL;
+
+ memcg->swap_force_disable = val;
+
+ return 0;
+}
+
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
@@ -5064,6 +5086,11 @@ static struct cftype mem_cgroup_legacy_files[] = {
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
+ {
+ .name = "swap_force_disable",
+ .read_u64 = mem_cgroup_swap_force_disable_read,
+ .write_u64 = mem_cgroup_swap_force_disable_write,
+ },
{
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
@@ -5367,6 +5394,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
+ WRITE_ONCE(memcg->swap_force_disable, mem_cgroup_swap_force_disable(parent));
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));

page_counter_init(&memcg->memory, &parent->memory);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6f13394b112e..5fdb4ac07007 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3029,6 +3029,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
+ int swap_force_disable = mem_cgroup_swap_force_disable(memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
@@ -3036,7 +3037,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
enum lru_list lru;

/* If we have no swap space, do not bother scanning anon folios. */
- if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
+ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc) || swap_force_disable) {
scan_balance = SCAN_FILE;
goto out;
}
--
2.34.1