Re: Low TCP throughput due to vmpressure with swap enabled

From: Johannes Weiner
Date: Tue Dec 06 2022 - 14:00:27 EST


On Mon, Dec 05, 2022 at 04:50:46PM -0800, Ivan Babrou wrote:
> And now I can see plenty of this:
>
> [ 108.156707][ T5175] socket pressure[2]: 4294673429
> [ 108.157050][ T5175] socket pressure[2]: 4294673429
> [ 108.157301][ T5175] socket pressure[2]: 4294673429
> [ 108.157581][ T5175] socket pressure[2]: 4294673429
> [ 108.157874][ T5175] socket pressure[2]: 4294673429
> [ 108.158254][ T5175] socket pressure[2]: 4294673429
>
> I think the first result below is to blame:
>
> $ rg '.->socket_pressure' mm
> mm/memcontrol.c
> 5280: memcg->socket_pressure = jiffies;
> 7198: memcg->socket_pressure = 0;
> 7201: memcg->socket_pressure = 1;
> 7211: memcg->socket_pressure = 0;
> 7215: memcg->socket_pressure = 1;

Hoo boy, that's a silly mistake indeed. Thanks for tracking it down.

> While we set socket_pressure to either zero or one in
> mem_cgroup_charge_skmem, it is still initialized to jiffies on memcg
> creation. Zero seems like a more appropriate starting point. With that
> change I see it working as expected with no TCP speed bumps. My
> ebpf_exporter program also looks happy and reports zero clamps in my
> brief testing.

Excellent, now this behavior makes sense.

> I also think we should downgrade socket_pressure from "unsigned long"
> to "bool", as it only holds zero and one now.

Sounds good to me!

Attaching the updated patch below. If nobody has any objections, I'll
add a proper changelog, reported-bys, sign-off etc and send it out.

---
include/linux/memcontrol.h | 8 +++---
include/linux/vmpressure.h | 7 ++---
mm/memcontrol.c | 20 +++++++++----
mm/vmpressure.c | 58 ++++++--------------------------------
mm/vmscan.c | 15 +---------
5 files changed, 30 insertions(+), 78 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e1644a24009c..ef1c388be5b3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -283,11 +283,11 @@ struct mem_cgroup {
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS];

- unsigned long socket_pressure;
+ /* Socket memory allocations have failed */
+ bool socket_pressure;

/* Legacy tcp memory accounting */
bool tcpmem_active;
- int tcpmem_pressure;

#ifdef CONFIG_MEMCG_KMEM
int kmemcg_id;
@@ -1701,10 +1701,10 @@ void mem_cgroup_sk_alloc(struct sock *sk);
void mem_cgroup_sk_free(struct sock *sk);
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->socket_pressure)
return true;
do {
- if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
+ if (memcg->socket_pressure)
return true;
} while ((memcg = parent_mem_cgroup(memcg)));
return false;
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 6a2f51ebbfd3..20d93de37a17 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -11,9 +11,6 @@
#include <linux/eventfd.h>

struct vmpressure {
- unsigned long scanned;
- unsigned long reclaimed;
-
unsigned long tree_scanned;
unsigned long tree_reclaimed;
/* The lock is used to keep the scanned/reclaimed above in sync. */
@@ -30,7 +27,7 @@ struct vmpressure {
struct mem_cgroup;

#ifdef CONFIG_MEMCG
-extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
unsigned long scanned, unsigned long reclaimed);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);

@@ -44,7 +41,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
#else
-static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
unsigned long scanned, unsigned long reclaimed) {}
static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
int prio) {}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..0d4b9dbe775a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5277,7 +5277,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
- memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
INIT_LIST_HEAD(&memcg->objcg_list);
@@ -7195,10 +7194,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
struct page_counter *fail;

if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
- memcg->tcpmem_pressure = 0;
+ memcg->socket_pressure = false;
return true;
}
- memcg->tcpmem_pressure = 1;
+ memcg->socket_pressure = true;
if (gfp_mask & __GFP_NOFAIL) {
page_counter_charge(&memcg->tcpmem, nr_pages);
return true;
@@ -7206,12 +7205,21 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
return false;
}

- if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
- mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
- return true;
+ if (try_charge(memcg, gfp_mask & ~__GFP_NOFAIL, nr_pages) == 0) {
+ memcg->socket_pressure = false;
+ goto success;
+ }
+ memcg->socket_pressure = true;
+ if (gfp_mask & __GFP_NOFAIL) {
+ try_charge(memcg, gfp_mask, nr_pages);
+ goto success;
}

return false;
+
+success:
+ mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
+ return true;
}

/**
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index b52644771cc4..4cec90711cf4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -219,7 +219,6 @@ static void vmpressure_work_fn(struct work_struct *work)
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
- * @tree: legacy subtree mode
* @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed
*
@@ -227,16 +226,9 @@ static void vmpressure_work_fn(struct work_struct *work)
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time.
*
- * If @tree is set, vmpressure is in traditional userspace reporting
- * mode: @memcg is considered the pressure root and userspace is
- * notified of the entire subtree's reclaim efficiency.
- *
- * If @tree is not set, reclaim efficiency is recorded for @memcg, and
- * only in-kernel users are notified.
- *
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr;
@@ -271,46 +263,14 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
if (!scanned)
return;

- if (tree) {
- spin_lock(&vmpr->sr_lock);
- scanned = vmpr->tree_scanned += scanned;
- vmpr->tree_reclaimed += reclaimed;
- spin_unlock(&vmpr->sr_lock);
-
- if (scanned < vmpressure_win)
- return;
- schedule_work(&vmpr->work);
- } else {
- enum vmpressure_levels level;
-
- /* For now, no users for root-level efficiency */
- if (!memcg || mem_cgroup_is_root(memcg))
- return;
-
- spin_lock(&vmpr->sr_lock);
- scanned = vmpr->scanned += scanned;
- reclaimed = vmpr->reclaimed += reclaimed;
- if (scanned < vmpressure_win) {
- spin_unlock(&vmpr->sr_lock);
- return;
- }
- vmpr->scanned = vmpr->reclaimed = 0;
- spin_unlock(&vmpr->sr_lock);
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->tree_scanned += scanned;
+ vmpr->tree_reclaimed += reclaimed;
+ spin_unlock(&vmpr->sr_lock);

- level = vmpressure_calc_level(scanned, reclaimed);
-
- if (level > VMPRESSURE_LOW) {
- /*
- * Let the socket buffer allocator know that
- * we are having trouble reclaiming LRU pages.
- *
- * For hysteresis keep the pressure state
- * asserted for a second in which subsequent
- * pressure events can occur.
- */
- WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
- }
- }
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
}

/**
@@ -340,7 +300,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, true, vmpressure_win, 0);
+ vmpressure(gfp, memcg, vmpressure_win, 0);
}

#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..d348366d58d4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6035,8 +6035,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- unsigned long reclaimed;
- unsigned long scanned;

/*
* This loop can become CPU-bound when target memcgs
@@ -6068,20 +6066,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
}

- reclaimed = sc->nr_reclaimed;
- scanned = sc->nr_scanned;
-
shrink_lruvec(lruvec, sc);
-
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
sc->priority);
-
- /* Record the group's reclaim efficiency */
- if (!sc->proactive)
- vmpressure(sc->gfp_mask, memcg, false,
- sc->nr_scanned - scanned,
- sc->nr_reclaimed - reclaimed);
-
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}

@@ -6111,7 +6098,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)

/* Record the subtree's reclaim efficiency */
if (!sc->proactive)
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);

--
2.38.1