[RFC PATCH 1/2] mm: multi-gen LRU: periodic aging

From: Yuanchu Xie
Date: Wed Dec 14 2022 - 17:52:02 EST


Periodically age MGLRU-enabled lruvecs to turn MGLRU generations into
time-based working set information. This includes an interface to set
the periodic aging interval and a new kthread to perform aging.

memory.periodic_aging: a new root-level only file in cgroupfs
Writing to memory.periodic aging sets the aging interval and opts into
periodic aging.
kold: a new kthread that ages memcgs based on the set aging interval.

Signed-off-by: Yuanchu Xie <yuanchu@xxxxxxxxxx>
---
include/linux/kold.h | 44 ++++++++++++
include/linux/mmzone.h | 4 +-
mm/Makefile | 3 +
mm/kold.c | 150 +++++++++++++++++++++++++++++++++++++++++
mm/memcontrol.c | 52 ++++++++++++++
mm/vmscan.c | 35 +++++++++-
6 files changed, 286 insertions(+), 2 deletions(-)
create mode 100644 include/linux/kold.h
create mode 100644 mm/kold.c

diff --git a/include/linux/kold.h b/include/linux/kold.h
new file mode 100644
index 000000000000..10b0dbe09a5c
--- /dev/null
+++ b/include/linux/kold.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Periodic aging for multi-gen LRU
+ *
+ * Copyright (C) 2022 Yuanchu Xie <yuanchu@xxxxxxxxxx>
+ */
+#ifndef KOLD_H_
+#define KOLD_H_
+
+#include <linux/memcontrol.h>
+
+struct kold_stats {
+ /* late is defined as spending an entire interval aging without sleep
+ * stat is aggregated every aging interval
+ */
+ unsigned int late_count;
+};
+
+int kold_set_interval(unsigned int interval);
+unsigned int kold_get_interval(void);
+int kold_get_stats(struct kold_stats *stats);
+
+/* returns the creation timestamp of the youngest generation */
+unsigned long lru_gen_force_age_lruvec(struct mem_cgroup *memcg, int nid,
+ unsigned long min_ttl);
+
+#ifndef CONFIG_MEMCG
+int kold_set_interval(unsigned int interval)
+{
+ return 0;
+}
+
+unsigned int kold_get_interval(void)
+{
+ return 0;
+}
+
+int kold_get_stats(struct kold_stats *stats)
+{
+ return -1;
+}
+#endif /* CONFIG_MEMCG */
+
+#endif /* KOLD_H_ */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5f74891556f3..929c777b826a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1218,7 +1218,9 @@ typedef struct pglist_data {

#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
- struct lru_gen_mm_walk mm_walk;
+ struct lru_gen_mm_walk mm_walk;
+ /* kold periodic aging walk data */
+ struct lru_gen_mm_walk kold_mm_walk;
#endif

CACHELINE_PADDING(_pad2_);
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..8bd554a6eb7d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -98,6 +98,9 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+ifdef CONFIG_LRU_GEN
+obj-$(CONFIG_MEMCG) += kold.o
+endif
ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
diff --git a/mm/kold.c b/mm/kold.c
new file mode 100644
index 000000000000..094574177968
--- /dev/null
+++ b/mm/kold.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Yuanchu Xie <yuanchu@xxxxxxxxxx>
+ */
+#include <linux/stddef.h>
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <linux/mmzone.h>
+#include <linux/nodemask.h>
+#include <linux/sched/mm.h>
+#include <linux/swap.h>
+#include <linux/memcontrol.h>
+#include <linux/err.h>
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/kold.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mm_inline.h>
+
+static struct task_struct *kold_thread __read_mostly;
+/* protects kold_thread */
+static DEFINE_MUTEX(kold_mutex);
+
+static unsigned int aging_interval __read_mostly;
+static unsigned int late_count;
+
+/* try to move to a cpu on the target node */
+static void try_move_current_to_node(int nid)
+{
+ struct cpumask node_cpus;
+
+ cpumask_and(&node_cpus, cpumask_of_node(nid), cpu_online_mask);
+ if (!cpumask_empty(&node_cpus))
+ set_cpus_allowed_ptr(current, &node_cpus);
+}
+
+static int kold_run(void *none)
+{
+ int nid;
+ unsigned int flags;
+ unsigned long last_interval_start_time = jiffies;
+ bool sleep_since_last_full_scan = false;
+ struct mem_cgroup *memcg;
+ struct reclaim_state reclaim_state = {};
+
+ while (!kthread_should_stop()) {
+ unsigned long interval =
+ (unsigned long)(READ_ONCE(aging_interval)) * HZ;
+ unsigned long next_wakeup_tick = jiffies + interval;
+ long timeout_ticks;
+
+ current->reclaim_state = &reclaim_state;
+ flags = memalloc_noreclaim_save();
+
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ try_move_current_to_node(nid);
+ reclaim_state.mm_walk = &pgdat->kold_mm_walk;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ unsigned long young_timestamp =
+ lru_gen_force_age_lruvec(memcg, nid,
+ interval);
+
+ if (time_before(young_timestamp + interval,
+ next_wakeup_tick)) {
+ next_wakeup_tick = young_timestamp + interval;
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+ }
+
+ memalloc_noreclaim_restore(flags);
+ current->reclaim_state = NULL;
+
+ /* late_count stats update */
+ if (time_is_before_jiffies(last_interval_start_time + interval)) {
+ last_interval_start_time += interval;
+ if (!sleep_since_last_full_scan) {
+ WRITE_ONCE(late_count,
+ READ_ONCE(late_count) + 1);
+ }
+ sleep_since_last_full_scan = false;
+ }
+
+ /* sleep until next aging */
+ timeout_ticks = -(long)(jiffies - next_wakeup_tick);
+ if (timeout_ticks > 0 && timeout_ticks != MAX_SCHEDULE_TIMEOUT) {
+ sleep_since_last_full_scan = true;
+ schedule_timeout_idle(timeout_ticks);
+ }
+ }
+ return 0;
+}
+
+int kold_get_stats(struct kold_stats *stats)
+{
+ stats->late_count = READ_ONCE(late_count);
+ return 0;
+}
+
+unsigned int kold_get_interval(void)
+{
+ return READ_ONCE(aging_interval);
+}
+
+int kold_set_interval(unsigned int interval)
+{
+ int err = 0;
+
+ mutex_lock(&kold_mutex);
+ if (interval && !kold_thread) {
+ if (!lru_gen_enabled()) {
+ err = -EOPNOTSUPP;
+ goto cleanup;
+ }
+ kold_thread = kthread_create(kold_run, NULL, "kold");
+
+ if (IS_ERR(kold_thread)) {
+ pr_err("kold: kthread_run(kold_run) failed\n");
+ err = PTR_ERR(kold_thread);
+ kold_thread = NULL;
+ goto cleanup;
+ }
+ WRITE_ONCE(aging_interval, interval);
+ wake_up_process(kold_thread);
+ } else {
+ if (!interval && kold_thread) {
+ kthread_stop(kold_thread);
+ kold_thread = NULL;
+ }
+ WRITE_ONCE(aging_interval, interval);
+ }
+
+cleanup:
+ mutex_unlock(&kold_mutex);
+ return err;
+}
+
+static int __init kold_init(void)
+{
+ return 0;
+}
+
+module_init(kold_init);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..7d2fb3fc4580 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
+#include <linux/kold.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -6569,6 +6570,49 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}

+#ifdef CONFIG_LRU_GEN
+static int memory_periodic_aging_show(struct seq_file *m, void *v)
+{
+ unsigned int interval = kold_get_interval();
+ struct kold_stats stats;
+ int err;
+
+ err = kold_get_stats(&stats);
+
+ if (err)
+ return err;
+
+ seq_printf(m, "aging_interval %u\n", interval);
+ seq_printf(m, "late_count %u\n", stats.late_count);
+ return 0;
+}
+
+static ssize_t memory_periodic_aging_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ unsigned int new_interval;
+ int err;
+
+ if (!lru_gen_enabled())
+ return -EOPNOTSUPP;
+
+ buf = strstrip(buf);
+ if (!buf)
+ return -EINVAL;
+
+ err = kstrtouint(buf, 0, &new_interval);
+ if (err)
+ return err;
+
+ err = kold_set_interval(new_interval);
+ if (err)
+ return err;
+
+ return nbytes;
+}
+#endif /* CONFIG_LRU_GEN */
+
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
@@ -6679,6 +6723,14 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NS_DELEGATABLE,
.write = memory_reclaim,
},
+#ifdef CONFIG_LRU_GEN
+ {
+ .name = "periodic_aging",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = memory_periodic_aging_show,
+ .write = memory_periodic_aging_write,
+ },
+#endif
{ } /* terminate */
};

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..0fea21366fc8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -54,6 +54,7 @@
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
+#include <linux/kold.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -5279,8 +5280,10 @@ static void lru_gen_change_state(bool enabled)

if (enabled)
static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
- else
+ else {
static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+ kold_set_interval(0);
+ }

memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
@@ -5760,6 +5763,36 @@ static const struct file_operations lru_gen_ro_fops = {
.release = seq_release,
};

+/******************************************************************************
+ * periodic aging (kold)
+ ******************************************************************************/
+
+/* age lruvec as long as it is older than min_ttl,
+ * return the timestamp of the youngest generation
+ */
+unsigned long lru_gen_force_age_lruvec(struct mem_cgroup *memcg, int nid,
+ unsigned long min_ttl)
+{
+ struct scan_control sc = {
+ .may_writepage = true,
+ .may_unmap = true,
+ .may_swap = true,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+ DEFINE_MAX_SEQ(lruvec);
+ int gen = lru_gen_from_seq(max_seq);
+ unsigned long birth_timestamp =
+ READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ if (time_is_before_jiffies(birth_timestamp + min_ttl))
+ try_to_inc_max_seq(lruvec, max_seq, &sc, true, true);
+
+ return READ_ONCE(lruvec->lrugen.timestamps[lru_gen_from_seq(
+ READ_ONCE((lruvec)->lrugen.max_seq))]);
+}
+
/******************************************************************************
* initialization
******************************************************************************/
--
2.39.0.314.g84b9a713c41-goog