[RFC PATCH 1/2] psi: introduce memory.pressure.stat

From: cgel . zte
Date: Sun Jul 31 2022 - 20:42:19 EST


From: cgel <cgel@xxxxxxxxxx>

For now psi memory pressure account for all the mem stall in the
system, And didnot provide a detailed information why the stall
happens. This patch introduce a cgroupu knob memory.pressure.stat,
it tells the detailed stall information of all memory events and it
format and the corresponding proc interface.

for the cgroup, add memory.pressure.stat and it shows:
kswapd: avg10=0.00 avg60=0.00 avg300=0.00 total=0
direct reclaim: avg10=0.00 avg60=0.00 avg300=0.12 total=42356
kcompacted: avg10=0.00 avg60=0.00 avg300=0.00 total=0
direct compact: avg10=0.00 avg60=0.00 avg300=0.00 total=0
cgroup reclaim: avg10=0.00 avg60=0.00 avg300=0.00 total=0
workingset thrashing: avg10=0.00 avg60=0.00 avg300=0.00 total=0

for the system wide, a proc file introduced as pressure/memory_stat
and the format is the same as the cgroup interface.

With this detaled information, for example, if the system is stalled
because of kcompacted, compaction_proactiveness can be promoted so
pro-compaction can be involved earlier.

Signed-off-by: cgel <cgel@xxxxxxxxxx>
---
include/linux/psi.h | 7 +--
include/linux/psi_types.h | 34 +++++++++++++
kernel/cgroup/cgroup.c | 11 ++++
kernel/sched/psi.c | 126 +++++++++++++++++++++++++++++++++++++++++++---
4 files changed, 168 insertions(+), 10 deletions(-)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7b3de73..163da43 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -19,10 +19,11 @@ void psi_init(void);
void psi_task_change(struct task_struct *task, int clear, int set);

void psi_memstall_tick(struct task_struct *task, int cpu);
-void psi_memstall_enter(unsigned long *flags);
-void psi_memstall_leave(unsigned long *flags);
+void psi_memstall_enter(unsigned long *flags, int mem_state);
+void psi_memstall_leave(unsigned long *flags, int mem_state);

int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
+int psi_mem_pressure_stat_show(struct seq_file *m, void *v);

#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgrp);
@@ -41,7 +42,7 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,

static inline void psi_init(void) {}

-static inline void psi_memstall_enter(unsigned long *flags) {}
+static inline void psi_memstall_enter(unsigned long *flags, int mem_state) {}
static inline void psi_memstall_leave(unsigned long *flags) {}

#ifdef CONFIG_CGROUPS
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 07aaf9b..194ea78 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -9,6 +9,8 @@

#ifdef CONFIG_PSI

+#define PSI_MASK(x) ((1UL << (x))-1)
+
/* Tracked task states */
enum psi_task_count {
NR_IOWAIT,
@@ -22,6 +24,10 @@ enum psi_task_count {
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)

+#define TSK_COUNT_MASK PSI_MASK(NR_PSI_TASK_COUNTS)
+#define TSK_COUNT_SHIFT 8
+
+
/* Resources that workloads could be stalled on */
enum psi_res {
PSI_IO,
@@ -53,6 +59,27 @@ enum psi_aggregators {
NR_PSI_AGGREGATORS,
};

+/* Causes for mem pressure */
+enum psi_memstall_states {
+ PSI_MEM_KSWAPD,
+ PSI_MEM_DRECALAIM,
+ PSI_MEM_KCOMPACTED,
+ PSI_MEM_DCOMPACT,
+ PSI_MEM_CGROUP,
+ PSI_MEM_SWAP,
+ PSI_MEM_WORKINGSET,
+ PSI_MEM_STATES,
+};
+
+#define TSK_MEMSTALL_SHIFT 8
+#define TSK_MEMSTALL_KSWAPD (1 << (PSI_MEM_KSWAPD + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_DRECLAIM (1 << (PSI_MEM_KCOMPACTED + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_KCOMPACTED (1 << (PSI_MEM_DCOMPACT + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_DCOMPACT (1 << (PSI_MEM_CGROUP + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_CGROUP (1 << (PSI_MEM_DRECALAIM + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_WORKINGSET (1 << (PSI_MEM_WORKINGSET + TSK_MEMSTALL_SHIFT))
+#define TSK_MEMSTALL_MASK (PSI_MASK(TSK_MEMSTALL_SHIFT) << TSK_COUNT_SHIFT)
+
struct psi_group_cpu {
/* 1st cacheline updated by the scheduler */

@@ -64,9 +91,11 @@ struct psi_group_cpu {

/* Aggregate pressure state derived from the tasks */
u32 state_mask;
+ u32 state_memstall;

/* Period time sampling buckets for each state of interest (ns) */
u32 times[NR_PSI_STATES];
+ u32 times_mem[PSI_MEM_STATES];

/* Time of last task change in this group (rq_clock) */
u64 state_start;
@@ -76,6 +105,7 @@ struct psi_group_cpu {
/* Delta detection against the sampling buckets */
u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
____cacheline_aligned_in_smp;
+ u32 times_mem_prev[PSI_MEM_STATES];
};

/* PSI growth tracking window */
@@ -144,6 +174,10 @@ struct psi_group {
u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
unsigned long avg[NR_PSI_STATES - 1][3];

+ u64 total_mems[PSI_MEM_STATES - 1];
+ unsigned long avg_mems[PSI_MEM_STATES - 1][3];
+ u64 avg_total_mems[PSI_MEM_STATES - 1];
+
/* Monitor work control */
atomic_t poll_scheduled;
struct kthread_worker __rcu *poll_kworker;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 806fc9d..b50ab92 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3613,6 +3613,13 @@ static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)

return psi_show(seq, psi, PSI_MEM);
}
+static int cgroup_memory_pressure_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+ return psi_mem_pressure_stat_show(seq, psi);
+}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgroup = seq_css(seq)->cgroup;
@@ -4930,6 +4937,10 @@ static struct cftype cgroup_base_files[] = {
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+ {
+ .name = "memory.pressure.stat",
+ .seq_show = cgroup_memory_pressure_stat_show,
+ },
{
.name = "cpu.pressure",
.seq_show = cgroup_cpu_pressure_show,
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9154e74..072d535 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -279,6 +279,35 @@ static void get_recent_times(struct psi_group *group, int cpu,
}
}

+static void get_recent_mem_times(struct psi_group *group, int cpu, u32 *times_mem)
+{
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ u64 now, state_start;
+ enum psi_memstall_states s;
+ unsigned int seq;
+ u32 state_mask;
+
+ do {
+ seq = read_seqcount_begin(&groupc->seq);
+ now = cpu_clock(cpu);
+ memcpy(times_mem, groupc->times_mem, sizeof(groupc->times_mem));
+ state_mask = groupc->state_mask;
+ state_start = groupc->state_start;
+ } while (read_seqcount_retry(&groupc->seq, seq));
+
+ for (s = 0; s < PSI_MEM_STATES; s++) {
+ u32 delta;
+
+ if (state_mask & (1 << s))
+ times_mem[s] += now - state_start;
+
+ delta = times_mem[s] - groupc->times_mem_prev[s];
+ groupc->times_mem_prev[s] = times_mem[s];
+
+ times_mem[s] = delta;
+ }
+}
+
static void calc_avgs(unsigned long avg[3], int missed_periods,
u64 time, u64 period)
{
@@ -304,6 +333,7 @@ static void collect_percpu_times(struct psi_group *group,
u32 *pchanged_states)
{
u64 deltas[NR_PSI_STATES - 1] = { 0, };
+ u64 delta_mems[PSI_MEM_STATES - 1] = { 0, };
unsigned long nonidle_total = 0;
u32 changed_states = 0;
int cpu;
@@ -319,11 +349,16 @@ static void collect_percpu_times(struct psi_group *group,
*/
for_each_possible_cpu(cpu) {
u32 times[NR_PSI_STATES];
+ u32 times_mem[PSI_MEM_STATES];
+
u32 nonidle;
u32 cpu_changed_states;

get_recent_times(group, cpu, aggregator, times,
&cpu_changed_states);
+ if (times[PSI_MEM_SOME])
+ get_recent_mem_times(group, cpu, times_mem);
+
changed_states |= cpu_changed_states;

nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
@@ -350,6 +385,10 @@ static void collect_percpu_times(struct psi_group *group,
group->total[aggregator][s] +=
div_u64(deltas[s], max(nonidle_total, 1UL));

+ for (s = 0; s < PSI_MEM_STATES - 1; s++)
+ group->total_mems[s] +=
+ div_u64(delta_mems[s], max(nonidle_total, 1UL));
+
if (pchanged_states)
*pchanged_states = changed_states;
}
@@ -404,6 +443,16 @@ static u64 update_averages(struct psi_group *group, u64 now)
calc_avgs(group->avg[s], missed_periods, sample, period);
}

+ for (s = 0; s < PSI_MEM_STATES - 1; s++) {
+ u32 sample;
+
+ sample = group->total_mems[s] - group->avg_total_mems[s];
+ if (sample > period)
+ sample = period;
+ group->avg_total_mems[s] += sample;
+ calc_avgs(group->avg_mems[s], missed_periods, sample, period);
+ }
+
return avg_next_update;
}

@@ -628,6 +677,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
{
u32 delta;
u64 now;
+ int state_memstall = groupc->state_memstall;

now = cpu_clock(cpu);
delta = now - groupc->state_start;
@@ -641,6 +691,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,

if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
groupc->times[PSI_MEM_SOME] += delta;
+ groupc->times_mem[state_memstall] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
else if (memstall_tick) {
@@ -676,7 +727,12 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
unsigned int t, m;
enum psi_states s;
u32 state_mask = 0;
+ u32 state_memstall = 0;

+ if (set & TSK_MEMSTALL) {
+ state_memstall = set & TSK_MEMSTALL_MASK;
+ set &= TSK_COUNT_MASK;
+ }
groupc = per_cpu_ptr(group->pcpu, cpu);

/*
@@ -714,7 +770,7 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
state_mask |= (1 << s);
}
groupc->state_mask = state_mask;
-
+ groupc->state_memstall = state_memstall;
write_seqcount_end(&groupc->seq);

return state_mask;
@@ -810,7 +866,7 @@ void psi_memstall_tick(struct task_struct *task, int cpu)
* Marks the calling task as being stalled due to a lack of memory,
* such as waiting for a refault or performing reclaim.
*/
-void psi_memstall_enter(unsigned long *flags)
+void psi_memstall_enter(unsigned long *flags, int mem_state)
{
struct rq_flags rf;
struct rq *rq;
@@ -829,7 +885,7 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);

current->flags |= PF_MEMSTALL;
- psi_task_change(current, 0, TSK_MEMSTALL);
+ psi_task_change(current, 0, TSK_MEMSTALL | mem_state);

rq_unlock_irq(rq, &rf);
}
@@ -840,7 +896,7 @@ void psi_memstall_enter(unsigned long *flags)
*
* Marks the calling task as no longer stalled due to lack of memory.
*/
-void psi_memstall_leave(unsigned long *flags)
+void psi_memstall_leave(unsigned long *flags, int mem_state)
{
struct rq_flags rf;
struct rq *rq;
@@ -858,7 +914,7 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);

current->flags &= ~PF_MEMSTALL;
- psi_task_change(current, TSK_MEMSTALL, 0);
+ psi_task_change(current, TSK_MEMSTALL | mem_state, 0);

rq_unlock_irq(rq, &rf);
}
@@ -974,6 +1030,53 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
return 0;
}

+const char * const memstall_text[] = {
+ "kswapd",
+ "direct reclaim",
+ "kcompacted",
+ "direct compact",
+ "cgroup reclaim",
+ "swap",
+ "workingset",
+};
+
+int psi_mem_pressure_stat_show(struct seq_file *m, void *v)
+{
+ int s;
+ u64 now;
+ struct psi_group *group = &psi_system;
+
+ if (static_branch_likely(&psi_disabled))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&group->avgs_lock);
+ now = sched_clock();
+ collect_percpu_times(group, PSI_AVGS, NULL);
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
+ mutex_unlock(&group->avgs_lock);
+
+ for (s = 0; s < PSI_MEM_STATES; s++) {
+ unsigned long avg[3];
+ u64 total;
+ int w;
+
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg_mems[s][w];
+
+ total = div_u64(group->total_mems[PSI_AVGS], NSEC_PER_USEC);
+
+ seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
+ memstall_text[s],
+ LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
+ LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
+ LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
+ total);
+ }
+
+ return 0;
+}
+
static int psi_io_show(struct seq_file *m, void *v)
{
return psi_show(m, &psi_system, PSI_IO);
@@ -998,7 +1101,10 @@ static int psi_memory_open(struct inode *inode, struct file *file)
{
return single_open(file, psi_memory_show, NULL);
}
-
+static int psi_memory_stat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_mem_pressure_stat_show, NULL);
+}
static int psi_cpu_open(struct inode *inode, struct file *file)
{
return single_open(file, psi_cpu_show, NULL);
@@ -1271,7 +1377,12 @@ static const struct file_operations psi_memory_fops = {
.poll = psi_fop_poll,
.release = psi_fop_release,
};
-
+static const struct file_operations psi_memory_stat_fops = {
+ .open = psi_memory_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = psi_fop_release,
+};
static const struct file_operations psi_cpu_fops = {
.open = psi_cpu_open,
.read = seq_read,
@@ -1286,6 +1397,7 @@ static int __init psi_proc_init(void)
proc_mkdir("pressure", NULL);
proc_create("pressure/io", 0, NULL, &psi_io_fops);
proc_create("pressure/memory", 0, NULL, &psi_memory_fops);
+ proc_create("pressure/memory_stat", 0, NULL, &psi_memory_stat_fops);
proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops);
return 0;
}
--
2.15.2