[PATCH v1] coresched/proc: add forceidle report with coresched enabled

From: Aubrey Li
Date: Thu Oct 29 2020 - 10:54:20 EST


When a CPU is running a task with coresched enabled, its sibling will
be forced idle if the sibling does not have a trusted task to run. It
is useful to report forceidle to understand the performance of different
cookies of tasks throughout the system.

forceidle is added at the last column of /proc/stat:

$ cat /proc/stat
cpu 102034 0 11992 8347016 1046 0 11 0 0 0 991
cpu0 59 0 212 80364 59 0 0 0 0 0 0
cpu1 72057 0 89 9102 0 0 0 0 0 0 90

So forceidle% can be computed by any user space tools, for example:

CPU user% system% iowait% forceidle% idle%
cpu53 24.75 0.00 0.00% 0.99% 74.26%
CPU user% system% iowait% forceidle% idle%
cpu53 25.74 0.00 0.00% 0.99% 73.27%
CPU user% system% iowait% forceidle% idle%
cpu53 24.75 0.00 0.00% 0.99% 74.26%
CPU user% system% iowait% forceidle% idle%
cpu53 25.24 0.00 0.00% 3.88% 70.87%

Signed-off-by: Aubrey Li <aubrey.li@xxxxxxxxxxxxxxx>
---
fs/proc/stat.c | 48 +++++++++++++++++++++++++++++++++++++
include/linux/kernel_stat.h | 1 +
include/linux/tick.h | 2 ++
kernel/time/tick-sched.c | 48 +++++++++++++++++++++++++++++++++++++
kernel/time/tick-sched.h | 3 +++
5 files changed, 102 insertions(+)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 46b3293015fe..b27ccac7b5a4 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -28,7 +28,11 @@ static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
u64 idle;

idle = kcs->cpustat[CPUTIME_IDLE];
+#ifdef CONFIG_SCHED_CORE
+ if (cpu_online(cpu) && !nr_iowait_cpu(cpu) && !cpu_rq(cpu)->core->core_forceidle)
+#else
if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
+#endif
idle += arch_idle_time(cpu);
return idle;
}
@@ -43,6 +47,17 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
return iowait;
}

+#ifdef CONFIG_SCHED_CORE
+static u64 get_forceidle_time(struct kernel_cpustat *kcs, int cpu)
+{
+ u64 forceidle;
+
+ forceidle = kcs->cpustat[CPUTIME_FORCEIDLE];
+ if (cpu_online(cpu) && cpu_rq(cpu)->core->core_forceidle)
+ forceidle += arch_idle_time(cpu);
+ return forceidle;
+}
+#endif
#else

static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
@@ -77,6 +92,21 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
return iowait;
}

+static u64 get_forceidle_time(struct kernel_cpustat *kcs, int cpu)
+{
+ u64 forceidle, forceidle_usecs = -1ULL;
+
+ if (cpu_online(cpu))
+ forceidle_usecs = get_cpu_forceidle_time_us(cpu, NULL);
+
+ if (forceidle_usecs == -1ULL)
+ /* !NO_HZ or cpu offline so we can rely on cpustat.forceidle */
+ forceidle = kcs->cpustat[CPUTIME_FORCEIDLE];
+ else
+ forceidle = forceidle_usecs * NSEC_PER_USEC;
+
+ return forceidle;
+}
#endif

static void show_irq_gap(struct seq_file *p, unsigned int gap)
@@ -111,12 +141,18 @@ static int show_stat(struct seq_file *p, void *v)
u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle;
+#endif
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec64 boottime;

user = nice = system = idle = iowait =
irq = softirq = steal = 0;
guest = guest_nice = 0;
+#ifdef CONFIG_SCHED_CORE
+ forceidle = 0;
+#endif
getboottime64(&boottime);

for_each_possible_cpu(i) {
@@ -130,6 +166,9 @@ static int show_stat(struct seq_file *p, void *v)
system += cpustat[CPUTIME_SYSTEM];
idle += get_idle_time(&kcpustat, i);
iowait += get_iowait_time(&kcpustat, i);
+#ifdef CONFIG_SCHED_CORE
+ forceidle += get_forceidle_time(&kcpustat, i);
+#endif
irq += cpustat[CPUTIME_IRQ];
softirq += cpustat[CPUTIME_SOFTIRQ];
steal += cpustat[CPUTIME_STEAL];
@@ -157,6 +196,9 @@ static int show_stat(struct seq_file *p, void *v)
seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+#ifdef CONFIG_SCHED_CORE
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(forceidle));
+#endif
seq_putc(p, '\n');

for_each_online_cpu(i) {
@@ -171,6 +213,9 @@ static int show_stat(struct seq_file *p, void *v)
system = cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(&kcpustat, i);
iowait = get_iowait_time(&kcpustat, i);
+#ifdef CONFIG_SCHED_CORE
+ forceidle = get_forceidle_time(&kcpustat, i);
+#endif
irq = cpustat[CPUTIME_IRQ];
softirq = cpustat[CPUTIME_SOFTIRQ];
steal = cpustat[CPUTIME_STEAL];
@@ -187,6 +232,9 @@ static int show_stat(struct seq_file *p, void *v)
seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+#ifdef CONFIG_SCHED_CORE
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(forceidle));
+#endif
seq_putc(p, '\n');
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 89f0745c096d..c7ce4bfe757e 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ enum cpu_usage_stat {
CPUTIME_IRQ,
CPUTIME_IDLE,
CPUTIME_IOWAIT,
+ CPUTIME_FORCEIDLE,
CPUTIME_STEAL,
CPUTIME_GUEST,
CPUTIME_GUEST_NICE,
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7340613c7eff..7fce78f46930 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -139,6 +139,7 @@ extern unsigned long tick_nohz_get_idle_calls(void);
extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_forceidle_time_us(int cpu, u64 *last_update_time);

static inline void tick_nohz_idle_stop_tick_protected(void)
{
@@ -169,6 +170,7 @@ static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
}
static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_forceidle_time_us(int cpu, u64 *unused) { return -1; }

static inline void tick_nohz_idle_stop_tick_protected(void) { }
#endif /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1b734070f028..de94e5bab5a1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -29,6 +29,7 @@
#include <asm/irq_regs.h>

#include "tick-internal.h"
+#include "../sched/sched.h"

#include <trace/events/timer.h>

@@ -547,6 +548,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
delta = ktime_sub(now, ts->idle_entrytime);
if (nr_iowait_cpu(cpu) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+#ifdef CONFIG_SCHED_CORE
+ else if (cpu_rq(cpu)->core->core_forceidle)
+ ts->forceidle_sleeptime = ktime_add(ts->forceidle_sleeptime, delta);
+#endif
else
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_entrytime = now;
@@ -653,6 +658,49 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);

+#ifdef CONFIG_SCHED_CORE
+/**
+ * get_cpu_forceidle_time_us - get the total force idle time of a CPU
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cumulative force idle time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_forceidle_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t now, forceidle;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time) {
+ update_ts_time_stats(cpu, ts, now, last_update_time);
+ forceidle = ts->forceidle_sleeptime;
+ } else {
+ if (ts->idle_active && cpu_rq(cpu)->core->core_forceidle) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ forceidle = ktime_add(ts->forceidle_sleeptime, delta);
+ } else {
+ forceidle = ts->forceidle_sleeptime;
+ }
+ }
+
+ return ktime_to_us(forceidle);
+
+}
+EXPORT_SYMBOL_GPL(get_cpu_forceidle_time_us);
+#endif
+
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
hrtimer_cancel(&ts->sched_timer);
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 4fb06527cf64..4c00c5399055 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -71,6 +71,9 @@ struct tick_sched {
ktime_t idle_exittime;
ktime_t idle_sleeptime;
ktime_t iowait_sleeptime;
+#ifdef CONFIG_SCHED_CORE
+ ktime_t forceidle_sleeptime;
+#endif
unsigned long last_jiffies;
u64 timer_expires;
u64 timer_expires_base;
--
2.17.1