[RFC PATCH 4/7] sched: cpufreq: Remove magic 1.25 headroom from apply_dvfs_headroom()

From: Qais Yousef
Date: Sun Aug 27 2023 - 19:33:01 EST


Instead of the magical 1.25 headroom, use the new approximate_util_avg()
to provide headroom based on the dvfs_update_delay; which is the period
at which the cpufreq governor will send DVFS updates to the hardware.

Add a new percpu dvfs_update_delay that can be cheaply accessed whenever
apply_dvfs_headroom() is called. We expect cpufreq governors that rely
on util to drive its DVFS logic/algorithm to populate these percpu
variables. schedutil is the only such governor at the moment.

Signed-off-by: Qais Yousef (Google) <qyousef@xxxxxxxxxxx>
---
kernel/sched/core.c | 3 ++-
kernel/sched/cpufreq_schedutil.c | 10 +++++++++-
kernel/sched/sched.h | 25 ++++++++++++++-----------
3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 602e369753a3..f56eb44745a8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -116,6 +116,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);

DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay);

#ifdef CONFIG_SCHED_DEBUG
/*
@@ -7439,7 +7440,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* frequency will be gracefully reduced with the utilization decay.
*/
if (type == FREQUENCY_UTIL) {
- util = apply_dvfs_headroom(util_cfs) + cpu_util_rt(rq);
+ util = apply_dvfs_headroom(util_cfs, cpu) + cpu_util_rt(rq);
util = uclamp_rq_util_with(rq, util, p);
} else {
util = util_cfs + cpu_util_rt(rq);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0c7565ac31fb..04aa06846f31 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -519,15 +519,21 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy;
unsigned int rate_limit_us;
+ int cpu;

if (kstrtouint(buf, 10, &rate_limit_us))
return -EINVAL;

tunables->rate_limit_us = rate_limit_us;

- list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+
sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;

+ for_each_cpu(cpu, sg_policy->policy->cpus)
+ per_cpu(dvfs_update_delay, cpu) = rate_limit_us;
+ }
+
return count;
}

@@ -772,6 +778,8 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
+
+ per_cpu(dvfs_update_delay, cpu) = sg_policy->tunables->rate_limit_us;
}

if (policy_is_shared(policy))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b889ad399de..e06e512af192 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3001,6 +3001,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long approximate_util_avg(unsigned long util, u64 delta);
u64 approximate_runtime(unsigned long util);

+/*
+ * Any governor that relies on util signal to drive DVFS, must populate these
+ * percpu dvfs_update_delay variables.
+ *
+ * It should describe the rate/delay at which the governor sends DVFS freq
+ * update to the hardware in us.
+ */
+DECLARE_PER_CPU_SHARED_ALIGNED(u64, dvfs_update_delay);
+
/*
* DVFS decision are made at discrete points. If CPU stays busy, the util will
* continue to grow, which means it could need to run at a higher frequency
@@ -3010,20 +3019,14 @@ u64 approximate_runtime(unsigned long util);
* to run at adequate performance point.
*
* This function provides enough headroom to provide adequate performance
- * assuming the CPU continues to be busy.
- *
- * At the moment it is a constant multiplication with 1.25.
+ * assuming the CPU continues to be busy. This headroom is based on the
+ * dvfs_update_delay of the cpufreq governor.
*
- * TODO: The headroom should be a function of the delay. 25% is too high
- * especially on powerful systems. For example, if the delay is 500us, it makes
- * more sense to give a small headroom as the next decision point is not far
- * away and will follow the util if it continues to rise. On the other hand if
- * the delay is 10ms, then we need a bigger headroom so the CPU won't struggle
- * at a lower frequency if it never goes to idle until then.
+ * XXX: Should we provide headroom when the util is decaying?
*/
-static inline unsigned long apply_dvfs_headroom(unsigned long util)
+static inline unsigned long apply_dvfs_headroom(unsigned long util, int cpu)
{
- return util + (util >> 2);
+ return approximate_util_avg(util, per_cpu(dvfs_update_delay, cpu));
}

/*
--
2.34.1