[PATCH v4] sched/topology: change behaviour of sysctl sched_energy_aware based on the platform

From: Shrikanth Hegde
Date: Tue Sep 26 2023 - 06:01:44 EST


sysctl sched_energy_aware is available for the admin to disable/enable
energy aware scheduling(EAS). EAS is enabled only if few conditions are
met by the platform. They are, asymmetric CPU capacity, no SMT,
valid cpufreq policy, frequency invariant load tracking etc. A platform may
boot without EAS capability, but could gain such capability at runtime
For example, changing/registering the cpufreq policy.

At present, though platform doesn't support EAS, this sysctl returns 1
and it ends up calling rebuild of sched domain on write to 1 and
NOP when writing to 0. That is confusing and un-necessary.

Desired behavior would be to, have this sysctl to enable/disable the EAS
on supported platform. On Non supported platform write to the sysctl
would return not supported error and read of the sysctl would return
empty. So
sched_energy_aware returns empty - EAS is not possible at this moment
sched_energy_aware returns 0 - EAS is supported but disabled by admin.
sched_energy_aware returns 1 - EAS is supported and enabled.
User can find out the reason why EAS is not possible by checking
info messages.

sched_is_eas_possible return if the platform can do EAS at this moment.
It takes most of the cases into account except one where EM complexity is
too high as the code was bit tricky to separate that.

v3->v4:
valentin suggested it would be better to consider simpler approach that
was mentioned in v2. It is a standard approach to keep the knob visible
but change how read and write are handled. Did that and Refactored the
code to use a common function in build_perf_domains and in sysctl handler.
v2->v3:
Chen Yu and Pierre Gondois both pointed out that if platform becomes
capable of EAS later, this patch was not allowing that to happen.
Addressed that by using a variable to indicate the sysctl change
and re-worded the commit message with desired behaviour,
v1->v2:
Chen Yu had pointed out that this will not destroy the perf domains on
architectures where EAS is supported by changing the sysctl.
[v1] Link: https://lore.kernel.org/lkml/20230829065040.920629-1-sshegde@xxxxxxxxxxxxxxxxxx/
[v2] Link: https://lore.kernel.org/lkml/20230901065249.137242-1-sshegde@xxxxxxxxxxxxxxxxxx/
[v3] Link: https://lore.kernel.org/lkml/20230913114807.665094-1-sshegde@xxxxxxxxxxxxxxxxxx/

Signed-off-by: Shrikanth Hegde <sshegde@xxxxxxxxxxxxxxxxxx>
---
Documentation/admin-guide/sysctl/kernel.rst | 3 +-
kernel/sched/topology.c | 107 +++++++++++++-------
2 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index cf33de56da27..d89ac2bd8dc4 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1182,7 +1182,8 @@ automatically on platforms where it can run (that is,
platforms with asymmetric CPU topologies and having an Energy
Model available). If your platform happens to meet the
requirements for EAS but you do not want to use it, change
-this value to 0.
+this value to 0. On Non-EAS platforms, write operation fails and
+read doesn't return anything.

task_delayacct
===============
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a7b50bba7829..839ddc80a5ac 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -212,6 +212,64 @@ static unsigned int sysctl_sched_energy_aware = 1;
static DEFINE_MUTEX(sched_energy_mutex);
static bool sched_energy_update;

+extern struct cpufreq_governor schedutil_gov;
+static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
+{
+ int cpu = cpumask_first(cpu_mask);
+ struct cpufreq_policy *policy;
+ struct cpufreq_governor *gov;
+ int i;
+
+ /* EAS is enabled for asymmetric CPU capacity topologies. */
+ if (!per_cpu(sd_asym_cpucapacity, cpu)) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* EAS definitely does *not* handle SMT */
+ if (sched_smt_active()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ if (!arch_scale_freq_invariant()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* Do not attempt EAS if schedutil is not being used. */
+ for_each_cpu(i, cpu_mask) {
+ policy = cpufreq_cpu_get(i);
+ if (!policy) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
+ cpumask_pr_args(cpu_mask), i);
+ }
+ return false;
+ }
+ gov = policy->governor;
+ cpufreq_cpu_put(policy);
+ if (gov != &schedutil_gov) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+ }
+
+ return true;
+}
+
void rebuild_sched_domains_energy(void)
{
mutex_lock(&sched_energy_mutex);
@@ -231,6 +289,14 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
return -EPERM;

ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!sched_is_eas_possible(cpu_active_mask)) {
+ if (write) {
+ return -EOPNOTSUPP;
+ } else {
+ *lenp = 0;
+ return 0;
+ }
+ }
if (!ret && write) {
state = static_branch_unlikely(&sched_energy_present);
if (state != sysctl_sched_energy_aware)
@@ -370,61 +436,24 @@ static void sched_energy_set(bool has_eas)
*/
#define EM_MAX_COMPLEXITY 2048

-extern struct cpufreq_governor schedutil_gov;
static bool build_perf_domains(const struct cpumask *cpu_map)
{
int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
struct perf_domain *pd = NULL, *tmp;
int cpu = cpumask_first(cpu_map);
struct root_domain *rd = cpu_rq(cpu)->rd;
- struct cpufreq_policy *policy;
- struct cpufreq_governor *gov;

if (!sysctl_sched_energy_aware)
goto free;

- /* EAS is enabled for asymmetric CPU capacity topologies. */
- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
- if (sched_debug()) {
- pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
- cpumask_pr_args(cpu_map));
- }
- goto free;
- }
-
- /* EAS definitely does *not* handle SMT */
- if (sched_smt_active()) {
- pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
- cpumask_pr_args(cpu_map));
- goto free;
- }
-
- if (!arch_scale_freq_invariant()) {
- if (sched_debug()) {
- pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
- cpumask_pr_args(cpu_map));
- }
+ if (!sched_is_eas_possible(cpu_map))
goto free;
- }

for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */
if (find_pd(pd, i))
continue;

- /* Do not attempt EAS if schedutil is not being used. */
- policy = cpufreq_cpu_get(i);
- if (!policy)
- goto free;
- gov = policy->governor;
- cpufreq_cpu_put(policy);
- if (gov != &schedutil_gov) {
- if (rd->pd)
- pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
- cpumask_pr_args(cpu_map));
- goto free;
- }
-
/* Create the new pd and add it to the local list. */
tmp = pd_init(i);
if (!tmp)
@@ -458,6 +487,8 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
return !!pd;

free:
+ if (sched_debug())
+ pr_warn("rd %*pbl: Disabling EAS", cpumask_pr_args(cpu_map));
free_pd(pd);
tmp = rd->pd;
rcu_assign_pointer(rd->pd, NULL);
--
2.31.1