[RFC PATCH 2/2] sched/fair: Repurpose cfs_rq_throttled()

From: Valentin Schneider
Date: Thu Nov 30 2023 - 11:13:23 EST


cfs_rq->throttled is now never set, as cfs_rq's are never fully throttled
but rather stay in limbo as tasks are slowly plucked out of them.

Get rid of cfs_rq->throttled, and repurpose cfs_rq_throttled() to use the
limbo meaning.

Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx>
---
kernel/sched/fair.c | 57 ++++++--------------------------------------
kernel/sched/sched.h | 3 +--
2 files changed, 8 insertions(+), 52 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 27aee13e7ccd9..fd3a0c388fabd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5497,7 +5497,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
if (likely(cfs_rq->runtime_remaining > 0))
return;

- if (cfs_rq->throttled || cfs_rq->in_throttle_limbo)
+ if (cfs_rq->in_throttle_limbo)
return;
/*
* if we're unable to extend our runtime we resched so that the active
@@ -5518,7 +5518,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+ return cfs_bandwidth_used() && cfs_rq->in_throttle_limbo;
}

/* check whether cfs_rq, or any parent, is throttled */
@@ -5848,10 +5848,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
}

for_each_sched_entity(se) {
@@ -5862,10 +5858,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(qcfs_rq))
- goto unthrottle_throttle;
}

/* At this point se is NULL and we are at root level*/
@@ -5909,7 +5901,7 @@ static void __cfsb_csd_unthrottle(void *arg)
throttled_csd_list) {
list_del_init(&cursor->throttled_csd_list);

- if (cfs_rq_throttled(cursor) || cursor->in_throttle_limbo)
+ if (cfs_rq_throttled(cursor))
unthrottle_cfs_rq(cursor);
}

@@ -5949,7 +5941,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
lockdep_assert_rq_held(rq_of(cfs_rq));

- if (SCHED_WARN_ON(!(cfs_rq_throttled(cfs_rq) || cfs_rq->in_throttle_limbo) ||
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
cfs_rq->runtime_remaining <= 0))
return;

@@ -5982,7 +5974,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
* waiting for tasks to exit the kernel. In this case we still
* want to replenish.
*/
- if (!cfs_rq_throttled(cfs_rq) && !cfs_rq->in_throttle_limbo)
+ if (!cfs_rq_throttled(cfs_rq))
goto next;

/* Already queued for async unthrottle */
@@ -6031,7 +6023,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)

list_del_init(&cfs_rq->throttled_csd_list);

- if (cfs_rq_throttled(cfs_rq) || cfs_rq->in_throttle_limbo)
+ if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);

rq_unlock_irqrestore(rq, &rf);
@@ -6230,10 +6222,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;

- /* ensure the group is not already throttled */
- if (cfs_rq_throttled(cfs_rq))
- return;
-
/* update runtime allocation */
account_cfs_rq_runtime(cfs_rq, 0);
if (cfs_rq->runtime_remaining <= 0)
@@ -6266,13 +6254,6 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
return false;

- /*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
- */
- if (cfs_rq_throttled(cfs_rq))
- return true;
-
return throttle_cfs_rq(cfs_rq);
}

@@ -6705,10 +6686,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;

- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
-
flags = ENQUEUE_WAKEUP;
}

@@ -6724,10 +6701,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
-
- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto enqueue_throttle;
}

/* At this point se is NULL and we are at root level*/
@@ -6750,7 +6723,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!task_new)
update_overutilized_status(rq);

-enqueue_throttle:
assert_list_leaf_cfs_rq(rq);

hrtick_update(rq);
@@ -6783,10 +6755,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;

- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
@@ -6815,10 +6783,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;

- /* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
}

/* At this point se is NULL and we are at root level*/
@@ -6828,7 +6792,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;

-dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -9582,7 +9545,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
* Something like:
*
* { 0 1 2 3 } { 4 5 6 7 }
- * * * * *
+ * * * * *
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
@@ -12642,9 +12605,6 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);

- if (cfs_rq_throttled(cfs_rq))
- return;
-
if (!throttled_hierarchy(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);

@@ -12656,9 +12616,6 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)

update_load_avg(cfs_rq, se, UPDATE_TG);

- if (cfs_rq_throttled(cfs_rq))
- break;
-
if (!throttled_hierarchy(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index be29154d93898..7f1afee52a776 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -631,9 +631,8 @@ struct cfs_rq {
u64 throttled_clock_pelt_time;
u64 throttled_clock_self;
u64 throttled_clock_self_time;
- int throttled;
- int throttle_count;
int in_throttle_limbo;
+ int throttle_count;
/* Temp storage for updating the counts during unthrottling */
unsigned int unthrottled_h_nr_running;
unsigned int unthrottled_idle_h_nr_running;
--
2.41.0