[PATCH] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

From: Valentin Schneider
Date: Fri Aug 11 2023 - 07:22:35 EST


Sebastian noted that the rto_push_work IRQ work can be queued for a CPU
that has an empty pushable_tasks list, which means nothing useful will be
done in the IPI other than queue the work for the next CPU on the rto_mask.

rto_push_irq_work_func() only operates on tasks in the pushable_tasks list,
but the conditions for that irq_work to be queued (and for a CPU to be
added to the rto_mask) rely on rq_rt->nr_migratory instead.

nr_migratory is increased whenever an RT task entity is enqueued and it has
nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a
rt_rq's current task. This means a rt_rq can have a migratible current, N
non-migratible queued tasks, and be flagged as overloaded / have its CPU
set in the rto_mask, despite having an empty pushable_tasks list.

Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task().
Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them.

Note that the case where the current task is pushed away to make way for a
migration-disabled task remains unchanged: the migration-disabled task has
to be in the pushable_tasks list in the first place, which means it has
nr_cpus_allowed > 1.

Link: http://lore.kernel.org/r/20230801152648._y603AS_@xxxxxxxxxxxxx
Reported-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx>
---
This is lightly tested, this looks to be working OK but I don't have nor am
I aware of a test case for RT balancing, I suppose we want something that
asserts we always run the N highest prio tasks for N CPUs, with a small
margin for migrations?
---
kernel/sched/debug.c | 3 --
kernel/sched/rt.c | 70 +++++++-------------------------------------
kernel/sched/sched.h | 2 --
3 files changed, 10 insertions(+), 65 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db63..b4f6fb592a123 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))

PU(rt_nr_running);
-#ifdef CONFIG_SMP
- PU(rt_nr_migratory);
-#endif
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 00e0e50741153..12100f3b6e5f2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -143,7 +143,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
- rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
@@ -358,53 +357,6 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}

-static void update_rt_migration(struct rt_rq *rt_rq)
-{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
-
- update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
-}
-
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
@@ -438,6 +390,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}

static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -451,6 +408,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
rq->rt.highest_prio.next = p->prio;
} else {
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
}
}

@@ -464,16 +426,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}

-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
@@ -1281,7 +1233,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);

inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}

@@ -1294,7 +1245,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);

dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9c5035ca3b06d..bea6a9ec8cde0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -675,8 +675,6 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned int rt_nr_migratory;
- unsigned int rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;

--
2.31.1