Re: [PATCH tip/sched/core v2] sched/rt: Simplify the IPI rt balancing logic

From: Peter Zijlstra
Date: Thu May 04 2017 - 11:33:24 EST


On Mon, Apr 24, 2017 at 11:47:32AM -0400, Steven Rostedt wrote:
> static int rto_next_cpu(struct rq *rq)
> {
> int cpu;
>
> /*
> + * When starting the IPI RT pushing, the rto_cpu is set to nr_cpu_ids
> + * or greater. rt_next_cpu() will simply return the first CPU found in
> + * the rto_mask.
> + *
> + * If rto_next_cpu() is called with rto_cpu less than nr_cpu_ids, it
> + * will return the next CPU found in the rto_mask.
> + *
> + * If there are no more CPUs left in the rto_mask, then a check is made
> + * against rto_loop and rto_loop_next. rto_loop is only updated with
> + * the rto_lock held, but any CPU may increment the rto_loop_next
> + * without any locking.
> */
> +again:
> + if (rq->rd->rto_cpu >= nr_cpu_ids) {
> cpu = cpumask_first(rq->rd->rto_mask);
> + rq->rd->rto_cpu = cpu;
> + /* If cpu is nr_cpu_ids, then there is no overloaded rqs */
> + return cpu;
> }
>
> + cpu = cpumask_next(rq->rd->rto_cpu, rq->rd->rto_mask);
> + rq->rd->rto_cpu = cpu;
>
> + if (cpu < nr_cpu_ids)
> + return cpu;
>
> + if (rq->rd->rto_loop == atomic_read(&rq->rd->rto_loop_next))
> + return cpu;
>
> + rq->rd->rto_loop = atomic_read(&rq->rd->rto_loop_next);
> + goto again;
> +}

I think you want to write that as:

struct root_domain *rd = rq->rd;
int cpu, next;

/* comment */
for (;;) {
if (rd->rto_cpu >= nr_cpu_ids) {
cpu = cpumask_first(rd->rto_mask);
rd->rto_cpu = cpu;
return cpu;
}

cpu = cpumask_next(rd->rto_mask);
rd->rto_cpu = cpu;

if (cpu < nr_cpu_ids)
break;

// rd->rto_cpu = -1;

/*
* ACQUIRE ensures we see the @rto_mask changes
* made prior to the @next value observed.
*
* Matches WMB in rt_set_overload().
*/
next = atomic_read_acquire(&rd->rto_loop_next);

if (rd->rto_loop == next)
break;

rd->rto_loop = next;
}

return cpu;

And I don't fully understand the whole rto_cpu >= nr_cpus_ids thing,
can't you simply reset the thing to -1 and always use cpumask_next()?
As per the // comment above?

> +static inline bool rto_start_trylock(atomic_t *v)
> +{
> + return !atomic_cmpxchg(v, 0, 1);

Arguably this could be: !atomic_cmpxchg_acquire(v, 0, 1);

> }
>
> +static inline void rto_start_unlock(atomic_t *v)
> +{
> + atomic_set_release(v, 0);
> +}
>

> static void tell_cpu_to_push(struct rq *rq)
> {
> + int cpu = nr_cpu_ids;
>
> + /* Keep the loop going if the IPI is currently active */
> + atomic_inc_return(&rq->rd->rto_loop_next);

Since rt_set_overload() already provides a WMB, we don't need an
ordered primitive here and atomic_inc() is fine.

>
> + /* Only one CPU can initiate a loop at a time */
> + if (!rto_start_trylock(&rq->rd->rto_loop_start))
> return;
>
> + raw_spin_lock(&rq->rd->rto_lock);
> +
> + /*
> + * The rto_cpu is updated under the lock, if it has a valid cpu
> + * then the IPI is still running and will continue due to the
> + * update to loop_next, and nothing needs to be done here.
> + * Otherwise it is finishing up and an ipi needs to be sent.
> + */
> + if (rq->rd->rto_cpu >= nr_cpu_ids)
// if (rq->rd->rto_cpu < 0)

> + cpu = rto_next_cpu(rq);
>
> + raw_spin_unlock(&rq->rd->rto_lock);
> +
> + rto_start_unlock(&rq->rd->rto_loop_start);
> +
> + if (cpu < nr_cpu_ids)
> + irq_work_queue_on(&rq->rd->rto_push_work, cpu);
> }
>
> /* Called from hardirq context */
> +void rto_push_irq_work_func(struct irq_work *work)
> {
> + struct rq *rq;
> int this_cpu;
> int cpu;
>
> + this_cpu = smp_processor_id();
> rq = cpu_rq(this_cpu);

rq = this_rq();

>
> + /*
> + * We do not need to grab the lock to check for has_pushable_tasks.
> + * When it gets updated, a check is made if a push is possible.
> + */
> if (has_pushable_tasks(rq)) {
> raw_spin_lock(&rq->lock);
> + push_rt_tasks(rq);
> raw_spin_unlock(&rq->lock);
> }
>
> + raw_spin_lock(&rq->rd->rto_lock);
>
> + /* Pass the IPI to the next rt overloaded queue */
> + cpu = rto_next_cpu(rq);
>
> + raw_spin_unlock(&rq->rd->rto_lock);
>
> if (cpu >= nr_cpu_ids)
> return;
>
> /* Try the next RT overloaded CPU */
> + irq_work_queue_on(&rq->rd->rto_push_work, cpu);
> }