Re: [PATCH v3] sched: async unthrottling for cfs bandwidth

From: Peter Zijlstra
Date: Fri Nov 25 2022 - 03:57:41 EST


On Tue, Nov 22, 2022 at 11:35:48AM +0100, Peter Zijlstra wrote:
> On Mon, Nov 21, 2022 at 11:37:14AM -0800, Josh Don wrote:
> > Yep, this tradeoff feels "best", but there are some edge cases where
> > this could potentially disrupt fairness. For example, if we have
> > non-trivial W, a lot of cpus to iterate through for dispatching remote
> > unthrottle, and quota is small. Doesn't help that the timer is pinned
> > so that this will continually hit the same cpu.
>
> We could -- if we wanted to -- manually rotate the timer around the
> relevant CPUs. Doing that sanely would require a bit of hrtimer surgery
> though I'm afraid.

Here; something like so should enable us to cycle the bandwidth timer.
Just need to figure out a way to find another CPU or something.

---
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..f8bd200d678a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -63,8 +63,10 @@ enum hrtimer_mode {
* Return values for the callback function
*/
enum hrtimer_restart {
- HRTIMER_NORESTART, /* Timer is not restarted */
- HRTIMER_RESTART, /* Timer must be restarted */
+ HRTIMER_RESTART = -1, /* Timer must be restarted */
+ HRTIMER_NORESTART = 0, /* Timer is not restarted */
+ HRTIMER_RESTART_MIGRATE = 1,
+ HRTIMER_RESTART_MIGRATE_MAX = HRTIMER_RESTART_MIGRATE + NR_CPUS,
};

/*
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab6260..e75033f78a19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1621,6 +1621,16 @@ bool hrtimer_active(const struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_active);

+static void raw_spin_lock_double(raw_spinlock_t *a, raw_spinlock_t *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ raw_spin_lock(a);
+ if (b != a)
+ raw_spin_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
/*
* The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
* distinct sections:
@@ -1644,6 +1654,8 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer *timer, ktime_t *now,
unsigned long flags) __must_hold(&cpu_base->lock)
{
+ struct hrtimer_cpu_base *new_cpu_base = cpu_base;
+ struct hrtimer_clock_base *new_base = base;
enum hrtimer_restart (*fn)(struct hrtimer *);
bool expires_in_hardirq;
int restart;
@@ -1686,7 +1698,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,

lockdep_hrtimer_exit(expires_in_hardirq);
trace_hrtimer_expire_exit(timer);
- raw_spin_lock_irq(&cpu_base->lock);
+
+ local_irq_disable();
+
+ if (restart >= HRTIMER_RESTART_MIGRATE) {
+ int cpu = restart - HRTIMER_RESTART_MIGRATE;
+ int b = base - cpu_base->clock_base;
+
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
+ new_base = new_cpu_base->clock_base[b];
+ }
+ raw_spin_lock_double(&cpu_base->lock, &new_cpu_base->lock);

/*
* Note: We clear the running state after enqueue_hrtimer and
@@ -1698,8 +1720,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
* for us already.
*/
if (restart != HRTIMER_NORESTART &&
- !(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+ !(timer->state & HRTIMER_STATE_ENQUEUED)) {
+
+ if (new_cpu_base != cpu_base) {
+ timer->base = new_base;
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+ raw_spin_unlock(&new_cpu_base->lock);
+ } else {
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+ }
+ }

/*
* Separate the ->running assignment from the ->state assignment.
@@ -2231,12 +2261,8 @@ int hrtimers_dead_cpu(unsigned int scpu)
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
new_base = this_cpu_ptr(&hrtimer_bases);
- /*
- * The caller is globally serialized and nobody else
- * takes two locks at once, deadlock is not possible.
- */
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+ raw_spin_lock_double(&old_base->lock, &new_base->lock);

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],