Re: [PATCH 2/3] sched: Cleanup bandwidth timers

From: bsegall
Date: Thu Apr 16 2015 - 16:03:52 EST


Peter Zijlstra <peterz@xxxxxxxxxxxxx> writes:

> Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth().
>
> The more I look at that code the more I'm convinced its crack, that
> entire __start_cfs_bandwidth() thing is brain melting, we don't need to
> cancel a timer before starting it, *hrtimer_start*() will happily remove
> the timer for you if its still enqueued.
>
> Removing that, removes a big part of the problem, no more ugly cancel
> loop to get stuck in.
>
> So now, if I understand things right, the entire reason you have this
> cfs_b->lock guarded ->timer_active nonsense is to make sure we don't
> accidentally loose the timer.
>
> It appears to me that it should be possible to guarantee that same by
> unconditionally (re)starting the timer when !queued. Because regardless
> what hrtimer::function will return, if we beat it to (re)enqueue the
> timer, it doesn't matter.
>
> Now, because hrtimers don't come with any serialization guarantees we
> must ensure both handler and (re)start loop serialize their access to
> the hrtimer to avoid both trying to forward the timer at the same
> time.
>
> Update the rt bandwidth timer to match.
>
> This effectively reverts: 09dc4ab03936 ("sched/fair: Fix
> tg_set_cfs_bandwidth() deadlock on rq->lock").
>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Paul Turner <pjt@xxxxxxxxxx>
> Cc: Ben Segall <bsegall@xxxxxxxxxx>
> Reported-by: Roman Gushchin <klamm@xxxxxxxxxxxxxx>

Reviewed-by: Ben Segall <bsegall@xxxxxxxxxx>

So much nicer. Docs/comment issue only: s/loose/lose/


> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> ---
> kernel/sched/core.c | 15 ++++++------
> kernel/sched/fair.c | 59 ++++++++++++---------------------------------------
> kernel/sched/rt.c | 14 +++++-------
> kernel/sched/sched.h | 4 +--
> 4 files changed, 31 insertions(+), 61 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -92,10 +92,13 @@
>
> void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
> {
> - if (hrtimer_active(period_timer))
> - return;
> + /*
> + * Do not forward the expiration time of active timers;
> + * we do not want to loose an overrun.
> + */
> + if (!hrtimer_active(period_timer))
> + hrtimer_forward_now(period_timer, period);
>
> - hrtimer_forward_now(period_timer, period);
> hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
> }
>
> @@ -8098,10 +8101,8 @@ static int tg_set_cfs_bandwidth(struct t
>
> __refill_cfs_bandwidth_runtime(cfs_b);
> /* restart the period timer (if active) to handle new period expiry */
> - if (runtime_enabled && cfs_b->timer_active) {
> - /* force a reprogram */
> - __start_cfs_bandwidth(cfs_b, true);
> - }
> + if (runtime_enabled)
> + start_cfs_bandwidth(cfs_b);
> raw_spin_unlock_irq(&cfs_b->lock);
>
> for_each_online_cpu(i) {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct
> if (cfs_b->quota == RUNTIME_INF)
> amount = min_amount;
> else {
> - /*
> - * If the bandwidth pool has become inactive, then at least one
> - * period must have elapsed since the last consumption.
> - * Refresh the global state and ensure bandwidth timer becomes
> - * active.
> - */
> - if (!cfs_b->timer_active) {
> - __refill_cfs_bandwidth_runtime(cfs_b);
> - __start_cfs_bandwidth(cfs_b, false);
> - }
> + start_cfs_bandwidth(cfs_b);
>
> if (cfs_b->runtime > 0) {
> amount = min(cfs_b->runtime, min_amount);
> @@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_r
> struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> struct sched_entity *se;
> long task_delta, dequeue = 1;
> + bool empty;
>
> se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>
> @@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_r
> cfs_rq->throttled = 1;
> cfs_rq->throttled_clock = rq_clock(rq);
> raw_spin_lock(&cfs_b->lock);
> + empty = list_empty(&cfs_rq->throttled_list);
> +
> /*
> * Add to the _head_ of the list, so that an already-started
> * distribute_cfs_runtime will not see us
> */
> list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
> - if (!cfs_b->timer_active)
> - __start_cfs_bandwidth(cfs_b, false);
> +
> + /*
> + * If we're the first throttled task, make sure the bandwidth
> + * timer is running.
> + */
> + if (empty)
> + start_cfs_bandwidth(cfs_b);
> +
> raw_spin_unlock(&cfs_b->lock);
> }
>
> @@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(str
> if (cfs_b->idle && !throttled)
> goto out_deactivate;
>
> - /*
> - * if we have relooped after returning idle once, we need to update our
> - * status as actually running, so that other cpus doing
> - * __start_cfs_bandwidth will stop trying to cancel us.
> - */
> - cfs_b->timer_active = 1;
> -
> __refill_cfs_bandwidth_runtime(cfs_b);
>
> if (!throttled) {
> @@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(str
> return 0;
>
> out_deactivate:
> - cfs_b->timer_active = 0;
> return 1;
> }
>
> @@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_sl
> {
> struct cfs_bandwidth *cfs_b =
> container_of(timer, struct cfs_bandwidth, slack_timer);
> +
> do_sched_cfs_slack_timer(cfs_b);
>
> return HRTIMER_NORESTART;
> @@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_pe
> {
> struct cfs_bandwidth *cfs_b =
> container_of(timer, struct cfs_bandwidth, period_timer);
> - ktime_t now;
> int overrun;
> int idle = 0;
>
> raw_spin_lock(&cfs_b->lock);
> for (;;) {
> - now = hrtimer_cb_get_time(timer);
> - overrun = hrtimer_forward(timer, now, cfs_b->period);
> -
> + overrun = hrtimer_forward_now(timer, cfs_b->period);
> if (!overrun)
> break;
>
> @@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct c
> INIT_LIST_HEAD(&cfs_rq->throttled_list);
> }
>
> -/* requires cfs_b->lock, may release to reprogram timer */
> -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
> +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> {
> - /*
> - * The timer may be active because we're trying to set a new bandwidth
> - * period or because we're racing with the tear-down path
> - * (timer_active==0 becomes visible before the hrtimer call-back
> - * terminates). In either case we ensure that it's re-programmed
> - */
> - while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
> - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
> - /* bounce the lock to allow do_sched_cfs_period_timer to run */
> - raw_spin_unlock(&cfs_b->lock);
> - cpu_relax();
> - raw_spin_lock(&cfs_b->lock);
> - /* if someone else restarted the timer then we're done */
> - if (!force && cfs_b->timer_active)
> - return;
> - }
> -
> - cfs_b->timer_active = 1;
> start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
> }
>
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_per
> {
> struct rt_bandwidth *rt_b =
> container_of(timer, struct rt_bandwidth, rt_period_timer);
> - ktime_t now;
> - int overrun;
> int idle = 0;
> + int overrun;
>
> + raw_spin_lock(&rt_b->rt_runtime_lock);
> for (;;) {
> - now = hrtimer_cb_get_time(timer);
> - overrun = hrtimer_forward(timer, now, rt_b->rt_period);
> -
> + overrun = hrtimer_forward_now(timer, rt_b->rt_period);
> if (!overrun)
> break;
>
> + raw_spin_unlock(&rt_b->rt_runtime_lock);
> idle = do_sched_rt_period_timer(rt_b, overrun);
> + raw_spin_lock(&rt_b->rt_runtime_lock);
> }
> + raw_spin_unlock(&rt_b->rt_runtime_lock);
>
> return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
> }
> @@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt
> if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
> return;
>
> - if (hrtimer_active(&rt_b->rt_period_timer))
> - return;
> -
> raw_spin_lock(&rt_b->rt_runtime_lock);
> start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
> raw_spin_unlock(&rt_b->rt_runtime_lock);
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -215,7 +215,7 @@ struct cfs_bandwidth {
> s64 hierarchical_quota;
> u64 runtime_expires;
>
> - int idle, timer_active;
> + int idle;
> struct hrtimer period_timer, slack_timer;
> struct list_head throttled_cfs_rq;
>
> @@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cf
> extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
>
> extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
> -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
> +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
> extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
>
> extern void free_rt_sched_group(struct task_group *tg);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/