Re: [patch V2 08/10] timer: Implement the hierarchical pull model

From: Peter Zijlstra
Date: Wed Apr 19 2017 - 05:09:29 EST


On Tue, Apr 18, 2017 at 01:11:10PM +0200, Thomas Gleixner wrote:
> +static void __tmigr_handle_remote(struct tmigr_group *group, unsigned int cpu,
> + u64 now, unsigned long jif, bool walkup)
> +{
> + struct timerqueue_node *tmr;
> + struct tmigr_group *parent;
> + struct tmigr_event *evt;
> +
> +again:
> + raw_spin_lock_irq(&group->lock);
> + /*
> + * Handle the group only if @cpu is the migrator or if the group
> + * has no migrator. Otherwise the group is active and is handled by
> + * its own migrator.
> + */
> + if (group->migrator != cpu && group->migrator != TMIGR_NONE) {
> + raw_spin_unlock_irq(&group->lock);
> + return;
> + }
> +
> + tmr = timerqueue_getnext(&group->events);
> + if (tmr && now >= tmr->expires) {
> + /*
> + * Remove the expired entry from the queue and handle
> + * it. If this is a leaf group, call the timer poll
> + * function for the given cpu. Otherwise handle the group
> + * itself. Drop the group lock here in both cases to avoid
> + * lock ordering inversions.
> + */
> + evt = container_of(tmr, struct tmigr_event, nextevt);
> + tmigr_remove_evt(group, evt);
> +
> + raw_spin_unlock_irq(&group->lock);
> +
> + /*
> + * If the event is a group event walk down the hierarchy of
> + * that group to the CPU leafs. If not, handle the expired
> + * timer from the remote CPU.
> + */
> + if (evt->group) {
> + __tmigr_handle_remote(evt->group, cpu, now, jif, false);

Recursion..

> + } else {
> + timer_expire_remote(evt->cpu);
> + tmigr_update_remote(evt->cpu, now, jif);
> + }
> + goto again;
> + }
> +
> + /*
> + * If @group is not active, queue the next event in the parent
> + * group. This is required, because the next event of @group
> + * could have been changed by tmigr_update_remote() above.
> + */
> + parent = group->parent;
> + if (parent && !group->active) {
> + raw_spin_lock_nested(&parent->lock, parent->level);
> + tmigr_add_evt(parent, &group->groupevt);
> + raw_spin_unlock(&parent->lock);
> + }
> + raw_spin_unlock_irq(&group->lock);
> +
> + /* Walk the hierarchy up? */
> + if (!walkup || !parent)
> + return;
> +
> + /* Racy lockless check: See comment in tmigr_handle_remote() */
> + if (parent->migrator == cpu)
> + __tmigr_handle_remote(parent, cpu, now, jif, true);

(hopefully) Tail recursion..

> +}

> +static u64 tmigr_set_cpu_inactive(struct tmigr_group *group,
> + struct tmigr_group *child,
> + struct tmigr_event *evt,
> + unsigned int cpu)
> +{
> + struct tmigr_group *parent;
> + u64 nextevt = KTIME_MAX;
> +
> + raw_spin_lock_nested(&group->lock, group->level);
> +
> + DBG_BUG_ON(!group->active);
> +
> + cpumask_clear_cpu(cpu, group->cpus);
> + group->active--;
> +
> + /*
> + * If @child is not NULL, then this is a recursive invocation to
> + * propagate the deactivation of @cpu. If @child has a new migrator
> + * set it active in @group.
> + */
> + if (child && child->migrator != TMIGR_NONE) {
> + cpumask_set_cpu(child->migrator, group->cpus);
> + group->active++;
> + }
> +
> + /* Add @evt to @group */
> + tmigr_add_evt(group, evt);
> +
> + /* If @cpu is not the active migrator, everything is up to date */
> + if (group->migrator != cpu)
> + goto done;
> +
> + /* Update the migrator. */
> + if (!group->active)
> + group->migrator = TMIGR_NONE;
> + else
> + group->migrator = cpumask_first(group->cpus);
> +
> + parent = group->parent;
> + if (parent) {
> + /*
> + * @cpu was the migrator in @group, so it is marked as
> + * active in its parent group(s) as well. Propagate the
> + * migrator change.
> + */
> + evt = group->active ? NULL : &group->groupevt;
> + nextevt = tmigr_set_cpu_inactive(parent, group, evt, cpu);

Recursion..

> + } else {
> + /*
> + * This is the top level of the hierarchy. If @cpu is about
> + * to go offline wake up some random other cpu so it will
> + * take over the migrator duty and program its timer
> + * proper. Ideally wake the cpu with the closest expiry
> + * time, but that's overkill to figure out.
> + */
> + if (!per_cpu(tmigr_cpu, cpu).online) {
> + cpu = cpumask_any_but(cpu_online_mask, cpu);
> + smp_send_reschedule(cpu);
> + }
> + /*
> + * Return the earliest event of the top level group to make
> + * sure that its handled.
> + *
> + * This could be optimized by keeping track of the last
> + * global scheduled event and only arming it on @cpu if the
> + * new event is earlier. Not sure if its worth the
> + * complexity.
> + */
> + nextevt = group->groupevt.nextevt.expires;
> + }
> +done:
> + raw_spin_unlock(&group->lock);
> + return nextevt;
> +}

Would it be very onerous to rewrite that into regular loops? That avoids
us having to think (and worry) about blowing our stack.