Re: [Bug report] A variant deadlock issue of CPU hot-unplug operation vs. the CFS bandwidth timer

From: Thomas Gleixner
Date: Tue Nov 07 2023 - 09:57:21 EST


On Tue, Nov 07 2023 at 21:26, Yu Liao wrote:
> The patch [1] from tglx solved the deadlock issue reported by Xiongfeng [2],
> but recently we discovered a variant of the deadlock issue.
>
> [1] https://lore.kernel.org/all/87h6oqdq0i.ffs@tglx/
> [2] https://lore.kernel.org/all/8e785777-03aa-99e1-d20e-e956f5685be6@xxxxxxxxxx/
>
> (The following description is partially borrowed from the commit log of tglx)
> CPU1 CPU2 CPU3
>
> T1 sets cfs_quota
> starts hrtimer cfs_bandwidth 'period_timer'
> T1 is migrated to CPU3
> T2(worker thread) initiates
> offlining of CPU1
> Hotplug operation starts
> ...
> 'period_timer' expires and is
> re-enqueued on CPU1
> ...
> take_cpu_down()
> CPU1 shuts down and does not handle timers
> anymore. They have to be migrated in the
> post dead hotplug steps by the control task.
>
> T2(worker thread) runs the
> post dead offline operation
> T1 holds lockA
> T1 is scheduled out
> //throttled by CFS bandwidth control
> T1 waits for 'period_timer' to expire
> T2(worker thread) waits for lockA
>
> T1 waits there forever if it is scheduled out before it can execute the
> hrtimer offline callback hrtimers_dead_cpu().
> Thus T2 waits for lockA forever.
>
> We discovered several different 'lockA'. This is a real example that occurs on
> kernel 5.10, where 'lockA' is 'kernfs_mutex'.

Bah.

So we can actually migrate the hrtimers away from the outgoing CPU in
the dying callbacks. That's safe as nothing can queue an hrtimer remote
on the outgoing CPU because all other CPUs are spinwaiting with
interrupts disabled in stomp_machine() until the CPU marked itself
offline.

Survived a quick test, but needs some scrunity and probably a sanity
check in the post dead stage.

Thanks,

tglx
---
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -195,6 +195,7 @@ enum cpuhp_state {
CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
CPUHP_AP_ARM64_ISNDEP_STARTING,
CPUHP_AP_SMPCFD_DYING,
+ CPUHP_AP_HRTIMERS_DYING,
CPUHP_AP_X86_TBOOT_DYING,
CPUHP_AP_ARM_CACHE_B15_RAC_DYING,
CPUHP_AP_ONLINE,
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -531,9 +531,9 @@ extern void sysrq_timer_list_show(void);

int hrtimers_prepare_cpu(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
-int hrtimers_dead_cpu(unsigned int cpu);
+int hrtimers_cpu_dying(unsigned int cpu);
#else
-#define hrtimers_dead_cpu NULL
+#define hrtimers_cpu_dying NULL
#endif

#endif
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2116,7 +2116,7 @@ static struct cpuhp_step cpuhp_hp_states
[CPUHP_HRTIMERS_PREPARE] = {
.name = "hrtimers:prepare",
.startup.single = hrtimers_prepare_cpu,
- .teardown.single = hrtimers_dead_cpu,
+ .teardown.single = NULL,
},
[CPUHP_SMPCFD_PREPARE] = {
.name = "smpcfd:prepare",
@@ -2208,6 +2208,12 @@ static struct cpuhp_step cpuhp_hp_states
.startup.single = NULL,
.teardown.single = smpcfd_dying_cpu,
},
+ [CPUHP_AP_HRTIMERS_DYING] = {
+ .name = "hrtimers:dying",
+ .startup.single = NULL,
+ .teardown.single = hrtimers_cpu_dying,
+ },
+
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2219,29 +2219,22 @@ static void migrate_hrtimer_list(struct
}
}

-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
- int i;
+ int i, ncpu = cpumask_first(cpu_active_mask);

- BUG_ON(cpu_online(scpu));
- tick_cancel_sched_timer(scpu);
+ tick_cancel_sched_timer(dying_cpu);
+
+ old_base = this_cpu_ptr(&hrtimer_bases);
+ new_base = &per_cpu(hrtimer_bases, ncpu);

- /*
- * this BH disable ensures that raise_softirq_irqoff() does
- * not wakeup ksoftirqd (and acquire the pi-lock) while
- * holding the cpu_base lock
- */
- local_bh_disable();
- local_irq_disable();
- old_base = &per_cpu(hrtimer_bases, scpu);
- new_base = this_cpu_ptr(&hrtimer_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&old_base->lock);
+ raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2252,15 +2245,13 @@ int hrtimers_dead_cpu(unsigned int scpu)
* The migration might have changed the first expiring softirq
* timer on this CPU. Update it.
*/
- hrtimer_update_softirq_timer(new_base, false);
+ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+ /* Tell the other CPU to retrigger the next event */
+ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

- raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
+ raw_spin_unlock(&old_base->lock);

- /* Check, if we got expired work to do */
- __hrtimer_peek_ahead_timers();
- local_irq_enable();
- local_bh_enable();
return 0;
}