Re: timer list corruption in devfreq

From: Mukesh Ojha
Date: Wed Nov 08 2023 - 11:41:39 EST




On 11/8/2023 9:39 PM, Mukesh Ojha wrote:
Hi,

We are facing an issue on 6.1 kernel while using devfreq framework
and looks like the devfreq_monitor_stop()/devfreq_monitor_start is
vulnerable if frequent governor change is being done from user space
in a loop.

echo simple_ondemand > /sys/class/devfreq/1d84000.ufshc/governor
echo performance > /sys/class/devfreq/1d84000.ufshc/governor

Here, we are using ufs device, but could be any device.

Issue is because same instance of timer is being queued from two
places one from devfreq_monitor() and one from devfreq_monitor_start() as
cancel_delayed_work_sync() from devfreq_monitor_stop() was not
able to delete the delayed work time completely due to which
devfreq_monitor() work rearmed the same timer.

But there looks to be issue in the timer framework where
it was initially discussed in [1] and later fixed in [2]
but not sure being whether is it issue in cancel_delayed_work_sync()
where del_timer() inside try_to_grab_pending() need to be replaced
with timer_delete[_sync]() or devfreq_monitor_stop() need to use
this api's and then delete the work.

I meant, timer_shutdown_sync().
Well, i am not sure as these patches[2] are not available on LTS kernel.

-Mukesh

[1]
https://lore.kernel.org/all/20220407161745.7d6754b3@xxxxxxxxxxxxxxxxxx/

[2]
https://lore.kernel.org/all/20221123201306.823305113@xxxxxxxxxxxxx/

In the below ftraces, last two are back to back two timer cancel
operations one from devfreq_monitor_stop() and other from expire_timer().

devfreq_monitor_stop
cancel_delayed_work_sync
__cancel_work_timer
try_to_grab_pending
del_timer()

00|__hlist_del(inline) == list corruption
-000|detach_timer(inline)
-000|expire_timers(inline) ==>
-000|__run_timers(base = 0xFFFFFF8A70F31240)
-001|run_timer_softirq()
-002|__do_softirq()
-003|____do_softirq()
-004|call_on_irq_stack(asm)
-005|do_softirq_own_stack()
-006|invoke_softirq(inline)

kworker/u16:2-5133 [002] 9436.089773: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da0d now=0x10022d9fe flags=56623106
<idle>-0 [002] 9436.149664: timer_cancel timer=0xffffff80444f0428
<idle>-0 [002] 9436.149666: timer_expire_entry timer=0xffffff80444f0428 now=0x10022da0d function=__typeid__ZTSFvP10timer_listE_global_addr baseclk=0x10022da0d
<idle>-0 [002] 9436.149705: timer_expire_exit timer=0xffffff80444f0428
kworker/u16:6-14217 [003] 9436.149776: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da1c now=0x10022da0d flags=119537667
vendor.xxxyyy.ha-1593 [006] 9436.182730: timer_init timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593 [006] 9436.182731: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da24 now=0x10022da15 flags=153092102
vendor.xxxyyy.ha-1593 [004] 9436.188128: timer_cancel timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593 [004] 9436.193456: timer_init timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593 [004] 9436.193458: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da26 now=0x10022da17 flags=161480708
vendor.xxxyyy.ha-1593 [005] 9436.199602: timer_cancel timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593 [004] 9436.205410: timer_init timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593 [004] 9436.205412: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da29 now=0x10022da1a flags=174063620
<idle>-0 [003] 9436.209662: timer_cancel timer=0xffffff80444f0428
<idle>-0 [003] 9436.209664: timer_expire_entry timer=0xffffff80444f0428 now=0x10022da1c function=__typeid__ZTSFvP10timer_listE_global_addr baseclk=0x10022da1c
<idle>-0 [003] 9436.209718: timer_expire_exit timer=0xffffff80444f0428
kworker/u16:6-14217 [003] 9436.209863: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da2b now=0x10022da1c flags=182452227
vendor.xxxyyy.ha-1593 [004] 9436.209888: timer_cancel timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593 [004] 9436.216390: timer_init timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593 [004] 9436.216392: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da2c now=0x10022da1d flags=186646532
vendor.xxxyyy.ha-1593 [005] 9436.220992: timer_cancel timer=0xffffff80444f0428
xxxyyyTraceManag-7795 [004] 9436.261641: timer_cancel timer=0xffffff80444f0428

<1>[10278.515522] [cpu: 0] Unable to handle kernel paging request at virtual address dead00000000012a

-000|__hlist_del(inline)
-000|detach_timer(inline)
-000|expire_timers(inline)
-000|__run_timers(base = 0xFFFFFF8A70F31240)
-001|run_timer_softirq()
-002|__do_softirq()
-003|____do_softirq()
-004|call_on_irq_stack(asm)
-005|do_softirq_own_stack()
-006|invoke_softirq(inline)
-006|__irq_exit_rcu()
-007|irq_exit_rcu()
-008|__el1_irq(inline)
-008|el1_interrupt(regs = 0xFFFFFFC0410FB680, handler = 0xFFFFFFE5222100F8)
-009|el1h_64_irq_handler(regs = ?)
-010|el1h_64_irq(asm)
-->|exception
-011|arch_local_irq_enable(inline)
-011|raw_spin_rq_unlock_irq(inline)
-011|finish_lock_switch(inline)
-011|finish_task_switch(prev = 0xFFFFFF88D30D8000)
-012|context_switch(inline)
-012|__schedule(:sched_mode = 1091549328)
-013|schedule()
-014|schedule_hrtimeout_range_clock(expires = ?, delta = ?, mode = ?, clock_id = ?)
-015|schedule_hrtimeout_range(expires = 0x0, delta = 0, mode = 1091549520)
-016|poll_schedule_timeout(inline)
-016|do_poll(inline)
-016|do_sys_poll(:ufds = 0x00000072B4397BD8, :nfds = 2, :end_time = 0x0)
-017|__do_sys_ppoll(inline)
-017|__se_sys_ppoll(inline)
-017|__arm64_sys_ppoll(:regs = 0xFFFFFFC0410FBEB0)
-018|__invoke_syscall(inline)
-018|invoke_syscall(:regs = 0xFFFFFFC0410FBEB0, scno = ?, sc_nr = ?, :syscall_table = 0xFFFFFFE5231E0D20)
-019|el0_svc_common(regs = 0xFFFFFFC0410FBEB0, scno = ?, sc_nr = ?, syscall_table = 0xFFFFFFE5231E0D20)
-020|do_el0_svc(regs = 0xFFFFFFC0410FBEB0)
-021|el0_svc(regs = 0xFFFFFFC0410FBEB0)
-022|el0t_64_sync_handler(regs = ?)
-023|el0t_64_sync(asm)
-->|exception
-024|NUX:0x736F8F15FC(asm)
---|end of frame
expire_timers(inline)
expires = 4297461926,
function = 0xFFFFFFE262CD6E18,
base = 0xFFFFFF8975713240
baseclk = 4297461926
fn = 0x0
timer = 0xFFFFFF8044AA5430 ->
( entry = (next = 0xDEAD000000000122, pprev = 0xFFFFFFC008003EA0 -> 0xDEAD000000000122 ->
next = 0x0,pprev = 0x0)),
expires = 4297461926,
function = 0xFFFFFFE262CD6E18, //delayed_work_timer_fn