[PATCH] mm/kmemleak: Add cond_resched() to kmemleak_free_percpu()

From: Waiman Long
Date: Mon Nov 27 2023 - 14:42:13 EST


It was found that on systems with large number of CPUs, the following soft
lockup splat might sometimes happen:

[ 2656.001617] watchdog: BUG: soft lockup - CPU#364 stuck for 21s! [ksoftirqd/364:2206]
:
[ 2656.141194] RIP: 0010:_raw_spin_unlock_irqrestore+0x3d/0x70
:
2656.241214] Call Trace:
[ 2656.243971] <IRQ>
[ 2656.246237] ? show_trace_log_lvl+0x1c4/0x2df
[ 2656.251152] ? show_trace_log_lvl+0x1c4/0x2df
[ 2656.256066] ? kmemleak_free_percpu+0x11f/0x1f0
[ 2656.261173] ? watchdog_timer_fn+0x379/0x470
[ 2656.265984] ? __pfx_watchdog_timer_fn+0x10/0x10
[ 2656.271179] ? __hrtimer_run_queues+0x5f3/0xd00
[ 2656.276283] ? __pfx___hrtimer_run_queues+0x10/0x10
[ 2656.281783] ? ktime_get_update_offsets_now+0x95/0x2c0
[ 2656.287573] ? ktime_get_update_offsets_now+0xdd/0x2c0
[ 2656.293380] ? hrtimer_interrupt+0x2e9/0x780
[ 2656.298221] ? __sysvec_apic_timer_interrupt+0x184/0x640
[ 2656.304211] ? sysvec_apic_timer_interrupt+0x8e/0xc0
[ 2656.309807] </IRQ>
[ 2656.312169] <TASK>
[ 2656.326110] kmemleak_free_percpu+0x11f/0x1f0
[ 2656.331015] free_percpu.part.0+0x1b/0xe70
[ 2656.335635] free_vfsmnt+0xb9/0x100
[ 2656.339567] rcu_do_batch+0x3c8/0xe30
[ 2656.363693] rcu_core+0x3de/0x5a0
[ 2656.367433] __do_softirq+0x2d0/0x9a8
[ 2656.381119] run_ksoftirqd+0x36/0x60
[ 2656.385145] smpboot_thread_fn+0x556/0x910
[ 2656.394971] kthread+0x2a4/0x350
[ 2656.402826] ret_from_fork+0x29/0x50
[ 2656.406861] </TASK>

Fix this by adding a cond_resched() call in the percpu freeing loop
and defer the freeing of percpu kmemleak objects to a workqueue if it
is being called from a non-task context.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
mm/kmemleak.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 1eacca03bedd..03385f4a8008 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -168,6 +168,14 @@ struct kmemleak_object {
char comm[TASK_COMM_LEN]; /* executable name */
};

+/*
+ * A percpu address to be submitted to a workqueue for being freed.
+ */
+struct kmemleak_percpu_addr {
+ struct work_struct work;
+ const void __percpu *ptr;
+};
+
/* flag representing the memory block allocation status */
#define OBJECT_ALLOCATED (1 << 0)
/* flag set after the first reporting of an unreference object */
@@ -1120,23 +1128,60 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);

+static void __kmemleak_free_percpu(const void __percpu *ptr)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu));
+ if (in_task())
+ cond_resched();
+ }
+}
+
+/*
+ * Work function for deferred freeing of kmemleak objects associated with
+ * a freed percpu memory block.
+ */
+static void kmemleak_free_percpu_workfn(struct work_struct *work)
+{
+ struct kmemleak_percpu_addr *addr;
+
+ addr = container_of(work, struct kmemleak_percpu_addr, work);
+ __kmemleak_free_percpu(addr->ptr);
+ kfree(addr);
+}
+
/**
* kmemleak_free_percpu - unregister a previously registered __percpu object
* @ptr: __percpu pointer to beginning of the object
*
* This function is called from the kernel percpu allocator when an object
- * (memory block) is freed (free_percpu).
+ * (memory block) is freed (free_percpu). Since this function is inherently
+ * slow especially on systems with a large number of CPUs, defer the actual
+ * removal of kmemleak objects associated with the percpu pointer to a
+ * workqueue if it is not in a task context.
*/
void __ref kmemleak_free_percpu(const void __percpu *ptr)
{
- unsigned int cpu;
-
pr_debug("%s(0x%px)\n", __func__, ptr);

- if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
- for_each_possible_cpu(cpu)
- delete_object_full((unsigned long)per_cpu_ptr(ptr,
- cpu));
+ if (!kmemleak_free_enabled || !ptr || IS_ERR(ptr))
+ return;
+
+ if (!in_task()) {
+ struct kmemleak_percpu_addr *addr;
+
+ addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
+ if (addr) {
+ INIT_WORK(&addr->work, kmemleak_free_percpu_workfn);
+ addr->ptr = ptr;
+ queue_work(system_long_wq, &addr->work);
+ return;
+ }
+ /* Fallback to do direct deletion */
+ }
+ __kmemleak_free_percpu(ptr);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);

--
2.39.3