[PATCH] workqueue: don't skip lockdep wq dependency in cancel_work_sync()

From: Tetsuo Handa
Date: Thu Jul 28 2022 - 08:23:55 EST


Like Hillf Danton mentioned

syzbot should have been able to catch cancel_work_sync() in work context
by checking lockdep_map in __flush_work() for both flush and cancel.

in [1], being unable to report an obvious deadlock scenario shown below is
broken. From locking dependency perspective, sync version of cancel request
should behave as if flush request, for it waits for completion of work if
that work has already started execution.

----------
#include <linux/module.h>
#include <linux/sched.h>
static DEFINE_MUTEX(mutex);
static void work_fn(struct work_struct *work)
{
schedule_timeout_uninterruptible(HZ / 5);
mutex_lock(&mutex);
mutex_unlock(&mutex);
}
static DECLARE_WORK(work, work_fn);
static int __init test_init(void)
{
schedule_work(&work);
schedule_timeout_uninterruptible(HZ / 10);
mutex_lock(&mutex);
cancel_work_sync(&work);
mutex_unlock(&mutex);
return -EINVAL;
}
module_init(test_init);
MODULE_LICENSE("GPL");
----------

Link: https://lkml.kernel.org/r/20220504044800.4966-1-hdanton@xxxxxxxx [1]
Reported-by: Hillf Danton <hdanton@xxxxxxxx>
Fixes: d6e89786bed977f3 ("workqueue: skip lockdep wq dependency in cancel_work_sync()")
Cc: Johannes Berg <johannes.berg@xxxxxxxxx>
Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx>
---
kernel/workqueue.c | 45 ++++++++++++++++++---------------------------
1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ea50f6be843..e6df688f84db 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3000,8 +3000,7 @@ void drain_workqueue(struct workqueue_struct *wq)
}
EXPORT_SYMBOL_GPL(drain_workqueue);

-static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
- bool from_cancel)
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
{
struct worker *worker = NULL;
struct worker_pool *pool;
@@ -3043,8 +3042,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
* workqueues the deadlock happens when the rescuer stalls, blocking
* forward progress.
*/
- if (!from_cancel &&
- (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
+ if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
}
@@ -3056,7 +3054,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
return false;
}

-static bool __flush_work(struct work_struct *work, bool from_cancel)
+/**
+ * flush_work - wait for a work to finish executing the last queueing instance
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution. @work is guaranteed to be idle
+ * on return if it hasn't been requeued since flush started.
+ *
+ * Return:
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work(struct work_struct *work)
{
struct wq_barrier barr;

@@ -3066,12 +3075,10 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
if (WARN_ON(!work->func))
return false;

- if (!from_cancel) {
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
- }
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);

- if (start_flush_work(work, &barr, from_cancel)) {
+ if (start_flush_work(work, &barr)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
return true;
@@ -3079,22 +3086,6 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
return false;
}
}
-
-/**
- * flush_work - wait for a work to finish executing the last queueing instance
- * @work: the work to flush
- *
- * Wait until @work has finished execution. @work is guaranteed to be idle
- * on return if it hasn't been requeued since flush started.
- *
- * Return:
- * %true if flush_work() waited for the work to finish execution,
- * %false if it was already idle.
- */
-bool flush_work(struct work_struct *work)
-{
- return __flush_work(work, false);
-}
EXPORT_SYMBOL_GPL(flush_work);

struct cwt_wait {
@@ -3159,7 +3150,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
* isn't executing.
*/
if (wq_online)
- __flush_work(work, true);
+ flush_work(work);

clear_work_data(work);

--
2.18.4