[PATCH v2 2/4] locking/rtmutex: Submit/resume work explicitly before/after blocking

From: Sebastian Andrzej Siewior
Date: Thu Apr 27 2023 - 07:20:04 EST


schedule() invokes sched_submit_work() before scheduling and
sched_resume_work() afterwards to ensure that queued block requests are
flushed and the (IO)worker machineries can instantiate new workers if
required. This avoids deadlocks and starvation.

With rt_mutexes this can lead to a subtle problem:

When rtmutex blocks current::pi_blocked_on points to the rtmutex it
blocks on. When one of the functions in sched_submit/resume_work() contends
on a rtmutex based lock then that would corrupt current::pi_blocked_on.

Let rtmutex and the RT lock variants which are based on it invoke
sched_submit/resume_work() explicitly before and after the slowpath so
it's guaranteed that current::pi_blocked_on cannot be corrupted by blocking
on two locks.

This does not apply to the PREEMPT_RT variants of spinlock_t and rwlock_t
as their scheduling slowpath is separate and cannot invoke the work related
functions due to potential deadlocks anyway.

[ tglx: Make it explicit and symmetric. Massage changelog ]

Fixes: e17ba59b7e8e1 ("locking/rtmutex: Guard regular sleeping locks specific functions")
Reported-by: Crystal Wood <swood@xxxxxxxxxx>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Link: https://lore.kernel.org/4b4ab374d3e24e6ea8df5cadc4297619a6d945af.camel@xxxxxxxxxx
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
kernel/locking/rtmutex.c | 11 +++++++++--
kernel/locking/rwbase_rt.c | 18 ++++++++++++++++--
kernel/locking/rwsem.c | 6 ++++++
kernel/locking/spinlock_rt.c | 3 +++
4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 728f434de2bbf..aa66a3c5950a7 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1555,7 +1555,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
raw_spin_unlock_irq(&lock->wait_lock);

if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
- schedule();
+ schedule_rtmutex();

raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
@@ -1584,7 +1584,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
WARN(1, "rtmutex deadlock detected\n");
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ schedule_rtmutex();
}
}

@@ -1679,6 +1679,12 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
unsigned long flags;
int ret;

+ /*
+ * The task is about to sleep. Invoke sched_submit_work() before
+ * blocking as that might take locks and corrupt tsk::pi_blocked_on.
+ */
+ sched_submit_work();
+
/*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
* be called in early boot if the cmpxchg() fast path is disabled
@@ -1691,6 +1697,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);

+ sched_resume_work();
return ret;
}

diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 25ec0239477c2..945d474f5d27f 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -131,10 +131,21 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
unsigned int state)
{
+ int ret;
+
if (rwbase_read_trylock(rwb))
return 0;

- return __rwbase_read_lock(rwb, state);
+ /*
+ * The task is about to sleep. For rwsems this submits work as that
+ * might take locks and corrupt tsk::pi_blocked_on. Must be
+ * explicit here because __rwbase_read_lock() cannot invoke
+ * rt_mutex_slowlock(). NOP for rwlocks.
+ */
+ rwbase_sched_submit_work();
+ ret = __rwbase_read_lock(rwb, state);
+ rwbase_sched_resume_work();
+ return ret;
}

static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
@@ -230,7 +241,10 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
struct rt_mutex_base *rtm = &rwb->rtmutex;
unsigned long flags;

- /* Take the rtmutex as a first step */
+ /*
+ * Take the rtmutex as a first step. For rwsem this will also
+ * invoke sched_submit_work() to flush IO and workers.
+ */
if (rwbase_rtmutex_lock_state(rtm, state))
return -EINTR;

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index acb5a50309a18..aca266006ad47 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1415,6 +1415,12 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
#define rwbase_rtmutex_lock_state(rtm, state) \
__rt_mutex_lock(rtm, state)

+#define rwbase_sched_submit_work() \
+ sched_submit_work()
+
+#define rwbase_sched_resume_work() \
+ sched_resume_work()
+
#define rwbase_rtmutex_slowlock_locked(rtm, state) \
__rt_mutex_slowlock_locked(rtm, NULL, state)

diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 48a19ed8486d8..62c4a6866087a 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -159,6 +159,9 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
return 0;
}

+static __always_inline void rwbase_sched_submit_work(void) { }
+static __always_inline void rwbase_sched_resume_work(void) { }
+
static __always_inline int
rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
{
--
2.40.1