[PATCH] locking/rwsem: Handle handoff bit inheritance

From: Waiman Long
Date: Sun Nov 07 2021 - 14:46:02 EST


When a queue head writer set the handoff bit, it will clear it when
the writer is being killed on its way out without acquiring the lock.
That is not the case for a queue head reader. The easy thing to do is
to let the next waiter inherit the handoff bit especially if the next
one is also a reader.

In the case of a writer inheriting the handoff bit, we need to make
sure that writer sets its state correctly to avoid leaving the handoff
bit dangling if it is killed or interrupted before getting the lock. We
also need to make sure that the handoff bit is always cleared when the
wait queue is empty to avoid having the handoff bit set with no waiter.

A new lock event is added to keep track of this writer handoff bit
inheritance.

Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation")
Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
kernel/locking/lock_events_list.h | 1 +
kernel/locking/rwsem.c | 50 +++++++++++++++++++++++--------
2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index 97fb6f3f840a..0eddfb2e6e74 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -67,3 +67,4 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
+LOCK_EVENT(rwsem_inherit_handoff) /* # of handoff inheritance */
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c51387a43265..45373b9e09e3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -536,7 +536,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* bit is set or the lock is acquired with handoff bit cleared.
*/
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
- enum writer_wait_state wstate)
+ enum writer_wait_state *wstate)
{
long count, new;

@@ -546,13 +546,21 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);

- if (has_handoff && wstate == WRITER_NOT_FIRST)
- return false;
+ if (has_handoff) {
+ if (*wstate == WRITER_NOT_FIRST)
+ return false;
+
+ /* Inherit a previously set handoff bit */
+ if (*wstate != WRITER_HANDOFF) {
+ *wstate = WRITER_HANDOFF;
+ lockevent_inc(rwsem_inherit_handoff);
+ }
+ }

new = count;

if (count & RWSEM_LOCK_MASK) {
- if (has_handoff || (wstate != WRITER_HANDOFF))
+ if (has_handoff || (*wstate != WRITER_HANDOFF))
return false;

new |= RWSEM_FLAG_HANDOFF;
@@ -1007,6 +1015,10 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
&sem->count);
}
+ /*
+ * If this reader was first in the queue, handoff bit set and queue
+ * not empty, we will let the next waiter inherit the handoff bit.
+ */
raw_spin_unlock_irq(&sem->wait_lock);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
@@ -1019,7 +1031,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
static struct rw_semaphore *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- long count;
+ long count, bits_clear;
enum writer_wait_state wstate;
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
@@ -1083,7 +1095,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/* wait until we successfully acquire the lock */
set_current_state(state);
for (;;) {
- if (rwsem_try_write_lock(sem, wstate)) {
+ if (rwsem_try_write_lock(sem, &wstate)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
break;
}
@@ -1132,12 +1144,21 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
if (!(count & RWSEM_LOCK_MASK))
break;

+ if (wstate != WRITER_FIRST)
+ continue;
+
+ if (count & RWSEM_FLAG_HANDOFF) {
+ /* Inherit previously set handoff bit */
+ wstate = WRITER_HANDOFF;
+ lockevent_inc(rwsem_inherit_handoff);
+ continue;
+ }
+
/*
* The setting of the handoff bit is deferred
* until rwsem_try_write_lock() is called.
*/
- if ((wstate == WRITER_FIRST) && (rt_task(current) ||
- time_after(jiffies, waiter.timeout))) {
+ if (rt_task(current) || time_after(jiffies, waiter.timeout)) {
wstate = WRITER_HANDOFF;
lockevent_inc(rwsem_wlock_handoff);
break;
@@ -1157,13 +1178,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list);
+ bits_clear = 0;
+ if (list_empty(&sem->wait_list))
+ bits_clear = RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS;
+ else if (wstate == WRITER_HANDOFF)
+ bits_clear = RWSEM_FLAG_HANDOFF;

- if (unlikely(wstate == WRITER_HANDOFF))
- atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
+ if (bits_clear)
+ atomic_long_andnot(bits_clear, &sem->count);

- if (list_empty(&sem->wait_list))
- atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
- else
+ if (!list_empty(&sem->wait_list))
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
--
2.27.0