Re: [RFC] Sleep waiting for an rwsem to be unlocked

From: Suren Baghdasaryan
Date: Mon Jan 15 2024 - 13:44:23 EST


On Tue, Jan 9, 2024 at 9:12 AM Matthew Wilcox <willy@xxxxxxxxxxxxx> wrote:
>
> The problem we're trying to solve is a lock-free walk of
> /proc/$pid/maps. If the process is modifying the VMAs at the same time
> the reader is walking them, it can see garbage. For page faults, we
> handle this by taking the mmap_lock for read and retrying the page fault
> (excluding any further modifications).
>
> We don't want to take that approach for the maps file. The monitoring
> task may have a significantly lower process priority, and so taking
> the mmap_lock for read can block it for a significant period of time.
> The obvious answer is to do some kind of backoff+sleep. But we already
> have a wait queue, so why not use it?
>
> I haven't done the rwbase version; this is just a demonstration of what
> we could do. It's also untested other than by compilation. It might
> well be missing something.

I just posted an RFC for lock-less /proc/$pid/maps reading at [1]. The
rwsem_wait() function proposed by Matthew here would be useful in that
patchset to replace mmap_read_lock/mmap_read_unlock sequence I have to
use to wait for mmap_lock writer to finish.

[1] https://lore.kernel.org/all/20240115183837.205694-1-surenb@xxxxxxxxxx/
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx>
> ---
> include/linux/rwsem.h | 6 +++
> kernel/locking/rwsem.c | 104 ++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 108 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index 4f1c18992f76..e7bf9dfc471a 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -250,6 +250,12 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
> */
> extern void downgrade_write(struct rw_semaphore *sem);
>
> +/*
> + * wait for current writer to be finished
> + */
> +void rwsem_wait(struct rw_semaphore *sem);
> +int __must_check rwsem_wait_killable(struct rw_semaphore *sem);
> +
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> /*
> * nested locking. NOTE: rwsems are not allowed to recurse
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index 2340b6d90ec6..7c8096c5586f 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -332,7 +332,8 @@ EXPORT_SYMBOL(__init_rwsem);
>
> enum rwsem_waiter_type {
> RWSEM_WAITING_FOR_WRITE,
> - RWSEM_WAITING_FOR_READ
> + RWSEM_WAITING_FOR_READ,
> + RWSEM_WAITING_FOR_RELEASE,
> };
>
> struct rwsem_waiter {
> @@ -511,7 +512,8 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
> if (waiter->type == RWSEM_WAITING_FOR_WRITE)
> continue;
>
> - woken++;
> + if (waiter->type == RWSEM_WAITING_FOR_READ)
> + woken++;
> list_move_tail(&waiter->list, &wlist);
>
> /*
> @@ -1401,6 +1403,67 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
> preempt_enable();
> }
>
> +static inline int __wait_read_common(struct rw_semaphore *sem, int state)
> +{
> + int ret = 0;
> + long adjustment = 0;
> + struct rwsem_waiter waiter;
> + DEFINE_WAKE_Q(wake_q);
> +
> + waiter.task = current;
> + waiter.type = RWSEM_WAITING_FOR_RELEASE;
> + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
> + waiter.handoff_set = false;
> +
> + preempt_disable();
> + raw_spin_lock_irq(&sem->wait_lock);
> + if (list_empty(&sem->wait_list)) {
> + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
> + /* Provide lock ACQUIRE */
> + smp_acquire__after_ctrl_dep();
> + raw_spin_unlock_irq(&sem->wait_lock);
> + goto done;
> + }
> + adjustment = RWSEM_FLAG_WAITERS;
> + }
> + rwsem_add_waiter(sem, &waiter);
> + if (adjustment) {
> + long count = atomic_long_add_return(adjustment, &sem->count);
> + rwsem_cond_wake_waiter(sem, count, &wake_q);
> + }
> + raw_spin_unlock_irq(&sem->wait_lock);
> +
> + if (!wake_q_empty(&wake_q))
> + wake_up_q(&wake_q);
> +
> + for (;;) {
> + set_current_state(state);
> + if (!smp_load_acquire(&waiter.task)) {
> + /* Matches rwsem_mark_wake()'s smp_store_release(). */
> + break;
> + }
> + if (signal_pending_state(state, current)) {
> + raw_spin_lock_irq(&sem->wait_lock);
> + if (waiter.task)
> + goto out_nolock;
> + raw_spin_unlock_irq(&sem->wait_lock);
> + /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> + break;
> + }
> + schedule_preempt_disabled();
> + }
> +
> + __set_current_state(TASK_RUNNING);
> +done:
> + preempt_enable();
> + return ret;
> +out_nolock:
> + rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> + __set_current_state(TASK_RUNNING);
> + ret = -EINTR;
> + goto done;
> +}
> +
> #else /* !CONFIG_PREEMPT_RT */
>
> #define RT_MUTEX_BUILD_MUTEX
> @@ -1500,6 +1563,11 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
> rwbase_write_downgrade(&sem->rwbase);
> }
>
> +static inline int __wait_read_killable(struct rw_semaphore *sem)
> +{
> + return rwbase_wait_lock(&sem->rwbase, TASK_KILLABLE);
> +}
> +
> /* Debug stubs for the common API */
> #define DEBUG_RWSEMS_WARN_ON(c, sem)
>
> @@ -1643,6 +1711,38 @@ void downgrade_write(struct rw_semaphore *sem)
> }
> EXPORT_SYMBOL(downgrade_write);
>
> +/**
> + * rwsem_wait_killable - Wait for current write lock holder to release lock
> + * @sem: The semaphore to wait on.
> + *
> + * This is equivalent to calling down_read(); up_read() but avoids the
> + * possibility that the thread will be preempted while holding the lock
> + * causing threads that want to take the lock for writes to block. The
> + * intended use case is for lockless readers who notice an inconsistent
> + * state and want to wait for the current writer to finish.
> + */
> +int rwsem_wait_killable(struct rw_semaphore *sem)
> +{
> + might_sleep();
> +
> + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
> + rwsem_release(&sem->dep_map, _RET_IP_);
> +
> + return __wait_read_common(sem, TASK_KILLABLE);
> +}
> +EXPORT_SYMBOL(rwsem_wait_killable);
> +
> +void rwsem_wait(struct rw_semaphore *sem)
> +{
> + might_sleep();
> +
> + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
> + rwsem_release(&sem->dep_map, _RET_IP_);
> +
> + __wait_read_common(sem, TASK_UNINTERRUPTIBLE);
> +}
> +EXPORT_SYMBOL(rwsem_wait);
> +
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>
> void down_read_nested(struct rw_semaphore *sem, int subclass)
> --
> 2.43.0
>