Re: [PATCH 1/2] sched/wait: abort_exclusive_wait() should pass TASK_NORMAL to wake_up()

From: Peter Zijlstra
Date: Thu Sep 01 2016 - 07:39:31 EST


On Fri, Aug 26, 2016 at 02:45:28PM +0200, Oleg Nesterov wrote:
> Otherwise this logic only works if mode is "compatible" with another
> exclusive waiter.
>
> If some wq has both TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE waiters,
> abort_exclusive_wait() won't wait an uninterruptible waiter.
>
> The main user is __wait_on_bit_lock() and currently it is fine but only
> because TASK_KILLABLE includes TASK_UNINTERRUPTIBLE and we do not have
> lock_page_interruptible() yet.

So mixing INTERRUPTIBLE and UNINTERRUPTIBLE and then not using
TASK_NORMAL for wakeups is a mis-feature/abuse of waitqueues IMO.

That said, people do 'creative' things, so maybe we should add some
debug infra to detect this mis-match.

Something like the below perhaps? It will miss people using the (old)
add_wait_queue() (which are plenty :/) but there's nothing quick we can
do about those.

Completely untested..

---
include/linux/wait.h | 13 ++++++++++++-
kernel/sched/wait.c | 27 +++++++++++++++++++++++++++
2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index c3ff74d764fa..e99ea720c5f9 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -39,6 +39,9 @@ struct wait_bit_queue {
struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list;
+#ifdef CONFIG_DEBUG_WAITQUEUE
+ unsigned int state;
+#endif
};
typedef struct __wait_queue_head wait_queue_head_t;

@@ -48,6 +51,13 @@ struct task_struct;
* Macros for declaration and initialisaton of the datatypes
*/

+#ifdef CONFIG_DEBUG_WAITQUEUE
+#define __DEBUG_WAIT_QUEUE_HEAD_INIT(name) \
+ .state = -1,
+#else
+#define __DEBUG_WAIT_QUEUE_HEAD_INIT(name)
+#endif
+
#define __WAITQUEUE_INITIALIZER(name, tsk) { \
.private = tsk, \
.func = default_wake_function, \
@@ -58,7 +68,8 @@ struct task_struct;

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
- .task_list = { &(name).task_list, &(name).task_list } }
+ .task_list = { &(name).task_list, &(name).task_list }, \
+ __DEBUG_WAIT_QUEUE_HEAD_INIT(name) }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f15d6b6a538a..cb71c56c5e76 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -16,6 +16,9 @@ void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_c
spin_lock_init(&q->lock);
lockdep_set_class_and_name(&q->lock, key, name);
INIT_LIST_HEAD(&q->task_list);
+#ifdef CONFIG_DEBUG_WAITQUEUE
+ q->state = -1;
+#endif
}

EXPORT_SYMBOL(__init_waitqueue_head);
@@ -67,6 +70,16 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
{
wait_queue_t *curr, *next;

+#ifdef CONFIG_DEBUG_WAITQUEUE
+ if (q->state != -1) {
+ /*
+ * WARN if we have INTERRUPTIBLE and UNINTERRUPTIBLE
+ * waiters and do not use TASK_NORMAL to wake.
+ */
+ WARN_ON_ONCE(q->state != (mode & TASK_NORMAL));
+ }
+#endif
+
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;

@@ -156,6 +169,17 @@ void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

+static inline void prepare_debug(struct wait_queue_head *q, int state)
+{
+#ifdef CONFIG_DEBUG_WAITQUEUE
+ if (q->state == -1) {
+ q->state = state & TASK_NORMAL;
+ } else {
+ q->state |= state & TASK_NORMAL;
+ }
+#endif
+}
+
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
@@ -178,6 +202,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait);
set_current_state(state);
+ prepare_debug(q, state);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait);
@@ -192,6 +217,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
if (list_empty(&wait->task_list))
__add_wait_queue_tail(q, wait);
set_current_state(state);
+ prepare_debug(q, state);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
@@ -214,6 +240,7 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
__add_wait_queue(q, wait);
}
set_current_state(state);
+ prepare_debug(q, state);
spin_unlock_irqrestore(&q->lock, flags);

return 0;