[PATCH 2/3 v2] futex: avoid double wake up in futex_wake() on -RT

From: Sebastian Andrzej Siewior
Date: Fri Apr 10 2015 - 12:11:45 EST


futex_wake() wakes the waiter while holding the hb->lock. The waiter
does not take the hb->lock and can leave the kernel. However the next
operation the same futex operation will point to the same hb->lock and
we will see a small dance around the lock including prio-boosting and
context switch:

low prio task FUTEX_WAKE on high prio
| ft-1489 [000] ....1.. 81.167501: sys_enter: sys_futex (8049f60, 1, 1, 0, 0, 0)
| ft-1489 [000] dN..311 81.167504: sched_wakeup: pid=1490 prio=94
| ft-1489 [000] d...311 81.167520: sched_switch: prev_pid=1489 prev_prio=120 prev_state=R+ ==> next_pid=1490 next_prio=94
| ft-1490 [000] ....1.. 81.167522: sys_exit: sys_futex = 0

prio task FUTEX_WAKE on low prio
| ft-1490 [000] ....1.. 81.167528: sys_enter: sys_futex (8049f5c, 1, 1, 0, 0, 0)
| ft-1490 [000] ....1.. 81.167530: sys_exit: sys_futex = 0

prio task waits FUTEX_WAIT, hb->lock still owned by low prio task
| ft-1490 [000] ....1.. 81.167534: sys_enter: sys_futex (8049f60, 0, 1, 0, 0, 0)
| ft-1490 [000] d...411 81.167895: sched_pi_setprio: pid=1489 oldprio=120 newprio=94
| ft-1490 [000] d...311 81.167901: sched_switch: prev_pid=1490 prev_prio=94 prev_state=D ==> next_pid=1489 next_prio=94
| ft-1489 [000] d...411 81.167910: sched_wakeup: pid=1490 prio=94
| ft-1489 [000] d...311 81.167912: sched_pi_setprio: pid=1489 oldprio=94 newprio=120
| ft-1489 [000] d...311 81.167915: sched_switch: prev_pid=1489 prev_prio=120 prev_state=R+ ==> next_pid=1490 next_prio=94
| ft-1490 [000] d...3.. 81.167922: sched_switch: prev_pid=1490 prev_prio=94 prev_state=S ==> next_pid=1489 next_prio=120
| ft-1489 [000] ....1.. 81.167924: sys_exit: sys_futex = 1

This patch delays the wakeup of the process untill the hb->lock is
dropped to avoid boosting + context switch to obtain the lock.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
v1âv2:
- update patch description
- move the comment to __wake_futex()
- move the wakeup before the out_put_key label in futex_wake()

kernel/futex.c | 24 +++++++++++++++++++++---
1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index b38abe3573a8..658f4d05cd6f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1092,12 +1092,12 @@ static void __unqueue_futex(struct futex_q *q)
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
*/
-static void wake_futex(struct futex_q *q)
+static struct task_struct *__wake_futex(struct futex_q *q)
{
struct task_struct *p = q->task;

if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
- return;
+ return NULL;

/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
@@ -1117,6 +1117,15 @@ static void wake_futex(struct futex_q *q)
*/
smp_wmb();
q->lock_ptr = NULL;
+ return p;
+}
+
+static void wake_futex(struct futex_q *q)
+{
+ struct task_struct *p = __wake_futex(q);
+
+ if (!p)
+ return;

wake_up_state(p, TASK_NORMAL);
put_task_struct(p);
@@ -1228,6 +1237,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
+ struct task_struct *waiter = NULL;
int ret;

if (!bitset)
@@ -1256,13 +1266,21 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;

- wake_futex(this);
+ if (nr_wake == 1)
+ waiter = __wake_futex(this);
+ else
+ wake_futex(this);
if (++ret >= nr_wake)
break;
}
}

spin_unlock(&hb->lock);
+ if (waiter) {
+ wake_up_state(waiter, TASK_NORMAL);
+ put_task_struct(waiter);
+ }
+
out_put_key:
put_futex_key(&key);
out:
--
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/