Re: [PATCH] blk-wbt: get back the missed wakeup from __wbt_done

From: Jens Axboe
Date: Sat Aug 25 2018 - 11:43:18 EST


On 8/24/18 2:41 PM, Jens Axboe wrote:
> On 8/24/18 2:33 PM, Anchal Agarwal wrote:
>> On Fri, Aug 24, 2018 at 12:50:44PM -0600, Jens Axboe wrote:
>>> On 8/24/18 12:12 PM, Anchal Agarwal wrote:
>>>> That's totally fair. As compared to before the patch it was way too high
>>>> and my test case wasn't even running due to the thunderign herd issues and
>>>> queue re-ordering. Anyways as I also mentioned before 10 times
>>>> contention is not too bad since its not really affecting much the number of
>>>> files read in my applciation. Also, you are right waking up N tasks seems
>>>> plausible.
>>>
>>> OK, I'm going to take that as a positive response. I'm going to propose
>>> the last patch as the final addition in this round, since it does fix a
>>> gap in the previous. And I do think that we need to wake as many tasks
>>> as can make progress, otherwise we're deliberately running the device at
>>> a lower load than we should.
>>>
>>>> My application is somewhat similar to database workload. It does uses fsync
>>>> internally also. So what it does is it creates files of random sizes with
>>>> random contents. It stores the hash of those files in memory. During the
>>>> test it reads those files back from storage and checks their hashes.
>>>
>>> How many tasks are running for your test?
>>>
>>> --
>>> Jens Axboe
>>>
>>>
>>
>> So there are 128 Concurrent reads/writes happening. Max files written before
>> reads start is 16384 and each file is of size 512KB. Does that answer your
>> question?
>
> Yes it does, thanks. That's not a crazy amount of tasks or threads.
>
>> BTW, I still have to test the last patch you send but by looking at the patch
>> I assumed it will work anyways!
>
> Thanks for the vote of confidence, I'd appreciate if you would give it a
> whirl. Your workload seems nastier than what I test with, so would be
> great to have someone else test it too.

This is what I have come up with, this actually survives some torture
testing. We do need to have the wait as a loop, since we can't rely on
just being woken from the ->func handler we set. It also handles the
prep token get race.


diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 84507d3e9a98..2442b2b141b8 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -123,16 +123,11 @@ static void rwb_wake_all(struct rq_wb *rwb)
}
}

-static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
+static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
+ enum wbt_flags wb_acct)
{
- struct rq_wb *rwb = RQWB(rqos);
- struct rq_wait *rqw;
int inflight, limit;

- if (!(wb_acct & WBT_TRACKED))
- return;
-
- rqw = get_rq_wait(rwb, wb_acct);
inflight = atomic_dec_return(&rqw->inflight);

/*
@@ -166,8 +161,21 @@ static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
int diff = limit - inflight;

if (!inflight || diff >= rwb->wb_background / 2)
- wake_up(&rqw->wait);
+ wake_up_all(&rqw->wait);
}
+
+}
+
+static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
+{
+ struct rq_wb *rwb = RQWB(rqos);
+ struct rq_wait *rqw;
+
+ if (!(wb_acct & WBT_TRACKED))
+ return;
+
+ rqw = get_rq_wait(rwb, wb_acct);
+ wbt_rqw_done(rwb, rqw, wb_acct);
}

/*
@@ -481,6 +489,33 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
return limit;
}

+struct wbt_wait_data {
+ struct wait_queue_entry wq;
+ struct task_struct *curr;
+ struct rq_wb *rwb;
+ struct rq_wait *rqw;
+ unsigned long rw;
+ unsigned long flags;
+};
+
+static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+ int wake_flags, void *key)
+{
+ struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, wq);
+
+ /*
+ * If we fail to get a budget, return -1 to interrupt the wake up
+ * loop in __wake_up_common.
+ */
+ if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw)))
+ return -1;
+
+ set_bit(0, &data->flags);
+ wake_up_process(data->curr);
+ list_del_init(&curr->entry);
+ return 1;
+}
+
/*
* Block if we will exceed our limit, or if we are currently waiting for
* the timer to kick off queuing again.
@@ -491,19 +526,42 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
__acquires(lock)
{
struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
- DECLARE_WAITQUEUE(wait, current);
+ struct wbt_wait_data data = {
+ .wq = {
+ .func = wbt_wake_function,
+ .entry = LIST_HEAD_INIT(data.wq.entry),
+ },
+ .curr = current,
+ .rwb = rwb,
+ .rqw = rqw,
+ .rw = rw,
+ };
bool has_sleeper;

has_sleeper = wq_has_sleeper(&rqw->wait);
if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
return;

- add_wait_queue_exclusive(&rqw->wait, &wait);
+ prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
do {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ if (test_bit(0, &data.flags))
+ break;

- if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
+ WARN_ON_ONCE(list_empty(&data.wq.entry));
+
+ if (!has_sleeper &&
+ rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
+ finish_wait(&rqw->wait, &data.wq);
+
+ /*
+ * We raced with wbt_wake_function() getting a token,
+ * which means we now have two. Put ours and wake
+ * anyone else potentially waiting for one.
+ */
+ if (test_bit(0, &data.flags))
+ wbt_rqw_done(rwb, rqw, wb_acct);
break;
+ }

if (lock) {
spin_unlock_irq(lock);
@@ -511,11 +569,11 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
spin_lock_irq(lock);
} else
io_schedule();
+
has_sleeper = false;
} while (1);

- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&rqw->wait, &wait);
+ finish_wait(&rqw->wait, &data.wq);
}

static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)

--
Jens Axboe