[RFC PATCH] block: Fix bio merge induced high I/O latency

From: Mathieu Desnoyers
Date: Sat Jan 17 2009 - 11:27:18 EST


A long standing I/O regression (since 2.6.18, still there today) has hit
Slashdot recently :
http://bugzilla.kernel.org/show_bug.cgi?id=12309
http://it.slashdot.org/article.pl?sid=09/01/15/049201

I've taken a trace reproducing the wrong behavior on my machine and I
think it's getting us somewhere.

LTTng 0.83, kernel 2.6.28
Machine : Intel Xeon E5405 dual quad-core, 16GB ram
(just created a new block-trace.c LTTng probe which is not released yet.
It basically replaces blktrace)


echo 3 > /proc/sys/vm/drop_caches

lttctl -C -w /tmp/trace -o channel.mm.bufnum=8 -o channel.block.bufnum=64 trace

dd if=/dev/zero of=/tmp/newfile bs=1M count=1M
cp -ax music /tmp (copying 1.1GB of mp3)

ls (takes 15 seconds to get the directory listing !)

lttctl -D trace

I looked at the trace (especially at the ls surroundings), and bash is
waiting for a few seconds for I/O in the exec system call (to exec ls).

While this happens, we have dd doing lots and lots of bio_queue. There
is a bio_backmerge after each bio_queue event. This is reasonable,
because dd is writing to a contiguous file.

However, I wonder if this is not the actual problem. We have dd which
has the head request in the elevator request queue. It is progressing
steadily by plugging/unplugging the device periodically and gets its
work done. However, because requests are being dequeued at the same
rate others are being merged, I suspect it stays at the top of the queue
and does not let the other unrelated requests run.

There is a test in the blk-merge.c which makes sure that merged requests
do not get bigger than a certain size. However, if the request is
steadily dequeued, I think this test is not doing anything.


This patch implements a basic test to make sure we never merge more than 128
requests into the same request if it is the "last_merge" request. I have not
been able to trigger the problem again with the fix applied. It might not be in
a perfect state : there may be better solutions to the problem, but I think it
helps pointing out where the culprit lays.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxx>
CC: Jens Axboe <axboe@xxxxxxxxx>
CC: Andrea Arcangeli <andrea@xxxxxxx>
CC: akpm@xxxxxxxxxxxxxxxxxxxx
CC: Ingo Molnar <mingo@xxxxxxx>
CC: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
---
block/blk-merge.c | 12 +++++++++---
block/elevator.c | 31 ++++++++++++++++++++++++++++---
include/linux/blkdev.h | 1 +
3 files changed, 38 insertions(+), 6 deletions(-)

Index: linux-2.6-lttng/include/linux/blkdev.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blkdev.h 2009-01-17 09:49:54.000000000 -0500
+++ linux-2.6-lttng/include/linux/blkdev.h 2009-01-17 09:50:29.000000000 -0500
@@ -313,6 +313,7 @@ struct request_queue
*/
struct list_head queue_head;
struct request *last_merge;
+ int nr_cached_merge;
elevator_t *elevator;

/*
Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c 2009-01-17 09:49:54.000000000 -0500
+++ linux-2.6-lttng/block/elevator.c 2009-01-17 11:07:12.000000000 -0500
@@ -255,6 +255,7 @@ int elevator_init(struct request_queue *

INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
+ q->nr_cached_merge = 0;
q->end_sector = 0;
q->boundary_rq = NULL;

@@ -438,8 +439,10 @@ void elv_dispatch_sort(struct request_qu
struct list_head *entry;
int stop_flags;

- if (q->last_merge == rq)
+ if (q->last_merge == rq) {
q->last_merge = NULL;
+ q->nr_cached_merge = 0;
+ }

elv_rqhash_del(q, rq);

@@ -478,8 +481,10 @@ EXPORT_SYMBOL(elv_dispatch_sort);
*/
void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
{
- if (q->last_merge == rq)
+ if (q->last_merge == rq) {
q->last_merge = NULL;
+ q->nr_cached_merge = 0;
+ }

elv_rqhash_del(q, rq);

@@ -498,6 +503,16 @@ int elv_merge(struct request_queue *q, s
int ret;

/*
+ * Make sure we don't starve other requests by merging too many cached
+ * requests together.
+ */
+ if (q->nr_cached_merge >= BLKDEV_MAX_RQ) {
+ q->last_merge = NULL;
+ q->nr_cached_merge = 0;
+ return ELEVATOR_NO_MERGE;
+ }
+
+ /*
* First try one-hit cache.
*/
if (q->last_merge) {
@@ -536,6 +551,10 @@ void elv_merged_request(struct request_q
if (type == ELEVATOR_BACK_MERGE)
elv_rqhash_reposition(q, rq);

+ if (q->last_merge != rq)
+ q->nr_cached_merge = 0;
+ else
+ q->nr_cached_merge++;
q->last_merge = rq;
}

@@ -551,6 +570,10 @@ void elv_merge_requests(struct request_q
elv_rqhash_del(q, next);

q->nr_sorted--;
+ if (q->last_merge != rq)
+ q->nr_cached_merge = 0;
+ else
+ q->nr_cached_merge++;
q->last_merge = rq;
}

@@ -626,8 +649,10 @@ void elv_insert(struct request_queue *q,
q->nr_sorted++;
if (rq_mergeable(rq)) {
elv_rqhash_add(q, rq);
- if (!q->last_merge)
+ if (!q->last_merge) {
+ q->nr_cached_merge = 1;
q->last_merge = rq;
+ }
}

/*
Index: linux-2.6-lttng/block/blk-merge.c
===================================================================
--- linux-2.6-lttng.orig/block/blk-merge.c 2009-01-17 09:49:54.000000000 -0500
+++ linux-2.6-lttng/block/blk-merge.c 2009-01-17 09:50:29.000000000 -0500
@@ -231,8 +231,10 @@ static inline int ll_new_hw_segment(stru
if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
|| req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
req->cmd_flags |= REQ_NOMERGE;
- if (req == q->last_merge)
+ if (req == q->last_merge) {
q->last_merge = NULL;
+ q->nr_cached_merge = 0;
+ }
return 0;
}

@@ -256,8 +258,10 @@ int ll_back_merge_fn(struct request_queu

if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
req->cmd_flags |= REQ_NOMERGE;
- if (req == q->last_merge)
+ if (req == q->last_merge) {
q->last_merge = NULL;
+ q->nr_cached_merge = 0;
+ }
return 0;
}
if (!bio_flagged(req->biotail, BIO_SEG_VALID))
@@ -281,8 +285,10 @@ int ll_front_merge_fn(struct request_que

if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
req->cmd_flags |= REQ_NOMERGE;
- if (req == q->last_merge)
+ if (req == q->last_merge) {
q->last_merge = NULL;
+ q->nr_cached_merge = 0;
+ }
return 0;
}
if (!bio_flagged(bio, BIO_SEG_VALID))

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/