Re: [PATCH] ide write barriers

From: Jens Axboe (axboe@suse.de)
Date: Wed Feb 05 2003 - 11:33:52 EST


On Wed, Feb 05 2003, Marc-Christian Petersen wrote:
> On Wednesday 05 February 2003 16:18, Jens Axboe wrote:
>
> Hi Jens,
>
> > The attached patch implements write barrier operations in the block
> > layer and for IDE, specifically. The goal is to make the use of write
> > back cache enabled ide drives safe with journalled file systems.
> > Patch is against 2.4.21-pre4-bk as of today, and includes a small patch
> > to enable it on ext3. Chris has a patch for reiserfs as well.
> Could you also please cook up one for 2.4.20? :) Thank you.

Sure, I had that one already. BTW, I discovered that the default io
scheduler forgets to honor the cmd_flags, it's supposed to break like
the noop does (see very first hunk in very first file). Must have
removed that by mistake some time ago... This applies both to the
2.4.21-pre4 patch posted and this one.

diff -urN -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.4.20/drivers/block/elevator.c linux/drivers/block/elevator.c
--- /opt/kernel/linux-2.4.20/drivers/block/elevator.c 2002-11-29 00:53:12.000000000 +0100
+++ linux/drivers/block/elevator.c 2002-11-19 07:58:11.000000000 +0100
@@ -156,6 +156,12 @@
         while ((entry = entry->prev) != head) {
                 struct request *__rq = blkdev_entry_to_request(entry);
 
+ /*
+ * we can neither merge nor insert before/with a flush
+ */
+ if (__rq->cmd_flags & RQ_WRITE_ORDERED)
+ break;
+
                 if (__rq->cmd != rw)
                         continue;
                 if (__rq->rq_dev != bh->b_rdev)
diff -urN -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.4.20/drivers/block/ll_rw_blk.c linux/drivers/block/ll_rw_blk.c
--- /opt/kernel/linux-2.4.20/drivers/block/ll_rw_blk.c 2002-11-29 00:53:12.000000000 +0100
+++ linux/drivers/block/ll_rw_blk.c 2002-11-22 13:53:31.000000000 +0100
@@ -240,6 +240,32 @@
 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 {
         q->make_request_fn = mfn;
+ q->ordered = QUEUE_ORDERED_NONE;
+}
+
+/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q: the request queue
+ * @flag: see below
+ *
+ * Description:
+ * For journalled file systems, doing ordered writes on a commit
+ * block instead of explicitly doing wait_on_buffer (which is bad
+ * for performance) can be a big win. Block drivers supporting this
+ * feature should call this function and indicate so.
+ *
+ * SCSI drivers usually need to support ordered tags, while others
+ * may have to do a complete drive cache flush if they are using write
+ * back caching (or not and lying about it)
+ *
+ * With this in mind, the values are
+ * QUEUE_ORDERED_NONE: the default, doesn't support barrier
+ * QUEUE_ORDERED_TAG: supports ordered tags
+ * QUEUE_ORDERED_FLUSH: supports barrier through cache flush
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+ q->ordered = flag;
 }
 
 /**
@@ -432,7 +458,7 @@
 
         si_meminfo(&si);
         megs = si.totalram >> (20 - PAGE_SHIFT);
- nr_requests = 128;
+ nr_requests = 16;
         if (megs < 32)
                 nr_requests /= 2;
         blk_grow_request_list(q, nr_requests);
@@ -517,6 +543,7 @@
                 rq = blkdev_free_rq(&rl->free);
                 list_del(&rq->queue);
                 rl->count--;
+ rq->cmd_flags = 0;
                 rq->rq_status = RQ_ACTIVE;
                 rq->cmd = rw;
                 rq->special = NULL;
@@ -908,12 +935,27 @@
         int rw_ahead, max_sectors, el_ret;
         struct list_head *head, *insert_here;
         int latency;
+ int write_ordered = 0;
         elevator_t *elevator = &q->elevator;
 
+ /* check for barrier requests the device can't handle */
+ if (buffer_ordered_tag(bh))
+ write_ordered = QUEUE_ORDERED_TAG;
+ else if (buffer_ordered_flush(bh))
+ write_ordered = QUEUE_ORDERED_FLUSH;
+
+ if (write_ordered && q->ordered != write_ordered) {
+ if (buffer_ordered_hard(bh)) {
+ set_bit(BH_IO_OPNOTSUPP, &bh->b_state);
+ goto end_io;
+ }
+ write_ordered = 0;
+ }
+
         count = bh->b_size >> 9;
         sector = bh->b_rsector;
 
- rw_ahead = 0; /* normal case; gets changed below for READA */
+ latency = rw_ahead = 0; /* normal case; gets changed below for READA */
         switch (rw) {
                 case READA:
 #if 0 /* bread() misinterprets failed READA attempts as IO errors on SMP */
@@ -922,7 +964,8 @@
                         rw = READ; /* drop into READ */
                 case READ:
                 case WRITE:
- latency = elevator_request_latency(elevator, rw);
+ if (!write_ordered)
+ latency = elevator_request_latency(elevator, rw);
                         break;
                 default:
                         BUG();
@@ -1049,6 +1092,9 @@
         }
 
 /* fill up the request-info, and add it to the queue */
+ if (write_ordered)
+ req->cmd_flags |= RQ_WRITE_ORDERED;
+
         req->elevator_sequence = latency;
         req->cmd = rw;
         req->errors = 0;
@@ -1525,3 +1571,4 @@
 EXPORT_SYMBOL(blk_max_pfn);
 EXPORT_SYMBOL(blk_seg_merge_ok);
 EXPORT_SYMBOL(blk_nohighio);
+EXPORT_SYMBOL(blk_queue_ordered);
diff -urN -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.4.20/drivers/ide/ide.c linux/drivers/ide/ide.c
--- /opt/kernel/linux-2.4.20/drivers/ide/ide.c 2002-11-29 00:53:13.000000000 +0100
+++ linux/drivers/ide/ide.c 2002-11-19 07:58:11.000000000 +0100
@@ -555,6 +555,36 @@
 }
 
 /*
+ * preempt pending requests, and store this cache flush for immediate
+ * execution
+ */
+static struct request *ide_queue_flush_cmd(ide_drive_t *drive, struct request *rq, int post)
+{
+ struct request *flush_rq = &HWGROUP(drive)->wrq;
+
+ list_del(&rq->queue);
+
+ memset(drive->special_buf, 0, sizeof(drive->special_buf));
+
+ ide_init_drive_cmd(flush_rq);
+
+ flush_rq->buffer = drive->special_buf;
+ flush_rq->special = rq;
+
+ flush_rq->buffer[0] = (drive->id->cfs_enable_2 & 0x2400) ? WIN_FLUSH_CACHE_EXT : WIN_FLUSH_CACHE;
+
+ if (post)
+ flush_rq->cmd_flags |= RQ_WRITE_POSTFLUSH;
+ else {
+ drive->doing_barrier = 1;
+ flush_rq->cmd_flags |= RQ_WRITE_PREFLUSH;
+ }
+
+ list_add(&flush_rq->queue, &drive->queue.queue_head);
+ return flush_rq;
+}
+
+/*
  * This is our end_request replacement function.
  */
 void ide_end_request (byte uptodate, ide_hwgroup_t *hwgroup)
@@ -577,9 +607,19 @@
 
         if (!end_that_request_first(rq, uptodate, hwgroup->drive->name)) {
                 add_blkdev_randomness(MAJOR(rq->rq_dev));
- blkdev_dequeue_request(rq);
                 hwgroup->rq = NULL;
- end_that_request_last(rq);
+
+ /*
+ * if this is a write barrier, flush the writecache before
+ * allowing new requests to finsh and before signalling
+ * completion of this request
+ */
+ if (rq->cmd_flags & RQ_WRITE_ORDERED)
+ ide_queue_flush_cmd(drive, rq, 1);
+ else {
+ blkdev_dequeue_request(rq);
+ end_that_request_last(rq);
+ }
         }
         spin_unlock_irqrestore(&io_request_lock, flags);
 }
@@ -932,8 +972,36 @@
                 default:
                         break;
         }
+
         spin_lock_irqsave(&io_request_lock, flags);
         blkdev_dequeue_request(rq);
+
+ /*
+ * if a cache flush fails, disable ordered write support
+ */
+ if (rq->cmd_flags & (RQ_WRITE_PREFLUSH | RQ_WRITE_POSTFLUSH)) {
+ struct request *real_rq = rq->special;
+
+ /*
+ * best-effort currently, this ignores the fact that there
+ * may be other barriers currently queued that we can't
+ * honor any more
+ */
+ if (err)
+ blk_queue_ordered(&drive->queue, QUEUE_ORDERED_NONE);
+
+ if (rq->cmd_flags & RQ_WRITE_POSTFLUSH) {
+ drive->doing_barrier = 0;
+ end_that_request_last(real_rq);
+ } else {
+ /*
+ * just indicate that we did the pre flush
+ */
+ real_rq->cmd_flags |= RQ_WRITE_PREFLUSH;
+ list_add(&real_rq->queue, &drive->queue.queue_head);
+ }
+ }
+
         HWGROUP(drive)->rq = NULL;
         end_that_request_last(rq);
         spin_unlock_irqrestore(&io_request_lock, flags);
@@ -947,6 +1015,13 @@
         unsigned long flags;
         byte err = 0;
 
+ if (drive->quiet) {
+ if ((stat & (BUSY_STAT|ERR_STAT)) == ERR_STAT)
+ err = GET_ERR();
+
+ return err;
+ }
+
         __save_flags (flags); /* local CPU only */
         ide__sti(); /* local CPU only */
         printk("%s: %s: status=0x%02x", drive->name, msg, stat);
@@ -1049,9 +1124,14 @@
         struct request *rq;
         byte err;
 
+ if (drive == NULL)
+ return ide_stopped;
+
         err = ide_dump_status(drive, msg, stat);
- if (drive == NULL || (rq = HWGROUP(drive)->rq) == NULL)
+
+ if ((rq = HWGROUP(drive)->rq) == NULL)
                 return ide_stopped;
+
         /* retry only "normal" I/O: */
         if (rq->cmd == IDE_DRIVE_CMD || rq->cmd == IDE_DRIVE_TASK) {
                 rq->errors = 1;
@@ -1454,6 +1534,15 @@
 repeat:
         best = NULL;
         drive = hwgroup->drive;
+
+ /*
+ * drive is doing pre-flush, ordered write, post-flush sequence. even
+ * though that is 3 requests, it must be seen as a single transaction.
+ * we must no preempt this drive until that is complete
+ */
+ if (drive->doing_barrier)
+ return drive;
+
         do {
                 if (!list_empty(&drive->queue.queue_head) && (!drive->sleep || 0 <= (signed long)(jiffies - drive->sleep))) {
                         if (!best
@@ -1583,7 +1672,18 @@
                 if ( drive->queue.plugged ) /* paranoia */
                         printk("%s: Huh? nuking plugged queue\n", drive->name);
 
- rq = hwgroup->rq = blkdev_entry_next_request(&drive->queue.queue_head);
+ rq = blkdev_entry_next_request(&drive->queue.queue_head);
+
+ /*
+ * if rq is a barrier write, issue pre cache flush if not
+ * already done
+ */
+ if ((rq->cmd_flags & RQ_WRITE_ORDERED)
+ && !(rq->cmd_flags & RQ_WRITE_PREFLUSH))
+ rq = ide_queue_flush_cmd(drive, rq, 0);
+
+ hwgroup->rq = rq;
+
                 /*
                  * Some systems have trouble with IDE IRQs arriving while
                  * the driver is still setting things up. So, here we disable
@@ -3868,6 +3968,14 @@
                 drive->dsc_overlap = (drive->next != drive && driver->supports_dsc_overlap);
                 drive->nice1 = 1;
         }
+ if (DRIVER(drive)->flushcache && drive->media == ide_disk) {
+ drive->quiet = 1;
+ if (!DRIVER(drive)->flushcache(drive)) {
+ blk_queue_ordered(&drive->queue, QUEUE_ORDERED_FLUSH);
+ printk("%s: safely enabled flush\n", drive->name);
+ }
+ drive->quiet = 0;
+ }
         drive->revalidate = 1;
         drive->suspend_reset = 0;
 #ifdef CONFIG_PROC_FS
diff -urN -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.4.20/fs/jbd/commit.c linux/fs/jbd/commit.c
--- /opt/kernel/linux-2.4.20/fs/jbd/commit.c 2002-11-29 00:53:15.000000000 +0100
+++ linux/fs/jbd/commit.c 2002-11-22 12:01:29.000000000 +0100
@@ -598,7 +598,15 @@
                 struct buffer_head *bh = jh2bh(descriptor);
                 clear_bit(BH_Dirty, &bh->b_state);
                 bh->b_end_io = journal_end_buffer_io_sync;
+
+ /* if we're on an ide device, setting BH_Ordered_Flush
+ will force a write cache flush before and after the
+ commit block. Otherwise, it'll do nothing. */
+
+ set_bit(BH_Ordered_Flush, &bh->b_state);
                 submit_bh(WRITE, bh);
+ clear_bit(BH_Ordered_Flush, &bh->b_state);
+
                 wait_on_buffer(bh);
                 put_bh(bh); /* One for getblk() */
                 journal_unlock_journal_head(descriptor);
diff -urN -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.4.20/include/linux/blkdev.h linux/include/linux/blkdev.h
--- /opt/kernel/linux-2.4.20/include/linux/blkdev.h 2002-11-29 00:53:15.000000000 +0100
+++ linux/include/linux/blkdev.h 2002-11-26 17:33:57.000000000 +0100
@@ -32,6 +32,7 @@
 
         kdev_t rq_dev;
         int cmd; /* READ or WRITE */
+ unsigned long cmd_flags;
         int errors;
         unsigned long start_time;
         unsigned long sector;
@@ -48,6 +49,10 @@
         request_queue_t *q;
 };
 
+#define RQ_WRITE_ORDERED 1 /* ordered write */
+#define RQ_WRITE_PREFLUSH 2 /* pre-barrier flush */
+#define RQ_WRITE_POSTFLUSH 4 /* post-barrier flush */
+
 #include <linux/elevator.h>
 
 typedef int (merge_request_fn) (request_queue_t *q,
@@ -127,6 +132,10 @@
         char head_active;
 
         unsigned long bounce_pfn;
+ /*
+ * ordered write support
+ */
+ char ordered;
 
         /*
          * Is meant to protect the queue in the future instead of
@@ -140,6 +149,9 @@
         wait_queue_head_t wait_for_requests[2];
 };
 
+#define QUEUE_ORDERED_NONE 0 /* no support */
+#define QUEUE_ORDERED_TAG 1 /* supported by tags (fast) */
+#define QUEUE_ORDERED_FLUSH 2 /* supported by cache flush (ugh!) */
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 
 #define BLK_BOUNCE_HIGH (blk_max_low_pfn << PAGE_SHIFT)
@@ -209,6 +221,7 @@
 extern void blk_init_queue(request_queue_t *, request_fn_proc *);
 extern void blk_cleanup_queue(request_queue_t *);
 extern void blk_queue_headactive(request_queue_t *, int);
+extern void blk_queue_ordered(request_queue_t *, int);
 extern void blk_queue_make_request(request_queue_t *, make_request_fn *);
 extern void generic_unplug_device(void *);
 extern inline int blk_seg_merge_ok(struct buffer_head *, struct buffer_head *);
diff -urN -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.4.20/include/linux/elevator.h linux/include/linux/elevator.h
--- /opt/kernel/linux-2.4.20/include/linux/elevator.h 2002-11-29 00:53:15.000000000 +0100
+++ linux/include/linux/elevator.h 2002-11-22 13:55:07.000000000 +0100
@@ -93,8 +93,8 @@
 
 #define ELEVATOR_LINUS \
 ((elevator_t) { \
- 2048, /* read passovers */ \
- 8192, /* write passovers */ \
+ 256, /* read passovers */ \
+ 1024, /* write passovers */ \
                                                                         \
         elevator_linus_merge, /* elevator_merge_fn */ \
         elevator_linus_merge_req, /* elevator_merge_req_fn */ \
diff -urN -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.4.20/include/linux/fs.h linux/include/linux/fs.h
--- /opt/kernel/linux-2.4.20/include/linux/fs.h 2002-11-29 00:53:15.000000000 +0100
+++ linux/include/linux/fs.h 2002-11-22 11:30:56.000000000 +0100
@@ -220,6 +220,10 @@
         BH_Wait_IO, /* 1 if we should write out this buffer */
         BH_Launder, /* 1 if we can throttle on this buffer */
         BH_JBD, /* 1 if it has an attached journal_head */
+ BH_Ordered_Tag, /* 1 if this buffer is a ordered write barrier */
+ BH_Ordered_Flush,/* 1 if this buffer is a flush write barrier */
+ BH_Ordered_Hard, /* 1 if barrier required by the caller */
+ BH_IO_OPNOTSUPP,/* 1 if block layer rejected a barrier write */
 
         BH_PrivateStart,/* not a state bit, but the first bit available
                          * for private allocation by other entities
@@ -283,7 +287,10 @@
 #define buffer_new(bh) __buffer_state(bh,New)
 #define buffer_async(bh) __buffer_state(bh,Async)
 #define buffer_launder(bh) __buffer_state(bh,Launder)
-
+#define buffer_ordered_tag(bh) __buffer_state(bh,Ordered_Tag)
+#define buffer_ordered_hard(bh) __buffer_state(bh,Ordered_Hard)
+#define buffer_ordered_flush(bh) __buffer_state(bh,Ordered_Flush)
+
 #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
 
 extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset);
diff -urN -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.4.20/include/linux/ide.h linux/include/linux/ide.h
--- /opt/kernel/linux-2.4.20/include/linux/ide.h 2002-11-29 00:53:15.000000000 +0100
+++ linux/include/linux/ide.h 2002-11-26 17:36:30.000000000 +0100
@@ -381,6 +381,8 @@
         unsigned autotune : 2; /* 1=autotune, 2=noautotune, 0=default */
         unsigned remap_0_to_1 : 2; /* 0=remap if ezdrive, 1=remap, 2=noremap */
         unsigned ata_flash : 1; /* 1=present, 0=default */
+ unsigned quiet : 1;
+ unsigned doing_barrier : 1; /* barrier sequence in progress */
         unsigned addressing; /* : 2; 0=28-bit, 1=48-bit, 2=64-bit */
         byte scsi; /* 0=default, 1=skip current ide-subdriver for ide-scsi emulation */
         byte media; /* disk, cdrom, tape, floppy, ... */
@@ -428,6 +430,7 @@
         byte acoustic; /* acoustic management */
         unsigned int failures; /* current failure count */
         unsigned int max_failures; /* maximum allowed failure count */
+ char special_buf[4]; /* IDE_DRIVE_CMD, free use */
 } ide_drive_t;
 
 /*

-- 
Jens Axboe

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Fri Feb 07 2003 - 22:00:17 EST