Re: Nbd is broken

From: Andrea Arcangeli (andrea@suse.de)
Date: Sat Mar 04 2000 - 11:06:16 EST


On Sat, 4 Mar 2000, Pavel Machek wrote:

>@@ -77,6 +77,7 @@
> nbd_dev[dev].refcnt++;
> if (!(nbdev->flags & NBD_INITIALISED)) {
> init_MUTEX(&nbdev->queue_lock);
>+ INIT_LIST_HEAD(&nbdev->queue_head);
> nbdev->flags |= NBD_INITIALISED;
> }
> MOD_INC_USE_COUNT;

I prefer to do all such stuff the main initialization and not at driver
open. So you don't need hte INITIALISED bitflag anymore.

>@@ -218,7 +219,10 @@
>
> down (&lo->queue_lock);
> while (!list_empty(&lo->queue_head)) {
>+ up (&lo->queue_lock);
> req = nbd_read_stat(lo);
>+ down (&lo->queue_lock);
>+
> if (!req)
> goto out;
> #ifdef PARANOIA

If we can't take the semaphore held during xmit we should remove the first
request from the lo->queue_head before dropping the semaphore. I don't see
recursion on the lock there but maybe I'm overlooking something.

>@@ -249,17 +253,22 @@
> {
> struct request *req;
>
>+#ifdef PARANOIA
>+ if (lo->magic != LO_MAGIC) {
>+ printk(KERN_ERR "NBD: nbd_dev[] corrupted: Not enough magic when clearing!\n");
>+ return;
>+ }
>+#endif
>+
> while (!list_empty(&lo->queue_head)) {
> req = blkdev_entry_prev_request(&lo->queue_head);
>+ if (!req)
>+ break;

This is not necessary by design. blkdev_entry_prev_request will never
return you NULL, but the list_empty check above will take care of that
case.

> while (!QUEUE_EMPTY) {
> req = CURRENT;
>+#ifdef PARANOIA
>+ if (!req)
>+ FAIL("que not empty but no request?");
>+#endif

Here as above. CURRENT can't return NULL. QUEUE_EMPTY is taking care if
it.

I also know why probably you got in troubles. As Al Viro pointed out to me
lots of blockdevices wasn't ready to handle merged requests. Lots of them
(the one with the head active) could be just ready if end_request wouldn't
be so badly designed :(. end_request acts correctly when there's been an
I/O error. If no IO error happened, then end_request forgets to advance
the ->sector field and to decrease nr_sectors in the request and so it
corrupts everything if you don't add the dirty hack before calling it.

end_request should really hide the merging trasparently. Fixing that is
trivial but all the blockdevices that was wokarounding that design bug
will have to be ported to the new semantic and that's some time not
obvious. It's certainly a cleanup to do in the long term IMHO.

I did this patch for nbd, could you try it out and continue along these
lines if it doesn't fix everything?

Note that the plugging bit is going to avoid merged requests, but since
you drop the lock inside the nbd request_fn you are not assured that there
won't be merged request. The plugging #define is mainly for trying how
performance/latency change. If somebody needs an enforcement to avoid
merged requests that should be added as a bitflag in the blk_dev_struct.

--- 2.3.49/drivers/block/nbd.c.~1~ Mon Feb 21 15:17:31 2000
+++ 2.3.49/drivers/block/nbd.c Sat Mar 4 16:55:00 2000
@@ -24,6 +24,7 @@
  * structure with userland
  */
 
+#undef NBD_PLUGGABLE
 #define PARANOIA
 #include <linux/major.h>
 
@@ -62,10 +63,11 @@
 static int requests_out;
 #endif
 
+static void nbd_plug_device(request_queue_t *q, kdev_t dev) { }
+
 static int nbd_open(struct inode *inode, struct file *file)
 {
         int dev;
- struct nbd_device *nbdev;
 
         if (!inode)
                 return -EINVAL;
@@ -73,12 +75,7 @@
         if (dev >= MAX_NBD)
                 return -ENODEV;
 
- nbdev = &nbd_dev[dev];
         nbd_dev[dev].refcnt++;
- if (!(nbdev->flags & NBD_INITIALISED)) {
- init_MUTEX(&nbdev->queue_lock);
- nbdev->flags |= NBD_INITIALISED;
- }
         MOD_INC_USE_COUNT;
         return 0;
 }
@@ -215,6 +212,7 @@
 void nbd_do_it(struct nbd_device *lo)
 {
         struct request *req;
+ int dequeued;
 
         down (&lo->queue_lock);
         while (!list_empty(&lo->queue_head)) {
@@ -237,9 +235,11 @@
                 list_del(&req->queue);
                 up (&lo->queue_lock);
                 
- nbd_end_request(req);
+ dequeued = nbd_end_request(req);
 
                 down (&lo->queue_lock);
+ if (!dequeued)
+ list_add(&req->queue, &lo->queue_head);
         }
  out:
         up (&lo->queue_lock);
@@ -248,6 +248,7 @@
 void nbd_clear_que(struct nbd_device *lo)
 {
         struct request *req;
+ int dequeued;
 
         while (!list_empty(&lo->queue_head)) {
                 req = blkdev_entry_prev_request(&lo->queue_head);
@@ -265,9 +266,11 @@
                 list_del(&req->queue);
                 up(&lo->queue_lock);
 
- nbd_end_request(req);
+ dequeued = nbd_end_request(req);
 
                 down(&lo->queue_lock);
+ if (!dequeued)
+ list_add(&req->queue, &lo->queue_head);
         }
 }
 
@@ -469,12 +472,17 @@
         blksize_size[MAJOR_NR] = nbd_blksizes;
         blk_size[MAJOR_NR] = nbd_sizes;
         blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), do_nbd_request);
+#ifndef NBD_PLUGGABLE
+ blk_queue_pluggable(BLK_DEFAULT_QUEUE(MAJOR_NR), nbd_plug_device);
+#endif
         blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0);
         for (i = 0; i < MAX_NBD; i++) {
                 nbd_dev[i].refcnt = 0;
                 nbd_dev[i].file = NULL;
                 nbd_dev[i].magic = LO_MAGIC;
                 nbd_dev[i].flags = 0;
+ INIT_LIST_HEAD(&nbd_dev[i].queue_head);
+ init_MUTEX(&nbd_dev[i]->queue_lock);
                 nbd_blksizes[i] = 1024;
                 nbd_blksize_bits[i] = 10;
                 nbd_bytesizes[i] = 0x7ffffc00; /* 2GB */
--- 2.3.49/include/linux/nbd.h.~1~ Thu Feb 17 14:05:33 2000
+++ 2.3.49/include/linux/nbd.h Sat Mar 4 16:43:55 2000
@@ -30,22 +30,35 @@
 extern int requests_out;
 #endif
 
-static void
+static int
 nbd_end_request(struct request *req)
 {
         unsigned long flags;
+ int ret = 0;
 
 #ifdef PARANOIA
         requests_out++;
 #endif
+ /*
+ * This is a very dirty hack that we have to do to handle
+ * merged requests because end_request stuff is a bit
+ * broken. The fact we have to do this only if there
+ * aren't errors looks even more silly.
+ */
+ if (!req->errors) {
+ req->sector += req->current_nr_sectors;
+ req->nr_sectors -= req->current_nr_sectors;
+ }
+
         spin_lock_irqsave(&io_request_lock, flags);
         if (end_that_request_first( req, !req->errors, "nbd" ))
                 goto out;
+ ret = 1;
         end_that_request_last( req );
 
 out:
         spin_unlock_irqrestore(&io_request_lock, flags);
- return;
+ return ret;
 }
 
 #define MAX_NBD 128
@@ -56,7 +69,6 @@
         int harderror; /* Code of hard error */
 #define NBD_READ_ONLY 0x0001
 #define NBD_WRITE_NOCHK 0x0002
-#define NBD_INITIALISED 0x0004
         struct socket * sock;
         struct file * file; /* If == NULL, device is not ready, yet */
         int magic; /* FIXME: not if debugging is off */

Thanks.

Andrea

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Tue Mar 07 2000 - 21:00:16 EST