[PATCH] Kiobuf and raw IO fixes for 2.3.50

From: Stephen C. Tweedie (sct@redhat.com)
Date: Thu Mar 09 2000 - 22:39:37 EST

Next message: Thomas Wagner: "Clearing cached data (dirty and clean)"
Previous message: John Newman: "memory tester and BP6 problems"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Hi all,

The patch below fixes a number of kiobuf and raw IO bugs in 2.3.50.
Some of these directly affect the functioning of /dev/raw/*; others only
affect the internal kiobuf functions available to other kernel modules.

Fixes include:

* Fix the error return when querying unbound raw devices, for
compatibility with existing util-linux "raw -qa".

* The IO completion used for raw IO on buffer_heads depended on kiobufs
   in a way which simply did not work well when doing IO on an iovec of
   multiple kiobufs. brw_kiovec() now waits for all the required
   buffers rather than waiting on just the last kiobuf in the iovec.

* brw_kiovec() returns -EFAULT if it encounters a null page in a
kiobuf.

* Removed redundant pagelist field from struct kiobuf

* Added a number of missing exports in kernel/ksyms.c

* Export a separate kiobuf_init() function to initialise the internal
structures of statically allocated kiobufs

* Allow the ZERO_PAGE and other reserved pages to be mapped by
map_user_kiobuf()

* Separate out the locking from map_user_kiobuf: mapping a kiobuf no
longer automatically locks each page, and an entire kiovec can be
locked atomically if needed.

* Pages are no longer locked by the raw character device driver (we
discussed this before, and it is needed at least for writing from
demand-zero areas where ZERO_PAGE may be multiply mapped).

--Stephen

----------------------------------------------------------------
--- linux-2.3.50/drivers/char/raw.c.~1~ Mon Feb 28 15:05:34 2000
+++ linux-2.3.50/drivers/char/raw.c Thu Mar 9 17:49:15 2000
@@ -197,14 +197,17 @@
                         raw_device_bindings[minor] =
                                 bdget(kdev_t_to_nr(MKDEV(rq.block_major, rq.block_minor)));
                 } else {
+ struct block_device *bdev;
                         kdev_t dev;
- if (!raw_device_bindings[minor]) {
- err = -ENODEV;
- break;
+
+ bdev = raw_device_bindings[minor];
+ if (bdev) {
+ dev = to_kdev_t(bdev->bd_dev);
+ rq.block_major = MAJOR(dev);
+ rq.block_minor = MINOR(dev);
+ } else {
+ rq.block_major = rq.block_minor = 0;
                         }
- dev = to_kdev_t(raw_device_bindings[minor]->bd_dev);
- rq.block_major = MAJOR(dev);
- rq.block_minor = MINOR(dev);
                         err = copy_to_user((void *) arg, &rq, sizeof(rq));
                 }
                 break;
@@ -304,7 +307,12 @@
                 err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
                 if (err)
                         break;
-
+#if 0
+ err = lock_kiovec(1, &iobuf, 1);
+ if (err)
+ break;
+#endif
+
                 for (i=0; i < blocks; i++)
                         b[i] = blocknr++;

@@ -316,7 +324,7 @@
                         buf += err;
                 }

- unmap_kiobuf(iobuf);
+ unmap_kiobuf(iobuf); /* The unlock_kiobuf is implicit here */

                 if (err != iosize)
                         break;
--- linux-2.3.50/fs/buffer.c.~1~ Thu Mar 9 17:48:18 2000
+++ linux-2.3.50/fs/buffer.c Thu Mar 9 17:49:15 2000
@@ -1754,10 +1754,10 @@
         mark_buffer_uptodate(bh, uptodate);

         kiobuf = bh->b_kiobuf;
- if (!uptodate)
- kiobuf->errno = -EIO;
- if (atomic_dec_and_test(&kiobuf->io_count))
- kiobuf->end_io(kiobuf);
+ unlock_buffer(bh);
+
+ kiobuf = bh->b_kiobuf;
+ end_kio_request(kiobuf, uptodate);
}

@@ -1766,8 +1766,7 @@
  * for them to complete. Clean up the buffer_heads afterwards.
  */

-static int do_kio(struct kiobuf *kiobuf,
- int rw, int nr, struct buffer_head *bh[], int size)
+static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
{
         int iosize;
         int i;
@@ -1778,18 +1777,20 @@

         if (rw == WRITE)
                 rw = WRITERAW;
- atomic_add(nr, &kiobuf->io_count);
- kiobuf->errno = 0;
         ll_rw_block(rw, nr, bh);

- kiobuf_wait_for_io(kiobuf);
-
- spin_lock(&unused_list_lock);
-
         iosize = 0;
+ spin_lock(&unused_list_lock);
+
         for (i = nr; --i >= 0; ) {
                 iosize += size;
                 tmp = bh[i];
+ if (buffer_locked(tmp)) {
+ spin_unlock(&unused_list_lock);
+ wait_on_buffer(tmp);
+ spin_lock(&unused_list_lock);
+ }
+
                 if (!buffer_uptodate(tmp)) {
                         /* We are traversing bh'es in reverse order so
                            clearing iosize on error calculates the
@@ -1801,11 +1802,7 @@

         spin_unlock(&unused_list_lock);

- if (iosize)
- return iosize;
- if (kiobuf->errno)
- return kiobuf->errno;
- return -EIO;
+ return iosize;
}

/*
@@ -1847,8 +1844,6 @@
                 if ((iobuf->offset & (size-1)) ||
                     (iobuf->length & (size-1)))
                         return -EINVAL;
- if (!iobuf->locked)
- panic("brw_kiovec: iobuf not locked for I/O");
                 if (!iobuf->nr_pages)
                         panic("brw_kiovec: iobuf not initialised");
         }
@@ -1861,10 +1856,15 @@
                 iobuf = iovec[i];
                 offset = iobuf->offset;
                 length = iobuf->length;
-
+ iobuf->errno = 0;
+
                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
                         map = iobuf->maplist[pageind];
-
+ if (!map) {
+ err = -EFAULT;
+ goto error;
+ }
+
                         while (length > 0) {
                                 blocknr = b[bufind++];
                                 tmp = get_unused_buffer_head(0);
@@ -1893,11 +1893,13 @@
                                 length -= size;
                                 offset += size;

+ atomic_inc(&iobuf->io_count);
+
                                 /*
                                  * Start the IO if we have got too much
                                  */
                                 if (bhind >= KIO_MAX_SECTORS) {
- err = do_kio(iobuf, rw, bhind, bh, size);
+ err = do_kio(rw, bhind, bh, size);
                                         if (err >= 0)
                                                 transferred += err;
                                         else
@@ -1915,7 +1917,7 @@

         /* Is there any IO still left to submit? */
         if (bhind) {
- err = do_kio(iobuf, rw, bhind, bh, size);
+ err = do_kio(rw, bhind, bh, size);
                 if (err >= 0)
                         transferred += err;
                 else
--- linux-2.3.50/fs/iobuf.c.~1~ Sun Jan 23 20:34:33 2000
+++ linux-2.3.50/fs/iobuf.c Thu Mar 9 17:49:15 2000
@@ -12,18 +12,21 @@

static kmem_cache_t *kiobuf_cachep;

-/*
- * The default IO completion routine for kiobufs: just wake up
- * the kiobuf, nothing more.
- */

-void simple_wakeup_kiobuf(struct kiobuf *kiobuf)
+void end_kio_request(struct kiobuf *kiobuf, int uptodate)
{
- wake_up(&kiobuf->wait_queue);
+ if ((!uptodate) && !kiobuf->errno)
+ kiobuf->errno = -EIO;
+
+ if (atomic_dec_and_test(&kiobuf->io_count)) {
+ if (kiobuf->end_io)
+ kiobuf->end_io(kiobuf);
+ wake_up(&kiobuf->wait_queue);
+ }
}

-void __init kiobuf_init(void)
+void __init kiobuf_setup(void)
{
         kiobuf_cachep = kmem_cache_create("kiobuf",
                                            sizeof(struct kiobuf),
@@ -33,6 +36,13 @@
                 panic("Cannot create kernel iobuf cache\n");
}

+void kiobuf_init(struct kiobuf *iobuf)
+{
+ memset(iobuf, 0, sizeof(*iobuf));
+ init_waitqueue_head(&iobuf->wait_queue);
+ iobuf->array_len = KIO_STATIC_PAGES;
+ iobuf->maplist = iobuf->map_array;
+}

int alloc_kiovec(int nr, struct kiobuf **bufp)
{
@@ -45,12 +55,7 @@
                         free_kiovec(i, bufp);
                         return -ENOMEM;
                 }
-
- memset(iobuf, 0, sizeof(*iobuf));
- init_waitqueue_head(&iobuf->wait_queue);
- iobuf->end_io = simple_wakeup_kiobuf;
- iobuf->array_len = KIO_STATIC_PAGES;
- iobuf->maplist = iobuf->map_array;
+ kiobuf_init(iobuf);
                 *bufp++ = iobuf;
         }

@@ -64,6 +69,8 @@

         for (i = 0; i < nr; i++) {
                 iobuf = bufp[i];
+ if (iobuf->locked)
+ unlock_kiovec(1, &iobuf);
                 if (iobuf->array_len > KIO_STATIC_PAGES)
                         kfree (iobuf->maplist);
                 kmem_cache_free(kiobuf_cachep, bufp[i]);
@@ -103,6 +110,9 @@
{
         struct task_struct *tsk = current;
         DECLARE_WAITQUEUE(wait, tsk);
+
+ if (atomic_read(&kiobuf->io_count) == 0)
+ return;

         add_wait_queue(&kiobuf->wait_queue, &wait);
repeat:
--- linux-2.3.50/include/linux/iobuf.h.~1~ Wed Nov 3 18:41:48 1999
+++ linux-2.3.50/include/linux/iobuf.h Thu Mar 9 17:49:15 2000
@@ -29,6 +29,8 @@
#define KIO_STATIC_PAGES (KIO_MAX_ATOMIC_IO / (PAGE_SIZE >> 10) + 1)
#define KIO_MAX_SECTORS (KIO_MAX_ATOMIC_IO * 2)

+/* The main kiobuf struct used for all our IO! */
+
struct kiobuf
{
         int nr_pages; /* Pages actually referenced */
@@ -46,7 +48,6 @@
         unsigned int locked : 1; /* If set, pages has been locked */

         /* Always embed enough struct pages for 64k of IO */
- unsigned long page_array[KIO_STATIC_PAGES];
         struct page * map_array[KIO_STATIC_PAGES];

         /* Dynamic state for IO completion: */
@@ -61,10 +62,14 @@

int map_user_kiobuf(int rw, struct kiobuf *, unsigned long va, size_t len);
void unmap_kiobuf(struct kiobuf *iobuf);
+int lock_kiovec(int nr, struct kiobuf *iovec[], int wait);
+int unlock_kiovec(int nr, struct kiobuf *iovec[]);

/* fs/iobuf.c */

-void __init kiobuf_init(void);
+void __init kiobuf_setup(void);
+void kiobuf_init(struct kiobuf *);
+void end_kio_request(struct kiobuf *, int);
void simple_wakeup_kiobuf(struct kiobuf *);
int alloc_kiovec(int nr, struct kiobuf **);
void free_kiovec(int nr, struct kiobuf **);
--- linux-2.3.50/init/main.c.~1~ Mon Feb 28 15:05:36 2000
+++ linux-2.3.50/init/main.c Thu Mar 9 17:49:15 2000
@@ -533,7 +533,7 @@
         vma_init();
         buffer_init(mempages);
         page_cache_init(mempages);
- kiobuf_init();
+ kiobuf_setup();
         signals_init();
         bdev_init();
         inode_init();
--- linux-2.3.50/kernel/ksyms.c.~1~ Thu Mar 9 17:48:18 2000
+++ linux-2.3.50/kernel/ksyms.c Thu Mar 9 17:49:15 2000
@@ -157,11 +157,6 @@
EXPORT_SYMBOL(mark_buffer_dirty);
EXPORT_SYMBOL(__mark_buffer_dirty);
EXPORT_SYMBOL(__mark_inode_dirty);
-EXPORT_SYMBOL(free_kiovec);
-EXPORT_SYMBOL(brw_kiovec);
-EXPORT_SYMBOL(alloc_kiovec);
-EXPORT_SYMBOL(expand_kiobuf);
-EXPORT_SYMBOL(unmap_kiobuf);
EXPORT_SYMBOL(get_empty_filp);
EXPORT_SYMBOL(init_private_file);
EXPORT_SYMBOL(filp_open);
@@ -357,6 +352,18 @@
EXPORT_SYMBOL(__br_write_lock);
EXPORT_SYMBOL(__br_write_unlock);
#endif
+
+/* Kiobufs */
+EXPORT_SYMBOL(kiobuf_init);
+
+EXPORT_SYMBOL(alloc_kiovec);
+EXPORT_SYMBOL(free_kiovec);
+EXPORT_SYMBOL(expand_kiobuf);
+
+EXPORT_SYMBOL(map_user_kiobuf);
+EXPORT_SYMBOL(lock_kiovec);
+EXPORT_SYMBOL(unlock_kiovec);
+EXPORT_SYMBOL(brw_kiovec);

/* autoirq from drivers/net/auto_irq.c */
EXPORT_SYMBOL(autoirq_setup);
--- linux-2.3.50/mm/memory.c.~1~ Thu Mar 9 17:48:18 2000
+++ linux-2.3.50/mm/memory.c Thu Mar 9 17:49:15 2000
@@ -398,28 +398,25 @@
                         return pte_page(*pte);
         }

- printk(KERN_ERR "Missing page in follow_page\n");
         return NULL;
}

/*
- * Given a physical address, is there a useful struct page pointing to it?
+ * Given a physical address, is there a useful struct page pointing to
+ * it? This may become more complex in the future if we start dealing
+ * with IO-aperture pages in kiobufs.
  */

-struct page * get_page_map(struct page *page, unsigned long vaddr)
+static inline struct page * get_page_map(struct page *page)
{
- if (MAP_NR(vaddr) >= max_mapnr)
- return 0;
- if (page == ZERO_PAGE(vaddr))
- return 0;
- if (PageReserved(page))
+ if (page > (mem_map + max_mapnr))
                 return 0;
         return page;
}

/*
  * Force in an entire range of pages from the current process's user VA,
- * and pin and lock the pages for IO.
+ * and pin them in physical memory.
  */

#define dprintk(x...)
@@ -430,8 +427,6 @@
         struct mm_struct * mm;
         struct vm_area_struct * vma = 0;
         struct page * map;
- int doublepage = 0;
- int repeat = 0;
         int i;

         /* Make sure the iobuf is not already mapped somewhere. */
@@ -447,11 +442,10 @@
         if (err)
                 return err;

- repeat:
         down(&mm->mmap_sem);

         err = -EFAULT;
- iobuf->locked = 1;
+ iobuf->locked = 0;
         iobuf->offset = va & ~PAGE_MASK;
         iobuf->length = len;

@@ -471,16 +465,15 @@
                 spin_lock(&mm->page_table_lock);
                 map = follow_page(ptr);
                 if (!map) {
+ spin_unlock(&mm->page_table_lock);
                         dprintk (KERN_ERR "Missing page in map_user_kiobuf\n");
- goto retry;
+ goto out_unlock;
                 }
- map = get_page_map(map, ptr);
- if (map) {
- if (TryLockPage(map)) {
- goto retry;
- }
+ map = get_page_map(map);
+ if (map)
                         atomic_inc(&map->count);
- }
+ else
+ printk (KERN_INFO "Mapped page missing [%d]\n", i);
                 spin_unlock(&mm->page_table_lock);
                 iobuf->maplist[i] = map;
                 iobuf->nr_pages = ++i;
@@ -497,42 +490,6 @@
         unmap_kiobuf(iobuf);
         dprintk ("map_user_kiobuf: end %d\n", err);
         return err;
-
- retry:
-
- /*
- * Undo the locking so far, wait on the page we got to, and try again.
- */
- spin_unlock(&mm->page_table_lock);
- unmap_kiobuf(iobuf);
- up(&mm->mmap_sem);
-
- /*
- * Did the release also unlock the page we got stuck on?
- */
- if (map) {
- if (!PageLocked(map)) {
- /* If so, we may well have the page mapped twice
- * in the IO address range. Bad news. Of
- * course, it _might_ * just be a coincidence,
- * but if it happens more than * once, chances
- * are we have a double-mapped page. */
- if (++doublepage >= 3) {
- return -EINVAL;
- }
- }
-
- /*
- * Try again...
- */
- wait_on_page(map);
- }
-
- if (++repeat < 16) {
- ptr = va & PAGE_MASK;
- goto repeat;
- }
- return -EAGAIN;
}

@@ -548,15 +505,118 @@

         for (i = 0; i < iobuf->nr_pages; i++) {
                 map = iobuf->maplist[i];
-
- if (map && iobuf->locked) {
- UnlockPage(map);
+ if (map) {
+ if (iobuf->locked)
+ UnlockPage(map);
                         __free_page(map);
                 }
         }

         iobuf->nr_pages = 0;
         iobuf->locked = 0;
+}
+
+
+/*
+ * Lock down all of the pages of a kiovec for IO.
+ *
+ * If any page is mapped twice in the kiovec, we return the error -EINVAL.
+ *
+ * The optional wait parameter causes the lock call to block until all
+ * pages can be locked if set. If wait==0, the lock operation is
+ * aborted if any locked pages are found and -EAGAIN is returned.
+ */
+
+int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
+{
+ struct kiobuf *iobuf;
+ int i, j;
+ struct page *page, **ppage;
+ int doublepage = 0;
+ int repeat = 0;
+
+ repeat:
+
+ for (i = 0; i < nr; i++) {
+ iobuf = iovec[i];
+
+ if (iobuf->locked)
+ continue;
+ iobuf->locked = 1;
+
+ ppage = iobuf->maplist;
+ for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
+ page = *ppage;
+ if (!page)
+ continue;
+
+ if (TryLockPage(page))
+ goto retry;
+ }
+ }
+
+ return 0;
+
+ retry:
+
+ /*
+ * We couldn't lock one of the pages. Undo the locking so far,
+ * wait on the page we got to, and try again.
+ */
+
+ unlock_kiovec(nr, iovec);
+ if (!wait)
+ return -EAGAIN;
+
+ /*
+ * Did the release also unlock the page we got stuck on?
+ */
+ if (!PageLocked(page)) {
+ /*
+ * If so, we may well have the page mapped twice
+ * in the IO address range. Bad news. Of
+ * course, it _might_ just be a coincidence,
+ * but if it happens more than once, chances
+ * are we have a double-mapped page.
+ */
+ if (++doublepage >= 3)
+ return -EINVAL;
+
+ /* Try again... */
+ wait_on_page(page);
+ }
+
+ if (++repeat < 16)
+ goto repeat;
+ return -EAGAIN;
+}
+
+/*
+ * Unlock all of the pages of a kiovec after IO.
+ */
+
+int unlock_kiovec(int nr, struct kiobuf *iovec[])
+{
+ struct kiobuf *iobuf;
+ int i, j;
+ struct page *page, **ppage;
+
+ for (i = 0; i < nr; i++) {
+ iobuf = iovec[i];
+
+ if (!iobuf->locked)
+ continue;
+ iobuf->locked = 0;
+
+ ppage = iobuf->maplist;
+ for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
+ page = *ppage;
+ if (!page)
+ continue;
+ UnlockPage(page);
+ }
+ }
+ return 0;
}

static inline void zeromap_pte_range(pte_t * pte, unsigned long address,

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/

Next message: Thomas Wagner: "Clearing cached data (dirty and clean)"
Previous message: John Newman: "memory tester and BP6 problems"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This archive was generated by hypermail 2b29 : Wed Mar 15 2000 - 21:00:17 EST