Hi all,
The patch below fixes a number of kiobuf and raw IO bugs in 2.3.50.
Some of these directly affect the functioning of /dev/raw/*; others only
affect the internal kiobuf functions available to other kernel modules.
Fixes include:
* Fix the error return when querying unbound raw devices, for
compatibility with existing util-linux "raw -qa".
* The IO completion used for raw IO on buffer_heads depended on kiobufs
in a way which simply did not work well when doing IO on an iovec of
multiple kiobufs. brw_kiovec() now waits for all the required
buffers rather than waiting on just the last kiobuf in the iovec.
* brw_kiovec() returns -EFAULT if it encounters a null page in a
kiobuf.
* Removed redundant pagelist field from struct kiobuf
* Added a number of missing exports in kernel/ksyms.c
* Export a separate kiobuf_init() function to initialise the internal
structures of statically allocated kiobufs
* Allow the ZERO_PAGE and other reserved pages to be mapped by
map_user_kiobuf()
* Separate out the locking from map_user_kiobuf: mapping a kiobuf no
longer automatically locks each page, and an entire kiovec can be
locked atomically if needed.
* Pages are no longer locked by the raw character device driver (we
discussed this before, and it is needed at least for writing from
demand-zero areas where ZERO_PAGE may be multiply mapped).
--Stephen
----------------------------------------------------------------
--- linux-2.3.50/drivers/char/raw.c.~1~ Mon Feb 28 15:05:34 2000
+++ linux-2.3.50/drivers/char/raw.c Thu Mar 9 17:49:15 2000
@@ -197,14 +197,17 @@
raw_device_bindings[minor] =
bdget(kdev_t_to_nr(MKDEV(rq.block_major, rq.block_minor)));
} else {
+ struct block_device *bdev;
kdev_t dev;
- if (!raw_device_bindings[minor]) {
- err = -ENODEV;
- break;
+
+ bdev = raw_device_bindings[minor];
+ if (bdev) {
+ dev = to_kdev_t(bdev->bd_dev);
+ rq.block_major = MAJOR(dev);
+ rq.block_minor = MINOR(dev);
+ } else {
+ rq.block_major = rq.block_minor = 0;
}
- dev = to_kdev_t(raw_device_bindings[minor]->bd_dev);
- rq.block_major = MAJOR(dev);
- rq.block_minor = MINOR(dev);
err = copy_to_user((void *) arg, &rq, sizeof(rq));
}
break;
@@ -304,7 +307,12 @@
err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
if (err)
break;
-
+#if 0
+ err = lock_kiovec(1, &iobuf, 1);
+ if (err)
+ break;
+#endif
+
for (i=0; i < blocks; i++)
b[i] = blocknr++;
@@ -316,7 +324,7 @@
buf += err;
}
- unmap_kiobuf(iobuf);
+ unmap_kiobuf(iobuf); /* The unlock_kiobuf is implicit here */
if (err != iosize)
break;
--- linux-2.3.50/fs/buffer.c.~1~ Thu Mar 9 17:48:18 2000
+++ linux-2.3.50/fs/buffer.c Thu Mar 9 17:49:15 2000
@@ -1754,10 +1754,10 @@
mark_buffer_uptodate(bh, uptodate);
kiobuf = bh->b_kiobuf;
- if (!uptodate)
- kiobuf->errno = -EIO;
- if (atomic_dec_and_test(&kiobuf->io_count))
- kiobuf->end_io(kiobuf);
+ unlock_buffer(bh);
+
+ kiobuf = bh->b_kiobuf;
+ end_kio_request(kiobuf, uptodate);
}
@@ -1766,8 +1766,7 @@
* for them to complete. Clean up the buffer_heads afterwards.
*/
-static int do_kio(struct kiobuf *kiobuf,
- int rw, int nr, struct buffer_head *bh[], int size)
+static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
{
int iosize;
int i;
@@ -1778,18 +1777,20 @@
if (rw == WRITE)
rw = WRITERAW;
- atomic_add(nr, &kiobuf->io_count);
- kiobuf->errno = 0;
ll_rw_block(rw, nr, bh);
- kiobuf_wait_for_io(kiobuf);
-
- spin_lock(&unused_list_lock);
-
iosize = 0;
+ spin_lock(&unused_list_lock);
+
for (i = nr; --i >= 0; ) {
iosize += size;
tmp = bh[i];
+ if (buffer_locked(tmp)) {
+ spin_unlock(&unused_list_lock);
+ wait_on_buffer(tmp);
+ spin_lock(&unused_list_lock);
+ }
+
if (!buffer_uptodate(tmp)) {
/* We are traversing bh'es in reverse order so
clearing iosize on error calculates the
@@ -1801,11 +1802,7 @@
spin_unlock(&unused_list_lock);
- if (iosize)
- return iosize;
- if (kiobuf->errno)
- return kiobuf->errno;
- return -EIO;
+ return iosize;
}
/*
@@ -1847,8 +1844,6 @@
if ((iobuf->offset & (size-1)) ||
(iobuf->length & (size-1)))
return -EINVAL;
- if (!iobuf->locked)
- panic("brw_kiovec: iobuf not locked for I/O");
if (!iobuf->nr_pages)
panic("brw_kiovec: iobuf not initialised");
}
@@ -1861,10 +1856,15 @@
iobuf = iovec[i];
offset = iobuf->offset;
length = iobuf->length;
-
+ iobuf->errno = 0;
+
for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
map = iobuf->maplist[pageind];
-
+ if (!map) {
+ err = -EFAULT;
+ goto error;
+ }
+
while (length > 0) {
blocknr = b[bufind++];
tmp = get_unused_buffer_head(0);
@@ -1893,11 +1893,13 @@
length -= size;
offset += size;
+ atomic_inc(&iobuf->io_count);
+
/*
* Start the IO if we have got too much
*/
if (bhind >= KIO_MAX_SECTORS) {
- err = do_kio(iobuf, rw, bhind, bh, size);
+ err = do_kio(rw, bhind, bh, size);
if (err >= 0)
transferred += err;
else
@@ -1915,7 +1917,7 @@
/* Is there any IO still left to submit? */
if (bhind) {
- err = do_kio(iobuf, rw, bhind, bh, size);
+ err = do_kio(rw, bhind, bh, size);
if (err >= 0)
transferred += err;
else
--- linux-2.3.50/fs/iobuf.c.~1~ Sun Jan 23 20:34:33 2000
+++ linux-2.3.50/fs/iobuf.c Thu Mar 9 17:49:15 2000
@@ -12,18 +12,21 @@
static kmem_cache_t *kiobuf_cachep;
-/*
- * The default IO completion routine for kiobufs: just wake up
- * the kiobuf, nothing more.
- */
-void simple_wakeup_kiobuf(struct kiobuf *kiobuf)
+void end_kio_request(struct kiobuf *kiobuf, int uptodate)
{
- wake_up(&kiobuf->wait_queue);
+ if ((!uptodate) && !kiobuf->errno)
+ kiobuf->errno = -EIO;
+
+ if (atomic_dec_and_test(&kiobuf->io_count)) {
+ if (kiobuf->end_io)
+ kiobuf->end_io(kiobuf);
+ wake_up(&kiobuf->wait_queue);
+ }
}
-void __init kiobuf_init(void)
+void __init kiobuf_setup(void)
{
kiobuf_cachep = kmem_cache_create("kiobuf",
sizeof(struct kiobuf),
@@ -33,6 +36,13 @@
panic("Cannot create kernel iobuf cache\n");
}
+void kiobuf_init(struct kiobuf *iobuf)
+{
+ memset(iobuf, 0, sizeof(*iobuf));
+ init_waitqueue_head(&iobuf->wait_queue);
+ iobuf->array_len = KIO_STATIC_PAGES;
+ iobuf->maplist = iobuf->map_array;
+}
int alloc_kiovec(int nr, struct kiobuf **bufp)
{
@@ -45,12 +55,7 @@
free_kiovec(i, bufp);
return -ENOMEM;
}
-
- memset(iobuf, 0, sizeof(*iobuf));
- init_waitqueue_head(&iobuf->wait_queue);
- iobuf->end_io = simple_wakeup_kiobuf;
- iobuf->array_len = KIO_STATIC_PAGES;
- iobuf->maplist = iobuf->map_array;
+ kiobuf_init(iobuf);
*bufp++ = iobuf;
}
@@ -64,6 +69,8 @@
for (i = 0; i < nr; i++) {
iobuf = bufp[i];
+ if (iobuf->locked)
+ unlock_kiovec(1, &iobuf);
if (iobuf->array_len > KIO_STATIC_PAGES)
kfree (iobuf->maplist);
kmem_cache_free(kiobuf_cachep, bufp[i]);
@@ -103,6 +110,9 @@
{
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
+
+ if (atomic_read(&kiobuf->io_count) == 0)
+ return;
add_wait_queue(&kiobuf->wait_queue, &wait);
repeat:
--- linux-2.3.50/include/linux/iobuf.h.~1~ Wed Nov 3 18:41:48 1999
+++ linux-2.3.50/include/linux/iobuf.h Thu Mar 9 17:49:15 2000
@@ -29,6 +29,8 @@
#define KIO_STATIC_PAGES (KIO_MAX_ATOMIC_IO / (PAGE_SIZE >> 10) + 1)
#define KIO_MAX_SECTORS (KIO_MAX_ATOMIC_IO * 2)
+/* The main kiobuf struct used for all our IO! */
+
struct kiobuf
{
int nr_pages; /* Pages actually referenced */
@@ -46,7 +48,6 @@
unsigned int locked : 1; /* If set, pages has been locked */
/* Always embed enough struct pages for 64k of IO */
- unsigned long page_array[KIO_STATIC_PAGES];
struct page * map_array[KIO_STATIC_PAGES];
/* Dynamic state for IO completion: */
@@ -61,10 +62,14 @@
int map_user_kiobuf(int rw, struct kiobuf *, unsigned long va, size_t len);
void unmap_kiobuf(struct kiobuf *iobuf);
+int lock_kiovec(int nr, struct kiobuf *iovec[], int wait);
+int unlock_kiovec(int nr, struct kiobuf *iovec[]);
/* fs/iobuf.c */
-void __init kiobuf_init(void);
+void __init kiobuf_setup(void);
+void kiobuf_init(struct kiobuf *);
+void end_kio_request(struct kiobuf *, int);
void simple_wakeup_kiobuf(struct kiobuf *);
int alloc_kiovec(int nr, struct kiobuf **);
void free_kiovec(int nr, struct kiobuf **);
--- linux-2.3.50/init/main.c.~1~ Mon Feb 28 15:05:36 2000
+++ linux-2.3.50/init/main.c Thu Mar 9 17:49:15 2000
@@ -533,7 +533,7 @@
vma_init();
buffer_init(mempages);
page_cache_init(mempages);
- kiobuf_init();
+ kiobuf_setup();
signals_init();
bdev_init();
inode_init();
--- linux-2.3.50/kernel/ksyms.c.~1~ Thu Mar 9 17:48:18 2000
+++ linux-2.3.50/kernel/ksyms.c Thu Mar 9 17:49:15 2000
@@ -157,11 +157,6 @@
EXPORT_SYMBOL(mark_buffer_dirty);
EXPORT_SYMBOL(__mark_buffer_dirty);
EXPORT_SYMBOL(__mark_inode_dirty);
-EXPORT_SYMBOL(free_kiovec);
-EXPORT_SYMBOL(brw_kiovec);
-EXPORT_SYMBOL(alloc_kiovec);
-EXPORT_SYMBOL(expand_kiobuf);
-EXPORT_SYMBOL(unmap_kiobuf);
EXPORT_SYMBOL(get_empty_filp);
EXPORT_SYMBOL(init_private_file);
EXPORT_SYMBOL(filp_open);
@@ -357,6 +352,18 @@
EXPORT_SYMBOL(__br_write_lock);
EXPORT_SYMBOL(__br_write_unlock);
#endif
+
+/* Kiobufs */
+EXPORT_SYMBOL(kiobuf_init);
+
+EXPORT_SYMBOL(alloc_kiovec);
+EXPORT_SYMBOL(free_kiovec);
+EXPORT_SYMBOL(expand_kiobuf);
+
+EXPORT_SYMBOL(map_user_kiobuf);
+EXPORT_SYMBOL(lock_kiovec);
+EXPORT_SYMBOL(unlock_kiovec);
+EXPORT_SYMBOL(brw_kiovec);
/* autoirq from drivers/net/auto_irq.c */
EXPORT_SYMBOL(autoirq_setup);
--- linux-2.3.50/mm/memory.c.~1~ Thu Mar 9 17:48:18 2000
+++ linux-2.3.50/mm/memory.c Thu Mar 9 17:49:15 2000
@@ -398,28 +398,25 @@
return pte_page(*pte);
}
- printk(KERN_ERR "Missing page in follow_page\n");
return NULL;
}
/*
- * Given a physical address, is there a useful struct page pointing to it?
+ * Given a physical address, is there a useful struct page pointing to
+ * it? This may become more complex in the future if we start dealing
+ * with IO-aperture pages in kiobufs.
*/
-struct page * get_page_map(struct page *page, unsigned long vaddr)
+static inline struct page * get_page_map(struct page *page)
{
- if (MAP_NR(vaddr) >= max_mapnr)
- return 0;
- if (page == ZERO_PAGE(vaddr))
- return 0;
- if (PageReserved(page))
+ if (page > (mem_map + max_mapnr))
return 0;
return page;
}
/*
* Force in an entire range of pages from the current process's user VA,
- * and pin and lock the pages for IO.
+ * and pin them in physical memory.
*/
#define dprintk(x...)
@@ -430,8 +427,6 @@
struct mm_struct * mm;
struct vm_area_struct * vma = 0;
struct page * map;
- int doublepage = 0;
- int repeat = 0;
int i;
/* Make sure the iobuf is not already mapped somewhere. */
@@ -447,11 +442,10 @@
if (err)
return err;
- repeat:
down(&mm->mmap_sem);
err = -EFAULT;
- iobuf->locked = 1;
+ iobuf->locked = 0;
iobuf->offset = va & ~PAGE_MASK;
iobuf->length = len;
@@ -471,16 +465,15 @@
spin_lock(&mm->page_table_lock);
map = follow_page(ptr);
if (!map) {
+ spin_unlock(&mm->page_table_lock);
dprintk (KERN_ERR "Missing page in map_user_kiobuf\n");
- goto retry;
+ goto out_unlock;
}
- map = get_page_map(map, ptr);
- if (map) {
- if (TryLockPage(map)) {
- goto retry;
- }
+ map = get_page_map(map);
+ if (map)
atomic_inc(&map->count);
- }
+ else
+ printk (KERN_INFO "Mapped page missing [%d]\n", i);
spin_unlock(&mm->page_table_lock);
iobuf->maplist[i] = map;
iobuf->nr_pages = ++i;
@@ -497,42 +490,6 @@
unmap_kiobuf(iobuf);
dprintk ("map_user_kiobuf: end %d\n", err);
return err;
-
- retry:
-
- /*
- * Undo the locking so far, wait on the page we got to, and try again.
- */
- spin_unlock(&mm->page_table_lock);
- unmap_kiobuf(iobuf);
- up(&mm->mmap_sem);
-
- /*
- * Did the release also unlock the page we got stuck on?
- */
- if (map) {
- if (!PageLocked(map)) {
- /* If so, we may well have the page mapped twice
- * in the IO address range. Bad news. Of
- * course, it _might_ * just be a coincidence,
- * but if it happens more than * once, chances
- * are we have a double-mapped page. */
- if (++doublepage >= 3) {
- return -EINVAL;
- }
- }
-
- /*
- * Try again...
- */
- wait_on_page(map);
- }
-
- if (++repeat < 16) {
- ptr = va & PAGE_MASK;
- goto repeat;
- }
- return -EAGAIN;
}
@@ -548,15 +505,118 @@
for (i = 0; i < iobuf->nr_pages; i++) {
map = iobuf->maplist[i];
-
- if (map && iobuf->locked) {
- UnlockPage(map);
+ if (map) {
+ if (iobuf->locked)
+ UnlockPage(map);
__free_page(map);
}
}
iobuf->nr_pages = 0;
iobuf->locked = 0;
+}
+
+
+/*
+ * Lock down all of the pages of a kiovec for IO.
+ *
+ * If any page is mapped twice in the kiovec, we return the error -EINVAL.
+ *
+ * The optional wait parameter causes the lock call to block until all
+ * pages can be locked if set. If wait==0, the lock operation is
+ * aborted if any locked pages are found and -EAGAIN is returned.
+ */
+
+int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
+{
+ struct kiobuf *iobuf;
+ int i, j;
+ struct page *page, **ppage;
+ int doublepage = 0;
+ int repeat = 0;
+
+ repeat:
+
+ for (i = 0; i < nr; i++) {
+ iobuf = iovec[i];
+
+ if (iobuf->locked)
+ continue;
+ iobuf->locked = 1;
+
+ ppage = iobuf->maplist;
+ for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
+ page = *ppage;
+ if (!page)
+ continue;
+
+ if (TryLockPage(page))
+ goto retry;
+ }
+ }
+
+ return 0;
+
+ retry:
+
+ /*
+ * We couldn't lock one of the pages. Undo the locking so far,
+ * wait on the page we got to, and try again.
+ */
+
+ unlock_kiovec(nr, iovec);
+ if (!wait)
+ return -EAGAIN;
+
+ /*
+ * Did the release also unlock the page we got stuck on?
+ */
+ if (!PageLocked(page)) {
+ /*
+ * If so, we may well have the page mapped twice
+ * in the IO address range. Bad news. Of
+ * course, it _might_ just be a coincidence,
+ * but if it happens more than once, chances
+ * are we have a double-mapped page.
+ */
+ if (++doublepage >= 3)
+ return -EINVAL;
+
+ /* Try again... */
+ wait_on_page(page);
+ }
+
+ if (++repeat < 16)
+ goto repeat;
+ return -EAGAIN;
+}
+
+/*
+ * Unlock all of the pages of a kiovec after IO.
+ */
+
+int unlock_kiovec(int nr, struct kiobuf *iovec[])
+{
+ struct kiobuf *iobuf;
+ int i, j;
+ struct page *page, **ppage;
+
+ for (i = 0; i < nr; i++) {
+ iobuf = iovec[i];
+
+ if (!iobuf->locked)
+ continue;
+ iobuf->locked = 0;
+
+ ppage = iobuf->maplist;
+ for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
+ page = *ppage;
+ if (!page)
+ continue;
+ UnlockPage(page);
+ }
+ }
+ return 0;
}
static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/
This archive was generated by hypermail 2b29 : Wed Mar 15 2000 - 21:00:17 EST