Re: [Ext2-devel] [RFC] Adding multiple block allocation

From: Suparna Bhattacharya
Date: Fri Apr 29 2005 - 08:48:46 EST


On Thu, Apr 28, 2005 at 12:14:24PM -0700, Mingming Cao wrote:
> Currently ext3_get_block()/ext3_new_block() only allocate one block at a
> time. To allocate multiple blocks, the caller, for example, ext3 direct
> IO routine, has to invoke ext3_get_block() many times. This is quite
> inefficient for sequential IO workload.
>
> The benefit of a real get_blocks() include
> 1) increase the possibility to get contiguous blocks, reduce possibility
> of fragmentation due to interleaved allocations from other threads.
> (should good for non reservation case)
> 2) Reduces CPU cycles spent in repeated get_block() calls
> 3) Batch meta data update and journaling in one short
> 4) Could possibly speed up future get_blocks() look up by cache the last
> mapped blocks in inode.
>

And here is the patch to make mpage_writepages use get_blocks() for
multiple block lookup/allocation. It performs a radix-tree contiguous
pages lookup, and issues a get_blocks for the range together. It maintains
an mpageio structure to track intermediate mapping state, somewhat
like the DIO code.

It does need some more testing, especially block_size < PAGE_SIZE.
The JFS workaround can be dropped if the JFS get_blocks fix from
Dave Kleikamp is integrated.

Review feedback would be welcome.

Mingming,
Let me know if you have a chance to try this out with your patch.

Regards
Suparna

--
Suparna Bhattacharya (suparna@xxxxxxxxxx)
Linux Technology Center
IBM Software Lab, India


diff -urp -X dontdiff linux-2.6.12-rc3/fs/buffer.c linux-2.6.12-rc3-getblocks/fs/buffer.c
--- linux-2.6.12-rc3/fs/buffer.c 2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/buffer.c 2005-04-22 15:08:33.000000000 +0530
@@ -2514,53 +2514,10 @@ EXPORT_SYMBOL(nobh_commit_write);
* that it tries to operate without attaching bufferheads to
* the page.
*/
-int nobh_writepage(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc)
+int nobh_writepage(struct page *page, get_blocks_t *get_blocks,
+ struct writeback_control *wbc, writepage_t bh_writepage_fn)
{
- struct inode * const inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
- unsigned offset;
- void *kaddr;
- int ret;
-
- /* Is the page fully inside i_size? */
- if (page->index < end_index)
- goto out;
-
- /* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
- if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
-#if 0
- /* Not really sure about this - do we need this ? */
- if (page->mapping->a_ops->invalidatepage)
- page->mapping->a_ops->invalidatepage(page, offset);
-#endif
- unlock_page(page);
- return 0; /* don't care */
- }
-
- /*
- * The page straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
- * in multiples of the page size. For a file that is not a multiple of
- * the page size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
-out:
- ret = mpage_writepage(page, get_block, wbc);
- if (ret == -EAGAIN)
- ret = __block_write_full_page(inode, page, get_block, wbc);
- return ret;
+ return mpage_writepage(page, get_blocks, wbc, bh_writepage_fn);
}
EXPORT_SYMBOL(nobh_writepage);

diff -urp -X dontdiff linux-2.6.12-rc3/fs/ext2/inode.c linux-2.6.12-rc3-getblocks/fs/ext2/inode.c
--- linux-2.6.12-rc3/fs/ext2/inode.c 2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/ext2/inode.c 2005-04-22 16:30:42.000000000 +0530
@@ -639,12 +639,6 @@ ext2_nobh_prepare_write(struct file *fil
return nobh_prepare_write(page,from,to,ext2_get_block);
}

-static int ext2_nobh_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- return nobh_writepage(page, ext2_get_block, wbc);
-}
-
static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,ext2_get_block);
@@ -662,6 +656,12 @@ ext2_get_blocks(struct inode *inode, sec
return ret;
}

+static int ext2_nobh_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ return nobh_writepage(page, ext2_get_blocks, wbc, ext2_writepage);
+}
+
static ssize_t
ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
@@ -676,7 +676,8 @@ ext2_direct_IO(int rw, struct kiocb *ioc
static int
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- return mpage_writepages(mapping, wbc, ext2_get_block);
+ return __mpage_writepages(mapping, wbc, ext2_get_blocks,
+ ext2_writepage);
}

struct address_space_operations ext2_aops = {
diff -urp -X dontdiff linux-2.6.12-rc3/fs/ext3/inode.c linux-2.6.12-rc3-getblocks/fs/ext3/inode.c
--- linux-2.6.12-rc3/fs/ext3/inode.c 2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/ext3/inode.c 2005-04-22 15:08:33.000000000 +0530
@@ -866,10 +866,10 @@ get_block:
return ret;
}

-static int ext3_writepages_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
+static int ext3_writepages_get_blocks(struct inode *inode, sector_t iblock,
+ unsigned long max_blocks, struct buffer_head *bh, int create)
{
- return ext3_direct_io_get_blocks(inode, iblock, 1, bh, create);
+ return ext3_direct_io_get_blocks(inode, iblock, max_blocks, bh, create);
}

/*
@@ -1369,11 +1369,11 @@ ext3_writeback_writepages(struct address
return ret;
}

- ret = __mpage_writepages(mapping, wbc, ext3_writepages_get_block,
+ ret = __mpage_writepages(mapping, wbc, ext3_writepages_get_blocks,
ext3_writeback_writepage_helper);

/*
- * Need to reaquire the handle since ext3_writepages_get_block()
+ * Need to reaquire the handle since ext3_writepages_get_blocks()
* can restart the handle
*/
handle = journal_current_handle();
@@ -1402,7 +1402,8 @@ static int ext3_writeback_writepage(stru
}

if (test_opt(inode->i_sb, NOBH))
- ret = nobh_writepage(page, ext3_get_block, wbc);
+ ret = nobh_writepage(page, ext3_writepages_get_blocks, wbc,
+ ext3_writeback_writepage_helper);
else
ret = block_write_full_page(page, ext3_get_block, wbc);

diff -urp -X dontdiff linux-2.6.12-rc3/fs/ext3/super.c linux-2.6.12-rc3-getblocks/fs/ext3/super.c
--- linux-2.6.12-rc3/fs/ext3/super.c 2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/ext3/super.c 2005-04-22 15:08:33.000000000 +0530
@@ -1321,6 +1321,7 @@ static int ext3_fill_super (struct super
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);

set_opt(sbi->s_mount_opt, RESERVATION);
+ set_opt(sbi->s_mount_opt, NOBH); /* temp: set nobh default */

if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0))
goto failed_mount;
@@ -1567,6 +1568,7 @@ static int ext3_fill_super (struct super
printk(KERN_ERR "EXT3-fs: Journal does not support "
"requested data journaling mode\n");
goto failed_mount3;
+ set_opt(sbi->s_mount_opt, NOBH); /* temp: set nobh default */
}
default:
break;
@@ -1584,6 +1586,7 @@ static int ext3_fill_super (struct super
"its supported only with writeback mode\n");
clear_opt(sbi->s_mount_opt, NOBH);
}
+ printk("NOBH option set\n");
}
/*
* The journal_load will have done any necessary log recovery,
diff -urp -X dontdiff linux-2.6.12-rc3/fs/hfs/inode.c linux-2.6.12-rc3-getblocks/fs/hfs/inode.c
--- linux-2.6.12-rc3/fs/hfs/inode.c 2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/hfs/inode.c 2005-04-22 15:08:33.000000000 +0530
@@ -124,7 +124,7 @@ static ssize_t hfs_direct_IO(int rw, str
static int hfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return mpage_writepages(mapping, wbc, hfs_get_block);
+ return mpage_writepages(mapping, wbc, hfs_get_blocks);
}

struct address_space_operations hfs_btree_aops = {
diff -urp -X dontdiff linux-2.6.12-rc3/fs/hfsplus/inode.c linux-2.6.12-rc3-getblocks/fs/hfsplus/inode.c
--- linux-2.6.12-rc3/fs/hfsplus/inode.c 2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/hfsplus/inode.c 2005-04-22 15:08:33.000000000 +0530
@@ -121,7 +121,7 @@ static ssize_t hfsplus_direct_IO(int rw,
static int hfsplus_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return mpage_writepages(mapping, wbc, hfsplus_get_block);
+ return mpage_writepages(mapping, wbc, hfsplus_get_blocks);
}

struct address_space_operations hfsplus_btree_aops = {
diff -urp -X dontdiff linux-2.6.12-rc3/fs/jfs/inode.c linux-2.6.12-rc3-getblocks/fs/jfs/inode.c
--- linux-2.6.12-rc3/fs/jfs/inode.c 2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/jfs/inode.c 2005-04-22 16:27:19.000000000 +0530
@@ -267,21 +267,41 @@ jfs_get_blocks(struct inode *ip, sector_
return rc;
}

+static int
+jfs_mpage_get_blocks(struct inode *ip, sector_t lblock, unsigned long
+ max_blocks, struct buffer_head *bh_result, int create)
+{
+ /*
+ * fixme: temporary workaround: return one block at a time until
+ * we figure out why we see exposures with truncate on
+ * allocating multiple blocks in one shot.
+ */
+ return jfs_get_blocks(ip, lblock, 1, bh_result, create);
+}
+
static int jfs_get_block(struct inode *ip, sector_t lblock,
struct buffer_head *bh_result, int create)
{
return jfs_get_blocks(ip, lblock, 1, bh_result, create);
}

+static int jfs_bh_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ return block_write_full_page(page, jfs_get_block, wbc);
+}
+
+
static int jfs_writepage(struct page *page, struct writeback_control *wbc)
{
- return nobh_writepage(page, jfs_get_block, wbc);
+ return nobh_writepage(page, jfs_mpage_get_blocks, wbc, jfs_bh_writepage);
}

static int jfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return mpage_writepages(mapping, wbc, jfs_get_block);
+ return __mpage_writepages(mapping, wbc, jfs_mpage_get_blocks,
+ jfs_bh_writepage);
}

static int jfs_readpage(struct file *file, struct page *page)
diff -urp -X dontdiff linux-2.6.12-rc3/fs/mpage.c linux-2.6.12-rc3-getblocks/fs/mpage.c
--- linux-2.6.12-rc3/fs/mpage.c 2005-04-21 05:33:15.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/fs/mpage.c 2005-04-22 16:19:14.000000000 +0530
@@ -370,6 +370,67 @@ int mpage_readpage(struct page *page, ge
}
EXPORT_SYMBOL(mpage_readpage);

+struct mpageio {
+ struct bio *bio;
+ struct buffer_head map_bh;
+ unsigned long block_in_file;
+ unsigned long final_block_in_request;
+ sector_t long block_in_bio;
+ int boundary;
+ sector_t boundary_block;
+ struct block_device *boundary_bdev;
+};
+
+/*
+ * Maps as many contiguous disk blocks as it can within the range of
+ * the request, and returns the total number of contiguous mapped
+ * blocks in the mpageio.
+ */
+static unsigned long mpage_get_more_blocks(struct mpageio *mio,
+ struct inode *inode, get_blocks_t get_blocks)
+{
+ struct buffer_head map_bh = {.b_state = 0};
+ unsigned long mio_nblocks = mio->map_bh.b_size >> inode->i_blkbits;
+ unsigned long first_unmapped = mio->block_in_file + mio_nblocks;
+ unsigned long next_contig_block = mio->map_bh.b_blocknr + mio_nblocks;
+
+ while ((first_unmapped < mio->final_block_in_request) &&
+ (mio->map_bh.b_size < PAGE_SIZE)) {
+
+ if (get_blocks(inode, first_unmapped,
+ mio->final_block_in_request - first_unmapped,
+ &map_bh, 1))
+ break;
+ if (mio_nblocks && ((map_bh.b_blocknr != next_contig_block) ||
+ map_bh.b_bdev != mio->map_bh.b_bdev))
+ break;
+
+ if (buffer_new(&map_bh)) {
+ int i = 0;
+ for (; i < map_bh.b_size >> inode->i_blkbits; i++)
+ unmap_underlying_metadata(map_bh.b_bdev,
+ map_bh.b_blocknr + i);
+ }
+
+ if (buffer_boundary(&map_bh)) {
+ mio->boundary = 1;
+ mio->boundary_block = map_bh.b_blocknr;
+ mio->boundary_bdev = map_bh.b_bdev;
+ }
+ if (mio_nblocks == 0) {
+ mio->map_bh.b_bdev = map_bh.b_bdev;
+ mio->map_bh.b_blocknr = map_bh.b_blocknr;
+ }
+
+ mio_nblocks += map_bh.b_size >> inode->i_blkbits;
+ first_unmapped = mio->block_in_file + mio_nblocks;
+ next_contig_block = mio->map_bh.b_blocknr + mio_nblocks;
+ mio->map_bh.b_size += map_bh.b_size;
+ }
+
+ return mio_nblocks;
+}
+
/*
* Writing is not so simple.
*
@@ -386,9 +447,9 @@ EXPORT_SYMBOL(mpage_readpage);
* written, so it can intelligently allocate a suitably-sized BIO. For now,
* just allocate full-size (16-page) BIOs.
*/
-static struct bio *
-__mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
- sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc,
+static int
+__mpage_writepage(struct mpageio *mio, struct page *page,
+ get_blocks_t get_blocks, struct writeback_control *wbc,
writepage_t writepage_fn)
{
struct address_space *mapping = page->mapping;
@@ -396,9 +457,8 @@ __mpage_writepage(struct bio *bio, struc
const unsigned blkbits = inode->i_blkbits;
unsigned long end_index;
const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
- sector_t last_block;
+ sector_t last_block, blocks_to_skip;
sector_t block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
unsigned first_unmapped = blocks_per_page;
struct block_device *bdev = NULL;
@@ -406,8 +466,10 @@ __mpage_writepage(struct bio *bio, struc
sector_t boundary_block = 0;
struct block_device *boundary_bdev = NULL;
int length;
- struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
+ struct buffer_head *map_bh = &mio->map_bh;
+ struct bio *bio = mio->bio;
+ int ret = 0;

if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -435,10 +497,13 @@ __mpage_writepage(struct bio *bio, struc
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
goto confused;
if (page_block) {
- if (bh->b_blocknr != blocks[page_block-1] + 1)
+ if (bh->b_blocknr != map_bh->b_blocknr
+ + page_block)
goto confused;
+ } else {
+ map_bh->b_blocknr = bh->b_blocknr;
+ map_bh->b_size = PAGE_SIZE;
}
- blocks[page_block++] = bh->b_blocknr;
boundary = buffer_boundary(bh);
if (boundary) {
boundary_block = bh->b_blocknr;
@@ -465,33 +530,30 @@ __mpage_writepage(struct bio *bio, struc
BUG_ON(!PageUptodate(page));
block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
last_block = (i_size - 1) >> blkbits;
- map_bh.b_page = page;
- for (page_block = 0; page_block < blocks_per_page; ) {
-
- map_bh.b_state = 0;
- if (get_block(inode, block_in_file, &map_bh, 1))
- goto confused;
- if (buffer_new(&map_bh))
- unmap_underlying_metadata(map_bh.b_bdev,
- map_bh.b_blocknr);
- if (buffer_boundary(&map_bh)) {
- boundary_block = map_bh.b_blocknr;
- boundary_bdev = map_bh.b_bdev;
- }
- if (page_block) {
- if (map_bh.b_blocknr != blocks[page_block-1] + 1)
- goto confused;
- }
- blocks[page_block++] = map_bh.b_blocknr;
- boundary = buffer_boundary(&map_bh);
- bdev = map_bh.b_bdev;
- if (block_in_file == last_block)
- break;
- block_in_file++;
+ blocks_to_skip = block_in_file - mio->block_in_file;
+ mio->block_in_file = block_in_file;
+ if (blocks_to_skip < (map_bh->b_size >> blkbits)) {
+ map_bh->b_blocknr += blocks_to_skip;
+ map_bh->b_size -= blocks_to_skip << blkbits;
+ } else {
+ map_bh->b_state = 0;
+ map_bh->b_size = 0;
+ if (mio->final_block_in_request > last_block)
+ mio->final_block_in_request = last_block;
+ mpage_get_more_blocks(mio, inode, get_blocks);
}
- BUG_ON(page_block == 0);
+ if (map_bh->b_size < PAGE_SIZE)
+ goto confused;

- first_unmapped = page_block;
+ if (mio->boundary && (mio->boundary_block < map_bh->b_blocknr
+ + blocks_per_page)) {
+ boundary = 1;
+ boundary_block = mio->boundary_block;
+ boundary_bdev = mio->boundary_bdev;
+ }
+
+ bdev = map_bh->b_bdev;
+ first_unmapped = blocks_per_page;

page_is_mapped:
end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -518,12 +580,16 @@ page_is_mapped:
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
- if (bio && *last_block_in_bio != blocks[0] - 1)
+ if (bio && mio->block_in_bio != map_bh->b_blocknr - 1)
bio = mpage_bio_submit(WRITE, bio);

alloc_new:
if (bio == NULL) {
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+ /*
+ * Fixme: bio size can be limited to final_block - block, or
+ * even mio->map_bh.b_size
+ */
+ bio = mpage_alloc(bdev, map_bh->b_blocknr << (blkbits - 9),
bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
@@ -539,6 +605,9 @@ alloc_new:
bio = mpage_bio_submit(WRITE, bio);
goto alloc_new;
}
+ map_bh->b_blocknr += blocks_per_page;
+ map_bh->b_size -= PAGE_SIZE;
+ mio->block_in_file += blocks_per_page;

/*
* OK, we have our BIO, so we can now mark the buffers clean. Make
@@ -575,7 +644,8 @@ alloc_new:
boundary_block, 1 << blkbits);
}
} else {
- *last_block_in_bio = blocks[blocks_per_page - 1];
+ /* we can pack more pages into the bio, don't submit yet */
+ mio->block_in_bio = map_bh->b_blocknr - 1;
}
goto out;

@@ -584,22 +654,23 @@ confused:
bio = mpage_bio_submit(WRITE, bio);

if (writepage_fn) {
- *ret = (*writepage_fn)(page, wbc);
+ ret = (*writepage_fn)(page, wbc);
} else {
- *ret = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
* The caller has a ref on the inode, so *mapping is stable
*/
- if (*ret) {
- if (*ret == -ENOSPC)
+ if (ret) {
+ if (ret == -ENOSPC)
set_bit(AS_ENOSPC, &mapping->flags);
else
set_bit(AS_EIO, &mapping->flags);
}
out:
- return bio;
+ mio->bio = bio;
+ return ret;
}

/**
@@ -625,20 +696,21 @@ out:
*/
int
mpage_writepages(struct address_space *mapping,
- struct writeback_control *wbc, get_block_t get_block)
+ struct writeback_control *wbc, get_blocks_t get_blocks)
{
- return __mpage_writepages(mapping, wbc, get_block,
+ return __mpage_writepages(mapping, wbc, get_blocks,
mapping->a_ops->writepage);
}

int
__mpage_writepages(struct address_space *mapping,
- struct writeback_control *wbc, get_block_t get_block,
+ struct writeback_control *wbc, get_blocks_t get_blocks,
writepage_t writepage_fn)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct inode *inode = mapping->host;
+ const unsigned blkbits = inode->i_blkbits;
struct bio *bio = NULL;
- sector_t last_block_in_bio = 0;
int ret = 0;
int done = 0;
int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -648,6 +720,9 @@ __mpage_writepages(struct address_space
pgoff_t end = -1; /* Inclusive */
int scanned = 0;
int is_range = 0;
+ struct mpageio mio = {
+ .bio = NULL
+ };

if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
@@ -655,7 +730,7 @@ __mpage_writepages(struct address_space
}

writepage = NULL;
- if (get_block == NULL)
+ if (get_blocks == NULL)
writepage = mapping->a_ops->writepage;

pagevec_init(&pvec, 0);
@@ -672,12 +747,15 @@ __mpage_writepages(struct address_space
scanned = 1;
}
retry:
+ down_read(&inode->i_alloc_sem);
while (!done && (index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
+ (nr_pages = pagevec_contig_lookup_tag(&pvec, mapping,
+ &index, PAGECACHE_TAG_DIRTY,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
unsigned i;

+ mio.final_block_in_request = min(index, end) <<
+ (PAGE_CACHE_SHIFT - blkbits);
scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -702,7 +780,7 @@ retry:
unlock_page(page);
continue;
}
-
+
if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(page);

@@ -723,9 +801,9 @@ retry:
&mapping->flags);
}
} else {
- bio = __mpage_writepage(bio, page, get_block,
- &last_block_in_bio, &ret, wbc,
- writepage_fn);
+ ret = __mpage_writepage(&mio, page, get_blocks,
+ wbc, writepage_fn);
+ bio = mio.bio;
}
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
@@ -737,6 +815,9 @@ retry:
pagevec_release(&pvec);
cond_resched();
}
+
+ up_read(&inode->i_alloc_sem);
+
if (!scanned && !done) {
/*
* We hit the last page and there is more work to be done: wrap
@@ -755,17 +836,23 @@ retry:
EXPORT_SYMBOL(mpage_writepages);
EXPORT_SYMBOL(__mpage_writepages);

-int mpage_writepage(struct page *page, get_block_t get_block,
- struct writeback_control *wbc)
+int mpage_writepage(struct page *page, get_blocks_t get_blocks,
+ struct writeback_control *wbc, writepage_t writepage_fn)
{
int ret = 0;
- struct bio *bio;
- sector_t last_block_in_bio = 0;
-
- bio = __mpage_writepage(NULL, page, get_block,
- &last_block_in_bio, &ret, wbc, NULL);
- if (bio)
- mpage_bio_submit(WRITE, bio);
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ const unsigned blkbits = inode->i_blkbits;
+ struct mpageio mio = {
+ .final_block_in_request = (page->index + 1) << (PAGE_CACHE_SHIFT
+ - blkbits)
+ };
+
+ dump_stack();
+ ret = __mpage_writepage(&mio, page, get_blocks,
+ wbc, writepage_fn);
+ if (mio.bio)
+ mpage_bio_submit(WRITE, mio.bio);

return ret;
}
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/buffer_head.h linux-2.6.12-rc3-getblocks/include/linux/buffer_head.h
--- linux-2.6.12-rc3/include/linux/buffer_head.h 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/buffer_head.h 2005-04-22 15:08:33.000000000 +0530
@@ -203,8 +203,8 @@ int file_fsync(struct file *, struct den
int nobh_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
int nobh_commit_write(struct file *, struct page *, unsigned, unsigned);
int nobh_truncate_page(struct address_space *, loff_t);
-int nobh_writepage(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc);
+int nobh_writepage(struct page *page, get_blocks_t *get_blocks,
+ struct writeback_control *wbc, writepage_t bh_writepage);


/*
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/fs.h linux-2.6.12-rc3-getblocks/include/linux/fs.h
--- linux-2.6.12-rc3/include/linux/fs.h 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/fs.h 2005-04-22 15:08:33.000000000 +0530
@@ -304,6 +304,8 @@ struct address_space;
struct writeback_control;
struct kiocb;

+typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
+
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/mpage.h linux-2.6.12-rc3-getblocks/include/linux/mpage.h
--- linux-2.6.12-rc3/include/linux/mpage.h 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/mpage.h 2005-04-22 15:08:33.000000000 +0530
@@ -11,17 +11,16 @@
*/

struct writeback_control;
-typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);

int mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block);
int mpage_readpage(struct page *page, get_block_t get_block);
int mpage_writepages(struct address_space *mapping,
- struct writeback_control *wbc, get_block_t get_block);
-int mpage_writepage(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc);
+ struct writeback_control *wbc, get_blocks_t get_blocks);
+int mpage_writepage(struct page *page, get_blocks_t *get_blocks,
+ struct writeback_control *wbc, writepage_t writepage);
int __mpage_writepages(struct address_space *mapping,
- struct writeback_control *wbc, get_block_t get_block,
+ struct writeback_control *wbc, get_blocks_t get_blocks,
writepage_t writepage);

static inline int
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/pagemap.h linux-2.6.12-rc3-getblocks/include/linux/pagemap.h
--- linux-2.6.12-rc3/include/linux/pagemap.h 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/pagemap.h 2005-04-22 15:08:33.000000000 +0530
@@ -73,7 +73,8 @@ extern struct page * find_or_create_page
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
- int tag, unsigned int nr_pages, struct page **pages);
+ int tag, unsigned int nr_pages, struct page **pages,
+ int contig);

/*
* Returns locked page at given index in given cache, creating it if needed.
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/pagevec.h linux-2.6.12-rc3-getblocks/include/linux/pagevec.h
--- linux-2.6.12-rc3/include/linux/pagevec.h 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/pagevec.h 2005-04-22 15:08:33.000000000 +0530
@@ -28,6 +28,9 @@ unsigned pagevec_lookup(struct pagevec *
unsigned pagevec_lookup_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, int tag,
unsigned nr_pages);
+unsigned pagevec_contig_lookup_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, int tag,
+ unsigned nr_pages);

static inline void pagevec_init(struct pagevec *pvec, int cold)
{
diff -urp -X dontdiff linux-2.6.12-rc3/include/linux/radix-tree.h linux-2.6.12-rc3-getblocks/include/linux/radix-tree.h
--- linux-2.6.12-rc3/include/linux/radix-tree.h 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/include/linux/radix-tree.h 2005-04-22 15:08:33.000000000 +0530
@@ -59,8 +59,18 @@ void *radix_tree_tag_clear(struct radix_
int radix_tree_tag_get(struct radix_tree_root *root,
unsigned long index, int tag);
unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
- unsigned long first_index, unsigned int max_items, int tag);
+__radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+ unsigned long first_index, unsigned int max_items, int tag,
+ int contig);
+
+static inline unsigned int radix_tree_gang_lookup_tag(struct radix_tree_root
+ *root, void **results, unsigned long first_index,
+ unsigned int max_items, int tag)
+{
+ return __radix_tree_gang_lookup_tag(root, results, first_index,
+ max_items, tag, 0);
+}
+
int radix_tree_tagged(struct radix_tree_root *root, int tag);

static inline void radix_tree_preload_end(void)
diff -urp -X dontdiff linux-2.6.12-rc3/lib/radix-tree.c linux-2.6.12-rc3-getblocks/lib/radix-tree.c
--- linux-2.6.12-rc3/lib/radix-tree.c 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/lib/radix-tree.c 2005-04-22 16:34:29.000000000 +0530
@@ -557,12 +557,13 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
*/
static unsigned int
__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
- unsigned int max_items, unsigned long *next_index, int tag)
+ unsigned int max_items, unsigned long *next_index, int tag, int contig)
{
unsigned int nr_found = 0;
unsigned int shift;
unsigned int height = root->height;
struct radix_tree_node *slot;
+ unsigned long cindex = (contig && (*next_index)) ? *next_index : -1;

shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
slot = root->rnode;
@@ -575,6 +576,11 @@ __lookup_tag(struct radix_tree_root *roo
BUG_ON(slot->slots[i] == NULL);
break;
}
+ if (contig && index >= cindex) {
+ /* break in contiguity */
+ index = 0;
+ goto out;
+ }
index &= ~((1UL << shift) - 1);
index += 1UL << shift;
if (index == 0)
@@ -593,6 +599,10 @@ __lookup_tag(struct radix_tree_root *roo
results[nr_found++] = slot->slots[j];
if (nr_found == max_items)
goto out;
+ } else if (contig && nr_found) {
+ /* break in contiguity */
+ index = 0;
+ goto out;
}
}
}
@@ -618,29 +628,32 @@ out:
* returns the number of items which were placed at *@results.
*/
unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
- unsigned long first_index, unsigned int max_items, int tag)
+__radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+ unsigned long first_index, unsigned int max_items, int tag,
+ int contig)
{
const unsigned long max_index = radix_tree_maxindex(root->height);
unsigned long cur_index = first_index;
+ unsigned long next_index = 0; /* Index of next contiguous search */
unsigned int ret = 0;

while (ret < max_items) {
unsigned int nr_found;
- unsigned long next_index; /* Index of next search */

if (cur_index > max_index)
break;
nr_found = __lookup_tag(root, results + ret, cur_index,
- max_items - ret, &next_index, tag);
+ max_items - ret, &next_index, tag, contig);
ret += nr_found;
if (next_index == 0)
break;
cur_index = next_index;
+ if (!nr_found)
+ next_index = 0;
}
return ret;
}
-EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
+EXPORT_SYMBOL(__radix_tree_gang_lookup_tag);

/**
* radix_tree_delete - delete an item from a radix tree
diff -urp -X dontdiff linux-2.6.12-rc3/mm/filemap.c linux-2.6.12-rc3-getblocks/mm/filemap.c
--- linux-2.6.12-rc3/mm/filemap.c 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/mm/filemap.c 2005-04-22 16:20:30.000000000 +0530
@@ -635,16 +635,19 @@ unsigned find_get_pages(struct address_s
/*
* Like find_get_pages, except we only return pages which are tagged with
* `tag'. We update *index to index the next page for the traversal.
+ * If 'contig' is 1, then we return only pages which are contiguous in the
+ * file.
*/
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
- int tag, unsigned int nr_pages, struct page **pages)
+ int tag, unsigned int nr_pages, struct page **pages,
+ int contig)
{
unsigned int i;
unsigned int ret;

read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
- (void **)pages, *index, nr_pages, tag);
+ ret = __radix_tree_gang_lookup_tag(&mapping->page_tree,
+ (void **)pages, *index, nr_pages, tag, contig);
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
if (ret)
diff -urp -X dontdiff linux-2.6.12-rc3/mm/swap.c linux-2.6.12-rc3-getblocks/mm/swap.c
--- linux-2.6.12-rc3/mm/swap.c 2005-04-21 05:33:16.000000000 +0530
+++ linux-2.6.12-rc3-getblocks/mm/swap.c 2005-04-22 15:08:33.000000000 +0530
@@ -384,7 +384,16 @@ unsigned pagevec_lookup_tag(struct pagev
pgoff_t *index, int tag, unsigned nr_pages)
{
pvec->nr = find_get_pages_tag(mapping, index, tag,
- nr_pages, pvec->pages);
+ nr_pages, pvec->pages, 0);
+ return pagevec_count(pvec);
+}
+
+unsigned int
+pagevec_contig_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
+ pgoff_t *index, int tag, unsigned nr_pages)
+{
+ pvec->nr = find_get_pages_tag(mapping, index, tag,
+ nr_pages, pvec->pages, 1);
return pagevec_count(pvec);
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/