[PATCH, RFC] ext4: Use preallocation when reading from the inode table

From: Theodore Ts'o
Date: Mon Sep 22 2008 - 20:35:37 EST



With modern hard drives, reading 64k takes roughly the same time as
reading a 4k block. So request adjacent inode table blocks to reduce
the time it takes when iterating over directories (especially when doing
this in htree sort order) in a cold cache case. With this patch, the
time it takes to run "git status" on a kernel tree after flushing the
caches via "echo 3 > /proc/sys/vm/drop_caches", is reduced by 21%.

Signed-off-by: "Theodore Ts'o" <tytso@xxxxxxx>
----

Note: this patch could work for ext3 as well. Anyone see any downsides?
A 20% improvement seems too easy... I guess it does increase what we
hold in the buffer cache by a small amount, but once the inodes are
cached, we'll stop needing to do any I/O, and we only try to do the
readahead when we know that we need to do some I/O anyway.

This patch also eliminates ext4_get_inode_block(), since it's a static
function which is only called once, and we needed access to the block
group descriptor, so it simplified the code to move the code into
__ext4_get_inode_loc(). The interesting bits are in the very last hunk
of the patch.


diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index eed1265..9f6c10e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3833,41 +3833,6 @@ out_stop:
ext4_journal_stop(handle);
}

-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
- unsigned long ino, struct ext4_iloc *iloc)
-{
- ext4_group_t block_group;
- unsigned long offset;
- ext4_fsblk_t block;
- struct ext4_group_desc *gdp;
-
- if (!ext4_valid_inum(sb, ino)) {
- /*
- * This error is already checked for in namei.c unless we are
- * looking at an NFS filehandle, in which case no error
- * report is needed
- */
- return 0;
- }
-
- block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
- gdp = ext4_get_group_desc(sb, block_group, NULL);
- if (!gdp)
- return 0;
-
- /*
- * Figure out the offset within the block group inode table
- */
- offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
- EXT4_INODE_SIZE(sb);
- block = ext4_inode_table(sb, gdp) +
- (offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
- iloc->block_group = block_group;
- iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
- return block;
-}
-
/*
* ext4_get_inode_loc returns with an extra refcount against the inode's
* underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3877,13 +3842,30 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
static int __ext4_get_inode_loc(struct inode *inode,
struct ext4_iloc *iloc, int in_mem)
{
+ struct ext4_group_desc *gdp;
ext4_fsblk_t block;
struct buffer_head *bh;

- block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
- if (!block)
+ iloc->bh = 0;
+ if (!ext4_valid_inum(inode->i_sb, inode->i_ino))
return -EIO;

+ iloc->block_group = (inode->i_ino - 1) /
+ EXT4_INODES_PER_GROUP(inode->i_sb);
+ gdp = ext4_get_group_desc(inode->i_sb, iloc->block_group, NULL);
+ if (!gdp)
+ return -EIO;
+
+ /*
+ * Figure out the offset within the block group inode table
+ */
+ iloc->offset = ((inode->i_ino - 1) %
+ EXT4_INODES_PER_GROUP(inode->i_sb)) *
+ EXT4_INODE_SIZE(inode->i_sb);
+ block = ext4_inode_table(inode->i_sb, gdp) +
+ (iloc->offset >> EXT4_BLOCK_SIZE_BITS(inode->i_sb));
+ iloc->offset = iloc->offset & (EXT4_BLOCK_SIZE(inode->i_sb) - 1);
+
bh = sb_getblk(inode->i_sb, block);
if (!bh) {
ext4_error (inode->i_sb, "ext4_get_inode_loc",
@@ -3969,6 +3951,31 @@ static int __ext4_get_inode_loc(struct inode *inode,

make_io:
/*
+ * If we need to do any I/O, try to readahead up to 16
+ * blocks from the inode table.
+ */
+ {
+ ext4_fsblk_t b, end, table;
+ unsigned num;
+
+ table = ext4_inode_table(inode->i_sb, gdp);
+ b = block & ~15;
+ if (table > b)
+ b = table;
+ end = b+16;
+ num = EXT4_INODES_PER_GROUP(inode->i_sb);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ num -= le16_to_cpu(gdp->bg_itable_unused);
+ table += num / (EXT4_BLOCK_SIZE(inode->i_sb) /
+ EXT4_INODE_SIZE(inode->i_sb));
+ if (end > table)
+ end = table;
+ while (b <= end)
+ sb_breadahead(inode->i_sb, b++);
+ }
+
+ /*
* There are other valid inodes in the buffer, this inode
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/