Re: [PATCH] fix fsync/fdatasync races

From: Alexander Viro (viro@math.psu.edu)
Date: Mon Apr 30 2001 - 13:14:17 EST


On Mon, 30 Apr 2001, Marcelo Tosatti wrote:

>
> Hi,
>
> The following patch implements a new super_operations "wait_inode"
> operation on ext2 to fix the generic_osync_inode/fsync/fdatasync race I
> mentioned sometime ago.
>
> We still have to implement the wait_inode operation on _all_ block
> filesystems to make them safe.
>
> Comments?

Just one: we already have two copies of "find the on-disk inode by inumber"
code. No need to introduce the third one.

        If we do it at all, please consider the patch below. It takes the
inode-loading stuff into struct ext2_inode *ext2_get_inode(sb, ino, bhp)
and does corresponding cleanup of ext2_read_inode() and ext2_update_inode().

                                                                Al

diff -urN S4/fs/ext2/inode.c S4-ext2_get_inode/fs/ext2/inode.c
--- S4/fs/ext2/inode.c Sat Apr 28 02:12:56 2001
+++ S4-ext2_get_inode/fs/ext2/inode.c Mon Apr 30 13:44:43 2001
@@ -958,56 +958,60 @@
                 mark_inode_dirty(inode);
 }
 
-void ext2_read_inode (struct inode * inode)
+static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
+ struct buffer_head **p)
 {
         struct buffer_head * bh;
- struct ext2_inode * raw_inode;
         unsigned long block_group;
- unsigned long group_desc;
- unsigned long desc;
         unsigned long block;
         unsigned long offset;
         struct ext2_group_desc * gdp;
 
- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO &&
- inode->i_ino != EXT2_ACL_DATA_INO &&
- inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
- inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
- ext2_error (inode->i_sb, "ext2_read_inode",
- "bad inode number: %lu", inode->i_ino);
- goto bad_inode;
- }
- block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
- if (block_group >= inode->i_sb->u.ext2_sb.s_groups_count) {
- ext2_error (inode->i_sb, "ext2_read_inode",
- "group >= groups count");
- goto bad_inode;
- }
- group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(inode->i_sb);
- desc = block_group & (EXT2_DESC_PER_BLOCK(inode->i_sb) - 1);
- bh = inode->i_sb->u.ext2_sb.s_group_desc[group_desc];
- if (!bh) {
- ext2_error (inode->i_sb, "ext2_read_inode",
- "Descriptor not loaded");
- goto bad_inode;
- }
-
- gdp = (struct ext2_group_desc *) bh->b_data;
+ *p = NULL;
+ if ((ino != EXT2_ROOT_INO && ino != EXT2_ACL_IDX_INO &&
+ ino != EXT2_ACL_DATA_INO && ino < EXT2_FIRST_INO(sb)) ||
+ ino > le32_to_cpu(sb->u.ext2_sb.s_es->s_inodes_count))
+ goto Einval;
+
+ block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
+ gdp = ext2_get_group_desc(sb, block_group, &bh);
+ if (!gdp)
+ goto Egdp;
         /*
          * Figure out the offset within the block group inode table
          */
- offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) *
- EXT2_INODE_SIZE(inode->i_sb);
- block = le32_to_cpu(gdp[desc].bg_inode_table) +
- (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb));
- if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) {
- ext2_error (inode->i_sb, "ext2_read_inode",
- "unable to read inode block - "
- "inode=%lu, block=%lu", inode->i_ino, block);
+ offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb))* EXT2_INODE_SIZE(sb);
+ block = le32_to_cpu(gdp->bg_inode_table) +
+ (offset >> EXT2_BLOCK_SIZE_BITS(sb));
+ if (!(bh = bread (sb->s_dev, block, sb->s_blocksize)))
+ goto Eio;
+
+ *p = bh;
+ offset &= (EXT2_BLOCK_SIZE(sb) - 1);
+ return (struct ext2_inode *) (bh->b_data + offset);
+
+Einval:
+ ext2_error (sb, "ext2_get_inode", "bad inode number: %lu", ino);
+ return ERR_PTR(-EINVAL);
+Eio:
+ ext2_error (sb, "ext2_get_inode",
+ "unable to read inode block - inode=%lu, block=%lu",
+ ino, block);
+Egdp:
+ return ERR_PTR(-EIO);
+}
+
+void ext2_read_inode (struct inode * inode)
+{
+ struct ext2_inode * raw_inode;
+ struct ext2_inode_info * ei = &inode->u.ext2_i;
+ ino_t ino = inode->i_ino;
+ struct buffer_head * bh;
+ unsigned long block;
+
+ raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
+ if (IS_ERR(raw_inode))
                 goto bad_inode;
- }
- offset &= (EXT2_BLOCK_SIZE(inode->i_sb) - 1);
- raw_inode = (struct ext2_inode *) (bh->b_data + offset);
 
         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -1021,13 +1025,13 @@
         inode->i_atime = le32_to_cpu(raw_inode->i_atime);
         inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
         inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
- inode->u.ext2_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
+ ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
         /* We now have enough fields to check if the inode was active or not.
          * This is needed because nfsd might try to access dead inodes
          * the test is that same one that e2fsck uses
          * NeilBrown 1999oct15
          */
- if (inode->i_nlink == 0 && (inode->i_mode == 0 || inode->u.ext2_i.i_dtime)) {
+ if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
                 /* this inode is deleted */
                 brelse (bh);
                 goto bad_inode;
@@ -1035,30 +1039,29 @@
         inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */
         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
         inode->i_version = ++event;
- inode->u.ext2_i.i_flags = le32_to_cpu(raw_inode->i_flags);
- inode->u.ext2_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
- inode->u.ext2_i.i_frag_no = raw_inode->i_frag;
- inode->u.ext2_i.i_frag_size = raw_inode->i_fsize;
- inode->u.ext2_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+ ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+ ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+ ei->i_frag_no = raw_inode->i_frag;
+ ei->i_frag_size = raw_inode->i_fsize;
+ ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
         if (S_ISDIR(inode->i_mode))
- inode->u.ext2_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+ ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
         else {
- inode->u.ext2_i.i_high_size = le32_to_cpu(raw_inode->i_size_high);
+ ei->i_high_size = le32_to_cpu(raw_inode->i_size_high);
                 inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
         }
         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
- inode->u.ext2_i.i_prealloc_count = 0;
- inode->u.ext2_i.i_block_group = block_group;
+ ei->i_prealloc_count = 0;
+ ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
 
         /*
          * NOTE! The in-memory inode i_data array is in little-endian order
          * even on big-endian machines: we do NOT byteswap the block numbers!
          */
         for (block = 0; block < EXT2_N_BLOCKS; block++)
- inode->u.ext2_i.i_data[block] = raw_inode->i_block[block];
+ ei->i_data[block] = raw_inode->i_block[block];
 
- if (inode->i_ino == EXT2_ACL_IDX_INO ||
- inode->i_ino == EXT2_ACL_DATA_INO)
+ if (ino == EXT2_ACL_IDX_INO || ino == EXT2_ACL_DATA_INO)
                 /* Nothing to do */ ;
         else if (S_ISREG(inode->i_mode)) {
                 inode->i_op = &ext2_file_inode_operations;
@@ -1106,70 +1109,33 @@
 {
         struct buffer_head * bh;
         struct ext2_inode * raw_inode;
- unsigned long block_group;
- unsigned long group_desc;
- unsigned long desc;
         unsigned long block;
- unsigned long offset;
         int err = 0;
- struct ext2_group_desc * gdp;
+ uid_t uid = inode->i_uid;
+ gid_t gid = inode->i_gid;
 
- if ((inode->i_ino != EXT2_ROOT_INO &&
- inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
- inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
- ext2_error (inode->i_sb, "ext2_write_inode",
- "bad inode number: %lu", inode->i_ino);
- return -EIO;
- }
- block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
- if (block_group >= inode->i_sb->u.ext2_sb.s_groups_count) {
- ext2_error (inode->i_sb, "ext2_write_inode",
- "group >= groups count");
+ raw_inode = ext2_get_inode(inode->i_sb, inode->i_ino, &bh);
+ if (IS_ERR(bh))
                 return -EIO;
- }
- group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(inode->i_sb);
- desc = block_group & (EXT2_DESC_PER_BLOCK(inode->i_sb) - 1);
- bh = inode->i_sb->u.ext2_sb.s_group_desc[group_desc];
- if (!bh) {
- ext2_error (inode->i_sb, "ext2_write_inode",
- "Descriptor not loaded");
- return -EIO;
- }
- gdp = (struct ext2_group_desc *) bh->b_data;
- /*
- * Figure out the offset within the block group inode table
- */
- offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) *
- EXT2_INODE_SIZE(inode->i_sb);
- block = le32_to_cpu(gdp[desc].bg_inode_table) +
- (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb));
- if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) {
- ext2_error (inode->i_sb, "ext2_write_inode",
- "unable to read inode block - "
- "inode=%lu, block=%lu", inode->i_ino, block);
- return -EIO;
- }
- offset &= EXT2_BLOCK_SIZE(inode->i_sb) - 1;
- raw_inode = (struct ext2_inode *) (bh->b_data + offset);
 
         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
         if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
                 if(!inode->u.ext2_i.i_dtime) {
- raw_inode->i_uid_high = cpu_to_le16(high_16_bits(inode->i_uid));
- raw_inode->i_gid_high = cpu_to_le16(high_16_bits(inode->i_gid));
+ raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid));
+ raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid));
                 } else {
                         raw_inode->i_uid_high = 0;
                         raw_inode->i_gid_high = 0;
                 }
         } else {
- raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid));
                 raw_inode->i_uid_high = 0;
                 raw_inode->i_gid_high = 0;
         }
@@ -1218,8 +1184,7 @@
                 ll_rw_block (WRITE, 1, &bh);
                 wait_on_buffer (bh);
                 if (buffer_req(bh) && !buffer_uptodate(bh)) {
- printk ("IO error syncing ext2 inode ["
- "%s:%08lx]\n",
+ printk ("IO error syncing ext2 inode [%s:%08lx]\n",
                                 bdevname(inode->i_dev), inode->i_ino);
                         err = -EIO;
                 }

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Mon Apr 30 2001 - 21:00:25 EST