[PATCH 7/9] vfs: protect remounting superblock read-only

From: Miklos Szeredi
Date: Tue Oct 05 2010 - 06:33:12 EST


From: Miklos Szeredi <mszeredi@xxxxxxx>

Currently remouting superblock read-only is racy in a major way.

With the per mount read-only infrastructure it is now possible to
prevent most races, which this patch attempts.

Before starting the remount read-only, set MNT_WRITE_HOLD on all
mounts so that writes are held off until the remount either succeeds
or fails.

After this proceed with the remount and if successful mark all mounts
MNT_READONLY and release MNT_WRITE_HOLD. If unsuccessful, just
release MNT_WRITE_HOLD so that write operations can proceed normally.

Careful thought needs to be given to places where mnt_flags are set
such as do_add_mount() and clone_mnt() to ensure that a read-write
mount is not added to a read-only superblock.

Small races still remain because delayed writes due to nlink going to
zero but inode still having a reference are not dealt with by this
patch.

Signed-off-by: Miklos Szeredi <mszeredi@xxxxxxx>
---
fs/internal.h | 3 ++
fs/namespace.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
fs/super.c | 25 +++++++++++++++++---
3 files changed, 93 insertions(+), 5 deletions(-)

Index: linux-2.6/fs/internal.h
===================================================================
--- linux-2.6.orig/fs/internal.h 2010-10-01 20:40:41.000000000 +0200
+++ linux-2.6/fs/internal.h 2010-10-04 13:02:24.000000000 +0200
@@ -69,6 +69,9 @@ extern void mnt_set_mountpoint(struct vf
extern void release_mounts(struct list_head *);
extern void umount_tree(struct vfsmount *, int, struct list_head *);
extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int sb_prepare_remount_readonly(struct super_block *);
+extern void sb_cancel_remount_readonly(struct super_block *);
+extern void sb_finish_remount_readonly(struct super_block *);

extern void __init mnt_init(void);

Index: linux-2.6/fs/namespace.c
===================================================================
--- linux-2.6.orig/fs/namespace.c 2010-10-04 12:38:35.000000000 +0200
+++ linux-2.6/fs/namespace.c 2010-10-04 13:02:24.000000000 +0200
@@ -422,6 +422,61 @@ void sb_force_remount_readonly(struct su
}
EXPORT_SYMBOL(sb_force_remount_readonly);

+int sb_prepare_remount_readonly(struct super_block *sb)
+{
+ struct vfsmount *mnt;
+ int err = 0;
+
+ br_write_lock(vfsmount_lock);
+ list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+ if (!(mnt->mnt_flags & MNT_READONLY)) {
+ mnt->mnt_flags |= MNT_WRITE_HOLD;
+ smp_mb();
+ if (count_mnt_writers(mnt) > 0) {
+ err = -EBUSY;
+ break;
+ }
+ }
+ }
+ br_write_unlock(vfsmount_lock);
+
+ if (err)
+ sb_cancel_remount_readonly(sb);
+
+ return err;
+}
+
+void sb_cancel_remount_readonly(struct super_block *sb)
+{
+ struct vfsmount *mnt;
+
+ br_write_lock(vfsmount_lock);
+ list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+ if (mnt->mnt_flags & MNT_WRITE_HOLD)
+ mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+ }
+
+ br_write_unlock(vfsmount_lock);
+ wake_up_all(&sb->s_wait_remount_readonly);
+}
+
+void sb_finish_remount_readonly(struct super_block *sb)
+{
+ struct vfsmount *mnt;
+
+ br_write_lock(vfsmount_lock);
+ list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+ if (!(mnt->mnt_flags & MNT_READONLY)) {
+ mnt->mnt_flags |= MNT_READONLY;
+ smp_wmb();
+ mnt->mnt_flags &= ~MNT_WRITE_HOLD;
+ }
+ }
+ sb->s_flags |= MS_RDONLY;
+ br_write_unlock(vfsmount_lock);
+ wake_up_all(&sb->s_wait_remount_readonly);
+}
+
void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
@@ -611,7 +666,7 @@ static struct vfsmount *clone_mnt(struct
goto out_free;
}

- mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
+ mnt->mnt_flags = old->mnt_flags & MNT_PROPAGATION_MASK;
atomic_inc(&sb->s_active);
mnt->mnt_sb = sb;
mnt->mnt_root = dget(root);
@@ -639,7 +694,13 @@ static struct vfsmount *clone_mnt(struct
list_add(&mnt->mnt_expire, &old->mnt_expire);
}

+ /*
+ * Add new mount to s_mounts. Do this after fiddling
+ * with propagation flags to prevent races with
+ * remount ro/rw.
+ */
br_write_lock(vfsmount_lock);
+ mnt->mnt_flags |= old->mnt_flags & ~MNT_PROPAGATION_MASK;
list_add_tail(&mnt->mnt_instance, &mnt->mnt_sb->s_mounts);
br_write_unlock(vfsmount_lock);
}
@@ -1804,7 +1865,14 @@ int do_add_mount(struct vfsmount *newmnt
if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
goto unlock;

+ /* Locking is necessary to prevent racing with remount r/o */
+ br_read_lock(vfsmount_lock);
+ if (newmnt->mnt_sb->s_flags & MS_RDONLY)
+ mnt_flags |= MNT_READONLY;
+
newmnt->mnt_flags = mnt_flags;
+ br_read_unlock(vfsmount_lock);
+
if ((err = graft_tree(newmnt, path)))
goto unlock;

Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c 2010-10-04 12:38:35.000000000 +0200
+++ linux-2.6/fs/super.c 2010-10-04 13:02:24.000000000 +0200
@@ -574,18 +574,31 @@ int do_remount_sb(struct super_block *sb
/* If we are remounting RDONLY and current sb is read/write,
make sure there are no rw files opened */
if (remount_ro) {
- if (force)
+ if (force) {
mark_files_ro(sb);
- else if (!fs_may_remount_ro(sb))
- return -EBUSY;
+ } else {
+ retval = sb_prepare_remount_readonly(sb);
+ if (retval)
+ return retval;
+
+ retval = -EBUSY;
+ if (!fs_may_remount_ro(sb))
+ goto cancel_readonly;
+ }
}

if (sb->s_op->remount_fs) {
retval = sb->s_op->remount_fs(sb, &flags, data);
/* If forced remount, go ahead despite any errors */
- if (retval && !force)
+ if (retval && !force) {
+ if (remount_ro)
+ goto cancel_readonly;
return retval;
+ }
}
+ if (remount_ro && !force)
+ sb_finish_remount_readonly(sb);
+
sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);

/*
@@ -599,6 +612,10 @@ int do_remount_sb(struct super_block *sb
if (remount_ro && sb->s_bdev)
invalidate_bdev(sb->s_bdev);
return 0;
+
+cancel_readonly:
+ sb_cancel_remount_readonly(sb);
+ return retval;
}

static void do_emergency_remount(struct work_struct *work)

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/