NFS: patch proposal (for kernel 2.2.13?) Please test...

Trond Myklebust (trond.myklebust@fys.uio.no)
30 Aug 1999 18:01:44 +0200


Hi,

The appended patch fixes the problem illustrated by the following
testcase (nfs2 is an NFS client, nfs1 is a different client or the
server).

nfs1> mkdir foo
nfs2> ls -sl foo
total 0
nfs1> rm -rf foo
nfs1> mkdir foo
nfs2> /bin/sh -c "( cd foo ; ps -ef > a ) & ( cd foo ; ps -ef > b )"
/bin/sh: a: Stale NFS file handle
/bin/sh: b: Stale NFS file handle

It is essentially a problem in dcache.c. The code in d_invalidate
tries to protect against somebody unhashing a dentry for a directory
that is in use. This violates the statelessness in NFS, since (as you
can see) anybody on another machine is allowed to mess about with the
directory structure.

In the appended patch I propose the following:

1) In nfs_revalidate_inode(), immediately unhash any stale dentry.

2) In d_invalidate() return OK if the dentry is already unhashed.

3) In nfs_lookup_revalidate() unhash any dentry that is stale
ourselves, thus ensuring a new lookup (even if the dentry is a
directory).

Also reintroduce the requirement that file handles always have to
match in order to pass nfs_revalidate_inode(). This requires a
small fix to the inode invalidation code to protect sockets over
NFS (which was why the d_count > 2 test was originally
introduced).

---- In addition (unrelated to the above testcase) ---

4) Relax the requirements on stale inodes in inode.c. The attempt to
compare inode->i_count to attr->nlink is very prone to races. The
current code can cause quite valid filehandles to be set to zero
if we have a single stale dentry that hasn't yet been dropped.

I also hope the new stale inode code will be a bit more
transparent.

5) For negative dentries, (as several people have suggested) we use
NFS_ATTRTIMEO(dir) to decide how often to check for new entries.
This allows the 'noac' crowd to flood the local network ;-)...

----

Please could people test this. In particular, I'm interested in
hearing from people with NFSroot setups, to hear if the stale inode
code is OK on devices, sockets (/tmp/.X11-unix/X0 et al.), and FIFOs.

Cheers,
Trond

NOTE: there are several remaining warning messages in dir.c and
inode.c that complain if 'inode->i_count > inode->i_nlink'. Please
ignore these messages.

--- fs/dcache.c-orig Mon Apr 26 08:17:56 1999
+++ fs/dcache.c Sun Aug 29 15:07:13 1999
@@ -168,6 +168,11 @@
int d_invalidate(struct dentry * dentry)
{
/*
+ * If it's already been dropped, return OK.
+ */
+ if (&list_empty(dentry->d_hash))
+ return 0;
+ /*
* Check whether to do a partial shrink_dcache
* to get rid of unused child entries.
*/
--- fs/nfs/dir.c-orig Mon Aug 9 21:04:57 1999
+++ fs/nfs/dir.c Sun Aug 29 15:07:13 1999
@@ -462,12 +462,8 @@
goto out_bad;

/* Filehandle matches? */
- if (memcmp(dentry->d_fsdata, &fhandle, sizeof(struct nfs_fh))) {
- if (!list_empty(&dentry->d_subdirs))
- shrink_dcache_parent(dentry);
- if (dentry->d_count < 2)
- goto out_bad;
- }
+ if (memcmp(dentry->d_fsdata, &fhandle, sizeof(struct nfs_fh)))
+ goto out_bad;

/* Ok, remeber that we successfully checked it.. */
nfs_renew_times(dentry);
@@ -476,6 +472,9 @@
out_valid:
return 1;
out_bad:
+ d_drop(dentry);
+ if (!list_empty(&dentry->d_subdirs))
+ shrink_dcache_parent(dentry);
if (dentry->d_parent->d_inode)
nfs_invalidate_dircache(dentry->d_parent->d_inode);
if (inode && S_ISDIR(inode->i_mode))
--- fs/nfs/inode.c-orig Mon Aug 9 21:05:05 1999
+++ fs/nfs/inode.c Sun Aug 29 15:22:29 1999
@@ -492,6 +492,43 @@
}

/*
+ * The following may seem pretty minimal, but the stateless nature
+ * of NFS means that we can't do too much more. Previous attempts to use
+ * fattr->nlink to determine how well the cached state matches the
+ * server suffer from races with stale dentries. You also risk killing
+ * off processes by just doing 'mv file newdir' on the server.
+ *
+ * FIXME: Of course, if 2 exported files have the same fileid (but
+ * different fsid which makes it legal) you're still buggered...
+ * Trond, August 1999.
+ */
+static int
+nfs_inode_is_stale(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int unhashed;
+ int is_stale = 0;
+
+ if (inode->i_mode &&
+ (fattr->mode & S_IFMT) != (inode->i_mode & S_IFMT))
+ is_stale = 1;
+
+ if (is_bad_inode(inode))
+ is_stale = 1;
+
+ /*
+ * Free up unused cached dentries to see if it's wise to unhash
+ * the inode (which we can do if all the dentries have been unhashed).
+ */
+ unhashed = nfs_free_dentries(inode);
+
+ /* Assume we're holding 1 lock on the inode from 'iget' */
+ if (unhashed && inode->i_count == unhashed + 1)
+ is_stale = 1;
+
+ return is_stale;
+}
+
+/*
* This is our own version of iget that looks up inodes by file handle
* instead of inode number. We use this technique instead of using
* the vfs read_inode function because there is no way to pass the
@@ -550,53 +587,26 @@
static struct inode *
__nfs_fhget(struct super_block *sb, struct nfs_fattr *fattr)
{
- struct inode *inode;
- int max_count, stale_inode, unhashed = 0;
+ struct inode *inode = NULL;

-retry:
- inode = iget(sb, fattr->fileid);
- if (!inode)
+ if (!fattr)
goto out_no_inode;
- /* N.B. This should be impossible ... */
- if (inode->i_ino != fattr->fileid)
- goto out_bad_id;

- /*
- * Check for busy inodes, and attempt to get rid of any
- * unused local references. If successful, we release the
- * inode and try again.
- *
- * Note that the busy test uses the values in the fattr,
- * as the inode may have become a different object.
- * (We can probably handle modes changes here, too.)
- */
- stale_inode = inode->i_mode &&
- ((fattr->mode ^ inode->i_mode) & S_IFMT);
- stale_inode |= inode->i_count && inode->i_count == unhashed;
- max_count = S_ISDIR(fattr->mode) ? 1 : fattr->nlink;
- if (stale_inode || inode->i_count > max_count + unhashed) {
- dprintk("__nfs_fhget: inode %ld busy, i_count=%d, i_nlink=%d\n",
- inode->i_ino, inode->i_count, inode->i_nlink);
- unhashed = nfs_free_dentries(inode);
- if (stale_inode || inode->i_count > max_count + unhashed) {
- printk("__nfs_fhget: inode %ld still busy, i_count=%d\n",
- inode->i_ino, inode->i_count);
- if (!list_empty(&inode->i_dentry)) {
- struct dentry *dentry;
- dentry = list_entry(inode->i_dentry.next,
- struct dentry, d_alias);
- printk("__nfs_fhget: killing %s/%s filehandle\n",
- dentry->d_parent->d_name.name,
- dentry->d_name.name);
- memset(dentry->d_fsdata, 0,
- sizeof(struct nfs_fh));
- }
- remove_inode_hash(inode);
- nfs_invalidate_inode(inode);
- unhashed = 0;
- }
+ while (!inode) {
+ inode = iget(sb, fattr->fileid);
+ if (!inode)
+ goto out_no_inode;
+ /* N.B. This should be impossible ... */
+ if (inode->i_ino != fattr->fileid)
+ goto out_bad_id;
+
+ if (!nfs_inode_is_stale(inode,fattr))
+ break;
+
+ remove_inode_hash(inode);
+ nfs_invalidate_inode(inode);
iput(inode);
- goto retry;
+ inode = NULL;
}
nfs_fill_inode(inode, fattr);
dprintk("NFS: __nfs_fhget(%x/%ld ct=%d)\n",
@@ -751,6 +761,8 @@
fh = (u32 *) &fhandle;
dfprintk(PAGECACHE, " %08x%08x%08x%08x%08x%08x%08x%08x\n",
fh[0],fh[1],fh[2],fh[3],fh[4],fh[5],fh[6],fh[7]);
+ if (!IS_ROOT(dentry))
+ d_drop(dentry);
goto out;
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/