Re: [PATCH v3] fuse: share lookup state between submount and its parent

From: Miklos Szeredi
Date: Thu Oct 19 2023 - 08:40:40 EST


On Wed, Oct 18, 2023 at 3:34 AM Krister Johansen
<kjlx@xxxxxxxxxxxxxxxxxx> wrote:
>
> Fuse submounts do not perform a lookup for the nodeid that they inherit
> from their parent. Instead, the code decrements the nlookup on the
> submount's fuse_inode when it is instantiated, and no forget is
> performed when a submount root is evicted.
>
> Trouble arises when the submount's parent is evicted despite the
> submount itself being in use. In this author's case, the submount was
> in a container and deatched from the initial mount namespace via a
> MNT_DEATCH operation. When memory pressure triggered the shrinker, the
> inode from the parent was evicted, which triggered enough forgets to
> render the submount's nodeid invalid.
>
> Since submounts should still function, even if their parent goes away,
> solve this problem by sharing refcounted state between the parent and
> its submount. When all of the references on this shared state reach
> zero, it's safe to forget the final lookup of the fuse nodeid.
>
> Signed-off-by: Krister Johansen <kjlx@xxxxxxxxxxxxxxxxxx>
> Cc: stable@xxxxxxxxxxxxxxx
> Fixes: 1866d779d5d2 ("fuse: Allow fuse_fill_super_common() for submounts")
> ---
> fs/fuse/fuse_i.h | 20 +++++++++++
> fs/fuse/inode.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 105 insertions(+), 3 deletions(-)
>
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 405252bb51f2..0d1659c5016b 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -63,6 +63,24 @@ struct fuse_forget_link {
> struct fuse_forget_link *next;
> };
>
> +/* Submount lookup tracking */
> +struct fuse_submount_lookup {
> + /** Refcount */
> + refcount_t count;
> +
> + /** Unique ID, which identifies the inode between userspace
> + * and kernel */
> + u64 nodeid;
> +
> + /** Number of lookups on this inode */
> + u64 nlookup;

sl->nlookup will always be one. So that can just be implicit and this
field can just go away.

> +
> + /** The request used for sending the FORGET message */
> + struct fuse_forget_link *forget;
> +
> + struct rcu_head rcu;

RCU would be needed if any fields could be accessed from RCU protected
code. But AFAICS there's no such access, so this shouldn't be needed.
Am I missing something?

> +};
> +
> /** FUSE inode */
> struct fuse_inode {
> /** Inode data */
> @@ -158,6 +176,8 @@ struct fuse_inode {
> */
> struct fuse_inode_dax *dax;
> #endif
> + /** Submount specific lookup tracking */
> + struct fuse_submount_lookup *submount_lookup;
> };
>
> /** FUSE inode state bits */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 444418e240c8..dc1499e2074f 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -68,6 +68,24 @@ struct fuse_forget_link *fuse_alloc_forget(void)
> return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT);
> }
>
> +static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void)
> +{
> + struct fuse_submount_lookup *sl;
> +
> + sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT);
> + if (!sl)
> + return NULL;
> + sl->forget = fuse_alloc_forget();
> + if (!sl->forget)
> + goto out_free;
> +
> + return sl;
> +
> +out_free:
> + kfree(sl);
> + return NULL;
> +}
> +
> static struct inode *fuse_alloc_inode(struct super_block *sb)
> {
> struct fuse_inode *fi;
> @@ -113,9 +131,24 @@ static void fuse_free_inode(struct inode *inode)
> kmem_cache_free(fuse_inode_cachep, fi);
> }
>
> +static void fuse_cleanup_submount_lookup(struct fuse_conn *fc,
> + struct fuse_submount_lookup *sl)
> +{
> + if (!refcount_dec_and_test(&sl->count))
> + return;
> +
> + if (sl->nlookup) {
> + fuse_queue_forget(fc, sl->forget, sl->nodeid, sl->nlookup);
> + sl->forget = NULL;
> + }
> + kfree(sl->forget);
> + kfree_rcu(sl, rcu);
> +}
> +
> static void fuse_evict_inode(struct inode *inode)
> {
> struct fuse_inode *fi = get_fuse_inode(inode);
> + struct fuse_submount_lookup *sl = NULL;
>
> /* Will write inode on close/munmap and in all other dirtiers */
> WARN_ON(inode->i_state & I_DIRTY_INODE);
> @@ -132,6 +165,15 @@ static void fuse_evict_inode(struct inode *inode)
> fi->nlookup);
> fi->forget = NULL;
> }
> +
> + spin_lock(&fi->lock);
> + if (fi->submount_lookup) {
> + sl = fi->submount_lookup;
> + fi->submount_lookup = NULL;
> + }
> + spin_unlock(&fi->lock);

I don't think locking is needed. Eviction happens only once and at
that point nobody else should be touching the inode.

> + if (sl)
> + fuse_cleanup_submount_lookup(fc, sl);
> }
> if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
> WARN_ON(!list_empty(&fi->write_files));
> @@ -332,6 +374,14 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
> fuse_dax_dontcache(inode, attr->flags);
> }
>
> +static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl,
> + u64 nodeid)
> +{
> + sl->nodeid = nodeid;
> + sl->nlookup = 1;
> + refcount_set(&sl->count, 1);
> +}
> +
> static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
> struct fuse_conn *fc)
> {
> @@ -395,12 +445,22 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
> */
> if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
> S_ISDIR(attr->mode)) {
> + struct fuse_inode *fi;
> +
> inode = new_inode(sb);
> if (!inode)
> return NULL;
>
> fuse_init_inode(inode, attr, fc);
> - get_fuse_inode(inode)->nodeid = nodeid;
> + fi = get_fuse_inode(inode);
> + fi->nodeid = nodeid;
> + fi->submount_lookup = fuse_alloc_submount_lookup();
> + if (!fi->submount_lookup) {
> + iput(inode);
> + return NULL;
> + }
> + /* Sets nlookup = 1 on fi->submount_lookup->nlookup */
> + fuse_init_submount_lookup(fi->submount_lookup, nodeid);
> inode->i_flags |= S_AUTOMOUNT;
> goto done;
> }
> @@ -423,11 +483,11 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
> iput(inode);
> goto retry;
> }
> -done:
> fi = get_fuse_inode(inode);
> spin_lock(&fi->lock);
> fi->nlookup++;
> spin_unlock(&fi->lock);
> +done:
> fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version);
>
> return inode;
> @@ -1465,6 +1525,8 @@ static int fuse_fill_super_submount(struct super_block *sb,
> struct super_block *parent_sb = parent_fi->inode.i_sb;
> struct fuse_attr root_attr;
> struct inode *root;
> + struct fuse_submount_lookup *sl;
> + struct fuse_inode *fi;
>
> fuse_sb_defaults(sb);
> fm->sb = sb;
> @@ -1487,12 +1549,32 @@ static int fuse_fill_super_submount(struct super_block *sb,
> * its nlookup should not be incremented. fuse_iget() does
> * that, though, so undo it here.
> */
> - get_fuse_inode(root)->nlookup--;
> + fi = get_fuse_inode(root);
> + fi->nlookup--;
> +
> sb->s_d_op = &fuse_dentry_operations;
> sb->s_root = d_make_root(root);
> if (!sb->s_root)
> return -ENOMEM;
>
> + /*
> + * Grab the parent's submount_lookup pointer and take a
> + * reference on the shared nlookup from the parent. This is to
> + * prevent the last forget for this nodeid from getting
> + * triggered until all users have finished with it.
> + */
> + spin_lock(&parent_fi->lock);

Root has just been allocated, no locking needed.

> + sl = parent_fi->submount_lookup;
> + if (sl) {

WARN_ON(!sl);

Thanks,
Miklos