Re: [PATCH v2 4/7] tracefs: dentry lookup crapectomy

From: Al Viro
Date: Wed Jan 31 2024 - 19:27:48 EST


On Wed, Jan 31, 2024 at 01:49:22PM -0500, Steven Rostedt wrote:

> @@ -329,32 +320,29 @@ static struct dentry *create_file(const char *name, umode_t mode,
>
> ti = get_tracefs(inode);
> ti->flags |= TRACEFS_EVENT_INODE;
> - d_instantiate(dentry, inode);
> +
> + d_add(dentry, inode);
> fsnotify_create(dentry->d_parent->d_inode, dentry);

Seriously? stat(2), have it go away from dcache on memory pressure,
lather, rinse, repeat... Won't *snotify get confused by the stream
of creations of the same thing, with not a removal in sight?

> - return eventfs_end_creating(dentry);
> + return dentry;
> };
>
> /**
> - * create_dir - create a dir in the tracefs filesystem
> + * lookup_dir_entry - look up a dir in the tracefs filesystem
> + * @dentry: the directory to look up
> * @ei: the eventfs_inode that represents the directory to create
> - * @parent: parent dentry for this file.
> *
> - * This function will create a dentry for a directory represented by
> + * This function will look up a dentry for a directory represented by
> * a eventfs_inode.
> */
> -static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
> +static struct dentry *lookup_dir_entry(struct dentry *dentry,
> + struct eventfs_inode *pei, struct eventfs_inode *ei)
> {
> struct tracefs_inode *ti;
> - struct dentry *dentry;
> struct inode *inode;
>
> - dentry = eventfs_start_creating(ei->name, parent);
> - if (IS_ERR(dentry))
> - return dentry;
> -
> inode = tracefs_get_inode(dentry->d_sb);
> if (unlikely(!inode))
> - return eventfs_failed_creating(dentry);
> + return ERR_PTR(-ENOMEM);
>
> /* If the user updated the directory's attributes, use them */
> update_inode_attr(dentry, inode, &ei->attr,
> @@ -371,11 +359,14 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
> /* Only directories have ti->private set to an ei, not files */
> ti->private = ei;
>
> + dentry->d_fsdata = ei;
> + ei->dentry = dentry; // Remove me!
> +
> inc_nlink(inode);
> - d_instantiate(dentry, inode);
> + d_add(dentry, inode);
> inc_nlink(dentry->d_parent->d_inode);

What will happen when that thing gets evicted from dcache,
gets looked up again, and again, and...?

> fsnotify_mkdir(dentry->d_parent->d_inode, dentry);

Same re snotify confusion...

> - return eventfs_end_creating(dentry);
> + return dentry;
> }
>
> static void free_ei(struct eventfs_inode *ei)
> @@ -425,7 +416,7 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
> }
>
> /**
> - * create_file_dentry - create a dentry for a file of an eventfs_inode
> + * lookup_file_dentry - create a dentry for a file of an eventfs_inode
> * @ei: the eventfs_inode that the file will be created under
> * @idx: the index into the d_children[] of the @ei
> * @parent: The parent dentry of the created file.
> @@ -438,157 +429,21 @@ void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
> * address located at @e_dentry.
> */
> static struct dentry *
> -create_file_dentry(struct eventfs_inode *ei, int idx,
> - struct dentry *parent, const char *name, umode_t mode, void *data,
> +lookup_file_dentry(struct dentry *dentry,
> + struct eventfs_inode *ei, int idx,
> + umode_t mode, void *data,
> const struct file_operations *fops)
> {
> struct eventfs_attr *attr = NULL;
> struct dentry **e_dentry = &ei->d_children[idx];
> - struct dentry *dentry;
> -
> - WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
>
> - mutex_lock(&eventfs_mutex);
> - if (ei->is_freed) {
> - mutex_unlock(&eventfs_mutex);
> - return NULL;
> - }
> - /* If the e_dentry already has a dentry, use it */
> - if (*e_dentry) {
> - dget(*e_dentry);
> - mutex_unlock(&eventfs_mutex);
> - return *e_dentry;
> - }
> -
> - /* ei->entry_attrs are protected by SRCU */
> if (ei->entry_attrs)
> attr = &ei->entry_attrs[idx];
>
> - mutex_unlock(&eventfs_mutex);
> -
> - dentry = create_file(name, mode, attr, parent, data, fops);
> -
> - mutex_lock(&eventfs_mutex);
> -
> - if (IS_ERR_OR_NULL(dentry)) {
> - /*
> - * When the mutex was released, something else could have
> - * created the dentry for this e_dentry. In which case
> - * use that one.
> - *
> - * If ei->is_freed is set, the e_dentry is currently on its
> - * way to being freed, don't return it. If e_dentry is NULL
> - * it means it was already freed.
> - */
> - if (ei->is_freed) {
> - dentry = NULL;
> - } else {
> - dentry = *e_dentry;
> - dget(dentry);
> - }
> - mutex_unlock(&eventfs_mutex);
> - return dentry;
> - }
> -
> - if (!*e_dentry && !ei->is_freed) {
> - *e_dentry = dentry;
> - dentry->d_fsdata = ei;
> - } else {
> - /*
> - * Should never happen unless we get here due to being freed.
> - * Otherwise it means two dentries exist with the same name.
> - */
> - WARN_ON_ONCE(!ei->is_freed);
> - dentry = NULL;
> - }
> - mutex_unlock(&eventfs_mutex);
> -
> - return dentry;
> -}
> -
> -/**
> - * eventfs_post_create_dir - post create dir routine
> - * @ei: eventfs_inode of recently created dir
> - *
> - * Map the meta-data of files within an eventfs dir to their parent dentry
> - */
> -static void eventfs_post_create_dir(struct eventfs_inode *ei)
> -{
> - struct eventfs_inode *ei_child;
> -
> - lockdep_assert_held(&eventfs_mutex);
> -
> - /* srcu lock already held */
> - /* fill parent-child relation */
> - list_for_each_entry_srcu(ei_child, &ei->children, list,
> - srcu_read_lock_held(&eventfs_srcu)) {
> - ei_child->d_parent = ei->dentry;
> - }
> -}
> -
> -/**
> - * create_dir_dentry - Create a directory dentry for the eventfs_inode
> - * @pei: The eventfs_inode parent of ei.
> - * @ei: The eventfs_inode to create the directory for
> - * @parent: The dentry of the parent of this directory
> - *
> - * This creates and attaches a directory dentry to the eventfs_inode @ei.
> - */
> -static struct dentry *
> -create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
> - struct dentry *parent)
> -{
> - struct dentry *dentry = NULL;
> -
> - WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
> -
> - mutex_lock(&eventfs_mutex);
> - if (pei->is_freed || ei->is_freed) {
> - mutex_unlock(&eventfs_mutex);
> - return NULL;
> - }
> - if (ei->dentry) {
> - /* If the eventfs_inode already has a dentry, use it */
> - dentry = ei->dentry;
> - dget(dentry);
> - mutex_unlock(&eventfs_mutex);
> - return dentry;
> - }
> - mutex_unlock(&eventfs_mutex);
> + dentry->d_fsdata = ei; // NOTE: ei of _parent_
> + lookup_file(dentry, mode, attr, data, fops);
>
> - dentry = create_dir(ei, parent);
> -
> - mutex_lock(&eventfs_mutex);
> -
> - if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
> - /*
> - * When the mutex was released, something else could have
> - * created the dentry for this e_dentry. In which case
> - * use that one.
> - *
> - * If ei->is_freed is set, the e_dentry is currently on its
> - * way to being freed.
> - */
> - dentry = ei->dentry;
> - if (dentry)
> - dget(dentry);
> - mutex_unlock(&eventfs_mutex);
> - return dentry;
> - }
> -
> - if (!ei->dentry && !ei->is_freed) {
> - ei->dentry = dentry;
> - eventfs_post_create_dir(ei);
> - dentry->d_fsdata = ei;
> - } else {
> - /*
> - * Should never happen unless we get here due to being freed.
> - * Otherwise it means two dentries exist with the same name.
> - */
> - WARN_ON_ONCE(!ei->is_freed);
> - dentry = NULL;
> - }
> - mutex_unlock(&eventfs_mutex);
> + *e_dentry = dentry; // Remove me
>
> return dentry;
> }
> @@ -607,79 +462,55 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
> struct dentry *dentry,
> unsigned int flags)
> {
> - const struct file_operations *fops;
> - const struct eventfs_entry *entry;
> struct eventfs_inode *ei_child;
> struct tracefs_inode *ti;
> struct eventfs_inode *ei;
> - struct dentry *ei_dentry = NULL;
> - struct dentry *ret = NULL;
> - struct dentry *d;
> const char *name = dentry->d_name.name;
> - umode_t mode;
> - void *data;
> - int idx;
> - int i;
> - int r;
> + struct dentry *result = NULL;
>
> ti = get_tracefs(dir);
> if (!(ti->flags & TRACEFS_EVENT_INODE))

Can that ever happen? I mean, why set ->i_op to something that
has this for ->lookup() on a directory without TRACEFS_EVENT_INODE in
its inode? It's not as if you ever removed that flag...

> - return NULL;
> -
> - /* Grab srcu to prevent the ei from going away */
> - idx = srcu_read_lock(&eventfs_srcu);
> + return ERR_PTR(-EIO);
>
> - /*
> - * Grab the eventfs_mutex to consistent value from ti->private.
> - * This s
> - */
> mutex_lock(&eventfs_mutex);
> - ei = READ_ONCE(ti->private);
> - if (ei && !ei->is_freed)
> - ei_dentry = READ_ONCE(ei->dentry);
> - mutex_unlock(&eventfs_mutex);
> -
> - if (!ei || !ei_dentry)
> - goto out;
>
> - data = ei->data;
> + ei = ti->private;
> + if (!ei || ei->is_freed)
> + goto enoent;
>
> - list_for_each_entry_srcu(ei_child, &ei->children, list,
> - srcu_read_lock_held(&eventfs_srcu)) {
> + list_for_each_entry(ei_child, &ei->children, list) {
> if (strcmp(ei_child->name, name) != 0)
> continue;
> - ret = simple_lookup(dir, dentry, flags);
> - if (IS_ERR(ret))
> - goto out;
> - d = create_dir_dentry(ei, ei_child, ei_dentry);
> - dput(d);
> + if (ei_child->is_freed)
> + goto enoent;

Out of curiosity - can that happen now? You've got exclusion with
eventfs_remove_rec(), so you shouldn't be able to catch the moment
between setting ->is_freed and removal from the list...

> + lookup_dir_entry(dentry, ei, ei_child);
> goto out;
> }
>
> - for (i = 0; i < ei->nr_entries; i++) {
> - entry = &ei->entries[i];
> - if (strcmp(name, entry->name) == 0) {
> - void *cdata = data;
> - mutex_lock(&eventfs_mutex);
> - /* If ei->is_freed, then the event itself may be too */
> - if (!ei->is_freed)
> - r = entry->callback(name, &mode, &cdata, &fops);
> - else
> - r = -1;
> - mutex_unlock(&eventfs_mutex);
> - if (r <= 0)
> - continue;
> - ret = simple_lookup(dir, dentry, flags);
> - if (IS_ERR(ret))
> - goto out;
> - d = create_file_dentry(ei, i, ei_dentry, name, mode, cdata, fops);
> - dput(d);
> - break;
> - }
> + for (int i = 0; i < ei->nr_entries; i++) {
> + void *data;
> + umode_t mode;
> + const struct file_operations *fops;
> + const struct eventfs_entry *entry = &ei->entries[i];
> +
> + if (strcmp(name, entry->name) != 0)
> + continue;
> +
> + data = ei->data;
> + if (entry->callback(name, &mode, &data, &fops) <= 0)
> + goto enoent;
> +
> + lookup_file_dentry(dentry, ei, i, mode, data, fops);
> + goto out;
> }
> +
> + enoent:
> + /* Don't cache negative lookups, just return an error */
> + result = ERR_PTR(-ENOENT);

Huh? Just return NULL and be done with that - you'll get an
unhashed negative dentry and let the caller turn that into
-ENOENT...

> out:
> - srcu_read_unlock(&eventfs_srcu, idx);
> - return ret;
> + mutex_unlock(&eventfs_mutex);
> + return result;
> }
>
> /*