Re: [RESEND PATCH v2] kernfs: fix dentry unexpected skip

From: Eric W. Biederman
Date: Sat Jun 02 2018 - 13:25:42 EST


"Hatayama, Daisuke" <d.hatayama@xxxxxxxxxxxxxx> writes:

> kernfs_dir_next_pos() overlooks the situation that the dentry
> corresponding to a given pos object has already been inactive. Hence,
> when kernfs_dir_pos() returns the dentry with a hash value larger than
> the original one, kernfs_dir_next_pos() returns the dentry next to the
> one returned by kernfs_dir_pos(). As a result, the dentry returned by
> kernfs_dir_pos() is skipped.
>
> To fix this issue, try to find a next node only when the returned
> object is less than or equal to the original one.
>
> Note that this implementation focuses on getting guarantee that the
> existing nodes are never skipped, not interested in the other nodes
> that are added or removed during the process.
>
> We found this issue during a stress test that repeatedly reads
> /sys/module directory to get a list of the currently loaded kernel
> modules while repeatedly loading and unloading kernel modules
> simultaneously.
>
> v2: Fix the case where nodes with the same hash but with the name
> larger than the original node could still be skipped. Use
> kernfs_sd_compare() to compare kernfs_node objects. Imporove patch
> description.
>

Semantically this looks like the right fix.

The deep bug is that kernfs_dir_pos can always advance the position,
and the code has never looked for or handled that case. AKA just the
rbtree walk is enough to advance the position.

That said your implementation is buggy. It is not safe to call
kernfs_sd_compare on orig as kernfs_put has already been called on orig
and thus orig may already be free.

I suggest moving the kernfs_put for orig into kernfs_fop_readdir,
just before the mutex_unlock calls. That makes your kernfs_sd_compare
safe.

That would then allow moving the code to skip the next entry to also
be vmoed into kernfs_fop_readir.

Perhaps something like this:

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 89d1dc19340b..2d0f71ffb539 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1584,13 +1584,12 @@ static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
return 0;
}

-static struct kernfs_node *kernfs_dir_pos(const void *ns,
+static struct kernfs_node *kernfs_dir_pos(
struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
{
if (pos) {
int valid = kernfs_active(pos) &&
pos->parent == parent && hash == pos->hash;
- kernfs_put(pos);
if (!valid)
pos = NULL;
}
@@ -1607,8 +1606,14 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
break;
}
}
- /* Skip over entries which are dying/dead or in the wrong namespace */
- while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
+ return pos;
+}
+
+static struct kernfs_node *kernfs_dir_next_pos(
+ struct kernfs_node *parent, ino_t ino, struct kernfs_node *start)
+{
+ struct kernfs_node *pos = kernfs_dir_pos(parent, ino, start);
+ if (pos && (kernfs_sd_compare(pos, start) <= 0)) {
struct rb_node *node = rb_next(&pos->rb);
if (!node)
pos = NULL;
@@ -1618,27 +1623,11 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
return pos;
}

-static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
- struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
-{
- pos = kernfs_dir_pos(ns, parent, ino, pos);
- if (pos) {
- do {
- struct rb_node *node = rb_next(&pos->rb);
- if (!node)
- pos = NULL;
- else
- pos = rb_to_kn(node);
- } while (pos && (!kernfs_active(pos) || pos->ns != ns));
- }
- return pos;
-}
-
static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct kernfs_node *parent = kernfs_dentry_node(dentry);
- struct kernfs_node *pos = file->private_data;
+ struct kernfs_node *pos, *saved = file->private_data;
const void *ns = NULL;

if (!dir_emit_dots(file, ctx))
@@ -1648,23 +1637,30 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dentry->d_sb)->ns;

- for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
+ for (pos = kernfs_dir_pos(parent, ctx->pos, saved);
pos;
- pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
+ pos = kernfs_dir_next_pos(parent, ctx->pos, pos)) {
const char *name = pos->name;
unsigned int type = dt_type(pos);
int len = strlen(name);
ino_t ino = pos->id.ino;

- ctx->pos = pos->hash;
- file->private_data = pos;
- kernfs_get(pos);
+ /* Skip entries not fit for userspace */
+ if (!kernfs_active(pos) || !(pos->ns != ns))
+ continue;
+
+ kernfs_put(saved);
+ saved = pos;
+ ctx->pos = saved->hash;
+ file->private_data = saved;
+ kernfs_get(saved);

mutex_unlock(&kernfs_mutex);
if (!dir_emit(ctx, name, len, ino, type))
return 0;
mutex_lock(&kernfs_mutex);
}
+ kernfs_put(saved);
mutex_unlock(&kernfs_mutex);
file->private_data = NULL;
ctx->pos = INT_MAX;