[PATCH] files: rcu free files_struct

From: Eric W. Biederman
Date: Wed Dec 09 2020 - 13:06:23 EST



This makes it safe to deference task->files with just rcu_read_lock
held, removing the need to take task_lock.

Signed-off-by: Eric W. Biederman <ebiederm@xxxxxxxxxxxx>
---

I have cleaned up my patch a little more and made this a little
more explicitly rcu. I took a completley different approach to
documenting the use of rcu_head than we described that seems a little
more robust.

fs/file.c | 53 ++++++++++++++++++++++++++---------------
include/linux/fdtable.h | 1 +
kernel/fork.c | 4 ++--
3 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 412033d8cfdf..88064f185560 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -379,7 +379,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
return NULL;
}

-static struct fdtable *close_files(struct files_struct * files)
+static void close_files(struct files_struct * files)
{
/*
* It is safe to dereference the fd table without RCU or
@@ -397,8 +397,9 @@ static struct fdtable *close_files(struct files_struct * files)
set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
+ struct file * file = fdt->fd[i];
if (file) {
+ rcu_assign_pointer(fdt->fd[i], NULL);
filp_close(file, files);
cond_resched();
}
@@ -407,19 +408,35 @@ static struct fdtable *close_files(struct files_struct * files)
set >>= 1;
}
}
+}

- return fdt;
+static void free_files_struct_rcu(struct rcu_head *rcu)
+{
+ struct files_struct *files =
+ container_of(rcu, struct files_struct, fdtab.rcu);
+ struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+
+ /* free the arrays if they are not embedded */
+ if (fdt != &files->fdtab)
+ __free_fdtable(fdt);
+ kmem_cache_free(files_cachep, files);
}

void put_files_struct(struct files_struct *files)
{
if (atomic_dec_and_test(&files->count)) {
- struct fdtable *fdt = close_files(files);
-
- /* free the arrays if they are not embedded */
- if (fdt != &files->fdtab)
- __free_fdtable(fdt);
- kmem_cache_free(files_cachep, files);
+ close_files(files);
+ /*
+ * The rules for using the rcu_head in fdtable:
+ *
+ * If the fdtable is being freed independently
+ * of the files_struct fdtable->rcu is used.
+ *
+ * When the fdtable and files_struct are freed
+ * together the rcu_head from the fdtable embedded in
+ * files_struct is used.
+ */
+ call_rcu(&files->fdtab.rcu, free_files_struct_rcu);
}
}

@@ -822,12 +839,14 @@ EXPORT_SYMBOL(fget_raw);

struct file *fget_task(struct task_struct *task, unsigned int fd)
{
+ struct files_struct *files;
struct file *file = NULL;

- task_lock(task);
- if (task->files)
- file = __fget_files(task->files, fd, 0, 1);
- task_unlock(task);
+ rcu_read_lock();
+ files = rcu_dereference(task->files);
+ if (files)
+ file = __fget_files(files, fd, 0, 1);
+ rcu_read_unlock();

return file;
}
@@ -838,11 +857,9 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
struct files_struct *files;
struct file *file = NULL;

- task_lock(task);
- files = task->files;
+ files = rcu_dereference(task->files);
if (files)
file = files_lookup_fd_rcu(files, fd);
- task_unlock(task);

return file;
}
@@ -854,8 +871,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
unsigned int fd = *ret_fd;
struct file *file = NULL;

- task_lock(task);
- files = task->files;
+ files = rcu_dereference(task->files);
if (files) {
for (; fd < files_fdtable(files)->max_fds; fd++) {
file = files_lookup_fd_rcu(files, fd);
@@ -863,7 +879,6 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
break;
}
}
- task_unlock(task);
*ret_fd = fd;
return file;
}
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index d0e78174874a..37dcface020f 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -30,6 +30,7 @@ struct fdtable {
unsigned long *close_on_exec;
unsigned long *open_fds;
unsigned long *full_fds_bits;
+ /* Used for freeing fdtable and any containing files_struct */
struct rcu_head rcu;
};

diff --git a/kernel/fork.c b/kernel/fork.c
index 4d0ae6f827df..eca474a1586a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2982,7 +2982,7 @@ int ksys_unshare(unsigned long unshare_flags)

if (new_fd) {
fd = current->files;
- current->files = new_fd;
+ rcu_assign_pointer(current->files, new_fd);
new_fd = fd;
}

@@ -3035,7 +3035,7 @@ int unshare_files(void)

old = task->files;
task_lock(task);
- task->files = copy;
+ rcu_assign_pointer(task->files, copy);
task_unlock(task);
put_files_struct(old);
return 0;
--
2.20.1