Re: [patch 1/2] ufd v1 - unsequential O(1) fdmap core

From: Eric Dumazet
Date: Tue Jun 05 2007 - 18:30:31 EST


Ingo Molnar a écrit :

no, i just wanted to make a demonstration that one can be pretty nasty in on-lkml replies while being technically correct :-) I think you went a bit overboard in your replies to Davide. Lets move this back into constructive channels, ok? :)

In any case, here is one preliminary patch to show what I had in mind.

I only had time to compile it (its very late here), not even boot tested, so dont try it !

[PATCH] reduce max latency of get_unused_fd().

Goal is to scan at most 4096 bytes (or 32768 bits) in the open_fds bitmap.

Processes that have many file descriptors might have to scan 128 KB of ram to find a zero bit. Thats about 100 us on modern machines.

This patch introduces an array of counters. Each counter gives the number of 'one' bits in a 4 KB section of the open_fds bitmap.

I chose to statically allocate this array of counters, being very small (64 bytes), so a dynamic allocation would only add complexity.

As a result, max latency is 4 us (same latency on x86 when vm gives you a new page)

Signed-off-by: Eric Dumazet <dada1@xxxxxxxxxxxxx>
---
fs/fcntl.c | 18 +++++++++++++++---
fs/file.c | 6 +++---
fs/open.c | 13 +++++++++++++
include/linux/file.h | 11 +++++++++++
include/linux/fs.h | 2 --
include/linux/init_task.h | 1 +
kernel/fork.c | 1 +
7 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/include/linux/file.h b/include/linux/file.h
index a59001e..ec6b120 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -36,6 +36,16 @@ struct fdtable {
};

/*
+ * To avoid big latencies in get_unused_fd(),
+ * we maintain counters of "one" bits in bitmap pages
+ * we define a 'page' here to contain 32768 bits,
+ * so that each counter is an unsigned short
+ * with MAX_NR_OPENS = 2^20, we get 32 counters : 64 bytes
+ */
+#define FDSBITS 32768
+#define MAX_NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
+
+/*
* Open file table structure
*/
struct files_struct {
@@ -50,6 +60,7 @@ struct files_struct {
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
int next_fd;
+ unsigned short fds_counter[(MAX_NR_OPEN + (FDSBITS - 1)) / FDSBITS];
struct embedded_fd_set close_on_exec_init;
struct embedded_fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8e382a5..5257ba6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -61,6 +61,7 @@ static int locate_fd(struct files_struct
unsigned int start;
int error;
struct fdtable *fdt;
+ unsigned int page_nr;

error = -EINVAL;
if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
@@ -77,11 +78,19 @@ repeat:
start = files->next_fd;

newfd = start;
- if (start < fdt->max_fds)
+
+ error = -EMFILE;
+ if (start < fdt->max_fds) {
+ page_nr = start / FDSBITS;
+ while (files->fds_counter[page_nr] == FDSBITS) {
+ page_nr++;
+ start = page_nr * FDSBITS;
+ if (start >= fdt->max_fds)
+ goto out;
+ }
newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
fdt->max_fds, start);
-
- error = -EMFILE;
+ }
if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
goto out;

@@ -122,6 +131,7 @@ static int dupfd(struct file *file, unsi
/* locate_fd() may have expanded fdtable, load the ptr */
fdt = files_fdtable(files);
FD_SET(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]++;
FD_CLR(fd, fdt->close_on_exec);
spin_unlock(&files->file_lock);
fd_install(fd, file);
@@ -171,7 +181,9 @@ asmlinkage long sys_dup2(unsigned int ol

rcu_assign_pointer(fdt->fd[newfd], file);
FD_SET(newfd, fdt->open_fds);
+ files->fds_counter[newfd / FDSBITS]++;
FD_CLR(newfd, fdt->close_on_exec);
+
spin_unlock(&files->file_lock);

if (tofree)
diff --git a/fs/file.c b/fs/file.c
index c5575de..7dbd9c5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,8 +147,8 @@ static struct fdtable * alloc_fdtable(un
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
- if (nr > NR_OPEN)
- nr = NR_OPEN;
+ if (nr > MAX_NR_OPEN)
+ nr = MAX_NR_OPEN;

fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
if (!fdt)
@@ -233,7 +233,7 @@ int expand_files(struct files_struct *fi
if (nr < fdt->max_fds)
return 0;
/* Can we expand? */
- if (nr >= NR_OPEN)
+ if (nr >= MAX_NR_OPEN)
return -EMFILE;

/* All good, so we try */
diff --git a/fs/open.c b/fs/open.c
index 0d515d1..340e69b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -860,12 +860,23 @@ int get_unused_fd(void)
struct files_struct * files = current->files;
int fd, error;
struct fdtable *fdt;
+ unsigned int page_nr;

error = -EMFILE;
spin_lock(&files->file_lock);

repeat:
fdt = files_fdtable(files);
+ page_nr = files->next_fd / FDSBITS;
+ /*
+ * We can avoid testing big chunks of memory if all bit are set
+ */
+ while (files->fds_counter[page_nr] == FDSBITS) {
+ page_nr++;
+ files->next_fd = page_nr * FDSBITS;
+ if (files->next_fd >= fdt->max_fds)
+ break;
+ }
fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
files->next_fd);

@@ -891,6 +902,7 @@ repeat:
}

FD_SET(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]++;
FD_CLR(fd, fdt->close_on_exec);
files->next_fd = fd + 1;
#if 1
@@ -913,6 +925,7 @@ static void __put_unused_fd(struct files
{
struct fdtable *fdt = files_fdtable(files);
__FD_CLR(fd, fdt->open_fds);
+ files->fds_counter[fd / FDSBITS]--;
if (fd < files->next_fd)
files->next_fd = fd;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ae77c..9db2799 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,8 +20,6 @@ #include <linux/ioctl.h>
*/

/* Fixed constants first: */
-#undef NR_OPEN
-#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
#define INR_OPEN 1024 /* Initial setting for nfile rlimits */

#define BLOCK_SIZE_BITS 10
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 276ccaa..bd24190 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -26,6 +26,7 @@ #define INIT_FILES \
.fdtab = INIT_FDTABLE, \
.file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), \
.next_fd = 0, \
+ .fds_counter = {0}, \
.close_on_exec_init = { { 0, } }, \
.open_fds_init = { { 0, } }, \
.fd_array = { NULL, } \
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cd..f4341c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -641,6 +641,7 @@ static struct files_struct *alloc_files(

spin_lock_init(&newf->file_lock);
newf->next_fd = 0;
+ memset(newf->fds_counter, 0, sizeof(newf->fds_counter));
fdt = &newf->fdtab;
fdt->max_fds = NR_OPEN_DEFAULT;
fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;