repost: dynamic fd array patch for 2.1.103

Bill Hawes (whawes@star.net)
Tue, 02 Jun 1998 15:17:20 -0400


This is a multi-part message in MIME format.
--------------337FD1A0EE568B2E614751C8
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

This is my previously-posted dynamic fd array patch in a clean diff
against 2.1.103. It provides substantial kernel memory savings
(typically 120K or more) and should provide faster forking speed as
well.

The patch was primarily intended to save memory and reduce forking
overhead, but some people have expressed an interest in using it to
extend the fd limit beyond the hard-wired 1024 (NR_OPEN) limit. It
should work as well for that, assuming that the other problems with
increasing NR_OPEN have been taken care of (libc issues, etc.) The patch
allows most processes to remain with the default sized fd array while
those that need more can expand to the NR_OPEN limit, so you can
increase NR_OPEN without taking a big hit on memory for evry process.

For those interested in experimenting with increasing NR_OPEN, a couple
of things to watch:
(1) the patch uses kmalloc when the array needs to be larger than one
page, so you'll need to have enough contiguous memory to allocate this
way. (Presumably the machines you'll be using this with will have plenty
of memory.)

(2) There's an oddity with bash whereby it uses fd 255 for some internal
purposes, so processes descended from a bash shell will expand to
accomodate this. As you probably don't want all of these to inherit the
maximum size, setting the NR_OPEN_DEFAULT value to 256 is probably
advisable.

The patch (in its default form) is well tested, as I've been using it
here since about 2.1.90 or so with no problems. It's highly recommended
for anyone who wants to save memory or fork faster :-)

Regards,
Bill
--------------337FD1A0EE568B2E614751C8
Content-Type: text/plain; charset=us-ascii; name="fork_files103-patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="fork_files103-patch"

--- linux-2.1.103/include/linux/sched.h.old Thu May 21 14:05:27 1998
+++ linux-2.1.103/include/linux/sched.h Thu May 21 15:07:48 1998
@@ -121,7 +121,7 @@

asmlinkage void schedule(void);

-
+#define NR_OPEN_DEFAULT 32
/*
* Open file table structure
*/
@@ -131,6 +131,7 @@
struct file ** fd; /* current fd array */
fd_set close_on_exec;
fd_set open_fds;
+ struct file * fd_array[NR_OPEN_DEFAULT];
};

#define INIT_FILES { \
@@ -138,7 +139,8 @@
NR_OPEN, \
&init_fd_array[0], \
{ { 0, } }, \
- { { 0, } } \
+ { { 0, } }, \
+ { NULL, } \
}

struct fs_struct {
@@ -586,6 +588,13 @@
mm->count++;
}
extern void mmput(struct mm_struct *);
+
+/*
+ * Routines for handling the fd arrays
+ */
+extern struct file ** alloc_fd_array(int);
+extern int expand_fd_array(struct files_struct *);
+extern void free_fd_array(struct file **, int);

extern int copy_thread(int, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
extern void flush_thread(void);
--- linux-2.1.103/kernel/fork.c.old Sun May 17 12:19:40 1998
+++ linux-2.1.103/kernel/fork.c Sun May 24 21:28:14 1998
@@ -378,11 +382,74 @@
return __copy_fdset(dst->fds_bits, src->fds_bits);
}

+/*
+ * Allocate an fd array, using get_free_page() if possible.
+ */
+struct file ** alloc_fd_array(int num)
+{
+ struct file **new_fds;
+ int size = num * sizeof(struct file *);
+
+ if (size == PAGE_SIZE)
+ new_fds = (struct file **) __get_free_page(GFP_KERNEL);
+ else
+ new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
+ if (new_fds)
+ memset((void *) new_fds, 0, size);
+ return new_fds;
+}
+
+/*
+ * Expand the fd array in the files_struct.
+ */
+int expand_fd_array(struct files_struct *files)
+{
+ struct file **new_fds;
+ int error, nfds;
+
+ error = -EMFILE;
+ if (files->max_fds >= NR_OPEN)
+ goto out;
+
+ /* Expand to the max in one step */
+ nfds = NR_OPEN;
+
+ error = -ENOMEM;
+ new_fds = alloc_fd_array(nfds);
+ if (!new_fds)
+ goto out;
+
+ /* Copy the existing array and install the new pointer */
+ if (nfds > files->max_fds) {
+ int i;
+ for (i = files->max_fds; i--; )
+ new_fds[i] = files->fd[i];
+ files->fd = new_fds;
+ files->max_fds = nfds;
+ } else {
+ /* Somebody expanded the array while we slept ... */
+ free_fd_array(new_fds, nfds);
+ }
+ error = 0;
+out:
+ return error;
+}
+
+void free_fd_array(struct file **array, int num)
+{
+ int size = num * sizeof(struct file *);
+
+ if (size == PAGE_SIZE)
+ free_page((unsigned long) array);
+ else
+ kfree(array);
+}
+
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
struct file **old_fds, **new_fds;
- int size, i, error = 0;
+ int nfds, i, error = 0;

/*
* A background process may not have any files ...
@@ -402,24 +469,31 @@
if (!newf)
goto out;

- /*
- * Allocate the fd array, using get_free_page() if possible.
- * Eventually we want to make the array size variable ...
- */
- size = NR_OPEN * sizeof(struct file *);
- if (size == PAGE_SIZE)
- new_fds = (struct file **) __get_free_page(GFP_KERNEL);
- else
- new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
- if (!new_fds)
- goto out_release;
- memset((void *) new_fds, 0, size);
-
newf->count = 1;
- newf->max_fds = NR_OPEN;
- newf->fd = new_fds;
newf->close_on_exec = oldf->close_on_exec;
i = copy_fdset(&newf->open_fds, &oldf->open_fds);
+#if 1
+ /* Do a sanity check ... */
+ if (i > oldf->max_fds)
+ printk("copy_files: pid %d, open files %d exceeds max %d!\n",
+ current->pid, i, oldf->max_fds);
+#endif
+
+ /*
+ * Check whether we need to allocate a larger fd array.
+ * Note: we're not a clone task, so the open count won't
+ * change.
+ */
+ new_fds = &newf->fd_array[0];
+ nfds = NR_OPEN_DEFAULT;
+ if (i > nfds) {
+ nfds = NR_OPEN;
+ new_fds = alloc_fd_array(nfds);
+ if (!new_fds)
+ goto out_release;
+ }
+ newf->max_fds = nfds;
+ newf->fd = new_fds;

old_fds = oldf->fd;
for (; i != 0; i--) {
--- linux-2.1.103/kernel/exit.c.old Sun May 17 12:21:08 1998
+++ linux-2.1.103/kernel/exit.c Thu May 21 15:07:38 1998
@@ -191,12 +191,10 @@
if (!--files->count) {
close_files(files);
/*
- * Free the fd array as appropriate ...
+ * Free the fd array if we expanded it.
*/
- if (NR_OPEN * sizeof(struct file *) == PAGE_SIZE)
- free_page((unsigned long) files->fd);
- else
- kfree(files->fd);
+ if (files->fd != &files->fd_array[0])
+ free_fd_array(files->fd, files->max_fds);
kmem_cache_free(files_cachep, files);
}
}
--- linux-2.1.103/fs/open.c.old Sun May 17 12:21:49 1998
+++ linux-2.1.103/fs/open.c Thu May 21 15:07:20 1998
@@ -689,6 +689,7 @@
struct files_struct * files = current->files;
int fd, error;

+repeat:
error = -EMFILE;
fd = find_first_zero_bit(&files->open_fds, NR_OPEN);
/*
@@ -697,8 +698,15 @@
*/
if (fd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
goto out;
-
- /* Check here for fd > files->max_fds to do dynamic expansion */
+ /*
+ * Check whether we need to expand the fd array.
+ */
+ if (fd >= files->max_fds) {
+ error = expand_fd_array(files);
+ if (!error)
+ goto repeat;
+ goto out;
+ }

FD_SET(fd, &files->open_fds);
FD_CLR(fd, &files->close_on_exec);
--- linux-2.1.103/fs/fcntl.c.old Sun Mar 22 11:30:36 1998
+++ linux-2.1.103/fs/fcntl.c Thu May 21 15:07:24 1998
@@ -20,14 +20,15 @@

extern int sock_fcntl (struct file *, unsigned int cmd, unsigned long arg);

-static inline int dupfd(unsigned int fd, unsigned int arg)
+static inline int dupfd(unsigned int fd, unsigned int in_arg)
{
struct files_struct * files = current->files;
struct file * file;
+ unsigned int arg;
int error;

error = -EINVAL;
- if (arg >= NR_OPEN)
+ if (in_arg >= NR_OPEN)
goto out;

error = -EBADF;
@@ -35,10 +36,21 @@
if (!file)
goto out;

+repeat:
error = -EMFILE;
- arg = find_next_zero_bit(&files->open_fds, NR_OPEN, arg);
+ arg = find_next_zero_bit(&files->open_fds, NR_OPEN, in_arg);
if (arg >= current->rlim[RLIMIT_NOFILE].rlim_cur)
goto out_putf;
+ /*
+ * Check whether we need to expand the fd array.
+ */
+ if (arg >= files->max_fds) {
+ error = expand_fd_array(files);
+ if (!error)
+ goto repeat;
+ goto out_putf;
+ }
+
FD_SET(arg, &files->open_fds);
FD_CLR(arg, &files->close_on_exec);
fd_install(arg, file);
@@ -58,12 +70,12 @@
lock_kernel();
if (!fcheck(oldfd))
goto out;
+ if (newfd >= NR_OPEN)
+ goto out; /* following POSIX.1 6.2.1 */
+
err = newfd;
if (newfd == oldfd)
goto out;
- err = -EBADF;
- if (newfd >= NR_OPEN)
- goto out; /* following POSIX.1 6.2.1 */

sys_close(newfd);
err = dupfd(oldfd, newfd);
@@ -119,6 +131,7 @@
filp = fget(fd);
if (!filp)
goto out;
+
err = 0;
switch (cmd) {
case F_DUPFD:
@@ -159,7 +172,6 @@
err = filp->f_owner.pid;
break;
case F_SETOWN:
- err = 0;
filp->f_owner.pid = arg;
filp->f_owner.uid = current->uid;
filp->f_owner.euid = current->euid;
@@ -168,10 +180,9 @@
break;
default:
/* sockets need a few special fcntls. */
+ err = -EINVAL;
if (S_ISSOCK (filp->f_dentry->d_inode->i_mode))
err = sock_fcntl (filp, cmd, arg);
- else
- err = -EINVAL;
break;
}
fput(filp);
--- linux-2.1.103/fs/proc/array.c.old Sun May 17 12:19:37 1998
+++ linux-2.1.103/fs/proc/array.c Thu May 21 15:07:33 1998
@@ -656,11 +656,14 @@
"Pid:\t%d\n"
"PPid:\t%d\n"
"Uid:\t%d\t%d\t%d\t%d\n"
- "Gid:\t%d\t%d\t%d\t%d\n",
+ "Gid:\t%d\t%d\t%d\t%d\n"
+ "FDSize:\t%d\n",
get_task_state(p),
- p->pid, p->p_pptr->pid,
+ p->pid,
+ p->p_pptr->pid,
p->uid, p->euid, p->suid, p->fsuid,
- p->gid, p->egid, p->sgid, p->fsgid);
+ p->gid, p->egid, p->sgid, p->fsgid,
+ p->files ? p->files->max_fds : 0);
return buffer;
}

@@ -942,7 +945,8 @@
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
- statm_pte_range(pmd, address, end - address, pages, shared, dirty, total);
+ statm_pte_range(pmd, address, end - address,
+ pages, shared, dirty, total);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
@@ -952,7 +956,8 @@
int * pages, int * shared, int * dirty, int * total)
{
while (address < end) {
- statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total);
+ statm_pmd_range(pgd, address, end - address,
+ pages, shared, dirty, total);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgd++;
}
@@ -960,7 +965,7 @@

static int get_statm(int pid, char * buffer)
{
- struct task_struct *tsk = find_task_by_pid(pid);
+ struct task_struct *tsk;
int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;

read_lock(&tasklist_lock);
@@ -975,7 +980,8 @@
pgd_t *pgd = pgd_offset(tsk->mm, vma->vm_start);
int pages = 0, shared = 0, dirty = 0, total = 0;

- statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
+ statm_pgd_range(pgd, vma->vm_start, vma->vm_end,
+ &pages, &shared, &dirty, &total);
resident += pages;
share += shared;
dt += dirty;

--------------337FD1A0EE568B2E614751C8--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu