[RFC] 6 sys_poll/sys_select performance patches

From: Manfred Spraul (manfred@colorfullife.com)
Date: Wed Nov 20 2002 - 18:00:08 EST


Attached are 6 patches that try to improve the performance of sys_poll
and sys_select:

- avoid dynamic memory allocations, stack storage is sufficient for most
callers and faster.
- use the wakeup callbacks and use that info to speed up the 2nd scan
for new events.

What do you think? Are there any apps/tests/benchmarks that stress
sys_poll or sys_select?

The first 3 patches replace dynamic memory allocations with stack storage.
The 4th and 5th patch use wait queue callbacks for a more efficient 2nd
scan.
The 6th patch merges common code.

The patch is against 2.5.48 - they do boot on my laptop, but that's all
I can guarantee.

--
    Manfred

// $Header$ // Kernel Version: // VERSION = 2 // PATCHLEVEL = 5 // SUBLEVEL = 48 // EXTRAVERSION = --- 2.5/include/linux/poll.h 2002-11-19 23:07:35.000000000 +0100 +++ build-2.5/include/linux/poll.h 2002-11-19 23:12:11.000000000 +0100 @@ -35,10 +35,20 @@ /* * Structures and helpers for sys_poll/sys_poll */ +struct poll_table_entry { + struct file *filp; + wait_queue_t wait; + wait_queue_head_t *wait_address; +}; + +#define POLL_TABLE_INTERNAL 6 + struct poll_wqueues { poll_table pt; - struct poll_table_page * table; int error; + int nr; + struct poll_table_entry internal[POLL_TABLE_INTERNAL]; + struct poll_table_page *table; }; extern void poll_initwait(struct poll_wqueues *pwq); --- 2.5/fs/select.c 2002-11-19 23:07:35.000000000 +0100 +++ build-2.5/fs/select.c 2002-11-19 23:10:50.000000000 +0100 @@ -26,12 +26,6 @@ #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) -struct poll_table_entry { - struct file * filp; - wait_queue_t wait; - wait_queue_head_t * wait_address; -}; - struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; @@ -53,18 +47,36 @@ * as all select/poll functions have to call it to add an entry to the * poll table. */ -void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); +static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->error = 0; + pwq->nr = 0; pwq->table = NULL; } +/* + * Dynamic memory allocation is expensive, avoid it by + * saving a few bytes on the stack: + * - the poll table contains 6 wait queue entries. This means that no dynamic + * memory allocation is necessary for the wait queues if one of the first + * 6 file descriptors has new data. + * <manfred@colorfullife.com> + */ void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; + struct poll_table_entry *entry; + + entry = pwq->internal + pwq->nr; + while(pwq->nr > 0) { + pwq->nr--; + entry--; + remove_wait_queue(entry->wait_address,&entry->wait); + fput(entry->filp); + } while (p) { struct poll_table_entry * entry; struct poll_table_page *old; @@ -77,43 +89,46 @@ } while (entry > p->entries); old = p; p = p->next; - free_page((unsigned long) old); + kfree(old); } } -void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p) +static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p) { struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); - struct poll_table_page *table = p->table; - - if (!table || POLL_TABLE_FULL(table)) { - struct poll_table_page *new_table; - - new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); - if (!new_table) { - p->error = -ENOMEM; - __set_current_state(TASK_RUNNING); - return; + struct poll_table_entry *entry; + + if(p->nr < POLL_TABLE_INTERNAL) { + entry = p->internal+p->nr++; + } else { + struct poll_table_page *table = p->table; + + if (!table || POLL_TABLE_FULL(table)) { + struct poll_table_page *new_table; + + new_table = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!new_table) { + p->error = -ENOMEM; + __set_current_state(TASK_RUNNING); + return; + } + new_table->entry = new_table->entries; + new_table->next = table; + p->table = new_table; + table = new_table; } - new_table->entry = new_table->entries; - new_table->next = table; - p->table = new_table; - table = new_table; + entry = table->entry; + table->entry = entry+1; } /* Add a new entry */ - { - struct poll_table_entry * entry = table->entry; - table->entry = entry+1; - get_file(filp); - entry->filp = filp; - entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); - add_wait_queue(wait_address,&entry->wait); - } + get_file(filp); + entry->filp = filp; + entry->wait_address = wait_address; + init_waitqueue_entry(&entry->wait, current); + add_wait_queue(wait_address,&entry->wait); } - #define __IN(fds, n) (fds->in + n) #define __OUT(fds, n) (fds->out + n) #define __EX(fds, n) (fds->ex + n)

// $Header$ // Kernel Version: // VERSION = 2 // PATCHLEVEL = 5 // SUBLEVEL = 48 // EXTRAVERSION = --- 2.5/fs/select.c 2002-11-20 19:08:58.000000000 +0100 +++ build-2.5/fs/select.c 2002-11-20 19:00:11.000000000 +0100 @@ -62,6 +62,7 @@ * - the poll table contains 6 wait queue entries. This means that no dynamic * memory allocation is necessary for the wait queues if one of the first * 6 file descriptors has new data. + * - sys_select saves 192 bytes on the stack, enough for 256 file descriptors. * <manfred@colorfullife.com> */ @@ -261,14 +262,18 @@ return retval; } -static void *select_bits_alloc(int size) +#define SELECT_INLINE_BYTES 32 +static inline void *select_bits_alloc(int size, void *internal) { + if(size <= SELECT_INLINE_BYTES) + return internal; return kmalloc(6 * size, GFP_KERNEL); } -static void select_bits_free(void *bits, int size) +static inline void select_bits_free(void *bits, void *internal) { - kfree(bits); + if(bits != internal) + kfree(bits); } /* @@ -286,6 +291,7 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp) { fd_set_bits fds; + char ibuf[6*SELECT_INLINE_BYTES]; char *bits; long timeout; int ret, size, max_fdset; @@ -325,7 +331,7 @@ */ ret = -ENOMEM; size = FDS_BYTES(n); - bits = select_bits_alloc(size); + bits = select_bits_alloc(size, ibuf); if (!bits) goto out_nofds; fds.in = (unsigned long *) bits; @@ -370,7 +376,7 @@ set_fd_set(n, exp, fds.res_ex); out: - select_bits_free(bits, size); + select_bits_free(bits, ibuf); out_nofds: return ret; }

// $Header$ // Kernel Version: // VERSION = 2 // PATCHLEVEL = 5 // SUBLEVEL = 48 // EXTRAVERSION = --- 2.5/fs/select.c 2002-11-20 00:32:46.000000000 +0100 +++ build-2.5/fs/select.c 2002-11-20 00:33:46.000000000 +0100 @@ -63,6 +63,7 @@ * memory allocation is necessary for the wait queues if one of the first * 6 file descriptors has new data. * - sys_select saves 192 bytes on the stack, enough for 256 file descriptors. + * - sys_poll saves 190 byte of user space buffers on the stack. * <manfred@colorfullife.com> */ @@ -381,7 +382,13 @@ return ret; } -#define POLLFD_PER_PAGE ((PAGE_SIZE) / sizeof(struct pollfd)) +struct poll_list { + struct poll_list *next; + int len; + struct pollfd entries[0]; +}; + +#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) static void do_pollfd(unsigned int num, struct pollfd * fdpage, poll_table ** pwait, int *count) @@ -415,21 +422,23 @@ } } -static int do_poll(unsigned int nfds, unsigned int nchunks, unsigned int nleft, - struct pollfd *fds[], struct poll_wqueues *wait, long timeout) +static int do_poll(unsigned int nfds, struct poll_list *list, + struct poll_wqueues *wait, long timeout) { - int count; + int count = 0; poll_table* pt = &wait->pt; + if (!timeout) + pt = NULL; + for (;;) { - unsigned int i; - + struct poll_list *walk; set_current_state(TASK_INTERRUPTIBLE); - count = 0; - for (i=0; i < nchunks; i++) - do_pollfd(POLLFD_PER_PAGE, fds[i], &pt, &count); - if (nleft) - do_pollfd(nleft, fds[nchunks], &pt, &count); + walk = list; + while(walk != NULL) { + do_pollfd( walk->len, walk->entries, &pt, &count); + walk = walk->next; + } pt = NULL; if (count || !timeout || signal_pending(current)) break; @@ -442,12 +451,17 @@ return count; } +#define INLINE_POLL_COUNT ((190+sizeof(struct pollfd))/sizeof(struct pollfd)) asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout) { - int i, j, fdcount, err; - struct pollfd **fds; - struct poll_wqueues table, *wait; - int nchunks, nleft; + struct poll_wqueues table; + int fdcount, err; + unsigned int i; + struct { + struct poll_list head; + struct pollfd entries[INLINE_POLL_COUNT]; + } polldata; + struct poll_list *pollwalk; /* Do a sanity check on nfds ... */ if (nfds > NR_OPEN) @@ -462,68 +476,60 @@ } poll_initwait(&table); - wait = &table; - if (!timeout) - wait = NULL; + polldata.head.next = NULL; + polldata.head.len = INLINE_POLL_COUNT; + if(nfds <= INLINE_POLL_COUNT) + polldata.head.len = nfds; + pollwalk = &polldata.head; + i = nfds; err = -ENOMEM; - fds = NULL; - if (nfds != 0) { - fds = (struct pollfd **)kmalloc( - (1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *), - GFP_KERNEL); - if (fds == NULL) - goto out; - } - - nchunks = 0; - nleft = nfds; - while (nleft > POLLFD_PER_PAGE) { /* allocate complete PAGE_SIZE chunks */ - fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL); - if (fds[nchunks] == NULL) + goto start; + while(i!=0) { + struct poll_list *pp; + pp = kmalloc(sizeof(struct poll_list)+ + sizeof(struct pollfd)* + (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i), + GFP_KERNEL); + if(pp==NULL) goto out_fds; - nchunks++; - nleft -= POLLFD_PER_PAGE; - } - if (nleft) { /* allocate last PAGE_SIZE chunk, only nleft elements used */ - fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL); - if (fds[nchunks] == NULL) + pp->next=NULL; + pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i); + pollwalk->next = pp; + pollwalk = pp; +start: + if (copy_from_user(pollwalk+1, ufds + nfds-i, + sizeof(struct pollfd)*pollwalk->len)) { + err = -EFAULT; goto out_fds; + } + i -= pollwalk->len; } - - err = -EFAULT; - for (i=0; i < nchunks; i++) - if (copy_from_user(fds[i], ufds + i*POLLFD_PER_PAGE, PAGE_SIZE)) - goto out_fds1; - if (nleft) { - if (copy_from_user(fds[nchunks], ufds + nchunks*POLLFD_PER_PAGE, - nleft * sizeof(struct pollfd))) - goto out_fds1; - } - - fdcount = do_poll(nfds, nchunks, nleft, fds, wait, timeout); + fdcount = do_poll(nfds, &polldata.head, &table, timeout); /* OK, now copy the revents fields back to user space. */ - for(i=0; i < nchunks; i++) - for (j=0; j < POLLFD_PER_PAGE; j++, ufds++) - __put_user((fds[i] + j)->revents, &ufds->revents); - if (nleft) - for (j=0; j < nleft; j++, ufds++) - __put_user((fds[nchunks] + j)->revents, &ufds->revents); - + pollwalk = &polldata.head; + err = -EFAULT; + while(pollwalk != NULL) { + struct pollfd *fds = pollwalk->entries; + int j; + + for (j=0; j < pollwalk->len; j++, ufds++) { + if(__put_user(fds[j].revents, &ufds->revents)) + goto out_fds; + } + pollwalk = pollwalk->next; + } err = fdcount; if (!fdcount && signal_pending(current)) err = -EINTR; - -out_fds1: - if (nleft) - free_page((unsigned long)(fds[nchunks])); out_fds: - for (i=0; i < nchunks; i++) - free_page((unsigned long)(fds[i])); - if (nfds != 0) - kfree(fds); -out: + pollwalk = polldata.head.next; + while(pollwalk!=NULL) { + struct poll_list *pp = pollwalk->next; + kfree(pollwalk); + pollwalk = pp; + } poll_freewait(&table); return err; }

// $Header$ // Kernel Version: // VERSION = 2 // PATCHLEVEL = 5 // SUBLEVEL = 48 // EXTRAVERSION = --- 2.5/include/linux/poll.h 2002-11-20 22:58:18.000000000 +0100 +++ build-2.5/include/linux/poll.h 2002-11-20 22:51:15.000000000 +0100 @@ -37,6 +37,8 @@ */ struct poll_table_entry { struct file *filp; + int woken; + void *handle; wait_queue_t wait; wait_queue_head_t *wait_address; }; @@ -47,8 +49,10 @@ poll_table pt; int error; int nr; + void *handle; struct poll_table_entry internal[POLL_TABLE_INTERNAL]; struct poll_table_page *table; + struct poll_table_page *last; }; extern void poll_initwait(struct poll_wqueues *pwq); --- 2.5/fs/select.c 2002-11-20 22:58:18.000000000 +0100 +++ build-2.5/fs/select.c 2002-11-20 22:58:01.000000000 +0100 @@ -55,6 +55,7 @@ pwq->error = 0; pwq->nr = 0; pwq->table = NULL; + pwq->last = NULL; } /* * Dynamic memory allocation is expensive, avoid it by @@ -95,6 +96,15 @@ } } +static int poll_wake_func(wait_queue_t *wait, unsigned mode, int sync) +{ + struct poll_table_entry *entry = container_of(wait, struct poll_table_entry, wait); + entry->woken = 1; + mb(); + wake_up_process(wait->task); + return 0; +} + static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p) { struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); @@ -103,7 +113,7 @@ if(p->nr < POLL_TABLE_INTERNAL) { entry = p->internal+p->nr++; } else { - struct poll_table_page *table = p->table; + struct poll_table_page *table = p->last; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; @@ -115,8 +125,12 @@ return; } new_table->entry = new_table->entries; - new_table->next = table; - p->table = new_table; + new_table->next = NULL; + if (table) + table->next = new_table; + else + p->table = new_table; + p->last = new_table; table = new_table; } entry = table->entry; @@ -127,7 +141,10 @@ get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + entry->woken = 0; + entry->handle = p->handle; + init_waitqueue_func_entry(&entry->wait, poll_wake_func); + entry->wait.task = current; add_wait_queue(wait_address,&entry->wait); } @@ -189,6 +206,51 @@ #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) +static int parse_mask(fd_set_bits *fds, unsigned long mask, unsigned long off, unsigned long bit) +{ + int retval = 0; + + if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) { + SET(bit, __RES_IN(fds,off)); + retval++; + } + if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) { + SET(bit, __RES_OUT(fds,off)); + retval++; + } + if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) { + SET(bit, __RES_EX(fds,off)); + retval++; + } + return retval; +} + +static int scan_entries(fd_set_bits *fds, struct poll_table_entry *entries, int size, int *prev) +{ + int i; + int retval = 0; + for (i=0;i<size;i++) { + unsigned long mask; + int fd; + if (!entries[i].woken) + continue; + entries[i].woken = 0; + mb(); + /* drivers are permitted to use multiple waitqueues, but we must + * not double-account that in the return value + */ + fd = (int)entries[i].handle; + if (fd == *prev) + continue; + *prev = fd; + mask = DEFAULT_POLLMASK; + if (entries[i].filp->f_op && entries[i].filp->f_op->poll) + mask = entries[i].filp->f_op->poll(entries[i].filp, NULL); + retval += parse_mask(fds, mask, fd/__NFDBITS, BIT(fd)); + } + return retval; +} + int do_select(int n, fd_set_bits *fds, long *timeout) { struct poll_wqueues table; @@ -209,41 +271,35 @@ if (!__timeout) wait = NULL; retval = 0; - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - for (i = 0 ; i < n; i++) { - unsigned long bit = BIT(i); - unsigned long mask; - struct file *file; - - off = i / __NFDBITS; - if (!(bit & BITS(fds, off))) - continue; - file = fget(i); - mask = POLLNVAL; - if (file) { - mask = DEFAULT_POLLMASK; - if (file->f_op && file->f_op->poll) - mask = file->f_op->poll(file, wait); - fput(file); - } - if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) { - SET(bit, __RES_IN(fds,off)); - retval++; - wait = NULL; - } - if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) { - SET(bit, __RES_OUT(fds,off)); - retval++; - wait = NULL; - } - if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) { - SET(bit, __RES_EX(fds,off)); - retval++; - wait = NULL; - } + /* step one: build the wait table */ + set_current_state(TASK_INTERRUPTIBLE); + for (i = 0 ; i < n; i++) { + unsigned long bit = BIT(i); + unsigned long mask; + struct file *file; + + off = i / __NFDBITS; + if (!(bit & BITS(fds, off))) + continue; + file = fget(i); + mask = POLLNVAL; + if (file) { + mask = DEFAULT_POLLMASK; + table.handle = (void*)i; + if (file->f_op && file->f_op->poll) + mask = file->f_op->poll(file, wait); + fput(file); } - wait = NULL; + retval += parse_mask(fds, mask, off, bit); + if (retval) + wait = NULL; + } + wait = NULL; + /* step two: now scan through the wait queues, that's faster + * than the bit lookup */ + for (;;) { + struct poll_table_page *pg; + int prev; if (retval || !__timeout || signal_pending(current)) break; if(table.error) { @@ -251,6 +307,14 @@ break; } __timeout = schedule_timeout(__timeout); + set_current_state(TASK_INTERRUPTIBLE); + prev = -1; + retval += scan_entries(fds, table.internal, table.nr, &prev); + pg = table.table; + while (pg) { + retval += scan_entries(fds, pg->entries, pg->entry-pg->entries, &prev); + pg = pg->next; + } } current->state = TASK_RUNNING;

// $Header$ // Kernel Version: // VERSION = 2 // PATCHLEVEL = 5 // SUBLEVEL = 48 // EXTRAVERSION = --- 2.5/fs/select.c 2002-11-20 22:59:34.000000000 +0100 +++ build-2.5/fs/select.c 2002-11-20 22:59:25.000000000 +0100 @@ -225,7 +225,7 @@ return retval; } -static int scan_entries(fd_set_bits *fds, struct poll_table_entry *entries, int size, int *prev) +static int scan_select_entries(fd_set_bits *fds, struct poll_table_entry *entries, int size, int *prev) { int i; int retval = 0; @@ -294,7 +294,6 @@ if (retval) wait = NULL; } - wait = NULL; /* step two: now scan through the wait queues, that's faster * than the bit lookup */ for (;;) { @@ -309,10 +308,10 @@ __timeout = schedule_timeout(__timeout); set_current_state(TASK_INTERRUPTIBLE); prev = -1; - retval += scan_entries(fds, table.internal, table.nr, &prev); + retval = scan_select_entries(fds, table.internal, table.nr, &prev); pg = table.table; while (pg) { - retval += scan_entries(fds, pg->entries, pg->entry-pg->entries, &prev); + retval += scan_select_entries(fds, pg->entries, pg->entry-pg->entries, &prev); pg = pg->next; } } @@ -455,7 +454,7 @@ #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) static void do_pollfd(unsigned int num, struct pollfd * fdpage, - poll_table ** pwait, int *count) + struct poll_wqueues **pwait, int *count) { int i; @@ -471,9 +470,11 @@ struct file * file = fget(fd); mask = POLLNVAL; if (file != NULL) { + if (*pwait) + (*pwait)->handle = fdp; mask = DEFAULT_POLLMASK; if (file->f_op && file->f_op->poll) - mask = file->f_op->poll(file, *pwait); + mask = file->f_op->poll(file, &(*pwait)->pt); mask &= fdp->events | POLLERR | POLLHUP; fput(file); } @@ -486,30 +487,74 @@ } } +static int scan_poll_entries(struct poll_table_entry *entries, int size, void **prev) +{ + int retval = 0; + int i; + + for (i=0;i<size;i++) { + unsigned long mask; + struct pollfd *fdp; + + if (!entries[i].woken) + continue; + entries[i].woken = 0; + mb(); + /* drivers are permitted to use multiple waitqueues, but we must + * not double-account that in the return value + */ + fdp = entries[i].handle; + if (fdp == *prev) + continue; + *prev = fdp; + mask = DEFAULT_POLLMASK; + if (entries[i].filp->f_op && entries[i].filp->f_op->poll) + mask = entries[i].filp->f_op->poll(entries[i].filp, NULL); + mask &= fdp->events | POLLERR | POLLHUP; + if (mask) { + retval++; + fdp->revents = mask; + } + } + return retval; +} + static int do_poll(unsigned int nfds, struct poll_list *list, struct poll_wqueues *wait, long timeout) { + struct poll_list *walk; int count = 0; - poll_table* pt = &wait->pt; + struct poll_wqueues *pt = wait; if (!timeout) pt = NULL; - + + set_current_state(TASK_INTERRUPTIBLE); + walk = list; + while(walk != NULL) { + do_pollfd( walk->len, walk->entries, &pt, &count); + walk = walk->next; + } + pt = NULL; for (;;) { - struct poll_list *walk; - set_current_state(TASK_INTERRUPTIBLE); - walk = list; - while(walk != NULL) { - do_pollfd( walk->len, walk->entries, &pt, &count); - walk = walk->next; - } - pt = NULL; + struct poll_table_page *pg; + void *prev; + if (count || !timeout || signal_pending(current)) break; count = wait->error; if (count) break; timeout = schedule_timeout(timeout); + + set_current_state(TASK_INTERRUPTIBLE); + prev = NULL; + count = scan_poll_entries(wait->internal, wait->nr, &prev); + pg = wait->table; + while (pg) { + count += scan_poll_entries(pg->entries, pg->entry-pg->entries, &prev); + pg = pg->next; + } } current->state = TASK_RUNNING; return count;

// $Header$ // Kernel Version: // VERSION = 2 // PATCHLEVEL = 5 // SUBLEVEL = 48 // EXTRAVERSION = --- 2.5/fs/select.c 2002-11-20 23:25:46.000000000 +0100 +++ build-2.5/fs/select.c 2002-11-20 23:21:51.000000000 +0100 @@ -225,13 +225,16 @@ return retval; } -static int scan_select_entries(fd_set_bits *fds, struct poll_table_entry *entries, int size, int *prev) +static int scan_entries(struct poll_table_entry *entries, int size, void **prev, + int (*actor)(void *priv, void *handle, unsigned long mask), void *priv) { - int i; int retval = 0; + int i; + for (i=0;i<size;i++) { unsigned long mask; - int fd; + void *handle; + if (!entries[i].woken) continue; entries[i].woken = 0; @@ -239,18 +242,51 @@ /* drivers are permitted to use multiple waitqueues, but we must * not double-account that in the return value */ - fd = (int)entries[i].handle; - if (fd == *prev) + handle = entries[i].handle; + if (handle == *prev) continue; - *prev = fd; + *prev = handle; mask = DEFAULT_POLLMASK; if (entries[i].filp->f_op && entries[i].filp->f_op->poll) mask = entries[i].filp->f_op->poll(entries[i].filp, NULL); - retval += parse_mask(fds, mask, fd/__NFDBITS, BIT(fd)); + retval += actor(priv, handle, mask); + } return retval; } +static int wait_for_events(struct poll_wqueues *wait, long *timeout, + int (*actor)(void *priv, void *handle, unsigned long mask), void *priv) +{ + int count = 0; + for (;;) { + struct poll_table_page *pg; + void *prev; + if (count || !(*timeout) || signal_pending(current)) + break; + count = wait->error; + if (count) + break; + *timeout = schedule_timeout(*timeout); + + set_current_state(TASK_INTERRUPTIBLE); + prev = NULL; + count = scan_entries(wait->internal, wait->nr, &prev, actor, priv); + pg = wait->table; + while (pg) { + count += scan_entries(pg->entries, pg->entry-pg->entries, &prev, actor, priv); + pg = pg->next; + } + } + return count; +} + +static int select_actor(void *priv, void *handle, unsigned long mask) +{ + int fd = (int)handle; + return parse_mask(priv, mask, fd/__NFDBITS, BIT(fd)); +} + int do_select(int n, fd_set_bits *fds, long *timeout) { struct poll_wqueues table; @@ -294,27 +330,8 @@ if (retval) wait = NULL; } - /* step two: now scan through the wait queues, that's faster - * than the bit lookup */ - for (;;) { - struct poll_table_page *pg; - int prev; - if (retval || !__timeout || signal_pending(current)) - break; - if(table.error) { - retval = table.error; - break; - } - __timeout = schedule_timeout(__timeout); - set_current_state(TASK_INTERRUPTIBLE); - prev = -1; - retval = scan_select_entries(fds, table.internal, table.nr, &prev); - pg = table.table; - while (pg) { - retval += scan_select_entries(fds, pg->entries, pg->entry-pg->entries, &prev); - pg = pg->next; - } - } + if (!retval && __timeout) + retval = wait_for_events(&table, &__timeout, select_actor, fds); current->state = TASK_RUNNING; poll_freewait(&table); @@ -487,36 +504,16 @@ } } -static int scan_poll_entries(struct poll_table_entry *entries, int size, void **prev) +int poll_actor(void *priv, void *handle, unsigned long mask) { - int retval = 0; - int i; - - for (i=0;i<size;i++) { - unsigned long mask; - struct pollfd *fdp; + struct pollfd *fdp = handle; - if (!entries[i].woken) - continue; - entries[i].woken = 0; - mb(); - /* drivers are permitted to use multiple waitqueues, but we must - * not double-account that in the return value - */ - fdp = entries[i].handle; - if (fdp == *prev) - continue; - *prev = fdp; - mask = DEFAULT_POLLMASK; - if (entries[i].filp->f_op && entries[i].filp->f_op->poll) - mask = entries[i].filp->f_op->poll(entries[i].filp, NULL); - mask &= fdp->events | POLLERR | POLLHUP; - if (mask) { - retval++; - fdp->revents = mask; - } + mask &= fdp->events | POLLERR | POLLHUP; + if (mask) { + fdp->revents = mask; + return 1; } - return retval; + return 0; } static int do_poll(unsigned int nfds, struct poll_list *list, @@ -535,27 +532,8 @@ do_pollfd( walk->len, walk->entries, &pt, &count); walk = walk->next; } - pt = NULL; - for (;;) { - struct poll_table_page *pg; - void *prev; - - if (count || !timeout || signal_pending(current)) - break; - count = wait->error; - if (count) - break; - timeout = schedule_timeout(timeout); - - set_current_state(TASK_INTERRUPTIBLE); - prev = NULL; - count = scan_poll_entries(wait->internal, wait->nr, &prev); - pg = wait->table; - while (pg) { - count += scan_poll_entries(pg->entries, pg->entry-pg->entries, &prev); - pg = pg->next; - } - } + if (!count && timeout) + count = wait_for_events(wait, &timeout, poll_actor, NULL); current->state = TASK_RUNNING; return count; }

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sat Nov 23 2002 - 22:00:34 EST