[RFC PATCH 3/3] epoll: add read()/write()/ioctl() operations

From: Nathaniel Yazdani
Date: Sun Feb 02 2014 - 21:24:14 EST


The eventpoll implementation is largely interface-agnostic, aside from the
userspace structure format and epoll_ctl(). Particularly as each field of the
structure is handled independently, replacing usage of epoll_event internally
was straighforward and clarifies the code some. As for epoll_ctl(), its
functionality was moved into the new ep_eventpoll_write() function, and
epoll_ctl() just hands off its work to it. The ep_eventpoll_read() function is
very similar to epoll_wait(), which remains independent but shares the vast
majority of code for minimal redundancy. Finally, ep_eventpoll_ioctl() is a
simple interface to configure a default timeout for read() operations on the
given eventpoll.

Signed-off-by: Nathaniel Yazdani <n1ght.4nd.d4y@xxxxxxxxx>
---
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index af90312..7f0ce59 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -168,8 +168,11 @@ struct epitem {
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;

- /* The structure that describe the interested events and the source fd */
- struct epoll_event event;
+ /* Interested events */
+ int events;
+
+ /* The userspace identifier for this entry */
+ long long ident;
};

/*
@@ -216,6 +219,9 @@ struct eventpoll {

struct file *file;

+ /* Default timeout */
+ int timeout;
+
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
@@ -251,6 +257,13 @@ struct ep_send_events_data {
struct epoll_event __user *events;
};

+/* ep_scan_ready_list() callback data for ep_send_epes() */
+struct ep_send_epes_data
+{
+ int max;
+ struct epoll __user *epes;
+};
+
/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
@@ -795,9 +808,9 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)

static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
{
- pt->_key = epi->event.events;
+ pt->_key = epi->events;

- return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+ return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->events;
}

static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
@@ -881,8 +894,8 @@ static int ep_show_fdinfo(struct seq_file *m, struct file *f)
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);

ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
- epi->ffd.fd, epi->event.events,
- (long long)epi->event.data);
+ epi->ffd.fd, epi->events,
+ (long long)epi->ident);
if (ret)
break;
}
@@ -892,6 +905,15 @@ static int ep_show_fdinfo(struct seq_file *m, struct file *f)
}
#endif

+static ssize_t ep_eventpoll_write(struct file *file, const char __user *buf,
+ size_t bufsz, loff_t *pos);
+
+static ssize_t ep_eventpoll_read(struct file *file, char __user *buf,
+ size_t bufsz, loff_t *pos);
+
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
@@ -899,6 +921,9 @@ static const struct file_operations eventpoll_fops = {
#endif
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
+ .read = ep_eventpoll_read,
+ .write = ep_eventpoll_write,
+ .unlocked_ioctl = ep_eventpoll_ioctl,
.llseek = noop_llseek,
};

@@ -1025,7 +1050,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
- if (!(epi->event.events & ~EP_PRIVATE_BITS))
+ if (!(epi->events & ~EP_PRIVATE_BITS))
goto out_unlock;

/*
@@ -1034,7 +1059,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
- if (key && !((unsigned long) key & epi->event.events))
+ if (key && !((unsigned long) key & epi->events))
goto out_unlock;

/*
@@ -1264,7 +1289,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
/*
* Must be called with "mtx" held.
*/
-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+static int ep_insert(struct eventpoll *ep, long long ident, int events,
struct file *tfile, int fd, int full_check)
{
int error, revents, pwake = 0;
@@ -1285,10 +1310,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
- epi->event = *event;
+ epi->ident = ident;
+ epi->events = events;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
- if (epi->event.events & EPOLLWAKEUP) {
+ if (epi->events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
@@ -1338,7 +1364,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
spin_lock_irqsave(&ep->lock, flags);

/* If the file is already "ready" we drop it inside the ready list */
- if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
+ if ((revents & events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);

@@ -1392,7 +1418,7 @@ error_create_wakeup_source:
* Modify the interest event mask by dropping an event if the new mask
* has a match in the current file status. Must be called with "mtx" held.
*/
-static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
+static int ep_modify(struct eventpoll *ep, struct epitem *epi, long long ident, int events)
{
int pwake = 0;
unsigned int revents;
@@ -1405,9 +1431,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* otherwise we might miss an event that happens between the
* f_op->poll() call and the new event set registering.
*/
- epi->event.events = event->events; /* need barrier below */
- epi->event.data = event->data; /* protected by mtx */
- if (epi->event.events & EPOLLWAKEUP) {
+ epi->events = events; /* need barrier below */
+ epi->ident = ident; /* protected by mtx */
+ if (epi->events & EPOLLWAKEUP) {
if (!ep_has_wakeup_source(epi))
ep_create_wakeup_source(epi);
} else if (ep_has_wakeup_source(epi)) {
@@ -1444,7 +1470,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* If the item is "hot" and it is not registered inside the ready
* list, push it inside.
*/
- if (revents & event->events) {
+ if (revents & events) {
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1516,16 +1542,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
*/
if (revents) {
if (__put_user(revents, &uevent->events) ||
- __put_user(epi->event.data, &uevent->data)) {
+ __put_user(epi->ident, &uevent->data)) {
list_add(&epi->rdllink, head);
ep_pm_stay_awake(epi);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
uevent++;
- if (epi->event.events & EPOLLONESHOT)
- epi->event.events &= EP_PRIVATE_BITS;
- else if (!(epi->event.events & EPOLLET)) {
+ if (epi->events & EPOLLONESHOT)
+ epi->events &= EP_PRIVATE_BITS;
+ else if (!(epi->events & EPOLLET)) {
/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
@@ -1546,17 +1572,103 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
return eventcnt;
}

-static int ep_send_events(struct eventpoll *ep,
- struct epoll_event __user *events, int maxevents)
+static int ep_send_events(struct eventpoll *ep, void __user *buf, size_t bufsz)
{
struct ep_send_events_data esed;

- esed.maxevents = maxevents;
- esed.events = events;
+ esed.maxevents = bufsz / sizeof(struct epoll_event);
+ esed.events = buf;

return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}

+/*
+ * Mostly biolerplate code from ep_send_events_proc(), but much cleaner to put
+ * in a separate function.
+ */
+static int ep_send_epes_proc(struct eventpoll *ep, struct list_head *head,
+ void *priv)
+{
+ struct ep_send_epes_data *esed = priv;
+ unsigned int revents, i;
+ struct epitem *epi;
+ struct wakeup_source *ws;
+ poll_table pt;
+
+ init_poll_funcptr(&pt, NULL);
+
+ /*
+ * We can loop without lock because we are passed a task private list.
+ * Items cannot vanish during the loop because ep_scan_ready_list() is
+ * holding "mtx" during this call.
+ */
+ for (i = 0; !list_empty(head) && i < esed->max; ++i) {
+ epi = list_first_entry(head, struct epitem, rdllink);
+
+ /*
+ * Activate ep->ws before deactivating epi->ws to prevent
+ * triggering auto-suspend here (in case we reactive epi->ws
+ * below).
+ *
+ * This could be rearranged to delay the deactivation of epi->ws
+ * instead, but then epi->ws would temporarily be out of sync
+ * with ep_is_linked().
+ */
+ ws = ep_wakeup_source(epi);
+ if (ws) {
+ if (ws->active)
+ __pm_stay_awake(ep->ws);
+ __pm_relax(ws);
+ }
+
+ list_del_init(&epi->rdllink);
+
+ revents = ep_item_poll(epi, &pt);
+
+ /*
+ * If the event mask intersect the caller-requested one,
+ * deliver the event to userspace. Again, ep_scan_ready_list()
+ * is holding "mtx", so no operations coming from userspace
+ * can change the item.
+ */
+ if (revents) {
+ if (__put_user(revents, &esed->epes[i].ep_events) ||
+ __put_user(epi->ident, &esed->epes[i].ep_ident) ||
+ __put_user(epi->ffd.fd, &esed->epes[i].ep_fildes)) {
+ list_add(&epi->rdllink, head);
+ ep_pm_stay_awake(epi);
+ return i ? i : -EFAULT;
+ }
+ if (epi->events & EPOLLONESHOT)
+ epi->events &= EP_PRIVATE_BITS;
+ else if (!(epi->events & EPOLLET)) {
+ /*
+ * If this file has been added with Level
+ * Trigger mode, we need to insert back inside
+ * the ready list, so that the next call to
+ * epoll_wait() will check again the events
+ * availability. At this point, no one can insert
+ * into ep->rdllist besides us. The epoll_ctl()
+ * callers are locked out by
+ * ep_scan_ready_list() holding "mtx" and the
+ * poll callback will queue them in ep->ovflist.
+ */
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake(epi);
+ }
+ }
+ }
+
+ return i;
+}
+
+static int ep_send_epes(struct eventpoll *ep, void __user *buf, size_t bufsz)
+{
+ struct ep_send_epes_data esed = { .max = bufsz / sizeof(struct epoll),
+ .epes = buf };
+ return ep_scan_ready_list(ep, ep_send_epes_proc, &esed, 0, false);
+}
+
static inline struct timespec ep_set_mstimeout(long ms)
{
struct timespec now, ts = {
@@ -1581,12 +1693,14 @@ static inline struct timespec ep_set_mstimeout(long ms)
* while if the @timeout is less than zero, the function will block
* until at least one event has been retrieved (or an error
* occurred).
+ * @sender: Function to call to send ready events to userspace.
*
* Returns: Returns the number of ready events which have been fetched, or an
* error code, in case of error.
*/
-static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
- int maxevents, long timeout)
+static int ep_poll(struct eventpoll *ep, void __user *buffer, size_t length,
+ long timeout, int (*sender)(struct eventpoll *,
+ void __user *, size_t))
{
int res = 0, eavail, timed_out = 0;
unsigned long flags;
@@ -1658,7 +1772,7 @@ check_events:
* more luck.
*/
if (!res && eavail &&
- !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
+ !(res = sender(ep, buffer, length)) && !timed_out)
goto fetch_events;

return res;
@@ -1761,6 +1875,213 @@ static void clear_tfile_check_list(void)
INIT_LIST_HEAD(&tfile_check_list);
}

+/**
+ *
+ * ep_eventpoll_write - Create, remove, or modify events to poll for. The epoll
+ * file distinguishes between events by file descriptor,
+ * but it will also store a user-defined identifier along
+ * with it. To modify an existing event, simply set
+ * ->ep_fildes to the target file desciptor and set
+ * ->ep_ident and ->ep_events to whatever values you wish
+ * to change them to. To remove an event, set ->ep_fildes
+ * to the relevant file descriptor and clear ->ep_events.
+ *
+ * @file: The epoll file being acted upon.
+ * @buf: Array of 'struct epoll' entries, to be inserted, modified, or removed
+ * from the epoll file depending on their contents.
+ * @bufsz: Number of 'struct epoll' entries in buffer times the size of the
+ * structure.
+ * @pos: Ignored, epoll files behave like character devices.
+ *
+ * Returns: The number of bytes from the userspace buffer successfully processed,
+ * always a multiple of sizeof(struct epoll), or an error code if the
+ * buffer is ill-aligned or inaccessible (nothing will have been
+ * processed).
+ */
+static ssize_t ep_eventpoll_write(struct file *file, const char __user *buf,
+ size_t bufsz, loff_t *pos)
+{
+ struct eventpoll *ep = file->private_data, *tep = NULL;
+ struct epitem *epi;
+ struct file *target;
+ const struct epoll __user *epes = (const struct epoll __user *)buf;
+ struct epoll epe;
+ bool full_check = false;
+ size_t num = bufsz / sizeof(struct epoll); /* Ignore any excess */
+ int i;
+
+ if (!access_ok(VERIFY_READ, buf, bufsz))
+ return -EFAULT;
+
+ for (i = 0; i < num; ++i) {
+
+ if (copy_from_user(&epe, &epes[i], sizeof(struct epoll)))
+ goto out;
+
+ target = fget(epe.ep_fildes);
+ if (target < 0)
+ goto out;
+
+ /* The target file descriptor must support poll */
+ if (!target->f_op || !target->f_op->poll)
+ goto out_fput;
+
+ /* Check if EPOLLWAKEUP is allowed */
+ if ((epe.ep_events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
+ epe.ep_events &= ~EPOLLWAKEUP;
+
+ /* We do not permit adding an epoll file descriptor inside itself. */
+ if (target == file)
+ goto out_fput;
+
+ mutex_lock_nested(&ep->mtx, 0);
+
+ /* Try to lookup the file inside our RB tree */
+ epi = ep_find(ep, target, epe.ep_fildes);
+
+ /*
+ * When we insert an epoll file descriptor, inside another epoll
+ * file descriptor, there is the chance of creating closed loops,
+ * which are better handled here, than in more critical paths.
+ * While we are checking for loops we also determine the list of
+ * files reachable and hang them on the tfile_check_list, so we
+ * can check that we haven't created too many possible wakeup
+ * paths.
+ *
+ * We do not need to take the global 'epumutex' to ep_insert()
+ * when the epoll file descriptor is attaching directly to a
+ * wakeup source, unless the epoll file descriptor is nested.
+ * The purpose of taking the 'epmutex' on add is to prevent
+ * complex toplogies such as loops and deep wakeup paths from
+ * forming in parallel through multiple ep_insert() operations.
+ */
+
+ if (epe.ep_events && !epi) {
+ /* add this epoll entry */
+ if (!list_empty(&file->f_ep_links) ||
+ is_file_epoll(target)) {
+ full_check = true;
+ mutex_unlock(&ep->mtx);
+ mutex_lock(&epmutex);
+ if (is_file_epoll(target) &&
+ ep_loop_check(ep, target) != 0) {
+ clear_tfile_check_list();
+ goto out_fput;
+ } else if (!is_file_epoll(target)) {
+ list_add(&target->f_tfile_llink,
+ &tfile_check_list);
+ }
+ mutex_lock_nested(&ep->mtx, 0);
+ if (is_file_epoll(target)) {
+ tep = target->private_data;
+ mutex_lock_nested(&tep->mtx, 1);
+ }
+ }
+ epe.ep_events |= POLLERR | POLLHUP;
+ if (ep_insert(ep, epe.ep_ident, epe.ep_events, target,
+ epe.ep_fildes, full_check))
+ goto out_unlock;
+ if (full_check)
+ clear_tfile_check_list();
+ } else if (epe.ep_events && epi) {
+ /* modify this epoll entry */
+ epe.ep_events |= POLLERR | POLLHUP;
+ if (ep_modify(ep, epi, epe.ep_ident, epe.ep_events))
+ goto out_unlock;
+ } else if (!epe.ep_events && epi) {
+ /* delete this epoll entry */
+ if (is_file_epoll(target)) {
+ tep = target->private_data;
+ mutex_lock_nested(&tep->mtx, 1);
+ }
+ if (is_file_epoll(target))
+ mutex_lock_nested(&tep->mtx, 1);
+ if (ep_remove(ep, epi))
+ goto out_unlock;
+ }
+
+ if (tep)
+ mutex_unlock(&tep->mtx);
+ tep = NULL;
+ mutex_unlock(&ep->mtx);
+ if (full_check)
+ mutex_unlock(&epmutex);
+ fput(target);
+ }
+ goto out;
+
+out_unlock:
+ if (tep)
+ mutex_unlock(&tep->mtx);
+ mutex_unlock(&ep->mtx);
+ if (full_check)
+ mutex_unlock(&epmutex);
+out_fput:
+ fput(target);
+out:
+ return i * sizeof(struct epoll);
+}
+
+/**
+ *
+ * ep_eventpoll_read - Read triggered events from an epoll file, delivered to
+ * userspace in 'struct epoll' packets. At most, as many
+ * events that wholly fit within the buffer are returned,
+ * with less being returned if the read times out.
+ *
+ * @file: The epoll file to retrieve events from.
+ * @buf: Preallocated buffer into which the kernel will store epoll entries.
+ * @bufsz: Size of buffer, which ought to be in multiples of the epoll entry
+ * structure. If not, the kernel will store as many structs as will
+ * wholly fit within the provided buffer, not exceeding EP_MAX_EVENTS.
+ * @pos: Ignored, epoll behaves like a character device.
+ *
+ * Returns: The number of triggered epoll entries multiplied by the size of the
+ * epoll entry structure.
+ */
+ssize_t ep_eventpoll_read(struct file *file, char __user *buf, size_t bufsz,
+ loff_t *pos)
+{
+ struct eventpoll *ep = file->private_data;
+ int tmp;
+
+ /* The event buffer must be of a reasonable size */
+ if (bufsz / sizeof(struct epoll) == 0 ||
+ bufsz / sizeof(struct epoll) > EP_MAX_EVENTS)
+ return -EINVAL;
+
+ /* Verify that the area passed by the user is writeable */
+ if (!access_ok(VERIFY_WRITE, buf, bufsz))
+ return -EFAULT;
+
+ /* Time to fish for events ... */
+ tmp = ep_poll(file->private_data, buf, bufsz, ep->timeout,
+ ep_send_epes);
+ return tmp < 0 ? tmp : (ssize_t)tmp * sizeof(struct epoll);
+}
+
+/*
+ * ep_eventpoll_ioctl - configure an eventpoll's behavior.
+ *
+ * @cmd: An EPIOC_* control command.
+ * @arg: A pointer whose type depends on @cmd (usually int).
+ *
+ * Returns: 0 on success or an errno code.
+ */
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct eventpoll *ep = file->private_data;
+ switch (cmd) {
+ case EPIOC_GETTIMEOUT:
+ return put_user(ep->timeout, (int __user *)arg);
+ case EPIOC_SETTIMEOUT:
+ return get_user(ep->timeout, (int __user *)arg);
+ default:
+ return -EINVAL;
+ }
+}
+
/*
* Open an eventpoll file descriptor.
*/
@@ -1775,6 +2096,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)

if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
+ flags |= O_RDWR;
+
/*
* Create the internal data structure ("struct eventpoll").
*/
@@ -1785,19 +2108,19 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+ fd = get_unused_fd_flags(flags);
if (fd < 0) {
error = fd;
goto out_free_ep;
}
- file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
- O_RDWR | (flags & O_CLOEXEC));
+ file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, flags);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
+ ep->timeout = -1; /* infinite (i.e., no) timeout by default */
return fd;

out_free_fd:
@@ -1823,137 +2146,27 @@ SYSCALL_DEFINE1(epoll_create, int, size)
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
- int error;
- int full_check = 0;
- struct fd f, tf;
- struct eventpoll *ep;
- struct epitem *epi;
- struct epoll_event epds;
- struct eventpoll *tep = NULL;
-
- error = -EFAULT;
- if (ep_op_has_event(op) &&
- copy_from_user(&epds, event, sizeof(struct epoll_event)))
- goto error_return;
-
- error = -EBADF;
- f = fdget(epfd);
- if (!f.file)
- goto error_return;
-
- /* Get the "struct file *" for the target file */
- tf = fdget(fd);
- if (!tf.file)
- goto error_fput;
-
- /* The target file descriptor must support poll */
- error = -EPERM;
- if (!tf.file->f_op->poll)
- goto error_tgt_fput;
-
- /* Check if EPOLLWAKEUP is allowed */
- ep_take_care_of_epollwakeup(&epds);
-
- /*
- * We have to check that the file structure underneath the file descriptor
- * the user passed to us _is_ an eventpoll file. And also we do not permit
- * adding an epoll file descriptor inside itself.
- */
- error = -EINVAL;
- if (f.file == tf.file || !is_file_epoll(f.file))
- goto error_tgt_fput;
-
- /*
- * At this point it is safe to assume that the "private_data" contains
- * our own data structure.
- */
- ep = f.file->private_data;
-
- /*
- * When we insert an epoll file descriptor, inside another epoll file
- * descriptor, there is the change of creating closed loops, which are
- * better be handled here, than in more critical paths. While we are
- * checking for loops we also determine the list of files reachable
- * and hang them on the tfile_check_list, so we can check that we
- * haven't created too many possible wakeup paths.
- *
- * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
- * the epoll file descriptor is attaching directly to a wakeup source,
- * unless the epoll file descriptor is nested. The purpose of taking the
- * 'epmutex' on add is to prevent complex toplogies such as loops and
- * deep wakeup paths from forming in parallel through multiple
- * EPOLL_CTL_ADD operations.
- */
- mutex_lock_nested(&ep->mtx, 0);
- if (op == EPOLL_CTL_ADD) {
- if (!list_empty(&f.file->f_ep_links) ||
- is_file_epoll(tf.file)) {
- full_check = 1;
- mutex_unlock(&ep->mtx);
- mutex_lock(&epmutex);
- if (is_file_epoll(tf.file)) {
- error = -ELOOP;
- if (ep_loop_check(ep, tf.file) != 0) {
- clear_tfile_check_list();
- goto error_tgt_fput;
- }
- } else
- list_add(&tf.file->f_tfile_llink,
- &tfile_check_list);
- mutex_lock_nested(&ep->mtx, 0);
- if (is_file_epoll(tf.file)) {
- tep = tf.file->private_data;
- mutex_lock_nested(&tep->mtx, 1);
- }
- }
- }
+ struct epoll epe = { .ep_fildes = fd };
+ struct file *file = fget(epfd);
+ int err;

- /*
- * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
- * above, we can be sure to be able to use the item looked up by
- * ep_find() till we release the mutex.
- */
- epi = ep_find(ep, tf.file, fd);
+ err = -EBADF;
+ if (!file || !is_file_epoll(file))
+ goto out;

- error = -EINVAL;
- switch (op) {
- case EPOLL_CTL_ADD:
- if (!epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tf.file, fd, full_check);
- } else
- error = -EEXIST;
- if (full_check)
- clear_tfile_check_list();
- break;
- case EPOLL_CTL_DEL:
- if (epi)
- error = ep_remove(ep, epi);
- else
- error = -ENOENT;
- break;
- case EPOLL_CTL_MOD:
- if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
- } else
- error = -ENOENT;
- break;
- }
- if (tep != NULL)
- mutex_unlock(&tep->mtx);
- mutex_unlock(&ep->mtx);
-
-error_tgt_fput:
- if (full_check)
- mutex_unlock(&epmutex);
-
- fdput(tf);
-error_fput:
- fdput(f);
-error_return:
-
- return error;
+ err = -EFAULT;
+ if (ep_op_has_event(op) &&
+ (get_user(epe.ep_events, (int *)&event->events) ||
+ get_user(epe.ep_ident, (long long *)&event->data)))
+ goto out;
+
+ err = ep_eventpoll_write(file, (const char *)&epe,
+ sizeof(struct epoll), NULL);
+ if (!err)
+ err = -EBADF;
+out:
+ fput(file);
+ return err < 0 ? err : 0;
}

/*
@@ -1995,7 +2208,8 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
ep = f.file->private_data;

/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, timeout);
+ error = ep_poll(ep, events, maxevents * sizeof(struct epoll_event),
+ timeout, ep_send_events);

error_fput:
fdput(f);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/