[PATCH] poll/wait/md: allow module to safely support 'poll' on /proc files

From: NeilBrown
Date: Tue Mar 11 2014 - 22:36:58 EST



The md driver currently supports 'poll' on /proc/mdstat.
This is unsafe as if the md-mod module is removed while a 'poll'
or 'select' is outstanding on /proc/mdstat, an oops occurs
when the syscall completes.
poll_freewait() will call remove_wait_queue() on a wait_queue_head_t
which was local to the module which no-longer exists.

This problem is particular to /proc. Most filesystems do not
allow the module to be unloaded while any files are open on it.
/proc only blocks module unloading while a file_operations
call is currently active into the module, not while the file is open.
kernfs has this property too but kernfs allocates a wait_queue_head_t
in its internal data structures so the module doesn't need to provide
one.
(A previous patch to add a similar allocation to procfs was not
accepted).

This patch takes a different approach and allows a module to
disconnect the wait_queue_head_t that was passed to poll_wait()
from all the clients which are waiting on it. Thus after calling
proc_remove_entry("mdstat", NULL);
we simply call
wait_queue_purge(&md_event_waiters);

and then know that it is safe to remove the module.

rcu infrastructure is used to avoid races.
poll_freewait() checks if the purge has happened under rcu_read_lock()
to ensure that it never touches any freed memory. wait_queue_purge()
uses synchronize_rcu() to ensure no poll_freewait() could still be
looking at the wait_queue_head_t.

Reported-by: "majianpeng" <majianpeng@xxxxxxxxx>
Signed-off-by: NeilBrown <neilb@xxxxxxx>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4ad5cc4e63e8..e28c9d2a1166 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8681,6 +8681,7 @@ static __exit void md_exit(void)
unregister_reboot_notifier(&md_notifier);
unregister_sysctl_table(raid_table_header);
remove_proc_entry("mdstat", NULL);
+ wait_queue_purge(&md_event_waiters);
for_each_mddev(mddev, tmp) {
export_array(mddev);
mddev->hold_active = 0;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index af903128891c..a095312d01e2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -521,7 +521,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
/* If it is cleared by POLLFREE, it should be rcu-safe */
whead = rcu_dereference(pwq->whead);
if (whead)
- remove_wait_queue(whead, &pwq->wait);
+ remove_wait_queue_purgeable(whead, &pwq->wait);
rcu_read_unlock();
}

diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..7c35bcdbd94c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -129,7 +129,7 @@ EXPORT_SYMBOL(poll_initwait);

static void free_poll_entry(struct poll_table_entry *entry)
{
- remove_wait_queue(entry->wait_address, &entry->wait);
+ remove_wait_queue_purgeable(entry->wait_address, &entry->wait);
fput(entry->filp);
}

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 559044c79232..18d0d2fbf3bd 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -106,6 +106,8 @@ static inline int waitqueue_active(wait_queue_head_t *q)
extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
+extern void wait_queue_purge(wait_queue_head_t *q);
+extern void remove_wait_queue_purgeable(wait_queue_head_t *q, wait_queue_t *wait);

static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
{
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 7d50f794e248..12548730c6ed 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -52,6 +52,51 @@ void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
EXPORT_SYMBOL(remove_wait_queue);


+/**
+ * wait_queue_purge - remove all waiter from a wait_queue
+ * @q: The queue to be purged
+ *
+ * Unlink all pending waiters from the queue.
+ * This can be used prior to freeing a queue providing all waiters are
+ * prepared for queue purging.
+ * Waiters must call remove_wait_queue_puregeable() rather than
+ * remove_wait_queue().
+ *
+ */
+void wait_queue_purge(wait_queue_head_t *q)
+{
+ spin_lock(&q->lock);
+ while (!list_empty(&q->task_list))
+ list_del_init(q->task_list.next);
+ spin_unlock(&q->lock);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(wait_queue_purge);
+
+/**
+ * remove_wait_queue_puregeable - remove_wait_queue if wait_queue_purge might be used.
+ * @q: the queue, which may already be purged, to remove from
+ * @wait: the waiter to remove
+ *
+ * Remove a waiter from a queue if it hasn't already been purged.
+ * If the queue has already been purged then task_list will be empty.
+ * If it isn't then it is still safe to lock the queue and remove
+ * the task.
+ */
+void remove_wait_queue_purgeable(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ rcu_read_lock();
+ if (!list_empty(&wait->task_list)) {
+ spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ spin_unlock_irqrestore(&q->lock, flags);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(remove_wait_queue_purgeable);
+
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

Attachment: signature.asc
Description: PGP signature