[KVM PATCH v3 3/3] KVM: Fix races in irqfd using new eventfd_kref_getinterface

From: Gregory Haskins
Date: Mon Jun 22 2009 - 12:06:58 EST


This patch fixes all known races in irqfd, and paves the way to restore
DEASSIGN support. For details of the eventfd races, please see the patch
presumably commited immediately prior to this one.

In a nutshell, we use eventfd_kref_get/put() to properly manage the
lifetime of the underlying eventfd. We also use careful coordination
with our workqueue to ensure that all irqfd objects have terminated
before we allow kvm to shutdown. The logic used for shutdown walks
all open irqfds and releases them. This logic can be generalized in
the future to allow a subset of irqfds to be released, thus allowing
DEASSIGN support.

Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
---

virt/kvm/eventfd.c | 144 ++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 110 insertions(+), 34 deletions(-)

diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 9656027..67985cd 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -28,6 +28,7 @@
#include <linux/file.h>
#include <linux/list.h>
#include <linux/eventfd.h>
+#include <linux/kref.h>

/*
* --------------------------------------------------------------------
@@ -36,26 +37,68 @@
* Credit goes to Avi Kivity for the original idea.
* --------------------------------------------------------------------
*/
+
+enum {
+ irqfd_flags_shutdown,
+};
+
struct _irqfd {
struct kvm *kvm;
+ struct kref *eventfd;
int gsi;
struct list_head list;
poll_table pt;
wait_queue_head_t *wqh;
wait_queue_t wait;
- struct work_struct inject;
+ struct work_struct work;
+ unsigned long flags;
};

static void
-irqfd_inject(struct work_struct *work)
+irqfd_release(struct _irqfd *irqfd)
+{
+ eventfd_kref_put(irqfd->eventfd);
+ kfree(irqfd);
+}
+
+static void
+irqfd_work(struct work_struct *work)
{
- struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+ struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
struct kvm *kvm = irqfd->kvm;

- mutex_lock(&kvm->irq_lock);
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
- mutex_unlock(&kvm->irq_lock);
+ if (!test_bit(irqfd_flags_shutdown, &irqfd->flags)) {
+ /* Inject an interrupt */
+ mutex_lock(&kvm->irq_lock);
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ mutex_unlock(&kvm->irq_lock);
+ } else {
+ /* shutdown the irqfd */
+ struct _irqfd *_irqfd = NULL;
+
+ mutex_lock(&kvm->lock);
+
+ if (!list_empty(&irqfd->list))
+ _irqfd = irqfd;
+
+ if (_irqfd)
+ list_del(&_irqfd->list);
+
+ mutex_unlock(&kvm->lock);
+
+ /*
+ * If the item is not currently on the irqfds list, we know
+ * we are running concurrently with the KVM side trying to
+ * remove this item as well. Since the KVM side should be
+ * holding the reference now, and will block behind a
+ * flush_work(), lets just let them do the release() for us
+ */
+ if (!_irqfd)
+ return;
+
+ irqfd_release(_irqfd);
+ }
}

static int
@@ -65,25 +108,20 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
unsigned long flags = (unsigned long)key;

/*
- * Assume we will be called with interrupts disabled
+ * called with interrupts disabled
*/
- if (flags & POLLIN)
- /*
- * Defer the IRQ injection until later since we need to
- * acquire the kvm->lock to do so.
- */
- schedule_work(&irqfd->inject);
-
if (flags & POLLHUP) {
/*
- * for now, just remove ourselves from the list and let
- * the rest dangle. We will fix this up later once
- * the races in eventfd are fixed
+ * ordering is important: shutdown flag must be visible
+ * before we schedule
*/
__remove_wait_queue(irqfd->wqh, &irqfd->wait);
- irqfd->wqh = NULL;
+ set_bit(irqfd_flags_shutdown, &irqfd->flags);
}

+ if (flags & (POLLHUP | POLLIN))
+ schedule_work(&irqfd->work);
+
return 0;
}

@@ -102,6 +140,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
{
struct _irqfd *irqfd;
struct file *file = NULL;
+ struct kref *kref = NULL;
int ret;
unsigned int events;

@@ -112,7 +151,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
irqfd->kvm = kvm;
irqfd->gsi = gsi;
INIT_LIST_HEAD(&irqfd->list);
- INIT_WORK(&irqfd->inject, irqfd_inject);
+ INIT_WORK(&irqfd->work, irqfd_work);

file = eventfd_fget(fd);
if (IS_ERR(file)) {
@@ -133,11 +172,13 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
list_add_tail(&irqfd->list, &kvm->irqfds);
mutex_unlock(&kvm->lock);

- /*
- * Check if there was an event already queued
- */
- if (events & POLLIN)
- schedule_work(&irqfd->inject);
+ kref = eventfd_kref_get(file);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto fail;
+ }
+
+ irqfd->eventfd = kref;

/*
* do not drop the file until the irqfd is fully initialized, otherwise
@@ -145,9 +186,18 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
*/
fput(file);

+ /*
+ * Check if there was an event already queued
+ */
+ if (events & POLLIN)
+ schedule_work(&irqfd->work);
+
return 0;

fail:
+ if (kref && !IS_ERR(kref))
+ eventfd_kref_put(kref);
+
if (file && !IS_ERR(file))
fput(file);

@@ -161,21 +211,47 @@ kvm_irqfd_init(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->irqfds);
}

+static struct _irqfd *
+irqfd_pop(struct kvm *kvm)
+{
+ struct _irqfd *irqfd = NULL;
+
+ mutex_lock(&kvm->lock);
+
+ if (!list_empty(&kvm->irqfds)) {
+ irqfd = list_first_entry(&kvm->irqfds, struct _irqfd, list);
+ list_del(&irqfd->list);
+ }
+
+ mutex_unlock(&kvm->lock);
+
+ return irqfd;
+}
+
void
kvm_irqfd_release(struct kvm *kvm)
{
- struct _irqfd *irqfd, *tmp;
+ struct _irqfd *irqfd;

- list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
- if (irqfd->wqh)
- remove_wait_queue(irqfd->wqh, &irqfd->wait);
+ while ((irqfd = irqfd_pop(kvm))) {

- flush_work(&irqfd->inject);
+ remove_wait_queue(irqfd->wqh, &irqfd->wait);

- mutex_lock(&kvm->lock);
- list_del(&irqfd->list);
- mutex_unlock(&kvm->lock);
+ /*
+ * We guarantee there will be no more notifications after
+ * the remove_wait_queue returns. Now lets make sure we
+ * synchronize behind any outstanding work items before
+ * releasing the resources
+ */
+ flush_work(&irqfd->work);

- kfree(irqfd);
+ irqfd_release(irqfd);
}
+
+ /*
+ * We need to wait in case there are any outstanding work-items
+ * in flight that had already removed themselves from the list
+ * prior to entry to this function
+ */
+ flush_scheduled_work();
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/