[PATCH 4/7] perf: Free aux pages in unmap path

From: Alexander Shishkin
Date: Thu Dec 03 2015 - 05:35:31 EST


Now that we can ensure that when ring buffer's aux area is on the way
to getting unmapped new transactions won't start, and we have means of
stopping the running transactions, we can do the latter to the events
on this ring buffer's event list and then safely free the aux pages and
corresponding pmu data, as this time it is guaranteed to be the last
aux reference holder. This partially reverts 57ffc5ca679 ("perf: Fix AUX
buffer refcounting"), which was made to defer deallocation that was
otherwise possible from an NMI context. Now it is no longer the case;
the last call to rb_free_aux() that drops the last AUX reference has
to happen in perf_mmap_close() on that AUX area.

Signed-off-by: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
---
kernel/events/core.c | 53 ++++++++++++++++++++++++++++++++++++++++++++-
kernel/events/internal.h | 1 -
kernel/events/ring_buffer.c | 37 ++++++++++---------------------
3 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 66f835a2df..10fce18710 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4630,11 +4630,62 @@ static void perf_mmap_close(struct vm_area_struct *vma)
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ struct perf_event *iter;
+ LIST_HEAD(stop_list);
+ unsigned long flags;
+
+ /*
+ * Stop all aux events that are writing to this here buffer,
+ * so that we can free its aux pages and corresponding pmu
+ * data. Note that after rb::aux_mmap_count dropped to zero,
+ * they won't start any more (see perf_aux_output_begin()).
+ *
+ * Since we can't take ctx::mutex under rb::event_lock, we
+ * need to jump through hoops to get there, namely fish out
+ * all events from rb::event_list onto an on-stack list,
+ * carry out the stopping and splice this on-stack list back
+ * to rb::event_list.
+ * This means that these events will miss wakeups during this
+ * window, but since it's mmap_close, assume the consumer
+ * doesn't care any more.
+ *
+ * Note: list_splice_init_rcu() doesn't cut it, since it syncs
+ * and rb::event_lock is a spinlock.
+ */
+retry:
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_for_each_entry_rcu(iter, &rb->event_list, rb_entry) {
+ list_del_rcu(&iter->rb_entry);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+
+ synchronize_rcu();
+ list_add_tail(&iter->rb_entry, &stop_list);
+
+ goto retry;
+ }
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+
+ mutex_unlock(&event->mmap_mutex);
+
+ list_for_each_entry(iter, &stop_list, rb_entry) {
+ if (!has_aux(iter))
+ continue;
+
+ perf_event_stop(iter);
+ }
+
+ /* and splice it back now that we're done with them */
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_splice_tail(&stop_list, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+
+ /* now it's safe to free the pages */
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;

+ /* this has to be the last one */
rb_free_aux(rb);
- mutex_unlock(&event->mmap_mutex);
+ WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
}

atomic_dec(&rb->mmap_count);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2bbad9c127..2b229fdcfc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,7 +11,6 @@
struct ring_buffer {
atomic_t refcount;
struct rcu_head rcu_head;
- struct irq_work irq_work;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
int page_order; /* allocation order */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 5709cc222f..6865ac95ca 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -221,8 +221,6 @@ void perf_output_end(struct perf_output_handle *handle)
rcu_read_unlock();
}

-static void rb_irq_work(struct irq_work *work);
-
static void
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
{
@@ -243,16 +241,6 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)

INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
- init_irq_work(&rb->irq_work, rb_irq_work);
-}
-
-static void ring_buffer_put_async(struct ring_buffer *rb)
-{
- if (!atomic_dec_and_test(&rb->refcount))
- return;
-
- rb->rcu_head.next = (void *)rb;
- irq_work_queue(&rb->irq_work);
}

/*
@@ -292,7 +280,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* the aux buffer is in perf_mmap_close(), about to get free'd.
*/
if (!atomic_read(&rb->aux_mmap_count))
- goto err;
+ goto err_put;

/*
* Nesting is not supported for AUX area, make sure nested
@@ -338,7 +326,7 @@ err_put:
rb_free_aux(rb);

err:
- ring_buffer_put_async(rb);
+ ring_buffer_put(rb);
handle->event = NULL;

return NULL;
@@ -389,7 +377,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,

local_set(&rb->aux_nest, 0);
rb_free_aux(rb);
- ring_buffer_put_async(rb);
+ ring_buffer_put(rb);
}

/*
@@ -563,6 +551,14 @@ static void __rb_free_aux(struct ring_buffer *rb)
{
int pg;

+ /*
+ * Should never happen, the last reference should be dropped from
+ * perf_mmap_close() path, which first stops aux transactions (which
+ * in turn are the atomic holders of aux_refcount) and then does the
+ * last rb_free_aux().
+ */
+ WARN_ON_ONCE(in_atomic());
+
if (rb->aux_priv) {
rb->free_aux(rb->aux_priv);
rb->free_aux = NULL;
@@ -581,18 +577,7 @@ static void __rb_free_aux(struct ring_buffer *rb)
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount))
- irq_work_queue(&rb->irq_work);
-}
-
-static void rb_irq_work(struct irq_work *work)
-{
- struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
-
- if (!atomic_read(&rb->aux_refcount))
__rb_free_aux(rb);
-
- if (rb->rcu_head.next == (void *)rb)
- call_rcu(&rb->rcu_head, rb_free_rcu);
}

#ifndef CONFIG_PERF_USE_VMALLOC
--
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/