[RFC][PATCH 1/7] perf/x86/intel: Rework the large PEBS setup code

From: Peter Zijlstra
Date: Fri Jul 08 2016 - 10:05:44 EST


In order to allow optimizing perf_pmu_sched_task() we must ensure
perf_sched_cb_{inc,dec} are no longer called from NMI context; this
means that pmu::{start,stop}() can no longer use them.

Prepare for this by reworking the whole large PEBS setup code.

The current code relied on the cpuc->pebs_enabled state, however since
that reflects the current active state as per pmu::{start,stop}() we
can no longer rely on this.

Introduce two counters: cpuc->n_pebs and cpuc->n_large_pebs which
count the total number of PEBS events and the number of PEBS events
that have FREERUNNING set, resp.. With this we can tell if the current
setup requires a single record interrupt threshold or can use a larger
buffer.

This also improves the code in that it re-enables the large threshold
once the PEBS event that required single record gets removed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
arch/x86/events/intel/ds.c | 96 +++++++++++++++++++++++++++----------------
arch/x86/events/perf_event.h | 2
kernel/events/core.c | 4 +
3 files changed, 67 insertions(+), 35 deletions(-)

--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -806,9 +806,45 @@ struct event_constraint *intel_pebs_cons
return &emptyconstraint;
}

-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
{
- return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+ return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+ struct debug_store *ds = cpuc->ds;
+ u64 threshold;
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs) {
+ threshold = ds->pebs_absolute_maximum -
+ x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+ } else {
+ threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+ }
+
+ ds->pebs_interrupt_threshold = threshold;
+}
+
+static void intel_pmu_pebs_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needs_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs++;
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+ cpuc->n_large_pebs++;
+
+ if (!needs_cb && pebs_needs_sched_cb(cpuc))
+ perf_sched_cb_inc(event->ctx->pmu);
+
+ pebs_update_threshold(cpuc);
}

void intel_pmu_pebs_enable(struct perf_event *event)
@@ -816,12 +852,11 @@ void intel_pmu_pebs_enable(struct perf_e
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
- bool first_pebs;
- u64 threshold;
+
+ intel_pmu_pebs_add(event);

hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;

- first_pebs = !pebs_is_enabled(cpuc);
cpuc->pebs_enabled |= 1ULL << hwc->idx;

if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
@@ -830,46 +865,38 @@ void intel_pmu_pebs_enable(struct perf_e
cpuc->pebs_enabled |= 1ULL << 63;

/*
- * When the event is constrained enough we can use a larger
- * threshold and run the event with less frequent PMI.
+ * Use auto-reload if possible to save a MSR write in the PMI.
+ * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
*/
- if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
- threshold = ds->pebs_absolute_maximum -
- x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
- if (first_pebs)
- perf_sched_cb_inc(event->ctx->pmu);
- } else {
- threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
- /*
- * If not all events can use larger buffer,
- * roll back to threshold = 1
- */
- if (!first_pebs &&
- (ds->pebs_interrupt_threshold > threshold))
- perf_sched_cb_dec(event->ctx->pmu);
- }
-
- /* Use auto-reload if possible to save a MSR write in the PMI */
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
ds->pebs_event_reset[hwc->idx] =
(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
}
+}

- if (first_pebs || ds->pebs_interrupt_threshold > threshold)
- ds->pebs_interrupt_threshold = threshold;
+static void intel_pmu_pebs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needs_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs--;
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+ cpuc->n_large_pebs--;
+
+ if (needs_cb && !pebs_needs_sched_cb(cpuc))
+ perf_sched_cb_dec(event->ctx->pmu);
+
+ if (cpuc->n_pebs)
+ pebs_update_threshold(cpuc);
}

void intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- struct debug_store *ds = cpuc->ds;
- bool large_pebs = ds->pebs_interrupt_threshold >
- ds->pebs_buffer_base + x86_pmu.pebs_record_size;

- if (large_pebs)
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
intel_pmu_drain_pebs_buffer();

cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -879,13 +906,12 @@ void intel_pmu_pebs_disable(struct perf_
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled &= ~(1ULL << 63);

- if (large_pebs && !pebs_is_enabled(cpuc))
- perf_sched_cb_dec(event->ctx->pmu);
-
if (cpuc->enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);

hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+
+ intel_pmu_pebs_del(event);
}

void intel_pmu_pebs_enable_all(void)
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -194,6 +194,8 @@ struct cpu_hw_events {
*/
struct debug_store *ds;
u64 pebs_enabled;
+ int n_pebs;
+ int n_large_pebs;

/*
* Intel LBR bits
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2791,6 +2791,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
/*
* This function provides the context switch callback to the lower code
* layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
*/
static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,