[PATCH V6 4/6] perf, x86: handle multiple records in PEBS buffer

From: Kan Liang
Date: Thu Apr 09 2015 - 12:39:37 EST


From: Yan, Zheng <zheng.z.yan@xxxxxxxxx>

When PEBS interrupt threshold is larger than one, the PEBS buffer
may include multiple records for each PEBS event. This patch makes
the code first count how many records each PEBS event has, then
output the samples in batch.

One corner case needs to mention is that the PEBS hardware doesn't
deal well with collisions. The records for the events can be collapsed
into a single one, and it's not possible to reconstruct all events that
caused the PEBS record.
Here are some cases which can be called collisions.
- PEBS events happen near to each other, so the hardware merges them.
- PEBS events happen near to each other, but they are not merged.
The GLOBAL_STATUS for first counter is clear before generating event
for next counter. Only the first record can be treated as collisions.
- Same as case2, but the first counter isn't clear before generating
event for next counter. All the records are treated as collision
until a record with only one bit set for PEBS event.

GLOBAL_STATUS could be set by both PEBS and non-PEBS events. Multiple
non-PEBS bit set doesn't count as collisions.

In practice collisions are extremely rare, as long as different PEBS
events are used. The periods are typically very large, so any collision
is unlikely. When collision happens, we drop the PEBS record.
The only way you can get a lot of collision is when you count the same
thing multiple times. But it is not a useful configuration.

Here are some numbers about collisions.
Four frequently occurring events
(cycles:p,instructions:p,branches:p,mem-stores:p) are tested

Test events which are sampled together collision rate
cycles:p,instructions:p 0.25%
cycles:p,instructions:p,branches:p 0.30%
cycles:p,instructions:p,branches:p,mem-stores:p 0.35%

cycles:p,cycles:p 98.52%

Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx>
Signed-off-by: Kan Liang <kan.liang@xxxxxxxxx>
---
arch/x86/kernel/cpu/perf_event_intel_ds.c | 157 +++++++++++++++++++++++++-----
include/linux/perf_event.h | 4 +
kernel/events/core.c | 6 +-
3 files changed, 137 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 3ce7f59..fafbf97 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -888,6 +888,9 @@ static void setup_pebs_sample_data(struct perf_event *event,
int fll, fst, dsrc;
int fl = event->hw.flags;

+ if (pebs == NULL)
+ return;
+
sample_type = event->attr.sample_type;
dsrc = sample_type & PERF_SAMPLE_DATA_SRC;

@@ -982,7 +985,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
data->br_stack = &cpuc->lbr_stack;
}

-static void __intel_pmu_pebs_event(struct perf_event *event,
+static void __intel_pmu_pebs_event_core(struct perf_event *event,
struct pt_regs *iregs, void *__pebs)
{
struct perf_sample_data data;
@@ -997,6 +1000,89 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
x86_pmu_stop(event, 0);
}

+/* Clear all non-PEBS bits */
+static u64
+nonpebs_bit_clear(u64 pebs_status)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event;
+ int bit;
+
+ for_each_set_bit(bit, (unsigned long *)&pebs_status, 64) {
+
+ if (bit >= x86_pmu.max_pebs_events)
+ clear_bit(bit, (unsigned long *)&pebs_status);
+ else {
+ event = cpuc->events[bit];
+ WARN_ON_ONCE(!event);
+
+ if (!event->attr.precise_ip)
+ clear_bit(bit, (unsigned long *)&pebs_status);
+ }
+ }
+
+ return pebs_status;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+ void *at;
+ u64 pebs_status;
+
+ if (base == NULL)
+ return NULL;
+
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+ struct pebs_record_nhm *p = at;
+
+ if (p->status & (1 << bit)) {
+
+ if (p->status == (1 << bit))
+ return at;
+
+ /* clear non-PEBS bit and re-check */
+ pebs_status = nonpebs_bit_clear(p->status);
+ if (pebs_status == (1 << bit))
+ return at;
+ }
+ }
+ return NULL;
+}
+
+static void __intel_pmu_pebs_event_nhm(struct perf_event *event,
+ struct pt_regs *iregs,
+ void *base, void *top,
+ int bit, int count)
+{
+ struct perf_sample_data data;
+ struct pt_regs regs;
+ int i;
+ void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+ if (!intel_pmu_save_and_restart(event) &&
+ !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+ return;
+
+ if (count > 1) {
+ for (i = 0; i < count - 1; i++) {
+ setup_pebs_sample_data(event, iregs, at, &data, &regs);
+ perf_event_output(event, &data, &regs);
+ at += x86_pmu.pebs_record_size;
+ at = get_next_pebs_record_by_bit(at, top, bit);
+ }
+ }
+
+ setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+ /* all records are processed, handle event overflow now */
+ if (perf_event_overflow(event, &data, &regs)) {
+ x86_pmu_stop(event, 0);
+ return;
+ }
+
+}
+
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1035,61 +1121,78 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
at += n - 1;

- __intel_pmu_pebs_event(event, iregs, at);
+ __intel_pmu_pebs_event_core(event, iregs, at);
}

static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
- struct perf_event *event = NULL;
- void *at, *top;
- u64 status = 0;
+ struct perf_event *event;
+ void *base, *at, *top;
int bit;
+ int counts[MAX_PEBS_EVENTS] = {};

if (!x86_pmu.pebs_active)
return;

- at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+ base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;

ds->pebs_index = ds->pebs_buffer_base;

- if (unlikely(at > top))
+ if (unlikely(base >= top))
return;

- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
- "Unexpected number of pebs records %ld\n",
- (long)(top - at) / x86_pmu.pebs_record_size);
-
- for (; at < top; at += x86_pmu.pebs_record_size) {
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
struct pebs_record_nhm *p = at;

for_each_set_bit(bit, (unsigned long *)&p->status,
x86_pmu.max_pebs_events) {
event = cpuc->events[bit];
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
WARN_ON_ONCE(!event);

- if (!event->attr.precise_ip)
- continue;
+ if (event->attr.precise_ip)
+ break;
+ }

- if (__test_and_set_bit(bit, (unsigned long *)&status))
- continue;
+ if (bit >= x86_pmu.max_pebs_events)
+ continue;
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+ /*
+ * The PEBS hardware does not deal well with the situation
+ * when events happen near to each other and multiple bits
+ * are set. But it should happen rarely.
+ *
+ * If these events include one PEBS and multiple non-PEBS
+ * events, it doesn't impact PEBS record. The record will
+ * be handled normally. (slow path)
+ *
+ * If these events include two or more PEBS events, the
+ * records for the events can be collapsed into a single
+ * one, and it's not possible to reconstruct all events
+ * that caused the PEBS record. It's called collision.
+ * If collision happened, the record will be dropped.
+ *
+ */
+ if (p->status != (1 << bit)) {
+ u64 pebs_status;

- break;
+ /* slow path */
+ pebs_status = nonpebs_bit_clear(p->status);
+ if (pebs_status != (1 << bit))
+ continue;
}
+ counts[bit]++;
+ }

- if (!event || bit >= x86_pmu.max_pebs_events)
+ for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+ if (counts[bit] == 0)
continue;
-
- __intel_pmu_pebs_event(event, iregs, at);
+ event = cpuc->events[bit];
+ __intel_pmu_pebs_event_nhm(event, iregs, base,
+ top, bit, counts[bit]);
}
}

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61992cf..750007e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -734,6 +734,10 @@ extern int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs);

+extern void perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
+
static inline bool is_sampling_event(struct perf_event *event)
{
return event->attr.sample_period != 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 06917d5..a8d0e92 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5360,9 +5360,9 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}

-static void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_output_handle handle;
struct perf_event_header header;
--
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/