[PATCH 1/3] x86, perf: Use a new PMU ack sequence

From: Andi Kleen
Date: Wed Oct 21 2015 - 16:16:40 EST


From: Andi Kleen <ak@xxxxxxxxxxxxxxx>

The SKL PMU code had a problem with LBR freezing. When a counter
overflows already in the PMI handler, the LBR would be frozen
early and not be unfrozen until the next PMI. This means we would
get stale LBR information.

Depending on the workload this could happen for a few percent
of the PMIs for cycles in adaptive frequency mode, because the frequency
algorithm regularly goes down to very low periods.

This patch implements a new PMU ack sequence that avoids this problem.
The new sequence is:

- (counters are disabled with GLOBAL_CTRL)
There should be no further increments of the counters by later instructions; and
thus no additional PMI (and thus no additional freezing).

- ack the APIC

Clear the APIC PMI LVT entry so that any later interrupt is delivered and is
not lost due to the PMI LVT entry being masked. A lost PMI interrupt could lead to
LBRs staying frozen without entering the PMI handler

- Ack the PMU counters. This unfreezes the LBRs on Skylake (but not
on earlier CPUs which rely on DEBUGCTL writes for this)

- Reenable counters

The WRMSR will start the counters counting again (and will be ordered after the
APIC LVT PMI entry write since WRMSR is architecturally serializing). Since the
APIC PMI LVT is unmasked, any PMI which is caused by these perfmon counters
will trigger an NMI (but the delivery may be delayed until after the next
IRET)

One side effect is that the old retry loop is not possible anymore,
as the counters stay unacked for the majority of the PMI handler,
but that is not a big loss, as "profiling" the PMI was always
a bit dubious. For the old ack sequence it is still supported.

The new sequence is now used unconditionally on all Intel Core/Atom
CPUs. The old irq loop check is removed. Instead we rely on the
generic nmi maximum duration check in the NMI code, together with the perf
enforcement of the maximum sampling rate, to stop any runaway
counters. We also assume that any counter can be stopped by setting
a sufficiently large overflow value.

v2:
Use new ack sequence unconditionally. Remove pmu reset code.
Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx>
---
arch/x86/kernel/cpu/perf_event.h | 1 -
arch/x86/kernel/cpu/perf_event_intel.c | 82 +++++-----------------------------
2 files changed, 11 insertions(+), 72 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 499f533..6388540 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -550,7 +550,6 @@ struct x86_pmu {
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
- bool late_ack;
unsigned (*limit_period)(struct perf_event *event, unsigned l);

/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f63360b..68a37b5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1741,44 +1741,6 @@ int intel_pmu_save_and_restart(struct perf_event *event)
return x86_perf_event_set_period(event);
}

-static void intel_pmu_reset(void)
-{
- struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
- unsigned long flags;
- int idx;
-
- if (!x86_pmu.num_counters)
- return;
-
- local_irq_save(flags);
-
- pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
- wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
- }
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
- wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-
- if (ds)
- ds->bts_index = ds->bts_buffer_base;
-
- /* Ack all overflows and disable fixed counters */
- if (x86_pmu.version >= 2) {
- intel_pmu_ack_status(intel_pmu_get_status());
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
- }
-
- /* Reset LBRs and LBR freezing */
- if (x86_pmu.lbr_nr) {
- update_debugctlmsr(get_debugctlmsr() &
- ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
- }
-
- local_irq_restore(flags);
-}
-
/*
* This handler is triggered by the local APIC, so the APIC IRQ handling
* rules apply:
@@ -1787,39 +1749,22 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
struct cpu_hw_events *cpuc;
- int bit, loops;
+ int bit;
u64 status;
+ u64 orig_status;
int handled;

cpuc = this_cpu_ptr(&cpu_hw_events);

- /*
- * No known reason to not always do late ACK,
- * but just in case do it opt-in.
- */
- if (!x86_pmu.late_ack)
- apic_write(APIC_LVTPC, APIC_DM_NMI);
__intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
handled += intel_bts_interrupt();
status = intel_pmu_get_status();
+ orig_status = status;
if (!status)
goto done;

- loops = 0;
-again:
intel_pmu_lbr_read();
- intel_pmu_ack_status(status);
- if (++loops > 100) {
- static bool warned = false;
- if (!warned) {
- WARN(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- warned = true;
- }
- intel_pmu_reset();
- goto done;
- }

inc_irq_stat(apic_perf_irqs);

@@ -1877,22 +1822,20 @@ again:
x86_pmu_stop(event, 0);
}

- /*
- * Repeat if there is more work to be done:
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
-
done:
- __intel_pmu_enable_all(0, true);
/*
* Only unmask the NMI after the overflow counters
* have been reset. This avoids spurious NMIs on
* Haswell CPUs.
*/
- if (x86_pmu.late_ack)
- apic_write(APIC_LVTPC, APIC_DM_NMI);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ /*
+ * Ack the PMU late. This avoids bogus freezing
+ * on Skylake CPUs.
+ */
+ intel_pmu_ack_status(orig_status);
+ __intel_pmu_enable_all(0, true);
return handled;
}

@@ -3455,7 +3398,6 @@ __init int intel_pmu_init(void)
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
x86_add_quirk(intel_ht_bug);
- x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));

@@ -3480,7 +3422,6 @@ __init int intel_pmu_init(void)
case 86: /* 14nm Broadwell Xeon D */
case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
case 79: /* 14nm Broadwell Server */
- x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));

@@ -3513,7 +3454,6 @@ __init int intel_pmu_init(void)

case 78: /* 14nm Skylake Mobile */
case 94: /* 14nm Skylake Desktop */
- x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_skl();
--
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/