Re: [PATCH 8/8] drivers/perf: Add Apple icestorm/firestorm CPU PMU driver

From: Alyssa Rosenzweig
Date: Sun Nov 14 2021 - 08:45:57 EST


> +/* Counters */
> +#define SYS_IMP_APL_PMC0_EL1 sys_reg(3, 2, 15, 0, 0)
> +#define SYS_IMP_APL_PMC1_EL1 sys_reg(3, 2, 15, 1, 0)
> +#define SYS_IMP_APL_PMC2_EL1 sys_reg(3, 2, 15, 2, 0)
> +#define SYS_IMP_APL_PMC3_EL1 sys_reg(3, 2, 15, 3, 0)
> +#define SYS_IMP_APL_PMC4_EL1 sys_reg(3, 2, 15, 4, 0)
> +#define SYS_IMP_APL_PMC5_EL1 sys_reg(3, 2, 15, 5, 0)
> +#define SYS_IMP_APL_PMC6_EL1 sys_reg(3, 2, 15, 6, 0)
> +#define SYS_IMP_APL_PMC7_EL1 sys_reg(3, 2, 15, 7, 0)
--gap--
> +#define SYS_IMP_APL_PMC8_EL1 sys_reg(3, 2, 15, 9, 0)
> +#define SYS_IMP_APL_PMC9_EL1 sys_reg(3, 2, 15, 10, 0)

Do we know what the gap is?

> +/*
> + * Description of the events we actually know about, as well as those with
> + * a specific counter affinity. Yes, this is a grand total of two known
> + * counters, and the rest is anybody's guess.
> + *
> + * Not all counters can count all events. Counters #0 and #1 are wired to
> + * count cycles and instructions respectively, and some events have
> + * bizarre mappings (every other counter, or even *one* counter). These
> + * restrictins equally apply to both P and E cores.

restrictions

> +/* Low level accessors. No synchronisation. */
> +#define PMU_READ_COUNTER(_idx) \
> + case _idx: return read_sysreg_s(SYS_IMP_APL_PMC## _idx ##_EL1)
> +
> +#define PMU_WRITE_COUNTER(_val, _idx) \
> + case _idx: \
> + write_sysreg_s(_val, SYS_IMP_APL_PMC## _idx ##_EL1); \
> + return
> +
> +static u64 m1_pmu_read_hw_counter(unsigned int index)
> +{
> + switch (index) {
> + PMU_READ_COUNTER(0);
> + PMU_READ_COUNTER(1);
> + PMU_READ_COUNTER(2);
> + PMU_READ_COUNTER(3);
> + PMU_READ_COUNTER(4);
> + PMU_READ_COUNTER(5);
> + PMU_READ_COUNTER(6);
> + PMU_READ_COUNTER(7);
> + PMU_READ_COUNTER(8);
> + PMU_READ_COUNTER(9);
> + }
> +
> + BUG();
> +}
> +
> +static void m1_pmu_write_hw_counter(u64 val, unsigned int index)
> +{
> + switch (index) {
> + PMU_WRITE_COUNTER(val, 0);
> + PMU_WRITE_COUNTER(val, 1);
> + PMU_WRITE_COUNTER(val, 2);
> + PMU_WRITE_COUNTER(val, 3);
> + PMU_WRITE_COUNTER(val, 4);
> + PMU_WRITE_COUNTER(val, 5);
> + PMU_WRITE_COUNTER(val, 6);
> + PMU_WRITE_COUNTER(val, 7);
> + PMU_WRITE_COUNTER(val, 8);
> + PMU_WRITE_COUNTER(val, 9);
> + }
> +
> + BUG();
> +}

Probbaly cleaner to use a single switch and no macros, registers become
greppable and the code is shorter too. Caveat: didn't check if it
compiles.

static inline u64 m1_pmu_hw_counter(unsigned int index)
{
switch (index) {
case 0: return SYS_IMP_APL_PMC0_EL1;
case 1: return SYS_IMP_APL_PMC1_EL1;
case 2: return SYS_IMP_APL_PMC2_EL1;
case 3: return SYS_IMP_APL_PMC3_EL1;
case 4: return SYS_IMP_APL_PMC4_EL1;
case 5: return SYS_IMP_APL_PMC5_EL1;
case 6: return SYS_IMP_APL_PMC6_EL1;
case 7: return SYS_IMP_APL_PMC7_EL1;
case 8: return SYS_IMP_APL_PMC8_EL1;
case 9: return SYS_IMP_APL_PMC9_EL1;
}

BUG();
}

static u64 m1_pmu_read_hw_counter(unsigned int index) {
return read_sysreg_s(m1_pmu_hw_counter(index));
}


static void m1_pmu_write_hw_counter(u64 val, unsigned int index)
{
write_sysreg_s(val, m1_pmu_hw_counter(index));
}

> +static void __m1_pmu_enable_counter(unsigned int index, bool en)
> +{
> + u64 val, bit;
> +
> + switch (index) {
> + case 0 ... 7:
> + bit = BIT(get_bit_offset(index, PMCR0_CNT_ENABLE_0_7));
> + break;
> + case 8 ... 9:
> + bit = BIT(get_bit_offset(index - 8, PMCR0_CNT_ENABLE_8_9));
> + break;
> + default:
> + BUG();
> + }
> +
> + val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
> +
> + if (en)
> + val |= bit;
> + else
> + val &= ~bit;
> +
> + write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
> +}
...
> +static void __m1_pmu_enable_counter_interrupt(unsigned int index, bool en)
> +{
> + u64 val, bit;
> +
> + switch (index) {
> + case 0 ... 7:
> + bit = BIT(get_bit_offset(index, PMCR0_PMI_ENABLE_0_7));
> + break;
> + case 8 ... 9:
> + bit = BIT(get_bit_offset(index - 8, PMCR0_PMI_ENABLE_8_9));
> + break;
> + default:
> + BUG();
> + }
> +
> + val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
> +
> + if (en)
> + val |= bit;
> + else
> + val &= ~bit;
> +
> + write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
> +}

These two helper functions have basically the same logic -- maybe worth combining?

> +static void m1_pmu_configure_counter(unsigned int index, u8 event,
> + bool user, bool kernel)
> +{
....
> + switch (index) {
> + case 0 ... 1:
> + /* 0 and 1 have fixed events */
> + break;
> + case 2 ... 5:
> + shift = (index - 2) * 8;
> + val = read_sysreg_s(SYS_IMP_APL_PMESR0_EL1);
> + val &= ~((u64)0xff << shift);
> + val |= (u64)event << shift;
> + write_sysreg_s(val, SYS_IMP_APL_PMESR0_EL1);
> + break;
> + case 6 ... 9:
> + shift = (index - 6) * 8;
> + val = read_sysreg_s(SYS_IMP_APL_PMESR1_EL1);
> + val &= ~((u64)0xff << shift);
> + val |= (u64)event << shift;
> + write_sysreg_s(val, SYS_IMP_APL_PMESR1_EL1);
> + break;
> + }
> +}

I'd love an explanation what's happening here.

> + /*
> + * Place the event on the first free counter that can count
> + * this event.
> + *
> + * We could do a better job if we had a view of all the events
> + * counting on the PMU at any given time, and by placing the
> + * most constraint events first.
> + */

constraining

> +static int m1_pmu_device_probe(struct platform_device *pdev)
> +{
> + int ret;
> +
> + ret = arm_pmu_device_probe(pdev, m1_pmu_of_device_ids, NULL);
> + if (!ret) {
> + /*
> + * If probe succeeds, taint the kernel as this is all
> + * undocumented, implementation defined black magic.
> + */
> + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
> + }
> +
> + return ret;
> +}

What are the implications of this taint? You could say that about every
driver we've written for the M1, but...