[PATCH RFC DNM] perf: Add support for Qualcomm Last-Level Cache Controller PMU

From: Konrad Dybcio
Date: Wed Aug 09 2023 - 16:10:05 EST


Add support for the Qualcomm LLCC (Last-Level Cache Controller) PMU,
which provides a single event, expressing cache read misses.

Based on the vendor driver found in the msm-5.10 downstream kernel.

Signed-off-by: Konrad Dybcio <konrad.dybcio@xxxxxxxxxx>
---
Hi, I've been trying to get this driver going upstream by cleaning it
up and adding the necessary perf boilerplate (the original Qualcomm one
only pokes at the PMU from within the kernel itself) to use the
userspace tool.

I can not however get it to cooperate.. in this iteration I get a PMU
event registered (though with only a "raw" name - no "x OR y" like with
other PMUs on the system) as:

llcc_pmu/read_miss/ [Kernel PMU event]

but the .read callback is never called when I run:

sudo perf stat -C 0 -a -e llcc_pmu/read_miss/ stress-ng -C 8 -c 8 -m 10

which always returns 0

if I add --always-kernel I get:
<not supported> llcc_pmu/read_miss/

So, here's me asking for some help. It's probably missing some small
detail, as per usual..
---
drivers/perf/Kconfig | 8 ++
drivers/perf/Makefile | 1 +
drivers/perf/qcom_llcc_pmu.c | 277 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 286 insertions(+)

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 273d67ecf6d2..31d848c88d8a 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -155,6 +155,14 @@ config QCOM_L3_PMU
Adds the L3 cache PMU into the perf events subsystem for
monitoring L3 cache events.

+config QCOM_LLCC_PMU
+ tristate "Qualcomm Technologies LLCC PMU"
+ depends on ARCH_QCOM || COMPILE_TEST
+ depends on OF
+ help
+ Support for the last-level cache performance monitor unit found
+ on some Qualcomm Snapdragon SoCs.
+
config THUNDERX2_PMU
tristate "Cavium ThunderX2 SoC PMU UNCORE"
depends on ARCH_THUNDER2 || COMPILE_TEST
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 16b3ec4db916..eb02574780b5 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_FSL_IMX9_DDR_PMU) += fsl_imx9_ddr_perf.o
obj-$(CONFIG_HISI_PMU) += hisilicon/
obj-$(CONFIG_QCOM_L2_PMU) += qcom_l2_pmu.o
obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
+obj-$(CONFIG_QCOM_LLCC_PMU) += qcom_llcc_pmu.o
obj-$(CONFIG_RISCV_PMU) += riscv_pmu.o
obj-$(CONFIG_RISCV_PMU_LEGACY) += riscv_pmu_legacy.o
obj-$(CONFIG_RISCV_PMU_SBI) += riscv_pmu_sbi.o
diff --git a/drivers/perf/qcom_llcc_pmu.c b/drivers/perf/qcom_llcc_pmu.c
new file mode 100644
index 000000000000..db290ae141a7
--- /dev/null
+++ b/drivers/perf/qcom_llcc_pmu.c
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2017-2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023, Linaro Limited
+ */
+
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+
+struct llcc_pmu {
+ struct pmu pmu;
+ struct hlist_node node;
+ void __iomem *base;
+ struct perf_event event;
+ raw_spinlock_t lock;
+ u64 *llcc_stats;
+};
+#define to_llcc_pmu(p) (container_of(p, struct llcc_pmu, pmu))
+
+#define LLCC_READ_MISS_EV 0x1000
+
+#define CNT_SCALING_FACTOR 0x3
+
+#define MAX_NUM_CPUS 16
+
+#define MON_CFG(m) ((m)->base + 0x200)
+ #define MON_CFG_ENABLE(cpu) BIT(cpu)
+ #define MON_CFG_CLEARn(cpu) BIT(16 + cpu)
+
+#define MON_CNT(m) ((m)->base + 0x220)
+ #define MON_CNT_VAL GENMASK(23, 0)
+#define MON_CNTn(m, cpu) (MON_CNT(m) + 0x4 * cpu)
+
+static DEFINE_PER_CPU(unsigned int, users_alive);
+
+static void mon_enable(struct llcc_pmu *llcc_pmu, int cpu)
+{
+ u32 val;
+
+ val = readl_relaxed(MON_CFG(llcc_pmu));
+ val |= MON_CFG_ENABLE(cpu);
+ writel_relaxed(val, MON_CFG(llcc_pmu));
+}
+
+static void mon_disable(struct llcc_pmu *llcc_pmu, int cpu)
+{
+ u32 val;
+
+ val = readl_relaxed(MON_CFG(llcc_pmu));
+ val &= ~MON_CFG_ENABLE(cpu);
+ writel_relaxed(val, MON_CFG(llcc_pmu));
+}
+
+static void mon_clear(struct llcc_pmu *llcc_pmu, int cpu)
+{
+ u32 val;
+
+ val = readl_relaxed(MON_CFG(llcc_pmu));
+
+ val |= MON_CFG_CLEARn(cpu);
+ writel_relaxed(val, MON_CFG(llcc_pmu));
+
+ val &= ~MON_CFG_CLEARn(cpu);
+ writel_relaxed(val, MON_CFG(llcc_pmu));
+}
+
+static int qcom_llcc_event_init(struct perf_event *event)
+{
+ struct llcc_pmu *llcc_pmu = to_llcc_pmu(event->pmu);
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return -EINVAL;
+
+ if (is_sampling_event(event)) {
+ dev_dbg(llcc_pmu->pmu.dev, "Per-task counters are unsupported\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (has_branch_stack(event)) {
+ dev_dbg(llcc_pmu->pmu.dev, "Filtering is unsupported\n");
+ return -EINVAL;
+ }
+
+ if (event->cpu < 0) {
+ dev_warn(llcc_pmu->pmu.dev, "Can't provide per-task data!\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void qcom_llcc_event_read(struct perf_event *event)
+{
+ struct llcc_pmu *llcc_pmu = to_llcc_pmu(event->pmu);
+ unsigned long irq_flags;
+ int cpu = event->cpu;
+ u64 readout;
+
+ raw_spin_lock_irqsave(&llcc_pmu->lock, irq_flags);
+
+ mon_disable(llcc_pmu, cpu);
+
+ readout = FIELD_GET(MON_CNT_VAL, readl_relaxed(MON_CNTn(llcc_pmu, cpu)));
+ readout <<= CNT_SCALING_FACTOR;
+
+ llcc_pmu->llcc_stats[cpu] += readout;
+
+ mon_clear(llcc_pmu, cpu);
+ mon_enable(llcc_pmu, cpu);
+
+ if (!(event->hw.state & PERF_HES_STOPPED))
+ local64_set(&event->count, llcc_pmu->llcc_stats[cpu]);
+
+ raw_spin_unlock_irqrestore(&llcc_pmu->lock, irq_flags);
+}
+
+static void qcom_llcc_pmu_start(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_RELOAD)
+ WARN_ON(!(event->hw.state & PERF_HES_UPTODATE));
+
+ event->hw.state = 0;
+}
+
+static void qcom_llcc_event_stop(struct perf_event *event, int flags)
+{
+ qcom_llcc_event_read(event);
+ event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int qcom_llcc_event_add(struct perf_event *event, int flags)
+{
+ struct llcc_pmu *llcc_pmu = to_llcc_pmu(event->pmu);
+ unsigned int cpu_users;
+
+ raw_spin_lock(&llcc_pmu->lock);
+
+ cpu_users = per_cpu(users_alive, event->cpu);
+ if (!cpu_users)
+ mon_enable(llcc_pmu, event->cpu);
+
+ cpu_users++;
+ per_cpu(users_alive, event->cpu) = cpu_users;
+
+ raw_spin_unlock(&llcc_pmu->lock);
+
+ event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+ if (flags & PERF_EF_START)
+ qcom_llcc_pmu_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void qcom_llcc_event_del(struct perf_event *event, int flags)
+{
+ struct llcc_pmu *llcc_pmu = to_llcc_pmu(event->pmu);
+ unsigned int cpu_users;
+
+ raw_spin_lock(&llcc_pmu->lock);
+
+ cpu_users = per_cpu(users_alive, event->cpu);
+ cpu_users--;
+ if (!cpu_users)
+ mon_disable(llcc_pmu, event->cpu);
+
+ per_cpu(users_alive, event->cpu) = cpu_users;
+
+ raw_spin_unlock(&llcc_pmu->lock);
+}
+
+static ssize_t llcc_pmu_event_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr;
+
+ pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+ return sysfs_emit(page, "event=0x%04llx\n", pmu_attr->id);
+}
+
+static struct attribute *qcom_llcc_pmu_events[] = {
+ PMU_EVENT_ATTR_ID(read_miss, llcc_pmu_event_show, LLCC_READ_MISS_EV),
+ NULL,
+};
+
+static const struct attribute_group qcom_llcc_pmu_events_group = {
+ .name = "events",
+ .attrs = qcom_llcc_pmu_events,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-15");
+static struct attribute *qcom_llcc_pmu_format_attrs[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static const struct attribute_group qcom_llcc_pmu_format_group = {
+ .name = "format",
+ .attrs = qcom_llcc_pmu_format_attrs,
+};
+
+static const struct attribute_group *qcom_llcc_pmu_attr_groups[] = {
+ &qcom_llcc_pmu_format_group,
+ &qcom_llcc_pmu_events_group,
+ NULL,
+};
+
+static int qcom_llcc_pmu_probe(struct platform_device *pdev)
+{
+ static struct llcc_pmu *llcc_pmu;
+ int ret;
+
+ if (num_possible_cpus() > MAX_NUM_CPUS)
+ return dev_err_probe(&pdev->dev, -EINVAL,
+ "LLCC PMU only supports <=%u CPUs\n",
+ MAX_NUM_CPUS);
+
+ llcc_pmu = devm_kzalloc(&pdev->dev, sizeof(*llcc_pmu), GFP_KERNEL);
+ if (!llcc_pmu)
+ return -ENOMEM;
+
+ llcc_pmu->llcc_stats = devm_kcalloc(&pdev->dev, num_possible_cpus(),
+ sizeof(*llcc_pmu->llcc_stats), GFP_KERNEL);
+
+ llcc_pmu->base = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(llcc_pmu->base))
+ return dev_err_probe(&pdev->dev, PTR_ERR(llcc_pmu->base),
+ "Failed to register LLCC PMU\n");
+
+ llcc_pmu->pmu = (struct pmu) {
+ .event_init = qcom_llcc_event_init,
+ .add = qcom_llcc_event_add,
+ .del = qcom_llcc_event_del,
+ .start = qcom_llcc_pmu_start,
+ .stop = qcom_llcc_event_stop,
+ .read = qcom_llcc_event_read,
+
+ .attr_groups = qcom_llcc_pmu_attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .task_ctx_nr = perf_invalid_context,
+
+ .module = THIS_MODULE,
+ };
+
+ raw_spin_lock_init(&llcc_pmu->lock);
+
+ ret = perf_pmu_register(&llcc_pmu->pmu, "llcc_pmu", -1);
+ if (ret)
+ return dev_err_probe(&pdev->dev, ret, "Failed to register LLCC PMU\n");
+
+ return 0;
+}
+
+static const struct of_device_id qcom_llcc_pmu_match_table[] = {
+ { .compatible = "qcom,llcc-pmu-v2" },
+ { }
+};
+
+static struct platform_driver qcom_llcc_pmu_driver = {
+ .probe = qcom_llcc_pmu_probe,
+ .driver = {
+ .name = "qcom-llcc-pmu",
+ .of_match_table = qcom_llcc_pmu_match_table,
+ .suppress_bind_attrs = true,
+ },
+};
+module_platform_driver(qcom_llcc_pmu_driver);
+
+MODULE_DEVICE_TABLE(of, qcom_llcc_pmu_match_table);
+MODULE_DESCRIPTION("QCOM LLCC PMU");
+MODULE_LICENSE("GPL");

---
base-commit: 21ef7b1e17d039053edaeaf41142423810572741
change-id: 20230809-topic-llcc_pmu-c6e9dbc36b12

Best regards,
--
Konrad Dybcio <konrad.dybcio@xxxxxxxxxx>