Re: [PATCH] drivers/perf: riscv_pmu_sbi: add perf_user_access sysctl

From: Palmer Dabbelt
Date: Thu Oct 06 2022 - 22:27:13 EST


On Fri, 26 Aug 2022 08:15:56 PDT (-0700), heiko@xxxxxxxxx wrote:
Add a sysctl similar to the one on arm64 to enable/disable
access to counter CSRs from u-mode on RISC-V.

The default is of course set to disabled keeping the current
state of access - to only the TIME CSR.

Sorry for being slow on this one, but IMO this is the wrong way to go: this was pretty clearly described by the PDFs as a non-optional instruction when we committed to uABI stability, and there's userspace binaries that use these instructions. I know the ISA folks changed their minds about these being in the base, but that doesn't mean we can break userspace.

If you're worried about a side channel from the high resolution timers that makes sense, but we can sort that out without breaking userspace: we just trap the counter accesses and handle them in the kernel with less precision. We'll need to do that in the long run anyway, as there's no way to make sure these are implement. That all applies to the time counter as well.

I'd also argue this should be a prctl(), as that'll allow users to flip it on/off for specific processes. We can make it sticky to deal with the side channels, but at least starting with a per-process flag will let us avoid breaking compatibility for everyone.

Signed-off-by: Heiko Stuebner <heiko@xxxxxxxxx>
---
Documentation/admin-guide/sysctl/kernel.rst | 6 +--
drivers/perf/riscv_pmu_sbi.c | 43 ++++++++++++++++++++-
2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index ee6572b1edad..efd4bc385e7a 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -894,15 +894,15 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
The default value is 8.


-perf_user_access (arm64 only)
-=================================
+perf_user_access (arm64 and riscv only)
+=======================================

Controls user space access for reading perf event counters. When set to 1,
user space can read performance monitor counter registers directly.

The default value is 0 (access disabled).

-See Documentation/arm64/perf.rst for more information.
+See Documentation/arm64/perf.rst for more information on arm64


pid_max
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 6f6681bbfd36..7aab8d673357 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -41,6 +41,8 @@ static const struct attribute_group *riscv_pmu_attr_groups[] = {
NULL,
};

+static int sysctl_perf_user_access __read_mostly;
+
/*
* RISC-V doesn't have hetergenous harts yet. This need to be part of
* per_cpu in case of harts with different pmu counters
@@ -640,13 +642,22 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
return IRQ_HANDLED;
}

+/*
+ * Depending on the perf_user_access setting, enable the access
+ * from usermode either for all counters or for TIME csr only.
+ */
+static void riscv_pmu_update_user_access(void *info)
+{
+ csr_write(CSR_SCOUNTEREN, sysctl_perf_user_access ? GENMASK(31, 0) :
+ 0x2);
+}
+
static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
{
struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);

- /* Enable the access for TIME csr only from the user mode now */
- csr_write(CSR_SCOUNTEREN, 0x2);
+ riscv_pmu_update_user_access(NULL);

/* Stop all the counters so that they can be enabled from perf */
pmu_sbi_stop_all(pmu);
@@ -785,6 +796,32 @@ static void riscv_pmu_destroy(struct riscv_pmu *pmu)
cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
}

+static int riscv_pmu_proc_user_access_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ on_each_cpu(riscv_pmu_update_user_access, NULL, 1);
+
+ return 0;
+}
+
+static struct ctl_table sbi_pmu_sysctl_table[] = {
+ {
+ .procname = "perf_user_access",
+ .data = &sysctl_perf_user_access,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = riscv_pmu_proc_user_access_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
static int pmu_sbi_device_probe(struct platform_device *pdev)
{
struct riscv_pmu *pmu = NULL;
@@ -834,6 +871,8 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
if (ret)
goto out_unregister;

+ register_sysctl("kernel", sbi_pmu_sysctl_table);
+
return 0;

out_unregister: