[PATCH] drm/panthor: Add support for performance counters

From: Adrián Larumbe
Date: Tue Mar 05 2024 - 11:58:49 EST


This brings in support for Panthor's HW performance counters and querying
them from UM through a specific ioctl(). The code is inspired by existing
functionality for the Panfrost driver, with some noteworthy differences:

- Sample size is now reported by the firmware rather than having to reckon
it by hand
- Counter samples are chained in a ring buffer that can be accessed
concurrently, but only from threads within a single context (this is
because of a HW limitation).
- List of enabled counters must be explicitly told from UM
- Rather than allocating the BO that will contain the perfcounter values
in the render context's address space, the samples ring buffer is mapped
onto the MCU's VM.
- If more than one thread within the same context tries to dump a sample,
then the kernel will copy the same frame to every single thread that was
able to join the dump queue right before the FW finished processing the
sample request.
- UM must provide a BO handle for retrieval of perfcnt values rather
than passing a user virtual address.

The reason multicontext access to the driver's perfcnt ioctl interface
isn't tolerated is because toggling a different set of counters than the
current one implies a counter reset, which also messes up with the ring
buffer's extraction and insertion pointers. This is an unfortunate
hardware limitation.

Signed-off-by: Adrián Larumbe <adrian.larumbe@xxxxxxxxxxxxx>
---
drivers/gpu/drm/panthor/Makefile | 3 +-
drivers/gpu/drm/panthor/panthor_device.c | 6 +
drivers/gpu/drm/panthor/panthor_device.h | 6 +
drivers/gpu/drm/panthor/panthor_drv.c | 61 +++
drivers/gpu/drm/panthor/panthor_fw.c | 27 ++
drivers/gpu/drm/panthor/panthor_fw.h | 12 +
drivers/gpu/drm/panthor/panthor_perfcnt.c | 551 ++++++++++++++++++++++
drivers/gpu/drm/panthor/panthor_perfcnt.h | 31 ++
drivers/gpu/drm/panthor/panthor_sched.c | 1 +
include/uapi/drm/panthor_drm.h | 72 +++
10 files changed, 769 insertions(+), 1 deletion(-)
create mode 100644 drivers/gpu/drm/panthor/panthor_perfcnt.c
create mode 100644 drivers/gpu/drm/panthor/panthor_perfcnt.h

diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
index 15294719b09c..7f841fd053d4 100644
--- a/drivers/gpu/drm/panthor/Makefile
+++ b/drivers/gpu/drm/panthor/Makefile
@@ -9,6 +9,7 @@ panthor-y := \
panthor_gpu.o \
panthor_heap.o \
panthor_mmu.o \
- panthor_sched.o
+ panthor_sched.o \
+ panthor_perfcnt.o

obj-$(CONFIG_DRM_PANTHOR) += panthor.o
diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
index bfe8da4a6e4c..5dfd82891063 100644
--- a/drivers/gpu/drm/panthor/panthor_device.c
+++ b/drivers/gpu/drm/panthor/panthor_device.c
@@ -20,6 +20,7 @@
#include "panthor_mmu.h"
#include "panthor_regs.h"
#include "panthor_sched.h"
+#include "panthor_perfcnt.h"

static int panthor_clk_init(struct panthor_device *ptdev)
{
@@ -78,6 +79,7 @@ void panthor_device_unplug(struct panthor_device *ptdev)
/* Now, try to cleanly shutdown the GPU before the device resources
* get reclaimed.
*/
+ panthor_perfcnt_unplug(ptdev);
panthor_sched_unplug(ptdev);
panthor_fw_unplug(ptdev);
panthor_mmu_unplug(ptdev);
@@ -233,6 +235,10 @@ int panthor_device_init(struct panthor_device *ptdev)
if (ret)
goto err_unplug_fw;

+ ret = panthor_perfcnt_init(ptdev);
+ if (ret)
+ goto err_rpm_put;
+
/* ~3 frames */
pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50);
pm_runtime_use_autosuspend(ptdev->base.dev);
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index 51c9d61b6796..adf0bd29deb0 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -100,6 +100,9 @@ struct panthor_device {
/** @csif_info: Command stream interface information. */
struct drm_panthor_csif_info csif_info;

+ /** @perfcnt_info: Performance counters interface information. */
+ struct drm_panthor_perfcnt_info perfcnt_info;
+
/** @gpu: GPU management data. */
struct panthor_gpu *gpu;

@@ -127,6 +130,9 @@ struct panthor_device {
struct completion done;
} unplug;

+ /** @perfcnt: Device performance counters data. */
+ struct panthor_perfcnt *perfcnt;
+
/** @reset: Reset related fields. */
struct {
/** @wq: Ordered worqueud used to schedule reset operations. */
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index ff484506229f..6cb9ea0aa553 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -27,6 +27,7 @@
#include "panthor_mmu.h"
#include "panthor_regs.h"
#include "panthor_sched.h"
+#include "panthor_perfcnt.h"

/**
* DOC: user <-> kernel object copy helpers.
@@ -164,6 +165,7 @@ panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride,
_Generic(_obj_name, \
PANTHOR_UOBJ_DECL(struct drm_panthor_gpu_info, tiler_present), \
PANTHOR_UOBJ_DECL(struct drm_panthor_csif_info, pad), \
+ PANTHOR_UOBJ_DECL(struct drm_panthor_perfcnt_info, fw_size), \
PANTHOR_UOBJ_DECL(struct drm_panthor_sync_op, timeline_value), \
PANTHOR_UOBJ_DECL(struct drm_panthor_queue_submit, syncs), \
PANTHOR_UOBJ_DECL(struct drm_panthor_queue_create, ringbuf_size), \
@@ -765,6 +767,10 @@ static int panthor_ioctl_dev_query(struct drm_device *ddev, void *data, struct d
args->size = sizeof(ptdev->csif_info);
return 0;

+ case DRM_PANTHOR_DEV_QUERY_PERFCNT_INFO:
+ args->size = sizeof(ptdev->perfcnt_info);
+ return 0;
+
default:
return -EINVAL;
}
@@ -777,6 +783,9 @@ static int panthor_ioctl_dev_query(struct drm_device *ddev, void *data, struct d
case DRM_PANTHOR_DEV_QUERY_CSIF_INFO:
return PANTHOR_UOBJ_SET(args->pointer, args->size, ptdev->csif_info);

+ case DRM_PANTHOR_DEV_QUERY_PERFCNT_INFO:
+ return PANTHOR_UOBJ_SET(args->pointer, args->size, ptdev->perfcnt_info);
+
default:
return -EINVAL;
}
@@ -1245,6 +1254,55 @@ static int panthor_ioctl_vm_get_state(struct drm_device *ddev, void *data,
return 0;
}

+static bool perf_masks_zero(struct drm_panthor_perfcnt_config *req)
+{
+
+ u32 counters_mask = req->csg_select | req->fw_enable |
+ req->csg_enable | req->csf_enable | req->shader_enable |
+ req->tiler_enable | req->mmu_l2_enable;
+
+ return (!counters_mask) ? true : false;
+}
+
+static int panthor_ioctl_perfcnt_config(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct panthor_file *pfile = file_priv->driver_priv;
+ struct panthor_device *ptdev = pfile->ptdev;
+ struct drm_panthor_perfcnt_config *req = data;
+
+ /*
+ * GLB_PRFCNT_CONFIG.SET_SELECT: This flag allows selection of different
+ * sets of counter events. For those counter blocks that support it, this
+ * effectively selects between up to four sets of the event count inputs to
+ * the same counter block. All counter blocks support counter set 0.
+ */
+ if (req->counterset > 3)
+ return -EINVAL;
+
+ return panthor_perfcnt_config(ptdev, req, pfile, perf_masks_zero(req));
+}
+
+static int panthor_ioctl_perfcnt_dump(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct panthor_file *pfile = file_priv->driver_priv;
+ struct panthor_device *ptdev = pfile->ptdev;
+ struct drm_panthor_perfcnt_dump *req = data;
+ struct drm_gem_object *obj;
+ int ret;
+
+ obj = drm_gem_object_lookup(file_priv, req->handle);
+ if (!obj)
+ return -ENOENT;
+
+ ret = panthor_perfcnt_dump(ptdev, obj, file_priv->driver_priv);
+
+ drm_gem_object_put(obj);
+
+ return ret;
+}
+
static int
panthor_open(struct drm_device *ddev, struct drm_file *file)
{
@@ -1290,6 +1348,7 @@ panthor_postclose(struct drm_device *ddev, struct drm_file *file)
{
struct panthor_file *pfile = file->driver_priv;

+ panthor_perfcnt_close(file);
panthor_group_pool_destroy(pfile);
panthor_vm_pool_destroy(pfile);

@@ -1314,6 +1373,8 @@ static const struct drm_ioctl_desc panthor_drm_driver_ioctls[] = {
PANTHOR_IOCTL(TILER_HEAP_CREATE, tiler_heap_create, DRM_RENDER_ALLOW),
PANTHOR_IOCTL(TILER_HEAP_DESTROY, tiler_heap_destroy, DRM_RENDER_ALLOW),
PANTHOR_IOCTL(GROUP_SUBMIT, group_submit, DRM_RENDER_ALLOW),
+ PANTHOR_IOCTL(PERFCNT_CONFIG, perfcnt_config, DRM_RENDER_ALLOW),
+ PANTHOR_IOCTL(PERFCNT_DUMP, perfcnt_dump, DRM_RENDER_ALLOW),
};

static int panthor_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
index 33c87a59834e..7b31bb6c21b9 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.c
+++ b/drivers/gpu/drm/panthor/panthor_fw.c
@@ -23,6 +23,7 @@
#include "panthor_mmu.h"
#include "panthor_regs.h"
#include "panthor_sched.h"
+#include "panthor_perfcnt.h"

#define CSF_FW_NAME "mali_csffw.bin"

@@ -947,6 +948,7 @@ static void panthor_fw_init_global_iface(struct panthor_device *ptdev)
GLB_PING |
GLB_CFG_PROGRESS_TIMER |
GLB_CFG_POWEROFF_TIMER |
+ GLB_PERFCNT_SAMPLE |
GLB_IDLE_EN |
GLB_IDLE;

@@ -975,6 +977,10 @@ static void panthor_job_irq_handler(struct panthor_device *ptdev, u32 status)
return;

panthor_sched_report_fw_events(ptdev, status);
+
+ /* Let the perfcnt layer figure out if there are PERFCNT events to process. */
+ if (status & JOB_INT_GLOBAL_IF)
+ panthor_perfcnt_report_fw_events(ptdev, status);
}
PANTHOR_IRQ_HANDLER(job, JOB, panthor_job_irq_handler);

@@ -1213,6 +1219,26 @@ int panthor_fw_glb_wait_acks(struct panthor_device *ptdev,
req_mask, acked, timeout_ms);
}

+/**
+ * panthor_fw_glb_state_change() - Notify change of state in a global request register flags
+ * @ptdev: Device.
+ * @req_mask: Mask of requests to check change of state for.
+ * @flipped: Pointer to field that's updated with the flipped requests.
+ * If the function returns false, *flipped == 0.
+ *
+ * Return: true on change, false otherwise.
+ */
+bool panthor_fw_glb_state_change(struct panthor_device *ptdev,
+ u32 req_mask, u32 *flipped)
+{
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ u32 req = READ_ONCE(glb_iface->input->req) & req_mask;
+ u32 ack = READ_ONCE(glb_iface->output->ack) & req_mask;
+
+ *flipped = (req ^ ack);
+ return (*flipped != 0);
+}
+
/**
* panthor_fw_csg_wait_acks() - Wait for command stream group requests to be acknowledged.
* @ptdev: Device.
@@ -1352,6 +1378,7 @@ int panthor_fw_init(struct panthor_device *ptdev)
goto err_unplug_fw;

panthor_fw_init_global_iface(ptdev);
+
return 0;

err_unplug_fw:
diff --git a/drivers/gpu/drm/panthor/panthor_fw.h b/drivers/gpu/drm/panthor/panthor_fw.h
index 22448abde992..682a02118077 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.h
+++ b/drivers/gpu/drm/panthor/panthor_fw.h
@@ -11,6 +11,7 @@ struct panthor_kernel_bo;

#define MAX_CSGS 31
#define MAX_CS_PER_CSG 32
+#define MAX_PERFCNT_BUF_SLOTS 128

struct panthor_fw_ringbuf_input_iface {
u64 insert;
@@ -197,6 +198,8 @@ struct panthor_fw_global_control_iface {
u32 output_va;
u32 group_num;
u32 group_stride;
+#define GLB_PERFCNT_FW_SIZE(x) ((((x) >> 16) << 8))
+#define GLB_PERFCNT_HW_SIZE(x) (((x) & GENMASK(15, 0)) << 8)
u32 perfcnt_size;
u32 instr_features;
};
@@ -240,6 +243,8 @@ struct panthor_fw_global_input_iface {
u64 perfcnt_base;
u32 perfcnt_extract;
u32 reserved3[3];
+#define GLB_PERFCNT_CFG_SIZE(x) ((x) & GENMASK(7, 0))
+#define GLB_PERFCNT_CFG_SET(x) ((GENMASK(1, 0) & (x)) << 8)
u32 perfcnt_config;
u32 perfcnt_csg_select;
u32 perfcnt_fw_enable;
@@ -264,6 +269,11 @@ struct panthor_fw_global_output_iface {
u32 doorbell_ack;
u32 reserved2;
u32 halt_status;
+
+#define GLB_PERFCNT_STATUS_FAILED BIT(0)
+#define GLB_PERFCNT_STATUS_POWERON BIT(1)
+#define GLB_PERFCNT_STATUS_POWEROFF BIT(2)
+#define GLB_PERFCNT_STATUS_PROTSESSION BIT(3)
u32 perfcnt_status;
u32 perfcnt_insert;
};
@@ -472,6 +482,8 @@ int panthor_fw_csg_wait_acks(struct panthor_device *ptdev, u32 csg_id, u32 req_m
int panthor_fw_glb_wait_acks(struct panthor_device *ptdev, u32 req_mask, u32 *acked,
u32 timeout_ms);

+bool panthor_fw_glb_state_change(struct panthor_device *ptdev, u32 req_mask, u32 *flipped);
+
void panthor_fw_ring_csg_doorbells(struct panthor_device *ptdev, u32 csg_slot);

struct panthor_kernel_bo *
diff --git a/drivers/gpu/drm/panthor/panthor_perfcnt.c b/drivers/gpu/drm/panthor/panthor_perfcnt.c
new file mode 100644
index 000000000000..e223e44e3f35
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_perfcnt.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2023 Collabora Ltd */
+
+#include "linux/mutex.h"
+#include <linux/completion.h>
+#include <linux/iopoll.h>
+#include <linux/iosys-map.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/spinlock.h>
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_managed.h>
+#include <drm/panthor_drm.h>
+
+#include "panthor_device.h"
+#include "panthor_gem.h"
+#include "panthor_mmu.h"
+#include "panthor_perfcnt.h"
+#include "panthor_regs.h"
+#include "panthor_gpu.h"
+#include "panthor_fw.h"
+
+#define SAMPLE_TIMEOUT_MS 1000
+#define SAMPLE_HDR_SIZE 12
+#define SAMPLE_BLOCK_SIZE 0x100
+
+#define PERFCNT_OP_AFFECTED \
+ (GLB_PERFCNT_STATUS_POWEROFF | \
+ GLB_PERFCNT_STATUS_POWERON | \
+ GLB_PERFCNT_STATUS_PROTSESSION)
+
+enum perfcnt_status {
+ PERFCNT_STATUS_STARTED,
+ PERFCNT_STATUS_SUCCEEDED,
+ PERFCNT_STATUS_FAILED,
+ PERFCNT_STATUS_OVERFLOW,
+};
+
+struct panthor_perfcnt {
+ struct panthor_device *ptdev;
+ struct panthor_file *user;
+ struct mutex lock;
+
+ struct panthor_kernel_bo *bo;
+ size_t sample_size;
+ u32 ringslots;
+
+ struct workqueue_struct *dumper_wkq;
+ struct work_struct work;
+ atomic_t dump_requested;
+
+ struct list_head dumper_list;
+ wait_queue_head_t wq;
+};
+
+struct panthor_perfcnt_dumper {
+ struct list_head list;
+ struct completion comp;
+ void *user_bo;
+ int last_status;
+};
+
+struct perfcnt_counters {
+ u32 counterset;
+ u32 csg_select;
+ u32 fw_enable;
+ u32 csg_enable;
+ u32 csf_enable;
+ u32 shader_enable;
+ u32 tiler_enable;
+ u32 mmu_l2_enable;
+};
+
+static int panthor_perfcnt_enable_counters(struct panthor_device *ptdev,
+ struct perfcnt_counters *counters)
+{
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ u32 acked;
+
+ if (glb_iface->input->req & GLB_PERFCNT_EN) {
+ drm_info(&ptdev->base, "Performance counters aren't disabled!\n");
+ return -EBUSY;
+ }
+
+ glb_iface->input->perfcnt_config |= GLB_PERFCNT_CFG_SET(counters->counterset);
+ glb_iface->input->perfcnt_csg_select = counters->csg_select;
+ glb_iface->input->perfcnt_mmu_l2_enable = counters->mmu_l2_enable;
+ glb_iface->input->perfcnt_tiler_enable = counters->tiler_enable;
+ glb_iface->input->perfcnt_shader_enable = counters->shader_enable;
+ glb_iface->input->perfcnt_csf_enable = counters->csf_enable;
+ glb_iface->input->perfcnt_csg_enable = counters->csg_enable;
+ glb_iface->input->perfcnt_fw_enable = counters->fw_enable;
+
+ /* Enable/Disabled status is value-based, rather than change-of-value */
+ panthor_fw_update_reqs(glb_iface, req, GLB_PERFCNT_EN, GLB_PERFCNT_EN);
+ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+ return panthor_fw_glb_wait_acks(ptdev, GLB_PERFCNT_EN, &acked, 100);
+}
+
+static int
+panthor_perfcnt_disable_counters(struct panthor_device *ptdev)
+{
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ u32 acked;
+ int ret;
+
+ if (!(glb_iface->input->req & GLB_PERFCNT_EN)) {
+ drm_info(&ptdev->base, "Performance counters were already disabled\n");
+ return 0;
+ }
+
+ /* Enable/Disabled status is value-based, rather than change-of-value */
+ panthor_fw_update_reqs(glb_iface, req, 0, GLB_PERFCNT_EN);
+ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+ ret = panthor_fw_glb_wait_acks(ptdev, GLB_PERFCNT_EN, &acked, 100);
+ if (ret) {
+ drm_err(&ptdev->base, "Could not disable performance counters\n");
+ return ret;
+ }
+
+ glb_iface->input->perfcnt_csg_select = 0;
+ glb_iface->input->perfcnt_mmu_l2_enable = 0;
+ glb_iface->input->perfcnt_tiler_enable = 0;
+ glb_iface->input->perfcnt_shader_enable = 0;
+ glb_iface->input->perfcnt_csf_enable = 0;
+ glb_iface->input->perfcnt_csg_enable = 0;
+ glb_iface->input->perfcnt_fw_enable = 0;
+
+ return 0;
+}
+
+static void perfcnt_copy_sample(struct panthor_device *ptdev,
+ struct panthor_perfcnt *perfcnt,
+ void *bo_va, unsigned int idx)
+{
+ /*
+ * Ring buffer index calculation can be done in this way because it
+ * is always guaranteed to be a power of 2
+ */
+ memcpy(bo_va, perfcnt->bo->kmap +
+ ((idx & (perfcnt->ringslots - 1)) * perfcnt->sample_size),
+ perfcnt->sample_size);
+}
+
+static void clear_slot_headers(struct panthor_device *ptdev, u32 ext_idx, u32 ins_idx)
+{
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+ unsigned int offset;
+ unsigned int i;
+
+ if (WARN_ON(ext_idx >= ins_idx)) {
+ drm_warn(&ptdev->base, "Extraction index is greater or equal than insertion index %u-%u\n",
+ ext_idx, ins_idx);
+ return;
+ }
+
+ drm_dbg(&ptdev->base, "Cleaning perfcnt ring buffer slots %u-%u\n", ext_idx, ins_idx);
+
+ for (i = ext_idx; i < ins_idx; i++) {
+ void *slot = perfcnt->bo->kmap +
+ ((i & (ptdev->perfcnt->ringslots - 1)) * perfcnt->sample_size);
+
+ for (offset = 0; offset < perfcnt->sample_size; offset += SAMPLE_BLOCK_SIZE)
+ memset(slot + offset, 0, SAMPLE_HDR_SIZE);
+ }
+}
+
+static void clean_dumper_list(struct panthor_device *ptdev, unsigned int status)
+{
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ struct panthor_perfcnt_dumper *dumper, *dumper_tmp;
+
+ mutex_lock(&perfcnt->lock);
+ list_for_each_entry_safe(dumper, dumper_tmp, &perfcnt->dumper_list, list) {
+ if (status == PERFCNT_STATUS_SUCCEEDED)
+ perfcnt_copy_sample(ptdev, perfcnt, dumper->user_bo,
+ glb_iface->output->perfcnt_insert - 1);
+ list_del(&dumper->list);
+ INIT_LIST_HEAD(&dumper->list);
+ dumper->last_status = status;
+ complete(&dumper->comp);
+ }
+ mutex_unlock(&perfcnt->lock);
+}
+
+static void perfcnt_process_sample(struct work_struct *work)
+{
+ struct panthor_perfcnt *perfcnt =
+ container_of(work, struct panthor_perfcnt, work);
+ struct panthor_device *ptdev = perfcnt->ptdev;
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ u32 acked, flipped;
+ int ret;
+
+ if (panthor_fw_glb_state_change(ptdev, GLB_PERFCNT_THRESHOLD, &flipped)) {
+ drm_dbg(&ptdev->base, "Performance counter buffer has reached 50%% capacity\n");
+ panthor_fw_toggle_reqs(glb_iface, req, ack, flipped);
+ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+ ret = panthor_fw_glb_wait_acks(ptdev, flipped, &acked, 100);
+ if (ret)
+ drm_warn(&ptdev->base, "Resetting Threshold flags failed\n");
+ }
+
+ if (glb_iface->output->perfcnt_status & GLB_PERFCNT_STATUS_FAILED) {
+ drm_err(&ptdev->base, "Perfcounter sampling failed\n");
+ clean_dumper_list(ptdev, PERFCNT_STATUS_FAILED);
+ goto worker_exit;
+ }
+
+ if (panthor_fw_glb_state_change(ptdev, GLB_PERFCNT_OVERFLOW, &flipped)) {
+ drm_info(&ptdev->base, "The performance counter buffer has overflowed. Some samples may have been lost\n");
+ panthor_fw_toggle_reqs(glb_iface, req, ack, flipped);
+ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+ ret = panthor_fw_glb_wait_acks(ptdev, flipped, &acked, 100);
+ if (ret)
+ drm_err(&ptdev->base, "Resetting Overflow flags failed\n");
+ clean_dumper_list(ptdev, PERFCNT_STATUS_OVERFLOW);
+ goto clear_inc_idx;
+ }
+
+ if (glb_iface->output->perfcnt_status & PERFCNT_OP_AFFECTED)
+ drm_warn(&ptdev->base, "Perfcnt sample operation might have been impacted by a power transition or protected session exec\n");
+
+ clean_dumper_list(ptdev, PERFCNT_STATUS_SUCCEEDED);
+
+clear_inc_idx:
+ clear_slot_headers(ptdev, glb_iface->input->perfcnt_extract,
+ glb_iface->output->perfcnt_insert);
+ /*
+ * TRM recommends increasing the extract pointer by one after every sample
+ * operation, but because sample requests are processed sequentially and we
+ * discard samples triggered by the HW automatically, it's best if we simply
+ * set it to the next insert slot index.
+ */
+ WRITE_ONCE(glb_iface->input->perfcnt_extract,
+ READ_ONCE(glb_iface->output->perfcnt_insert));
+worker_exit:
+ wake_up_all(&perfcnt->wq);
+}
+
+int panthor_perfcnt_dump(struct panthor_device *ptdev,
+ struct drm_gem_object *obj,
+ struct panthor_file *pfile)
+{
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+ struct panthor_perfcnt_dumper dumper;
+ struct iosys_map map;
+
+ int ret;
+
+ mutex_lock(&perfcnt->lock);
+ if (perfcnt->user != pfile) {
+ ret = -EINVAL;
+ goto err_dump;
+ }
+
+ ret = drm_gem_vmap_unlocked(obj, &map);
+ if (ret) {
+ drm_err(&ptdev->base, "Could not map the target BO\n");
+ goto err_dump;
+ }
+
+ dumper.user_bo = map.vaddr;
+ dumper.last_status = PERFCNT_STATUS_STARTED;
+ init_completion(&dumper.comp);
+ list_add_tail(&dumper.list, &perfcnt->dumper_list);
+
+ /* Start the sampling if list were empty */
+ if (list_is_first(&dumper.list, &perfcnt->dumper_list)) {
+ panthor_fw_toggle_reqs(glb_iface, req, ack, GLB_PERFCNT_SAMPLE);
+ atomic_set(&ptdev->perfcnt->dump_requested, 1);
+ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+ }
+ mutex_unlock(&perfcnt->lock);
+
+ ret = wait_for_completion_interruptible_timeout(&dumper.comp,
+ msecs_to_jiffies(SAMPLE_TIMEOUT_MS));
+ if (!ret)
+ /* Let's give the worker thread a chance to finish */
+ ret = flush_work(&perfcnt->work);
+
+ if (!ret && !try_wait_for_completion(&dumper.comp)) {
+ mutex_lock(&perfcnt->lock);
+ if (!list_empty(&dumper.list)) {
+ list_del(&dumper.list);
+ if (list_empty(&perfcnt->dumper_list)) {
+ atomic_set(&ptdev->perfcnt->dump_requested, 0);
+ wake_up_all(&perfcnt->wq);
+ }
+ }
+ mutex_unlock(&perfcnt->lock);
+
+ ret = -ETIMEDOUT;
+ } else {
+ WARN_ON(dumper.last_status == PERFCNT_STATUS_STARTED);
+ ret = (dumper.last_status >= PERFCNT_STATUS_FAILED) ? -EIO : 0;
+ }
+
+ drm_gem_vunmap_unlocked(obj, &map);
+
+ return ret;
+
+err_dump:
+ mutex_unlock(&perfcnt->lock);
+ return ret;
+}
+
+
+static int panthor_perfcnt_enable_locked(struct panthor_device *ptdev,
+ struct panthor_file *pfile,
+ struct drm_panthor_perfcnt_config *req)
+
+{
+ unsigned int perfcnt_ringbuf_slots = req->ringslots;
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+ struct panthor_kernel_bo *bo;
+ int ret;
+
+ if (pfile == perfcnt->user)
+ return 0;
+ else if (perfcnt->user)
+ return -EBUSY;
+
+ if (perfcnt_ringbuf_slots != perfcnt->ringslots) {
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+
+ if (perfcnt->bo) {
+ panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), perfcnt->bo);
+ perfcnt->bo = NULL;
+ }
+
+ if (perfcnt_ringbuf_slots > MAX_PERFCNT_BUF_SLOTS)
+ perfcnt_ringbuf_slots = MAX_PERFCNT_BUF_SLOTS;
+ if (!is_power_of_2(perfcnt_ringbuf_slots))
+ perfcnt_ringbuf_slots = rounddown_pow_of_two(perfcnt_ringbuf_slots);
+
+ /*
+ * Create the perfcnt dump BO. We need to use the FW's VM because GLB_PRFCNT_JASID's
+ * maximum implementation defined value is 7. The way AS are assigned to a VM
+ * in panthor_vm_active means we cannot guarantee an AS between 1 and 7 would be
+ * available. An alternative would be implementing some sort of AS eviction
+ * mechanism, or perhaps setting one AS bit aside for perfcnt. However, given that
+ * the counters are global, it's simpler to bind the perfcount ringbuf to the FW AS.
+ */
+ bo = panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev),
+ perfcnt->sample_size * perfcnt_ringbuf_slots,
+ DRM_PANTHOR_BO_NO_MMAP,
+ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
+ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
+ PANTHOR_VM_KERNEL_AUTO_VA);
+ if (IS_ERR(bo))
+ return PTR_ERR(bo);
+
+ ret = panthor_kernel_bo_vmap(bo);
+ if (ret)
+ goto err_put_bo;
+
+ perfcnt->bo = bo;
+ perfcnt->ringslots = perfcnt_ringbuf_slots;
+ glb_iface->input->perfcnt_base = perfcnt->bo->va_node.start;
+ glb_iface->input->perfcnt_config |= GLB_PERFCNT_CFG_SIZE(perfcnt->ringslots);
+ }
+
+ ret = pm_runtime_get_sync(ptdev->base.dev);
+ if (ret < 0)
+ goto enable_err;
+
+ ret = panthor_perfcnt_disable_counters(ptdev);
+ if (ret)
+ goto enable_err;
+
+ ret = panthor_perfcnt_enable_counters(ptdev,
+ (struct perfcnt_counters *) &req->counterset);
+ if (ret)
+ goto enable_err;
+
+ perfcnt->user = pfile;
+
+ return 0;
+
+enable_err:
+ pm_runtime_put(ptdev->base.dev);
+ panthor_kernel_bo_vunmap(bo);
+err_put_bo:
+ panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), bo);
+ perfcnt->bo = NULL;
+ return ret;
+}
+
+static int panthor_perfcnt_disable_locked(struct panthor_device *ptdev,
+ struct panthor_file *pfile)
+{
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+ int ret;
+
+ if (perfcnt->user != pfile)
+ return -EINVAL;
+
+ if (!list_empty(&perfcnt->dumper_list)) {
+ drm_warn(&ptdev->base, "A perfcnt dump is still running, let it finnish\n");
+ mutex_unlock(&perfcnt->lock);
+ ret = wait_event_timeout(perfcnt->wq,
+ list_empty(&perfcnt->dumper_list),
+ msecs_to_jiffies(SAMPLE_TIMEOUT_MS));
+ mutex_lock(&perfcnt->lock);
+ if (!ret)
+ drm_warn(&ptdev->base, "Dump didn't finish, results will be undefined\n");
+ }
+
+ panthor_perfcnt_disable_counters(ptdev);
+ glb_iface->input->perfcnt_extract = 0;
+ perfcnt->user = NULL;
+
+ pm_runtime_mark_last_busy(ptdev->base.dev);
+ pm_runtime_put_autosuspend(ptdev->base.dev);
+
+ return 0;
+}
+
+int panthor_perfcnt_config(struct panthor_device *ptdev,
+ struct drm_panthor_perfcnt_config *req,
+ struct panthor_file *pfile,
+ bool disable)
+{
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+ int ret;
+
+ mutex_lock(&perfcnt->lock);
+ if (disable)
+ ret = panthor_perfcnt_disable_locked(ptdev, pfile);
+ else
+ ret = panthor_perfcnt_enable_locked(ptdev, pfile, req);
+ mutex_unlock(&perfcnt->lock);
+
+ return ret;
+}
+
+void panthor_perfcnt_close(struct drm_file *file_priv)
+{
+ struct panthor_file *pfile = file_priv->driver_priv;
+ struct panthor_device *ptdev = pfile->ptdev;
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+
+ pm_runtime_get_sync(ptdev->base.dev);
+
+ mutex_lock(&perfcnt->lock);
+ if (perfcnt->user == pfile)
+ panthor_perfcnt_disable_locked(ptdev, file_priv->driver_priv);
+ mutex_unlock(&perfcnt->lock);
+
+ pm_runtime_mark_last_busy(ptdev->base.dev);
+ pm_runtime_put_autosuspend(ptdev->base.dev);
+}
+
+void panthor_perfcnt_report_fw_events(struct panthor_device *ptdev, u32 status)
+{
+
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+ u32 req, ack;
+
+ if (!perfcnt)
+ return;
+
+ req = READ_ONCE(glb_iface->input->req);
+ ack = READ_ONCE(glb_iface->output->ack);
+
+ if ((~(req ^ ack) & GLB_PERFCNT_SAMPLE) &&
+ !panthor_device_reset_is_pending(ptdev)) {
+ if (atomic_cmpxchg(&ptdev->perfcnt->dump_requested, 1, 0))
+ queue_work(perfcnt->dumper_wkq, &perfcnt->work);
+ }
+}
+
+int panthor_perfcnt_init(struct panthor_device *ptdev)
+{
+ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+ struct panthor_perfcnt *perfcnt;
+ int ret;
+
+ perfcnt = devm_kzalloc(ptdev->base.dev, sizeof(*perfcnt), GFP_KERNEL);
+ if (!perfcnt)
+ return -ENOMEM;
+
+ ptdev->perfcnt_info.fw_size = GLB_PERFCNT_FW_SIZE(glb_iface->control->perfcnt_size);
+ ptdev->perfcnt_info.hw_size = GLB_PERFCNT_HW_SIZE(glb_iface->control->perfcnt_size);
+
+ perfcnt->sample_size = ptdev->perfcnt_info.fw_size + ptdev->perfcnt_info.hw_size;
+ perfcnt->ringslots = 0;
+ perfcnt->bo = NULL;
+
+ perfcnt->dumper_wkq = alloc_workqueue("perfcnt-dumper", WQ_UNBOUND, 0);
+ if (!perfcnt->dumper_wkq) {
+ drm_err(&ptdev->base, "Failed to allocate perfcnt workqueue");
+ return -ENOMEM;
+ }
+ INIT_WORK(&perfcnt->work, perfcnt_process_sample);
+
+ /* Perfcnt configuration */
+ glb_iface->input->perfcnt_config |= GLB_PERFCNT_CFG_SIZE(perfcnt->ringslots);
+ glb_iface->input->perfcnt_as = panthor_vm_as(panthor_fw_vm(ptdev));
+ glb_iface->input->perfcnt_extract = 0;
+
+ /* Start with everything disabled. */
+ ret = panthor_perfcnt_disable_counters(ptdev);
+ if (ret)
+ goto err_dealloc_workqueue;
+
+ INIT_LIST_HEAD(&perfcnt->dumper_list);
+ init_waitqueue_head(&perfcnt->wq);
+ mutex_init(&perfcnt->lock);
+
+ perfcnt->ptdev = ptdev;
+ ptdev->perfcnt = perfcnt;
+
+ drm_info(&ptdev->base,
+ "Perfcnt params: Sample size: %#zx Slots: %u\n",
+ perfcnt->sample_size, perfcnt->ringslots);
+
+ return 0;
+
+err_dealloc_workqueue:
+ destroy_workqueue(perfcnt->dumper_wkq);
+
+ return ret;
+}
+
+void panthor_perfcnt_unplug(struct panthor_device *ptdev)
+{
+ struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+
+ WARN_ON(perfcnt->user);
+
+ panthor_perfcnt_disable_counters(ptdev);
+
+ cancel_work_sync(&perfcnt->work);
+ destroy_workqueue(perfcnt->dumper_wkq);
+
+ mutex_destroy(&perfcnt->lock);
+
+ if (perfcnt->bo) {
+ panthor_kernel_bo_vunmap(perfcnt->bo);
+ panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), perfcnt->bo);
+ }
+}
diff --git a/drivers/gpu/drm/panthor/panthor_perfcnt.h b/drivers/gpu/drm/panthor/panthor_perfcnt.h
new file mode 100644
index 000000000000..6edcbe256f4a
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_perfcnt.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2023 Collabora Ltd */
+#ifndef __PANTHOR_PERFCNT_H__
+#define __PANTHOR_PERFCNT_H__
+
+#include <linux/types.h>
+
+struct panthor_device;
+struct panthor_file;
+struct drm_device;
+struct drm_file;
+struct drm_gem_object;
+struct drm_panthor_perfcnt_config;
+
+int panthor_perfcnt_init(struct panthor_device *ptdev);
+void panthor_perfcnt_unplug(struct panthor_device *ptdev);
+void panthor_perfcnt_close(struct drm_file *file_priv);
+
+int panthor_perfcnt_config(struct panthor_device *ptdev,
+ struct drm_panthor_perfcnt_config *req,
+ struct panthor_file *pfile,
+ bool disable);
+int panthor_perfcnt_dump(struct panthor_device *ptdev,
+ struct drm_gem_object *obj,
+ struct panthor_file *pfile);
+
+void panthor_perfcnt_report_fw_events(struct panthor_device *ptdev,
+ u32 status);
+
+
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 5f7803b6fc48..cbd0ab77a3cd 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -31,6 +31,7 @@
#include "panthor_mmu.h"
#include "panthor_regs.h"
#include "panthor_sched.h"
+#include "panthor_perfcnt.h"

/**
* DOC: Scheduler
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 373df80f41ed..0ca940529be4 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -127,6 +127,12 @@ enum drm_panthor_ioctl_id {

/** @DRM_PANTHOR_TILER_HEAP_DESTROY: Destroy a tiler heap. */
DRM_PANTHOR_TILER_HEAP_DESTROY,
+
+ /** @DRM_PANTHOR_PERFCNT_CONFIG: Enable or disable performance counters. */
+ DRM_PANTHOR_PERFCNT_CONFIG,
+
+ /** @DRM_PANTHOR_PERFCNT_DUMP: Sample and retrieve performance counters. */
+ DRM_PANTHOR_PERFCNT_DUMP,
};

/**
@@ -170,6 +176,10 @@ enum drm_panthor_ioctl_id {
DRM_IOCTL_PANTHOR(WR, TILER_HEAP_CREATE, tiler_heap_create)
#define DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY \
DRM_IOCTL_PANTHOR(WR, TILER_HEAP_DESTROY, tiler_heap_destroy)
+#define DRM_IOCTL_PANTHOR_PERFCNT_CONFIG \
+ DRM_IOCTL_PANTHOR(WR, PERFCNT_CONFIG, perfcnt_config)
+#define DRM_IOCTL_PANTHOR_PERFCNT_DUMP \
+ DRM_IOCTL_PANTHOR(WR, PERFCNT_DUMP, perfcnt_dump)

/**
* DOC: IOCTL arguments
@@ -260,6 +270,9 @@ enum drm_panthor_dev_query_type {

/** @DRM_PANTHOR_DEV_QUERY_CSIF_INFO: Query command-stream interface information. */
DRM_PANTHOR_DEV_QUERY_CSIF_INFO,
+
+ /** @DRM_PANTHOR_DEV_QUERY_PERFCNT_INFO: Query perf counters interface information. */
+ DRM_PANTHOR_DEV_QUERY_PERFCNT_INFO,
};

/**
@@ -377,6 +390,19 @@ struct drm_panthor_csif_info {
__u32 pad;
};

+/**
+ * struct drm_panthor_perfcnt_info - Performance counters interface information
+ *
+ * Structure grouping all queryable information relating to the perfcnt interface.
+ */
+struct drm_panthor_perfcnt_info {
+ /** @hw_size: Size of HW related performance counters. */
+ __u32 hw_size;
+
+ /** @fw_size: Size of FW related performance counters. */
+ __u32 fw_size;
+};
+
/**
* struct drm_panthor_dev_query - Arguments passed to DRM_PANTHOR_IOCTL_DEV_QUERY
*/
@@ -938,6 +964,52 @@ struct drm_panthor_tiler_heap_destroy {
__u32 pad;
};

+/**
+ * struct drm_panthor_perfcnt_config - Arguments passed to DRM_IOCTL_PANTHOR_PERFCNT_CONFIG
+ */
+struct drm_panthor_perfcnt_config {
+ /** @ringslots: Size of the perfcnt ring buffer in slot count. */
+ __u32 ringslots;
+
+ /** @counterset: Counter set to enable in Panthor. */
+ __u32 counterset;
+
+ /** @csg_enable: List of CSG intances enabled for perf counting */
+ __u32 csg_select;
+
+ /** @fw_enable FW counters to be enabled */
+ __u32 fw_enable;
+
+ /** @csg_enable CSG counters to be enabled */
+ __u32 csg_enable;
+
+ /** @csf_enable CSF counters to be enabled */
+ __u32 csf_enable;
+
+ /** @shader_enable Shader unit counters to be enabled */
+ __u32 shader_enable;
+
+ /** @tiler_enable Tiler unit counters to be enabled */
+ __u32 tiler_enable;
+
+ /** @mmu_l2_enable L2 cache MMU counters to be enabled */
+ __u32 mmu_l2_enable;
+
+ /** @pad: Padding field, MBZ. */
+ __u32 pad;
+};
+
+/**
+ * struct drm_panthor_perfcnt_dump - Arguments passed to DRM_IOCTL_PANTHOR_PERFCNT_DUMP
+ */
+struct drm_panthor_perfcnt_dump {
+ /** @handle: Handle of the BO to write perfcnt dump into */
+ __u32 handle;
+
+ /** @pad: Padding field, MBZ. */
+ __u32 pad;
+};
+
#if defined(__cplusplus)
}
#endif

base-commit: e635b7eb7062b464bbd9795308b1a80eac0b01f5
--
2.43.0