Re: [PATCH v2 1/2] perf/arm-cmn: Decouple wp_config registers from filter group number

From: Ilkka Koskinen
Date: Mon Mar 25 2024 - 20:43:09 EST




Hi Robin,

On Fri, 8 Mar 2024, Robin Murphy wrote:
On 2024-03-07 11:09 pm, Ilkka Koskinen wrote:
Previously, wp_config0/2 registers were used for primary match group and
wp_config1/3 registers for secondary match group. In order to support
tertiary match group, this patch decouples the registers and the groups.

Signed-off-by: Ilkka Koskinen <ilkka@xxxxxxxxxxxxxxxxxxxxxx>
---
drivers/perf/arm-cmn.c | 125 ++++++++++++++++++++++++++++++++++-------
1 file changed, 105 insertions(+), 20 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 7e3aa7e2345f..29d46e0cf1cd 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -589,6 +589,13 @@ struct arm_cmn_hw_event {
s8 dtc_idx[CMN_MAX_DTCS];
u8 num_dns;
u8 dtm_offset;
+
+ /*
+ * WP config registers are divided to UP and DOWN events. We need to
+ * keep to track only one of them.
+ */
+ DECLARE_BITMAP(wp_cfg, 2 * CMN_MAX_XPS);

What I had in mind was a wp_idx field which works the same way as dtm_idx, i.e. we just store the allocated index per relevant DN, since a single event can never use *both* watchpoints on a single XP. Each index then need only be 0 or 1 since they're already scoped by the watchpoint direction of the base event, thus we should only need one bit per XP.

Ah, I got it now.


+
bool wide_sel;
enum cmn_filter_select filter_sel;
};
@@ -1335,9 +1342,51 @@ static const struct attribute_group *arm_cmn_attr_groups[] = {
NULL
};
-static int arm_cmn_wp_idx(struct perf_event *event)
+static inline unsigned int arm_cmn_get_xp_idx(struct arm_cmn *cmn,
+ struct arm_cmn_node *xp)
{
- return CMN_EVENT_EVENTID(event) + CMN_EVENT_WP_GRP(event);
+ return ((unsigned long) xp - (unsigned long) cmn->xps) / sizeof(struct arm_cmn_node);
+}
+
+static int arm_cmn_find_free_wp_idx(struct arm_cmn *cmn, struct arm_cmn_dtm *dtm,
+ struct perf_event *event)
+{
+ int wp_idx = CMN_EVENT_EVENTID(event);
+
+ if (dtm->wp_event[wp_idx] >= 0)
+ if (dtm->wp_event[++wp_idx] >= 0)
+ return -ENOSPC;
+
+ return wp_idx;
+}
+
+static int arm_cmn_get_assigned_wp_idx(struct arm_cmn *cmn,
+ struct arm_cmn_node *xp,
+ struct perf_event *event,
+ struct arm_cmn_hw_event *hw)
+{
+ int xp_idx = arm_cmn_get_xp_idx(cmn, xp);
+
+ if (test_bit(2 * xp_idx, hw->wp_cfg))
+ return CMN_EVENT_EVENTID(event);
+ else if (test_bit(2 * xp_idx + 1, hw->wp_cfg))
+ return CMN_EVENT_EVENTID(event) + 1;
+
+ dev_err(cmn->dev, "Could't find the assigned wp_cfg\n");
+ return -EINVAL;
+}

...and so for this we would only need more of a mild tweak to the existing design, something like:

static int arm_cmn_get_wp_idx(struct perf_event *event, int pos)
{
struct arm_cmn_hw_event *hw = to_cmn_hw(event);

return CMN_EVENT_EVENTID(event) + test_bit(&hw->wp_idx, pos);
}

Yep, it simplifies it quite a bit.


+
+static void arm_cmn_claim_wp_idx(struct arm_cmn *cmn,
+ struct arm_cmn_dtm *dtm,
+ struct perf_event *event,
+ struct arm_cmn_node *xp,
+ int wp_idx, unsigned int dtc)
+{
+ struct arm_cmn_hw_event *hw = to_cmn_hw(event);
+ int xp_idx = arm_cmn_get_xp_idx(cmn, xp);
+
+ dtm->wp_event[wp_idx] = hw->dtc_idx[dtc];
+ set_bit(2 * xp_idx + (wp_idx & 1), hw->wp_cfg);

This is recalculating way more than it needs to. It's only ever used within for_each_hw_dn(), which already has all the information to hand already - again, look at how hw->dtm_idx is managed. Furthermore I'd also prefer to similarly not conflate management of the per-event state with that of the DTM state (i.e. just have an arm_cmn_set_wp_idx() for updating the event data).

Right, I somehow forgot that hw->dn points to the right type of the node and I can simply use the index from for_each_hw_dn().


}
static u32 arm_cmn_wp_config(struct perf_event *event)
@@ -1519,12 +1568,16 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR);
cmn->dtc[i].cc_active = true;
} else if (type == CMN_TYPE_WP) {
- int wp_idx = arm_cmn_wp_idx(event);
u64 val = CMN_EVENT_WP_VAL(event);
u64 mask = CMN_EVENT_WP_MASK(event);
for_each_hw_dn(hw, dn, i) {
void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+ int wp_idx;
+
+ wp_idx = arm_cmn_get_assigned_wp_idx(cmn, dn, event, hw);
+ if (wp_idx < 0)
+ return;
writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx));
writeq_relaxed(mask, base + CMN_DTM_WPn_MASK(wp_idx));
@@ -1549,10 +1602,13 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
i = hw->dtc_idx[0];
cmn->dtc[i].cc_active = false;
} else if (type == CMN_TYPE_WP) {
- int wp_idx = arm_cmn_wp_idx(event);
-
for_each_hw_dn(hw, dn, i) {
void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+ int wp_idx;
+
+ wp_idx = arm_cmn_get_assigned_wp_idx(cmn, dn, event, hw);
+ if (wp_idx < 0)
+ continue;
writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx));
writeq_relaxed(~0ULL, base + CMN_DTM_WPn_VAL(wp_idx));
@@ -1574,8 +1630,20 @@ struct arm_cmn_val {
bool cycles;
};
-static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
- struct perf_event *event)
+static int arm_cmn_val_find_free_wp_config(struct perf_event *event,
+ struct arm_cmn_val *val, int dtm)
+{
+ int wp_idx = CMN_EVENT_EVENTID(event);
+
+ if (val->wp[dtm][wp_idx])
+ if (val->wp[dtm][++wp_idx])
+ return -ENOSPC;
+
+ return wp_idx;
+}
+
+static int arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
+ struct perf_event *event)

This must never fail - the purpose of val_add_event is to fill in the val structure with the combination of leader and sibling events which have *already* passed their own event_init calls been declared valid as a group. The body of validate_group then does the "what if?" version to test whether the group would remain valid if the *current* event were to be added.

Makes perfectly sense. I fix it.


The trick with the offset combine value relies on direct indexing to work, so I think we need to rejig the structure slightly to track distinct wp_count and wp_combine values (per direction) - that then becomes nicely consistent with the relationship between dtm_count and occupid, too.

I'm not sure if it's necessary but I do get the consistency reason though


{
struct arm_cmn_hw_event *hw = to_cmn_hw(event);
struct arm_cmn_node *dn;
@@ -1583,12 +1651,12 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
int i;
if (is_software_event(event))
- return;
+ return 0;
type = CMN_EVENT_TYPE(event);
if (type == CMN_TYPE_DTC) {
val->cycles = true;
- return;
+ return 0;
}
for_each_hw_dtc_idx(hw, dtc, idx)
@@ -1605,9 +1673,14 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
if (type != CMN_TYPE_WP)
continue;
- wp_idx = arm_cmn_wp_idx(event);
+ wp_idx = arm_cmn_val_find_free_wp_config(event, val, dtm);
+ if (wp_idx < 0)
+ return -ENOSPC;
+
val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
}
+
+ return 0;
}
static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
@@ -1629,9 +1702,15 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
if (!val)
return -ENOMEM;
- arm_cmn_val_add_event(cmn, val, leader);
- for_each_sibling_event(sibling, leader)
- arm_cmn_val_add_event(cmn, val, sibling);
+ ret = arm_cmn_val_add_event(cmn, val, leader);
+ if (ret)
+ goto done;
+
+ for_each_sibling_event(sibling, leader) {
+ ret = arm_cmn_val_add_event(cmn, val, sibling);
+ if (ret)
+ goto done;
+ }
type = CMN_EVENT_TYPE(event);
if (type == CMN_TYPE_DTC) {
@@ -1656,8 +1735,8 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
if (type != CMN_TYPE_WP)
continue;
- wp_idx = arm_cmn_wp_idx(event);
- if (val->wp[dtm][wp_idx])
+ wp_idx = arm_cmn_val_find_free_wp_config(event, val, dtm);
+ if (wp_idx < 0)
goto done;
wp_cmb = val->wp[dtm][wp_idx ^ 1];
@@ -1772,8 +1851,11 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] + hw->dtm_offset;
unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
- if (type == CMN_TYPE_WP)
- dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
+ if (type == CMN_TYPE_WP) {
+ int wp_idx = arm_cmn_get_assigned_wp_idx(cmn, &hw->dn[i], event, hw);
+
+ dtm->wp_event[wp_idx] = -1;
+ }
if (hw->filter_sel > SEL_NONE)
hw->dn[i].occupid[hw->filter_sel].count--;
@@ -1782,6 +1864,7 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG);
}
memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
+ bitmap_zero(hw->wp_cfg, 2 * CMN_MAX_XPS);

Nit: I'd rather do this in terms of sizeof() so it's harder to break in future. And since it's going to end up being a memset() anyway I'd then probably just open-code that rather than mucking about with bytes-to-bits calculations.

I change it.


for_each_hw_dtc_idx(hw, j, idx)
cmn->dtc[j].counters[idx] = NULL;
@@ -1835,10 +1918,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
if (type == CMN_TYPE_XP) {
input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx;
} else if (type == CMN_TYPE_WP) {
- int tmp, wp_idx = arm_cmn_wp_idx(event);
+ int tmp, wp_idx;
u32 cfg = arm_cmn_wp_config(event);
- if (dtm->wp_event[wp_idx] >= 0)
+ wp_idx = arm_cmn_find_free_wp_idx(cmn, dtm, event);
+ if (wp_idx < 0)

TBH I'm not convinced it's even worth factoring out the "allocator" here, since inline it can be as simple as:

int tmp, wp_idx = CMN_EVENT_EVENTID(event);
...
if (dtm->wp_event[wp_idx] && dtm->wp_event[++wp_idx])

(or perhaps follow the same while/if shape as for dtm_idx further up, if you think it's worth being more clear than concise)

I'd rather keep them in their own functions to be more consistent and slghtly clearer.

Cheers, Ilkka


Thanks,
Robin.

goto free_dtms;
tmp = dtm->wp_event[wp_idx ^ 1];
@@ -1847,7 +1931,8 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
goto free_dtms;
input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
- dtm->wp_event[wp_idx] = hw->dtc_idx[d];
+
+ arm_cmn_claim_wp_idx(cmn, dtm, event, dn, wp_idx, d);
writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
} else {
struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);