[PATCH v5 5/8] x86/resctrl: Introduce snc_nodes_per_l3_cache

From: Tony Luck
Date: Tue Aug 29 2023 - 19:45:29 EST


Intel Sub-NUMA Cluster mode requires several changes in resctrl
behavior for correct operation.

Add a global integer "snc_nodes_per_l3_cache" that will show how many
SNC nodes share each L3 cache. When this is "1", SNC mode is either
not implemented, or not enabled.

A later patch will detect SNC mode and set snc_nodes_per_l3_cache to
the appropriate value. For now it remains at the default "1" to
indicate SNC mode is not active.

Code that needs to take action when SNC is enabled is:
1) The number of logical RMIDs available for use is the number of
physical RMIDs divided by the number of SNC nodes.
2) Likewise the "mon_scale" value must be adjusted for the number
of SNC nodes.
3) When reading an RMID counter code must adjust from the logical
RMID used to the physical RMID value that must be loaded into
the IA32_QM_EVTSEL MSR.
4) The L3 cache is divided between the SNC nodes. So the value
reported in the resctrl "size" file is adjusted.
5) The "-o mba_MBps" mount option must be disabled in SNC mode
because the monitoring is being done per SNC node, while the
bandwidth allocation is still done at the L3 cache scope.

Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
---
arch/x86/kernel/cpu/resctrl/internal.h | 2 ++
arch/x86/kernel/cpu/resctrl/core.c | 7 +++++++
arch/x86/kernel/cpu/resctrl/monitor.c | 16 +++++++++++++---
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 ++--
4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index c61fd6709730..326ca6b3688a 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -446,6 +446,8 @@ DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);

extern struct dentry *debugfs_resctrl;

+extern int snc_nodes_per_l3_cache;
+
enum resctrl_res_level {
RDT_RESOURCE_L3,
RDT_RESOURCE_L2,
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 9fcc264fac6c..ed4f55b3e5e4 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -48,6 +48,13 @@ int max_name_width, max_data_width;
*/
bool rdt_alloc_capable;

+/*
+ * Number of SNC nodes that share each L3 cache.
+ * Default is 1 for systems that do not support
+ * SNC, or have SNC disabled.
+ */
+int snc_nodes_per_l3_cache = 1;
+
static void
mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
struct rdt_resource *r);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 42262d59ef9b..b6b3fb0f9abe 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -148,8 +148,18 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)

static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ int cpu = smp_processor_id();
+ int rmid_offset = 0;
u64 msr_val;

+ /*
+ * When SNC mode is on, need to compute the offset to read the
+ * physical RMID counter for the node to which this CPU belongs
+ */
+ if (snc_nodes_per_l3_cache > 1)
+ rmid_offset = (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
+
/*
* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
* with a valid event code for supported resource type and the bits
@@ -158,7 +168,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
* are error bits.
*/
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid + rmid_offset);
rdmsrl(MSR_IA32_QM_CTR, msr_val);

if (msr_val & RMID_VAL_ERROR)
@@ -783,8 +793,8 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
int ret;

resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
- hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
- r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+ r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;

if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 5feec2c33544..a8cf6251e506 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1367,7 +1367,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
}
}

- return size;
+ return size / snc_nodes_per_l3_cache;
}

/**
@@ -2600,7 +2600,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->enable_cdpl2 = true;
return 0;
case Opt_mba_mbps:
- if (!supports_mba_mbps())
+ if (!supports_mba_mbps() || snc_nodes_per_l3_cache > 1)
return -EINVAL;
ctx->enable_mba_mbps = true;
return 0;
--
2.41.0