Re: [PATCH v4 5/7] x86/resctrl: Determine if Sub-NUMA Cluster is enabled and initialize.

From: Reinette Chatre
Date: Fri Aug 11 2023 - 13:32:41 EST


Hi Tony,

On 7/22/2023 12:07 PM, Tony Luck wrote:
> There isn't a simple hardware enumeration to indicate to software that
> a system is running with Sub-NUMA Cluster enabled.
>
> Compare the number of NUMA nodes with the number of L3 caches to calculate
> the number of Sub-NUMA nodes per L3 cache.
>
> When Sub-NUMA cluster mode is enabled in BIOS setup the RMID counters
> are distributed equally between the SNC nodes within each socket.
>
> E.g. if there are 400 RMID counters, and the system is configured with
> two SNC nodes per socket, then RMID counter 0..199 are used on SNC node
> 0 on the socket, and RMID counter 200..399 on SNC node 1.
>
> A model specific MSR (0xca0) can change the configuration of the RMIDs
> when SNC mode is enabled.
>
> The MSR controls the interpretation of the RMID field in the
> IA32_PQR_ASSOC MSR so that the appropriate hardware counters
> within the SNC node are updated.
>
> To read the RMID counters an offset must be used to get data
> from the physical counter associated with the SNC node. As in
> the example above with 400 RMID counters Linux sees only 200
> counters. No special action is needed to read a counter from
> the first SNC node on a socket. But to read a Linux visible
> counter 50 on the second SNC node the kernel must load 250
> into the QM_EVTSEL MSR.
>
> N.B. this works well for well-behaved NUMA applications that access
> memory predominantly from the local memory node. For applications that
> access memory across multiple nodes it may be necessary for the user
> to read counters for all SNC nodes on a socket and add the values to
> get the actual LLC occupancy or memory bandwidth. Perhaps this isn't
> all that different from applications that span across multiple sockets
> in a legacy system.
>
> The cache allocation feature still provides the same number of
> bits in a mask to control allocation into the L3 cache. But each
> of those ways has its capacity reduced because the cache is divided
> between the SNC nodes. Adjust the value reported in the resctrl
> "size" file accordingly.
>
> Mounting the file system with the "mba_MBps" option is disabled
> when SNC mode is enabled. This is because the measurement of bandwidth
> is per SNC node, while the MBA throttling controls are still at
> the L3 cache scope.
>

I'm counting four logical changes in this changelog. Can they be separate
patches?

> Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
> ---
> include/linux/resctrl.h | 2 +
> arch/x86/include/asm/msr-index.h | 1 +
> arch/x86/kernel/cpu/resctrl/internal.h | 2 +
> arch/x86/kernel/cpu/resctrl/core.c | 82 +++++++++++++++++++++++++-
> arch/x86/kernel/cpu/resctrl/monitor.c | 18 +++++-
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 4 +-
> 6 files changed, 103 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 80a89d171eba..576dc21bd990 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -200,6 +200,8 @@ struct rdt_resource {
> bool cdp_capable;
> };
>
> +#define MON_SCOPE_NODE 100
> +

Could you please add a comment to explain what this constant represents
and how it is used?

> /**
> * struct resctrl_schema - configuration abilities of a resource presented to
> * user-space
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 3aedae61af4f..4b624a37d64a 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -1087,6 +1087,7 @@
> #define MSR_IA32_QM_CTR 0xc8e
> #define MSR_IA32_PQR_ASSOC 0xc8f
> #define MSR_IA32_L3_CBM_BASE 0xc90
> +#define MSR_RMID_SNC_CONFIG 0xca0
> #define MSR_IA32_L2_CBM_BASE 0xd10
> #define MSR_IA32_MBA_THRTL_BASE 0xd50
>
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index 016ef0373c5a..00a330bc5ced 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -446,6 +446,8 @@ DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
>
> extern struct dentry *debugfs_resctrl;
>
> +extern int snc_nodes_per_l3_cache;
> +
> enum resctrl_res_level {
> RDT_RESOURCE_L3,
> RDT_RESOURCE_L2,
> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
> index 0161362b0c3e..1331add347fc 100644
> --- a/arch/x86/kernel/cpu/resctrl/core.c
> +++ b/arch/x86/kernel/cpu/resctrl/core.c
> @@ -16,11 +16,14 @@
>
> #define pr_fmt(fmt) "resctrl: " fmt
>
> +#include <linux/cpu.h>
> #include <linux/slab.h>
> #include <linux/err.h>
> #include <linux/cacheinfo.h>
> #include <linux/cpuhotplug.h>
> +#include <linux/mod_devicetable.h>
>

What does this include provide?

> +#include <asm/cpu_device_id.h>
> #include <asm/intel-family.h>
> #include <asm/resctrl.h>
> #include "internal.h"
> @@ -48,6 +51,13 @@ int max_name_width, max_data_width;
> */
> bool rdt_alloc_capable;
>
> +/*
> + * Number of SNC nodes that share each L3 cache.
> + * Default is 1 for systems that do not support
> + * SNC, or have SNC disabled.
> + */
> +int snc_nodes_per_l3_cache = 1;
> +
> static void
> mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
> struct rdt_resource *r);
> @@ -543,9 +553,16 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
> }
> }
>
> +static int get_mon_scope_id(int cpu, int scope)
> +{
> + if (scope == MON_SCOPE_NODE)
> + return cpu_to_node(cpu);
> + return get_cpu_cacheinfo_id(cpu, scope);
> +}
> +
> static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
> {
> - int id = get_cpu_cacheinfo_id(cpu, r->mon_scope);
> + int id = get_mon_scope_id(cpu, r->mon_scope);
> struct list_head *add_pos = NULL;
> struct rdt_hw_mondomain *hw_mondom;
> struct rdt_mondomain *d;
> @@ -692,11 +709,28 @@ static void clear_closid_rmid(int cpu)
> wrmsr(MSR_IA32_PQR_ASSOC, 0, 0);
> }
>
> +static void snc_remap_rmids(int cpu)
> +{
> + u64 val;

No need for tab here.

> +
> + /* Only need to enable once per package */
> + if (cpumask_first(topology_core_cpumask(cpu)) != cpu)
> + return;
> +
> + rdmsrl(MSR_RMID_SNC_CONFIG, val);
> + val &= ~BIT_ULL(0);
> + wrmsrl(MSR_RMID_SNC_CONFIG, val);
> +}

Could you please document snc_remap_rmids()
with information on what the bit in the register means
and what the above function does?

> +
> static int resctrl_online_cpu(unsigned int cpu)
> {
> struct rdt_resource *r;
>
> mutex_lock(&rdtgroup_mutex);
> +
> + if (snc_nodes_per_l3_cache > 1)
> + snc_remap_rmids(cpu);
> +
> for_each_capable_rdt_resource(r)
> domain_add_cpu(cpu, r);
> /* The cpu is set in default rdtgroup after online. */
> @@ -951,11 +985,57 @@ static __init bool get_rdt_resources(void)
> return (rdt_mon_capable || rdt_alloc_capable);
> }
>

I think it will help to add a comment like:
"CPUs that support the model specific MSR_RMID_SNC_CONFIG register."

> +static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
> + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
> + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 0),
> + X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, 0),
> + {}
> +};
> +
> +/*
> + * There isn't a simple enumeration bit to show whether SNC mode
> + * is enabled. Look at the ratio of number of NUMA nodes to the
> + * number of distinct L3 caches. Take care to skip memory-only nodes.
> + */
> +static __init int get_snc_config(void)
> +{
> + unsigned long *node_caches;
> + int mem_only_nodes = 0;
> + int cpu, node, ret;
> +
> + if (!x86_match_cpu(snc_cpu_ids))
> + return 1;
> +
> + node_caches = kcalloc(BITS_TO_LONGS(nr_node_ids), sizeof(*node_caches), GFP_KERNEL);
> + if (!node_caches)
> + return 1;
> +
> + cpus_read_lock();
> + for_each_node(node) {
> + cpu = cpumask_first(cpumask_of_node(node));
> + if (cpu < nr_cpu_ids)
> + set_bit(get_cpu_cacheinfo_id(cpu, 3), node_caches);
> + else
> + mem_only_nodes++;
> + }
> + cpus_read_unlock();

I am not familiar with the numa code at all so please correct me
where I am wrong. I do see that nr_node_ids is initialized with __init code
so it should be accurate at this point. It looks to me like this initialization
assumes that at least one CPU per node will be online at the time it is run.
It is not clear to me that this assumption would always be true.

> +
> + ret = (nr_node_ids - mem_only_nodes) / bitmap_weight(node_caches, nr_node_ids);
> + kfree(node_caches);
> +
> + if (ret > 1)
> + rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = MON_SCOPE_NODE;
> +
> + return ret;
> +}
> +
> static __init void rdt_init_res_defs_intel(void)
> {
> struct rdt_hw_resource *hw_res;
> struct rdt_resource *r;
>
> + snc_nodes_per_l3_cache = get_snc_config();
> +
> for_each_rdt_resource(r) {
> hw_res = resctrl_to_arch_res(r);
>
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 0d9605fccb34..4ca064e62911 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -148,8 +148,18 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
>
> static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
> {
> + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
> + int cpu = get_cpu();

I do not think it is necessary to disable preemption here. Also please note that
James is working on changes to not have this code block. Could just smp_processor_id()
do?

> + int rmid_offset = 0;
> u64 msr_val;
>
> + /*
> + * When SNC mode is on, need to compute the offset to read the
> + * physical RMID counter for the node to which this CPU belongs
> + */
> + if (snc_nodes_per_l3_cache > 1)
> + rmid_offset = (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
> +
> /*
> * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
> * with a valid event code for supported resource type and the bits
> @@ -158,9 +168,11 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
> * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
> * are error bits.
> */
> - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid + rmid_offset);
> rdmsrl(MSR_IA32_QM_CTR, msr_val);
>
> + put_cpu();
> +
> if (msr_val & RMID_VAL_ERROR)
> return -EIO;
> if (msr_val & RMID_VAL_UNAVAIL)

Reinette