Re: [PATCH v5 15/24] x86/resctrl: Allow arch to allocate memory needed in resctrl_arch_rmid_read()

From: Reinette Chatre
Date: Wed Aug 09 2023 - 18:37:53 EST


Hi James,

On 7/28/2023 9:42 AM, James Morse wrote:
> Depending on the number of monitors available, Arm's MPAM may need to
> allocate a monitor prior to reading the counter value. Allocating a
> contended resource may involve sleeping.
>
> add_rmid_to_limbo() calls resctrl_arch_rmid_read() for multiple domains,
> the allocation should be valid for all domains.
>
> __check_limbo() and mon_event_count() each make multiple calls to
> resctrl_arch_rmid_read(), to avoid extra work on contended systems,
> the allocation should be valid for multiple invocations of
> resctrl_arch_rmid_read().
>
> Add arch hooks for this allocation, which need calling before
> resctrl_arch_rmid_read(). The allocated monitor is passed to
> resctrl_arch_rmid_read(), then freed again afterwards. The helper
> can be called on any CPU, and can sleep.
>
> Tested-by: Shaopeng Tan <tan.shaopeng@xxxxxxxxxxx>
> Signed-off-by: James Morse <james.morse@xxxxxxx>
> ---
> Changes since v3:
> * Expanded comment.
> * Removed stray header include.
> * Reworded commit message.
> * Made ctx a void * instead of an int.
>
> Changes since v4:
> * Used IS_ERR() in more places.
> ---
> arch/x86/include/asm/resctrl.h | 11 ++++++++++
> arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 5 +++++
> arch/x86/kernel/cpu/resctrl/internal.h | 1 +
> arch/x86/kernel/cpu/resctrl/monitor.c | 25 ++++++++++++++++++++---
> include/linux/resctrl.h | 5 ++++-
> 5 files changed, 43 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
> index 66d9e18cdc61..0986b5208d76 100644
> --- a/arch/x86/include/asm/resctrl.h
> +++ b/arch/x86/include/asm/resctrl.h
> @@ -136,6 +136,17 @@ static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid)
> return rmid;
> }
>
> +/* x86 can always read an rmid, nothing needs allocating */
> +struct rdt_resource;
> +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid)
> +{
> + might_sleep();
> + return NULL;
> +};
> +
> +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid,
> + void *ctx) { };
> +
> void resctrl_cpu_detect(struct cpuinfo_x86 *c);
>
> #else
> diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> index bd263b9a0abd..55bad57a7bd5 100644
> --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> @@ -546,6 +546,9 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
> rr->d = d;
> rr->val = 0;
> rr->first = first;
> + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
> + if (IS_ERR(rr->arch_mon_ctx))
> + return;
>
> cpu = cpumask_any_housekeeping(&d->cpu_mask);
>
> @@ -559,6 +562,8 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
> smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
> else
> smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
> +
> + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
> }
>
> int rdtgroup_mondata_show(struct seq_file *m, void *arg)
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index 7012f42a82ee..45db51280ff4 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -136,6 +136,7 @@ struct rmid_read {
> bool first;
> int err;
> u64 val;
> + void *arch_mon_ctx;
> };
>
> extern bool rdt_alloc_capable;
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 08e3307863c3..5eed8d0cbf36 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -275,7 +275,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
>
> int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
> u32 closid, u32 rmid, enum resctrl_event_id eventid,
> - u64 *val)
> + u64 *val, void *ignored)
> {
> struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
> struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
> @@ -342,9 +342,14 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
> u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> struct rmid_entry *entry;
> u32 idx, cur_idx = 1;
> + void *arch_mon_ctx;
> bool rmid_dirty;
> u64 val = 0;
>
> + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
> + if (IS_ERR(arch_mon_ctx))
> + return;
> +
> /*
> * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
> * are marked as busy for occupancy < threshold. If the occupancy
> @@ -358,7 +363,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
>
> entry = __rmid_entry(idx);
> if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
> - QOS_L3_OCCUP_EVENT_ID, &val)) {
> + QOS_L3_OCCUP_EVENT_ID, &val,
> + arch_mon_ctx)) {
> rmid_dirty = true;
> } else {
> rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
> @@ -371,6 +377,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
> }
> cur_idx = idx + 1;
> }
> +
> + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
> }
>
> bool has_busy_rmid(struct rdt_domain *d)
> @@ -544,7 +552,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> }
>
> rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid,
> - &tval);
> + &tval, rr->arch_mon_ctx);
> if (rr->err)
> return rr->err;
>
> @@ -754,11 +762,21 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
> if (is_mbm_total_enabled()) {
> rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> rr.val = 0;
> + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> + if (IS_ERR(rr.arch_mon_ctx))
> + return;
> +
> __mon_event_count(closid, rmid, &rr);
> +
> + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> }
> if (is_mbm_local_enabled()) {
> rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> rr.val = 0;
> + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> + if (IS_ERR(rr.arch_mon_ctx))
> + return;
> +
> __mon_event_count(closid, rmid, &rr);
>
> /*
> @@ -768,6 +786,7 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
> */
> if (is_mba_sc(NULL))
> mbm_bw_count(closid, rmid, &rr);
> + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> }
> }
>
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index f7311102e94c..5e4b4df9610b 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -235,6 +235,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
> * @rmid: rmid of the counter to read.
> * @eventid: eventid to read, e.g. L3 occupancy.
> * @val: result of the counter read in bytes.
> + * @arch_mon_ctx: An architecture specific value from
> + * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies
> + * the hardware monitor allocated for this read request.
> *
> * Some architectures need to sleep when first programming some of the counters.
> * (specifically: arm64's MPAM cache occupancy counters can return 'not ready'
> @@ -248,7 +251,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
> */
> int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
> u32 closid, u32 rmid, enum resctrl_event_id eventid,
> - u64 *val);
> + u64 *val, void *arch_mon_ctx);
>
> /**
> * resctrl_arch_rmid_read_context_check() - warn about invalid contexts

Looking at the error paths all the errors are silent failures. On the
failure in mon_event_read() this could potentially be handled by setting
the "err" field in struct rmid_read ... at least then the caller can print
an error instead of displaying a zero count to the user. The other failures
are harder to handle though. Considering that these contexts are allocated and
freed so often, why not allocate them once (perhaps in struct rdt_hw_domain?)
on driver load with clear error handling?

Reinette