Re: [PATCH v7 12/24] x86/resctrl: Add cpumask_any_housekeeping() for limbo/overflow

From: Moger, Babu
Date: Thu Nov 09 2023 - 15:40:13 EST




On 10/25/23 13:03, James Morse wrote:
> The limbo and overflow code picks a CPU to use from the domain's list
> of online CPUs. Work is then scheduled on these CPUs to maintain
> the limbo list and any counters that may overflow.
>
> cpumask_any() may pick a CPU that is marked nohz_full, which will
> either penalise the work that CPU was dedicated to, or delay the
> processing of limbo list or counters that may overflow. Perhaps
> indefinitely. Delaying the overflow handling will skew the bandwidth
> values calculated by mba_sc, which expects to be called once a second.
>
> Add cpumask_any_housekeeping() as a replacement for cpumask_any()
> that prefers housekeeping CPUs. This helper will still return
> a nohz_full CPU if that is the only option. The CPU to use is
> re-evaluated each time the limbo/overflow work runs. This ensures
> the work will move off a nohz_full CPU once a housekeeping CPU is
> available.
>
> Tested-by: Shaopeng Tan <tan.shaopeng@xxxxxxxxxxx>
> Tested-by: Peter Newman <peternewman@xxxxxxxxxx>
> Reviewed-by: Shaopeng Tan <tan.shaopeng@xxxxxxxxxxx>
> Signed-off-by: James Morse <james.morse@xxxxxxx>

Reviewed-by: Babu Moger <babu.moger@xxxxxxx>

> ---
> Changes since v3:
> * typos fixed
>
> Changes since v4:
> * Made temporary variables unsigned
>
> Changes since v5:
> * Restructured cpumask_any_housekeeping() to avoid later churn.
>
> Changes since v6:
> * Update mbm_work_cpu/cqm_work_cpu when rescheduling.
> ---
> arch/x86/kernel/cpu/resctrl/internal.h | 24 ++++++++++++++++++++++++
> arch/x86/kernel/cpu/resctrl/monitor.c | 20 +++++++++++++-------
> 2 files changed, 37 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index 521afa016b05..33e24fcc8dd0 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -7,6 +7,7 @@
> #include <linux/kernfs.h>
> #include <linux/fs_context.h>
> #include <linux/jump_label.h>
> +#include <linux/tick.h>
>
> #include <asm/resctrl.h>
>
> @@ -56,6 +57,29 @@
> /* Max event bits supported */
> #define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
>
> +/**
> + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
> + * aren't marked nohz_full
> + * @mask: The mask to pick a CPU from.
> + *
> + * Returns a CPU in @mask. If there are housekeeping CPUs that don't use
> + * nohz_full, these are preferred.
> + */
> +static inline unsigned int cpumask_any_housekeeping(const struct cpumask *mask)
> +{
> + unsigned int cpu, hk_cpu;
> +
> + cpu = cpumask_any(mask);
> + if (!tick_nohz_full_cpu(cpu))
> + return cpu;
> +
> + hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
> + if (hk_cpu < nr_cpu_ids)
> + cpu = hk_cpu;
> +
> + return cpu;
> +}
> +
> struct rdt_fs_context {
> struct kernfs_fs_context kfc;
> bool enable_cdpl2;
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index cf512d4d383e..718770aea2af 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -783,7 +783,6 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
> void cqm_handle_limbo(struct work_struct *work)
> {
> unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
> - int cpu = smp_processor_id();
> struct rdt_resource *r;
> struct rdt_domain *d;
>
> @@ -794,8 +793,11 @@ void cqm_handle_limbo(struct work_struct *work)
>
> __check_limbo(d, false);
>
> - if (has_busy_rmid(d))
> - schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
> + if (has_busy_rmid(d)) {
> + d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask);
> + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
> + delay);
> + }
>
> mutex_unlock(&rdtgroup_mutex);
> }
> @@ -805,7 +807,7 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
> unsigned long delay = msecs_to_jiffies(delay_ms);
> int cpu;
>
> - cpu = cpumask_any(&dom->cpu_mask);
> + cpu = cpumask_any_housekeeping(&dom->cpu_mask);
> dom->cqm_work_cpu = cpu;
>
> schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
> @@ -815,7 +817,6 @@ void mbm_handle_overflow(struct work_struct *work)
> {
> unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
> struct rdtgroup *prgrp, *crgrp;
> - int cpu = smp_processor_id();
> struct list_head *head;
> struct rdt_resource *r;
> struct rdt_domain *d;
> @@ -839,7 +840,12 @@ void mbm_handle_overflow(struct work_struct *work)
> update_mba_bw(prgrp, d);
> }
>
> - schedule_delayed_work_on(cpu, &d->mbm_over, delay);
> + /*
> + * Re-check for housekeeping CPUs. This allows the overflow handler to
> + * move off a nohz_full CPU quickly.
> + */
> + d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask);
> + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
>
> out_unlock:
> mutex_unlock(&rdtgroup_mutex);
> @@ -852,7 +858,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
>
> if (!static_branch_likely(&rdt_mon_enable_key))
> return;
> - cpu = cpumask_any(&dom->cpu_mask);
> + cpu = cpumask_any_housekeeping(&dom->cpu_mask);
> dom->mbm_work_cpu = cpu;
> schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
> }

--
Thanks
Babu Moger