Re: [PATCH v4 2/2] x86/resctrl: IPI all online CPUs for group updates

From: Reinette Chatre
Date: Tue Dec 06 2022 - 13:58:14 EST


Hi Peter,

On 11/29/2022 3:10 AM, Peter Newman wrote:
> Removing a CTRL_MON or MON group directory moves all tasks to the parent
> group. The rmdir implementation therefore interrupts any running
> tasks which were in the deleted group to update their CLOSID/RMID to
> those of the parent.
>
> The rmdir operation iterates over all tasks in the deleted group while
> read-locking the tasklist_lock to ensure that no newly-created child
> tasks remain in the deleted group.

The above describes the current behavior. This is great context. What
follows in the changelog is a description of different fixes. This is
unexpected because there is no description of a problem with the current
behavior.

Could you please describe the problem with the current implementation? Next
you could state the two possible solutions and then I think the reader would
be ready to parse what is written below.


> Calling task_call_func() to perform
> the updates on every task in the deleted group, similar to the recent
> fix in __rdtgroup_move_task(), would result in a much longer
> tasklist_lock critical section.


I so still think it would help to state that this additional locking
does not help to provide precise CPU mask. Especially since
the next paragraph may be interpreted that a precise CPU mask
is lost by giving up the additional locking.

> To avoid this, stop attempting to construct a precise mask of CPUs
> hosting the moved tasks in rdt_move_group_tasks(). Its callers instead
> perform the PQR_ASSOC MSR update on all online CPUs to ensure all
> affected tasks are notified.
>
> To measure the impact of the rdt_move_group_tasks() implementation
> options, the following command was run in an rdtgroup to produce a
> 1600-task workload:
>
> # mkdir /sys/fs/resctrl/test
> # echo $$ > /sys/fs/resctrl/test/tasks
> # perf bench sched messaging -g 40 -l 100000
>
> Results collected using:
>
> # perf stat rmdir /sys/fs/resctrl/test
>
> CPU: Intel(R) Xeon(R) Platinum P-8136 CPU @ 2.00GHz (112 threads)
>
> Calling task_call_func() on all tasks in the deleted group increased
> task-clock time from 1.54 to 2.35 ms, while the IPI broadcast reduced
> the time to 1.31 ms.

Thank you very much for doing this testing.

>
> Restructuring resctrl groups is assumed to be a rare act of system-level
> reconfiguration by the user, so the impact of additional IPIs resulting
> from this change to a CPU-isolated workload is not a concern.
>
> Signed-off-by: Peter Newman <peternewman@xxxxxxxxxx>
> Reviewed-by: James Morse <james.morse@xxxxxxx>
> ---
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 52 +++++++-------------------
> 1 file changed, 13 insertions(+), 39 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 59b7ffcd53bb..4a3c0b315484 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -2401,12 +2401,10 @@ static int reset_all_ctrls(struct rdt_resource *r)
> * Move tasks from one to the other group. If @from is NULL, then all tasks
> * in the systems are moved unconditionally (used for teardown).
> *
> - * If @mask is not NULL the cpus on which moved tasks are running are set
> - * in that mask so the update smp function call is restricted to affected
> - * cpus.
> + * Following this operation, the caller should update PQR_ASSOC MSR and per-CPU
> + * storage on all online CPUs.
> */
> -static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
> - struct cpumask *mask)
> +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to)
> {
> struct task_struct *p, *t;
>
> @@ -2416,16 +2414,6 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
> is_rmid_match(t, from)) {
> WRITE_ONCE(t->closid, to->closid);
> WRITE_ONCE(t->rmid, to->mon.rmid);
> -
> - /*
> - * If the task is on a CPU, set the CPU in the mask.
> - * The detection is inaccurate as tasks might move or
> - * schedule before the smp function call takes place.
> - * In such a case the function call is pointless, but
> - * there is no other side effect.
> - */
> - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
> - cpumask_set_cpu(task_cpu(t), mask);
> }
> }
> read_unlock(&tasklist_lock);
> @@ -2456,7 +2444,7 @@ static void rmdir_all_sub(void)
> struct rdtgroup *rdtgrp, *tmp;
>
> /* Move all tasks to the default resource group */
> - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
> + rdt_move_group_tasks(NULL, &rdtgroup_default);
>
> list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
> /* Free any child rmids */
> @@ -3115,23 +3103,19 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
> return -EPERM;
> }
>
> -static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp)
> {
> struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
> int cpu;
>
> /* Give any tasks back to the parent group */
> - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
> + rdt_move_group_tasks(rdtgrp, prdtgrp);
>
> /* Update per cpu rmid of the moved CPUs first */
> for_each_cpu(cpu, &rdtgrp->cpu_mask)
> per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
> - /*
> - * Update the MSR on moved CPUs and CPUs which have moved
> - * task running on them.
> - */
> - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
> - update_closid_rmid(tmpmask, NULL);
> +
> + update_closid_rmid(cpu_online_mask, NULL);
>
> rdtgrp->flags = RDT_DELETED;
> free_rmid(rdtgrp->mon.rmid);
> @@ -3156,12 +3140,12 @@ static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
> return 0;
> }
>
> -static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp)
> {
> int cpu;
>
> /* Give any tasks back to the default group */
> - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
> + rdt_move_group_tasks(rdtgrp, &rdtgroup_default);
>
> /* Give any CPUs back to the default group */
> cpumask_or(&rdtgroup_default.cpu_mask,
> @@ -3173,12 +3157,7 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
> }
>
> - /*
> - * Update the MSR on moved CPUs and CPUs which have moved
> - * task running on them.
> - */
> - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
> - update_closid_rmid(tmpmask, NULL);
> + update_closid_rmid(cpu_online_mask, NULL);
>
> closid_free(rdtgrp->closid);
> free_rmid(rdtgrp->mon.rmid);
> @@ -3197,12 +3176,8 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
> {
> struct kernfs_node *parent_kn = kn->parent;
> struct rdtgroup *rdtgrp;
> - cpumask_var_t tmpmask;
> int ret = 0;
>
> - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
> - return -ENOMEM;
> -
> rdtgrp = rdtgroup_kn_lock_live(kn);
> if (!rdtgrp) {
> ret = -EPERM;
> @@ -3222,18 +3197,17 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
> rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> ret = rdtgroup_ctrl_remove(rdtgrp);
> } else {
> - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
> + ret = rdtgroup_rmdir_ctrl(rdtgrp);
> }
> } else if (rdtgrp->type == RDTMON_GROUP &&
> is_mon_groups(parent_kn, kn->name)) {
> - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
> + ret = rdtgroup_rmdir_mon(rdtgrp);
> } else {
> ret = -EPERM;
> }
>
> out:
> rdtgroup_kn_unlock(kn);
> - free_cpumask_var(tmpmask);
> return ret;
> }
>

The change looks good to me.

Reinette