Re: [PATCH v5 01/24] x86/resctrl: Track the closid with the rmid

From: Reinette Chatre
Date: Wed Aug 09 2023 - 18:32:18 EST


Hi James,

On 7/28/2023 9:42 AM, James Morse wrote:

> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index ded1fc7cb7cb..fa66029de41c 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -25,6 +25,12 @@
> #include "internal.h"
>
> struct rmid_entry {
> + /*
> + * Some architectures's resctrl_arch_rmid_read() needs the CLOSID value
> + * in order to access the correct monitor. This field provides the
> + * value to list walkers like __check_limbo(). On x86 this is ignored.
> + */
> + u32 closid;
> u32 rmid;
> int busy;
> struct list_head list;

In Documentation/process/maintainer-tip.rst the x86 maintainers ask to avoid
documenting struct members within the declaration. Could you please use
kernel-doc format instead as is requested there?

...

> @@ -429,7 +439,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
> * __mon_event_count() is compared with the chunks value from the previous
> * invocation. This must be called once per second to maintain values in MBps.
> */
> -static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
> +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
> {
> struct mbm_state *m = &rr->d->mbm_local[rmid];
> u64 cur_bw, bytes, cur_bytes;
> @@ -459,7 +469,7 @@ void mon_event_count(void *info)
>
> rdtgrp = rr->rgrp;
>
> - ret = __mon_event_count(rdtgrp->mon.rmid, rr);
> + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
>
> /*
> * For Ctrl groups read data from child monitor groups and
> @@ -470,7 +480,8 @@ void mon_event_count(void *info)
>
> if (rdtgrp->type == RDTCTRL_GROUP) {
> list_for_each_entry(entry, head, mon.crdtgrp_list) {
> - if (__mon_event_count(entry->mon.rmid, rr) == 0)
> + if (__mon_event_count(rdtgrp->closid, entry->mon.rmid,
> + rr) == 0)
> ret = 0;
> }
> }

I understand that the parent and child resource groups should have the same
closid, but that makes me wonder why you use the parent closid in this change,
but later in the change to mbm_handle_overflow() where the monitor groups are
traversed you use the closid from the child resource group?

> @@ -600,7 +611,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
> }
> }
>
> -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
> +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
> + u32 closid, u32 rmid)
> {
> struct rmid_read rr;
>
> @@ -615,12 +627,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
> if (is_mbm_total_enabled()) {
> rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> rr.val = 0;
> - __mon_event_count(rmid, &rr);
> + __mon_event_count(closid, rmid, &rr);
> }
> if (is_mbm_local_enabled()) {
> rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> rr.val = 0;
> - __mon_event_count(rmid, &rr);
> + __mon_event_count(closid, rmid, &rr);
>
> /*
> * Call the MBA software controller only for the
> @@ -628,7 +640,7 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
> * the software controller explicitly.
> */
> if (is_mba_sc(NULL))
> - mbm_bw_count(rmid, &rr);
> + mbm_bw_count(closid, rmid, &rr);
> }
> }
>
> @@ -685,11 +697,11 @@ void mbm_handle_overflow(struct work_struct *work)
> d = container_of(work, struct rdt_domain, mbm_over.work);
>
> list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
> - mbm_update(r, d, prgrp->mon.rmid);
> + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
>
> head = &prgrp->mon.crdtgrp_list;
> list_for_each_entry(crgrp, head, mon.crdtgrp_list)
> - mbm_update(r, d, crgrp->mon.rmid);
> + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
>
> if (is_mba_sc(NULL))
> update_mba_bw(prgrp, d);

Above hunk is what I referred to above.

> @@ -732,10 +744,11 @@ static int dom_data_init(struct rdt_resource *r)
> }
>
> /*
> - * RMID 0 is special and is always allocated. It's used for all
> - * tasks that are not monitored.
> + * CLOSID 0 and RMID 0 are special and are always allocated. These are
> + * used for rdtgroup_default control group, which will be setup later.
> + * See rdtgroup_setup_root().
> */
> - entry = __rmid_entry(0);
> + entry = __rmid_entry(0, 0);

There seems to be an ordering issue here with the hardcoded values for
RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID used before those defines
are introduced in the next patch. That may be ok since this code changes in
the next patch ... but the comment is left referring to the constant. Maybe
it would just be clearer if the defines are moved to this patch?

> list_del(&entry->list);
>
> return 0;
> diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
> index 458cb7419502..aeadaeb5df9a 100644
> --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
> +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
> @@ -738,7 +738,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
> * anymore when this group would be used for pseudo-locking. This
> * is safe to call on platforms not capable of monitoring.
> */
> - free_rmid(rdtgrp->mon.rmid);
> + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
>
> ret = 0;
> goto out;
> @@ -773,7 +773,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
>
> ret = rdtgroup_locksetup_user_restore(rdtgrp);
> if (ret) {
> - free_rmid(rdtgrp->mon.rmid);
> + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> return ret;
> }
>
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 725344048f85..f7fda4fc2c9e 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -2714,7 +2714,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
>
> head = &rdtgrp->mon.crdtgrp_list;
> list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
> - free_rmid(sentry->mon.rmid);
> + free_rmid(sentry->closid, sentry->mon.rmid);
> list_del(&sentry->mon.crdtgrp_list);
>
> if (atomic_read(&sentry->waitcount) != 0)
> @@ -2754,7 +2754,7 @@ static void rmdir_all_sub(void)
> cpumask_or(&rdtgroup_default.cpu_mask,
> &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
>
> - free_rmid(rdtgrp->mon.rmid);
> + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
>
> kernfs_remove(rdtgrp->kn);
> list_del(&rdtgrp->rdtgroup_list);
> @@ -3252,7 +3252,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
> return 0;
>
> out_idfree:
> - free_rmid(rdtgrp->mon.rmid);
> + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> out_destroy:
> kernfs_put(rdtgrp->kn);
> kernfs_remove(rdtgrp->kn);

This does not look right ... as you note in later patches closid_alloc() is called
_after_ mkdir_rdt_prepare(). Adding rdtgrp->closid to free_rmid() at this point would
thus use an uninitialized value. I know this code is being moved in subsequent
patches so it seems the patches may need to be reordered?

> @@ -3266,7 +3266,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
> static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
> {
> kernfs_remove(rgrp->kn);
> - free_rmid(rgrp->mon.rmid);
> + free_rmid(rgrp->closid, rgrp->mon.rmid);
> rdtgroup_remove(rgrp);
> }
>

Related issue to above. Looking at how mkdir_rdt_prepare_clean() is called, right
after closid is freed, this seems to be use-after-free? Another motivation to
re-order the patches?

Reinette