Re: [PATCH 1/3] powerpc/smp: Parse ibm,thread-groups with multiple properties

From: Gautham R Shenoy
Date: Tue Dec 08 2020 - 12:26:48 EST


Hello Srikar,

Thanks for taking a look at the patch.

On Mon, Dec 07, 2020 at 05:40:42PM +0530, Srikar Dronamraju wrote:
> * Gautham R. Shenoy <ego@xxxxxxxxxxxxxxxxxx> [2020-12-04 10:18:45]:
>
> > From: "Gautham R. Shenoy" <ego@xxxxxxxxxxxxxxxxxx>
>
> <snipped>
>
> >
> > static int parse_thread_groups(struct device_node *dn,
> > - struct thread_groups *tg,
> > - unsigned int property)
> > + struct thread_groups_list *tglp)
> > {
> > - int i;
> > - u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE];
> > + int i = 0;
> > + u32 *thread_group_array;
> > u32 *thread_list;
> > size_t total_threads;
> > - int ret;
> > + int ret = 0, count;
> > + unsigned int property_idx = 0;
>
> NIT:
> tglx mentions in one of his recent comments to try keep a reverse fir tree
> ordering of variables where possible.

I suppose you mean moving the longer local variable declarations to to
the top and shorter ones to the bottom. Thanks. Will fix this.


>
> >
> > + count = of_property_count_u32_elems(dn, "ibm,thread-groups");
> > + thread_group_array = kcalloc(count, sizeof(u32), GFP_KERNEL);
> > ret = of_property_read_u32_array(dn, "ibm,thread-groups",
> > - thread_group_array, 3);
> > + thread_group_array, count);
> > if (ret)
> > - return ret;
> > -
> > - tg->property = thread_group_array[0];
> > - tg->nr_groups = thread_group_array[1];
> > - tg->threads_per_group = thread_group_array[2];
> > - if (tg->property != property ||
> > - tg->nr_groups < 1 ||
> > - tg->threads_per_group < 1)
> > - return -ENODATA;
> > + goto out_free;
> >
> > - total_threads = tg->nr_groups * tg->threads_per_group;
> > + while (i < count && property_idx < MAX_THREAD_GROUP_PROPERTIES) {
> > + int j;
> > + struct thread_groups *tg = &tglp->property_tgs[property_idx++];
>
> NIT: same as above.

Ok.
>
> >
> > - ret = of_property_read_u32_array(dn, "ibm,thread-groups",
> > - thread_group_array,
> > - 3 + total_threads);
> > - if (ret)
> > - return ret;
> > + tg->property = thread_group_array[i];
> > + tg->nr_groups = thread_group_array[i + 1];
> > + tg->threads_per_group = thread_group_array[i + 2];
> > + total_threads = tg->nr_groups * tg->threads_per_group;
> > +
> > + thread_list = &thread_group_array[i + 3];
> >
> > - thread_list = &thread_group_array[3];
> > + for (j = 0; j < total_threads; j++)
> > + tg->thread_list[j] = thread_list[j];
> > + i = i + 3 + total_threads;
>
> Can't we simply use memcpy instead?

We could. But this one makes it more explicit.


>
> > + }
> >
> > - for (i = 0 ; i < total_threads; i++)
> > - tg->thread_list[i] = thread_list[i];
> > + tglp->nr_properties = property_idx;
> >
> > - return 0;
> > +out_free:
> > + kfree(thread_group_array);
> > + return ret;
> > }
> >
> > /*
> > @@ -805,24 +827,39 @@ static int get_cpu_thread_group_start(int cpu, struct thread_groups *tg)
> > return -1;
> > }
> >
> > -static int init_cpu_l1_cache_map(int cpu)
> > +static int init_cpu_cache_map(int cpu, unsigned int cache_property)
> >
> > {
> > struct device_node *dn = of_get_cpu_node(cpu, NULL);
> > - struct thread_groups tg = {.property = 0,
> > - .nr_groups = 0,
> > - .threads_per_group = 0};
> > + struct thread_groups *tg = NULL;
> > int first_thread = cpu_first_thread_sibling(cpu);
> > int i, cpu_group_start = -1, err = 0;
> > + cpumask_var_t *mask;
> > + struct thread_groups_list *cpu_tgl = &tgl[cpu];
>
> NIT: same as 1st comment.

Sure, will fix this.

>
> >
> > if (!dn)
> > return -ENODATA;
> >
> > - err = parse_thread_groups(dn, &tg, THREAD_GROUP_SHARE_L1);
> > - if (err)
> > - goto out;
> > + if (!(cache_property == THREAD_GROUP_SHARE_L1))
> > + return -EINVAL;
> >
> > - cpu_group_start = get_cpu_thread_group_start(cpu, &tg);
> > + if (!cpu_tgl->nr_properties) {
> > + err = parse_thread_groups(dn, cpu_tgl);
> > + if (err)
> > + goto out;
> > + }
> > +
> > + for (i = 0; i < cpu_tgl->nr_properties; i++) {
> > + if (cpu_tgl->property_tgs[i].property == cache_property) {
> > + tg = &cpu_tgl->property_tgs[i];
> > + break;
> > + }
> > + }
> > +
> > + if (!tg)
> > + return -EINVAL;
> > +
> > + cpu_group_start = get_cpu_thread_group_start(cpu, tg);
>
> This whole hunk should be moved to a new function and called before
> init_cpu_cache_map. It will simplify the logic to great extent.

I suppose you are referring to the part where we select the correct
tg. Yeah, that can move to a different helper.

>
> >
> > if (unlikely(cpu_group_start == -1)) {
> > WARN_ON_ONCE(1);
> > @@ -830,11 +867,12 @@ static int init_cpu_l1_cache_map(int cpu)
> > goto out;
> > }
> >
> > - zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
> > - GFP_KERNEL, cpu_to_node(cpu));
> > + mask = &per_cpu(cpu_l1_cache_map, cpu);
> > +
> > + zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu));
> >
>
> This hunk (and the next hunk) should be moved to next patch.
>

The next patch is only about introducing THREAD_GROUP_SHARE_L2. Hence
I put in any other code in this patch, since it seems to be a logical
place to collate whatever we have in a generic form.



> > for (i = first_thread; i < first_thread + threads_per_core; i++) {
> > - int i_group_start = get_cpu_thread_group_start(i, &tg);
> > + int i_group_start = get_cpu_thread_group_start(i, tg);
> >
> > if (unlikely(i_group_start == -1)) {
> > WARN_ON_ONCE(1);
> > @@ -843,7 +881,7 @@ static int init_cpu_l1_cache_map(int cpu)
> > }
> >
> > if (i_group_start == cpu_group_start)
> > - cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu));
> > + cpumask_set_cpu(i, *mask);
> > }
> >
> > out:
> > @@ -924,7 +962,7 @@ static int init_big_cores(void)
> > int cpu;
> >
> > for_each_possible_cpu(cpu) {
> > - int err = init_cpu_l1_cache_map(cpu);
> > + int err = init_cpu_cache_map(cpu, THREAD_GROUP_SHARE_L1);
> >
> > if (err)
> > return err;
> > --
> > 1.9.4
> >
>
> --
> Thanks and Regards
> Srikar Dronamraju