Re: [PATCH v2 5/5] perf/x86/amd/power: Add AMD accumulated power reporting mechanism

From: Huang Rui
Date: Fri Jan 22 2016 - 03:05:06 EST


On Thu, Jan 21, 2016 at 05:59:58PM +0100, Borislav Petkov wrote:
> On Thu, Jan 21, 2016 at 04:10:40PM +0100, Peter Zijlstra wrote:
> > > > > + cpumask_clear(pmu->mask);
> > > > > + cpumask_clear(pmu->tmp_mask);
> > > > >
> > > > > for (i = 0; i < cores_per_cu; i++)
> > > > > + cpumask_set_cpu(i, pmu->mask);
> > > > >
> > > > > + cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu);
> > > >
> > > > Couldn't you simply use topology_sibling_cpumask(cpu) instead?
> > > >
> > >
> > > Looks like we couldn't. That's because cores number per cu (compute
> > > unit) is got by CPUID 0x8000001e EBX. That relies on the CPU hardware.
> >
> > Borislav? I thought the AMD compute unit stuff was modeled as the SMT
> > topology.
>
> I would think so too:
>
> smp_num_siblings = ((ebx >> 8) & 3) + 1;
>
> gets set based on that CPUID leaf above. And that value is
> CoresPerComputeUnit which needs to be incremented by 1 to get the actual
> count of cores in a compute unit.
>
> And that participates in the setting of topology_sibling_cpumask() in
> set_cpu_sibling_map().
>
> And that looks correct on my system here:
>
> $ grep -EriIn . /sys/devices/system/cpu/cpu?/topology/* | grep thread_siblings
> /sys/devices/system/cpu/cpu0/topology/thread_siblings:1:03
> /sys/devices/system/cpu/cpu0/topology/thread_siblings_list:1:0-1
> /sys/devices/system/cpu/cpu1/topology/thread_siblings:1:03
> /sys/devices/system/cpu/cpu1/topology/thread_siblings_list:1:0-1
> /sys/devices/system/cpu/cpu2/topology/thread_siblings:1:0c
> /sys/devices/system/cpu/cpu2/topology/thread_siblings_list:1:2-3
> /sys/devices/system/cpu/cpu3/topology/thread_siblings:1:0c
> /sys/devices/system/cpu/cpu3/topology/thread_siblings_list:1:2-3
> /sys/devices/system/cpu/cpu4/topology/thread_siblings:1:30
> /sys/devices/system/cpu/cpu4/topology/thread_siblings_list:1:4-5
> /sys/devices/system/cpu/cpu5/topology/thread_siblings:1:30
> /sys/devices/system/cpu/cpu5/topology/thread_siblings_list:1:4-5
> /sys/devices/system/cpu/cpu6/topology/thread_siblings:1:c0
> /sys/devices/system/cpu/cpu6/topology/thread_siblings_list:1:6-7
> /sys/devices/system/cpu/cpu7/topology/thread_siblings:1:c0
> /sys/devices/system/cpu/cpu7/topology/thread_siblings_list:1:6-7
>
> and when we look at what CPUID reports:
>
> $ cpuid -r | grep -E "^\s+0x8000001e" | awk '{ print $4 }'
> ebx=0x00000100
> ebx=0x00000100
> ebx=0x00000101
> ebx=0x00000101
> ebx=0x00000102
> ebx=0x00000102
> ebx=0x00000103
> ebx=0x00000103
>
> We see that [15:8] is CoresPerComputeUnit which is + 1, so 2 cores per
> compute unit.
>
> And slice [7:0] gives the compute unit (CU) id of each core, so cores 0
> and 1 are CU0, 2 and 3 are CU1 and so on...
>
> So Rui, why do you say you can't use topology_sibling_cpumask()?
>

OK, you're right. Peter, Boris, thanks for your information.
I might need look at topology deeper. :-)

So how about below update:

8<--------------------------------------------------------------------------

diff --git a/arch/x86/kernel/cpu/perf_event_amd_power.c b/arch/x86/kernel/cpu/perf_event_amd_power.c
index 1f31157..d387fe7 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_power.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_power.c
@@ -301,18 +301,12 @@ static struct pmu pmu_class = {
static int power_cpu_exit(int cpu)
{
struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
- int i, cu, ret = 0;
+ int ret = 0;
int target = nr_cpumask_bits;

- cu = cpu / cores_per_cu;
-
cpumask_clear(pmu->mask);
- cpumask_clear(pmu->tmp_mask);
-
- for (i = 0; i < cores_per_cu; i++)
- cpumask_set_cpu(i, pmu->mask);

- cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu);
+ cpumask_copy(pmu->mask, topology_sibling_cpumask(cpu));

cpumask_clear_cpu(cpu, &cpu_mask);
cpumask_clear_cpu(cpu, pmu->mask);
@@ -345,19 +339,12 @@ out:
static int power_cpu_init(int cpu)
{
struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
- int i, cu;

if (pmu)
return 0;

- cu = cpu / cores_per_cu;
-
- for (i = 0; i < cores_per_cu; i++)
- cpumask_set_cpu(i, pmu->mask);
-
- cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu);
-
- if (!cpumask_and(pmu->tmp_mask, pmu->mask, &cpu_mask))
+ if (!cpumask_and(pmu->mask, topology_sibling_cpumask(cpu),
+ &cpu_mask))
cpumask_set_cpu(cpu, &cpu_mask);

return 0;
@@ -454,7 +441,6 @@ static int __init amd_power_pmu_init(void)
{
int i, ret;
u64 tmp;
- cpumask_var_t tmp_mask, res_mask;

if (!x86_match_cpu(cpu_match))
return 0;
@@ -476,27 +462,16 @@ static int __init amd_power_pmu_init(void)
}
max_cu_acc_power = tmp;

- if (!zalloc_cpumask_var(&tmp_mask, GFP_KERNEL))
- return -ENOMEM;
-
- if (!zalloc_cpumask_var(&res_mask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out;
- }
-
- for (i = 0; i < cores_per_cu; i++)
- cpumask_set_cpu(i, tmp_mask);
-
cpu_notifier_register_begin();

/*
* Choose the one online core of each compute unit
*/
- for (i = 0; i < cu_num; i++) {
+ for (i = 0; i < boot_cpu_data.x86_max_cores; i += cores_per_cu) {
/* WARN_ON for empty CU masks */
- WARN_ON(!cpumask_and(res_mask, tmp_mask, cpu_online_mask));
- cpumask_set_cpu(cpumask_any(res_mask), &cpu_mask);
- cpumask_shift_left(tmp_mask, tmp_mask, cores_per_cu);
+ WARN_ON(cpumask_empty(topology_sibling_cpumask(i)));
+ cpumask_set_cpu(cpumask_any(topology_sibling_cpumask(i)),
+ &cpu_mask);
}

for_each_present_cpu(i) {
@@ -505,14 +480,14 @@ static int __init amd_power_pmu_init(void)
/* unwind on [0 ... i-1] CPUs */
while (i--)
power_cpu_kfree(i);
- goto out1;
+ goto out;
}
ret = power_cpu_init(i);
if (ret) {
/* unwind on [0 ... i] CPUs */
while (i >= 0)
power_cpu_kfree(i--);
- goto out1;
+ goto out;
}
}

@@ -521,17 +496,13 @@ static int __init amd_power_pmu_init(void)
ret = perf_pmu_register(&pmu_class, "power", -1);
if (WARN_ON(ret)) {
pr_warn("AMD Power PMU registration failed\n");
- goto out1;
+ goto out;
}

pr_info("AMD Power PMU detected, %d compute units\n", cu_num);

-out1:
- cpu_notifier_register_done();
-
- free_cpumask_var(res_mask);
out:
- free_cpumask_var(tmp_mask);
+ cpu_notifier_register_done();

return ret;
}

8<--------------------------------------------------------------------------

BTW, "smp_num_siblings = ((ebx >> 8) & 3) + 1" should not put under
init_amd(), we would better move it to bsp_init_amd(). Because the AMD
"smp_num_siblings" number must be constant.

Thanks,
Rui