Re: [patch 07/11] x86/perf/uncore: Track packages not per cpu data

From: Stephane Eranian
Date: Wed Feb 17 2016 - 16:19:23 EST


On Wed, Feb 17, 2016 at 5:47 AM, Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote:
> uncore is a per package facility, but the code tries to mimick a per cpu
> facility with completely convoluted constructs.
>
> Simplify the whole machinery by tracking per package information. While at it,
> avoid the kfree/alloc dance when a cpu goes offline and online again. There is
> no point in freeing the box after it was allocated. We just keep proper
> refcounting and the first cpu which comes online in a package does the
> initialization/activation of the box.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> ---
> arch/x86/kernel/cpu/perf_event_intel_uncore.c | 355 ++++++++++----------------
> arch/x86/kernel/cpu/perf_event_intel_uncore.h | 4
> 2 files changed, 139 insertions(+), 220 deletions(-)
>
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -11,7 +11,6 @@ DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
> struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
> struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
>
> -static DEFINE_RAW_SPINLOCK(uncore_box_lock);
> /* mask of cpus that collect uncore events */
> static cpumask_t uncore_cpu_mask;
>
> @@ -89,27 +88,7 @@ struct intel_uncore_pmu *uncore_event_to
>
> struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
> {
> - struct intel_uncore_box *box;
> -
> - box = *per_cpu_ptr(pmu->box, cpu);
> - if (box)
> - return box;
> -
> - raw_spin_lock(&uncore_box_lock);
> - /* Recheck in lock to handle races. */
> - if (*per_cpu_ptr(pmu->box, cpu))
> - goto out;
> - list_for_each_entry(box, &pmu->box_list, list) {
> - if (box->phys_id == topology_physical_package_id(cpu)) {
> - atomic_inc(&box->refcnt);
> - *per_cpu_ptr(pmu->box, cpu) = box;
> - break;
> - }
> - }
> -out:
> - raw_spin_unlock(&uncore_box_lock);
> -
> - return *per_cpu_ptr(pmu->box, cpu);
> + return pmu->boxes[topology_physical_package_id(cpu)];
> }
>
> struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
> @@ -317,7 +296,6 @@ static struct intel_uncore_box *uncore_a
> raw_spin_lock_init(&box->shared_regs[i].lock);
>
> uncore_pmu_init_hrtimer(box);
> - atomic_set(&box->refcnt, 1);
> box->cpu = -1;
> box->phys_id = -1;
>
> @@ -773,14 +751,24 @@ static void uncore_pmu_unregister(struct
> pmu->registered = false;
> }
>
> +static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
> +{
> + int pkg, maxpkg = topology_max_packages();
> +
> + for (pkg = 0; pkg < maxpkg; pkg++)
> + kfree(pmu->boxes[pkg]);
> + kfree(pmu->boxes);
> +}
> +
> static void __init uncore_type_exit(struct intel_uncore_type *type)
> {
> + struct intel_uncore_pmu *pmu = type->pmus;
> int i;
>
> - if (type->pmus) {
> - for (i = 0; i < type->num_boxes; i++) {
> - uncore_pmu_unregister(&type->pmus[i]);
> - free_percpu(type->pmus[i].box);
> + if (pmu) {
> + for (i = 0; i < type->num_boxes; i++, pmu++) {
> + uncore_pmu_unregister(pmu);
> + uncore_free_boxes(pmu);
> }
> kfree(type->pmus);
> type->pmus = NULL;
> @@ -795,33 +783,34 @@ static void __init uncore_types_exit(str
> uncore_type_exit(*types++);
> }
>
> -static int __init uncore_type_init(struct intel_uncore_type *type)
> +static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
> {
> struct intel_uncore_pmu *pmus;
> struct attribute_group *attr_group;
> struct attribute **attrs;
> + size_t size;
> int i, j;
>
> pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
> if (!pmus)
> return -ENOMEM;
>
> - type->pmus = pmus;
> -
> - type->unconstrainted = (struct event_constraint)
> - __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
> - 0, type->num_counters, 0, 0);
> + size = topology_max_packages() * sizeof(struct intel_uncore_box *);
>
Let's assume you have a system with 48 cpus with HT on, then
you have 2 processor sockets, which is correct. But then you further
assume that topology_physical_package_id() will return 0 or 1 in this case.
In other words, that physical id are always assigned from 0 to N in a contiguous
manner.

Do we know for sure that this is always the case for all Intel systems
and that no
weird BIOS is assigning random numbers for phys_proc_id?


for (i = 0; i < type->num_boxes; i++) {
- pmus[i].func_id = -1;
- pmus[i].pmu_idx = i;
- pmus[i].type = type;
- INIT_LIST_HEAD(&pmus[i].box_list);
- pmus[i].box = alloc_percpu(struct intel_uncore_box *);
- if (!pmus[i].box)
+ pmus[i].func_id = setid ? i : -1;
+ pmus[i].pmu_idx = i;
+ pmus[i].type = type;
+ pmus[i].boxes = kzalloc(size, GFP_KERNEL);

> for (i = 0; i < type->num_boxes; i++) {
> - pmus[i].func_id = -1;
> - pmus[i].pmu_idx = i;
> - pmus[i].type = type;
> - INIT_LIST_HEAD(&pmus[i].box_list);
> - pmus[i].box = alloc_percpu(struct intel_uncore_box *);
> - if (!pmus[i].box)
> + pmus[i].func_id = setid ? i : -1;
> + pmus[i].pmu_idx = i;
> + pmus[i].type = type;
> + pmus[i].boxes = kzalloc(size, GFP_KERNEL);
> + if (!pmus[i].boxes)
> return -ENOMEM;
> }
>
> + type->pmus = pmus;
> + type->unconstrainted = (struct event_constraint)
> + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
> + 0, type->num_counters, 0, 0);
> +
> if (type->event_descs) {
> i = 0;
> while (type->event_descs[i].attr.attr.name)
> @@ -846,12 +835,13 @@ static int __init uncore_type_init(struc
> return 0;
> }
>
> -static int __init uncore_types_init(struct intel_uncore_type **types)
> +static int __init
> +uncore_types_init(struct intel_uncore_type **types, bool setid)
> {
> - int i, ret;
> + int ret;
>
> - for (i = 0; types[i]; i++) {
> - ret = uncore_type_init(types[i]);
> + while (*types) {
> + ret = uncore_type_init(*types++, setid);
> if (ret)
> return ret;
> }
> @@ -863,10 +853,9 @@ static int __init uncore_types_init(stru
> */
> static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> {
> + struct intel_uncore_type *type;
> struct intel_uncore_pmu *pmu;
> struct intel_uncore_box *box;
> - struct intel_uncore_type *type;
> - bool first_box = false;
> int phys_id, ret;
>
> phys_id = uncore_pcibus_to_physid(pdev->bus);
> @@ -906,27 +895,24 @@ static int uncore_pci_probe(struct pci_d
> else
> WARN_ON_ONCE(pmu->func_id != pdev->devfn);
>
> + WARN_ON_ONCE(pmu->boxes[phys_id] != NULL);
> +
> + atomic_inc(&box->refcnt);
> box->phys_id = phys_id;
> box->pci_dev = pdev;
> box->pmu = pmu;
> uncore_box_init(box);
> pci_set_drvdata(pdev, box);
>
> - raw_spin_lock(&uncore_box_lock);
> - if (list_empty(&pmu->box_list))
> - first_box = true;
> - list_add_tail(&box->list, &pmu->box_list);
> - raw_spin_unlock(&uncore_box_lock);
> -
> - if (!first_box)
> + pmu->boxes[phys_id] = box;
> + if (atomic_inc_return(&pmu->activeboxes) > 1)
> return 0;
>
> + /* First active box registers the pmu */
> ret = uncore_pmu_register(pmu);
> if (ret) {
> pci_set_drvdata(pdev, NULL);
> - raw_spin_lock(&uncore_box_lock);
> - list_del(&box->list);
> - raw_spin_unlock(&uncore_box_lock);
> + pmu->boxes[phys_id] = NULL;
> uncore_box_exit(box);
> kfree(box);
> }
> @@ -937,8 +923,7 @@ static void uncore_pci_remove(struct pci
> {
> struct intel_uncore_box *box = pci_get_drvdata(pdev);
> struct intel_uncore_pmu *pmu;
> - int i, cpu, phys_id;
> - bool last_box = false;
> + int i, phys_id;
>
> phys_id = uncore_pcibus_to_physid(pdev->bus);
> box = pci_get_drvdata(pdev);
> @@ -958,26 +943,13 @@ static void uncore_pci_remove(struct pci
> return;
>
> pci_set_drvdata(pdev, NULL);
> + pmu->boxes[phys_id] = NULL;
>
> - raw_spin_lock(&uncore_box_lock);
> - list_del(&box->list);
> - if (list_empty(&pmu->box_list))
> - last_box = true;
> - raw_spin_unlock(&uncore_box_lock);
> -
> - for_each_possible_cpu(cpu) {
> - if (*per_cpu_ptr(pmu->box, cpu) == box) {
> - *per_cpu_ptr(pmu->box, cpu) = NULL;
> - atomic_dec(&box->refcnt);
> - }
> - }
> + if (atomic_dec_return(&pmu->activeboxes) == 0)
> + uncore_pmu_unregister(pmu);
>
> - WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
> uncore_box_exit(box);
> kfree(box);
> -
> - if (last_box)
> - uncore_pmu_unregister(pmu);
> }
>
> static int __init uncore_pci_init(void)
> @@ -1024,7 +996,7 @@ static int __init uncore_pci_init(void)
> if (ret)
> return ret;
>
> - ret = uncore_types_init(uncore_pci_uncores);
> + ret = uncore_types_init(uncore_pci_uncores, false);
> if (ret)
> goto err;
>
> @@ -1053,106 +1025,76 @@ static void __init uncore_pci_exit(void)
> }
> }
>
> -/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
> -static LIST_HEAD(boxes_to_free);
> -
> -static void uncore_kfree_boxes(void)
> -{
> - struct intel_uncore_box *box;
> -
> - while (!list_empty(&boxes_to_free)) {
> - box = list_entry(boxes_to_free.next,
> - struct intel_uncore_box, list);
> - list_del(&box->list);
> - kfree(box);
> - }
> -}
> -
> static void uncore_cpu_dying(int cpu)
> {
> - struct intel_uncore_type *type;
> + struct intel_uncore_type *type, **types = uncore_msr_uncores;
> struct intel_uncore_pmu *pmu;
> struct intel_uncore_box *box;
> - int i, j;
> + int i, pkg;
>
> - for (i = 0; uncore_msr_uncores[i]; i++) {
> - type = uncore_msr_uncores[i];
> - for (j = 0; j < type->num_boxes; j++) {
> - pmu = &type->pmus[j];
> - box = *per_cpu_ptr(pmu->box, cpu);
> - *per_cpu_ptr(pmu->box, cpu) = NULL;
> - if (box && atomic_dec_and_test(&box->refcnt)) {
> - list_add(&box->list, &boxes_to_free);
> + pkg = topology_physical_package_id(cpu);
> + while (*types) {
> + type = *types++;
> + pmu = type->pmus;
> + for (i = 0; i < type->num_boxes; i++, pmu++) {
> + box = pmu->boxes[pkg];
> + if (box && atomic_dec_return(&box->refcnt) == 0)
> uncore_box_exit(box);
> - }
> }
> }
> }
>
> -static int uncore_cpu_starting(int cpu)
> +static void uncore_cpu_starting(int cpu, bool init)
> {
> - struct intel_uncore_type *type;
> + struct intel_uncore_type *type, **types = uncore_msr_uncores;
> struct intel_uncore_pmu *pmu;
> - struct intel_uncore_box *box, *exist;
> - int i, j, k, phys_id;
> + struct intel_uncore_box *box;
> + int i, pkg, ncpus = 1;
>
> - phys_id = topology_physical_package_id(cpu);
> + if (init) {
> + /*
> + * On init we get the number of online cpus in the package
> + * and set refcount for all of them.
> + */
> + ncpus = cpumask_weight(topology_core_cpumask(cpu));
> + }
>
> - for (i = 0; uncore_msr_uncores[i]; i++) {
> - type = uncore_msr_uncores[i];
> - for (j = 0; j < type->num_boxes; j++) {
> - pmu = &type->pmus[j];
> - box = *per_cpu_ptr(pmu->box, cpu);
> - /* called by uncore_cpu_init? */
> - if (box && box->phys_id >= 0) {
> - uncore_box_init(box);
> + pkg = topology_physical_package_id(cpu);
> + while (*types) {
> + type = *types++;
> + pmu = type->pmus;
> + for (i = 0; i < type->num_boxes; i++, pmu++) {
> + box = pmu->boxes[pkg];
> + if (!box)
> continue;
> - }
> -
> - for_each_online_cpu(k) {
> - exist = *per_cpu_ptr(pmu->box, k);
> - if (exist && exist->phys_id == phys_id) {
> - atomic_inc(&exist->refcnt);
> - *per_cpu_ptr(pmu->box, cpu) = exist;
> - if (box) {
> - list_add(&box->list,
> - &boxes_to_free);
> - box = NULL;
> - }
> - break;
> - }
> - }
> -
> - if (box) {
> - box->phys_id = phys_id;
> + /* The first cpu on a package activates the box */
> + if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
> uncore_box_init(box);
> - }
> }
> }
> - return 0;
> }
>
> -static int uncore_cpu_prepare(int cpu, int phys_id)
> +static int uncore_cpu_prepare(int cpu)
> {
> - struct intel_uncore_type *type;
> + struct intel_uncore_type *type, **types = uncore_msr_uncores;
> struct intel_uncore_pmu *pmu;
> struct intel_uncore_box *box;
> - int i, j;
> -
> - for (i = 0; uncore_msr_uncores[i]; i++) {
> - type = uncore_msr_uncores[i];
> - for (j = 0; j < type->num_boxes; j++) {
> - pmu = &type->pmus[j];
> - if (pmu->func_id < 0)
> - pmu->func_id = j;
> + int i, pkg;
>
> + pkg = topology_physical_package_id(cpu);
> + while (*types) {
> + type = *types++;
> + pmu = type->pmus;
> + for (i = 0; i < type->num_boxes; i++, pmu++) {
> + if (pmu->boxes[pkg])
> + continue;
> + /* First cpu of a package allocates the box */
> box = uncore_alloc_box(type, cpu_to_node(cpu));
> if (!box)
> return -ENOMEM;
> -
> box->pmu = pmu;
> - box->phys_id = phys_id;
> - *per_cpu_ptr(pmu->box, cpu) = box;
> + box->phys_id = pkg;
> + pmu->boxes[pkg] = box;
> }
> }
> return 0;
> @@ -1163,13 +1105,11 @@ static void uncore_change_type_ctx(struc
> {
> struct intel_uncore_pmu *pmu = type->pmus;
> struct intel_uncore_box *box;
> - int i;
> + int i, pkg;
>
> + pkg = topology_physical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
> for (i = 0; i < type->num_boxes; i++, pmu++) {
> - if (old_cpu < 0)
> - box = uncore_pmu_to_box(pmu, new_cpu);
> - else
> - box = uncore_pmu_to_box(pmu, old_cpu);
> + box = pmu->boxes[pkg];
> if (!box)
> continue;
>
> @@ -1199,27 +1139,20 @@ static void uncore_change_context(struct
>
> static void uncore_event_exit_cpu(int cpu)
> {
> - int i, phys_id, target;
> + int target;
>
> - /* if exiting cpu is used for collecting uncore events */
> + /* Check if exiting cpu is used for collecting uncore events */
> if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
> return;
>
> - /* find a new cpu to collect uncore events */
> - phys_id = topology_physical_package_id(cpu);
> - target = -1;
> - for_each_online_cpu(i) {
> - if (i == cpu)
> - continue;
> - if (phys_id == topology_physical_package_id(i)) {
> - target = i;
> - break;
> - }
> - }
> + /* Find a new cpu to collect uncore events */
> + target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
>
> - /* migrate uncore events to the new cpu */
> - if (target >= 0)
> + /* Migrate uncore events to the new target */
> + if (target < nr_cpu_ids)
> cpumask_set_cpu(target, &uncore_cpu_mask);
> + else
> + target = -1;
>
> uncore_change_context(uncore_msr_uncores, cpu, target);
> uncore_change_context(uncore_pci_uncores, cpu, target);
> @@ -1227,13 +1160,15 @@ static void uncore_event_exit_cpu(int cp
>
> static void uncore_event_init_cpu(int cpu)
> {
> - int i, phys_id;
> + int target;
>
> - phys_id = topology_physical_package_id(cpu);
> - for_each_cpu(i, &uncore_cpu_mask) {
> - if (phys_id == topology_physical_package_id(i))
> - return;
> - }
> + /*
> + * Check if there is an online cpu in the package
> + * which collects uncore events already.
> + */
> + target = cpumask_any_and(topology_core_cpumask(cpu), &uncore_cpu_mask);
> + if (target < nr_cpu_ids)
> + return;
>
> cpumask_set_cpu(cpu, &uncore_cpu_mask);
>
> @@ -1246,38 +1181,25 @@ static int uncore_cpu_notifier(struct no
> {
> unsigned int cpu = (long)hcpu;
>
> - /* allocate/free data structure for uncore box */
> switch (action & ~CPU_TASKS_FROZEN) {
> case CPU_UP_PREPARE:
> - return notifier_from_errno(uncore_cpu_prepare(cpu, -1));
> + return notifier_from_errno(uncore_cpu_prepare(cpu));
> +
> case CPU_STARTING:
> - uncore_cpu_starting(cpu);
> + uncore_cpu_starting(cpu, false);
> + case CPU_DOWN_FAILED:
> + uncore_event_init_cpu(cpu);
> break;
> +
> case CPU_UP_CANCELED:
> case CPU_DYING:
> uncore_cpu_dying(cpu);
> break;
> - case CPU_ONLINE:
> - case CPU_DEAD:
> - uncore_kfree_boxes();
> - break;
> - default:
> - break;
> - }
>
> - /* select the cpu that collects uncore events */
> - switch (action & ~CPU_TASKS_FROZEN) {
> - case CPU_DOWN_FAILED:
> - case CPU_STARTING:
> - uncore_event_init_cpu(cpu);
> - break;
> case CPU_DOWN_PREPARE:
> uncore_event_exit_cpu(cpu);
> break;
> - default:
> - break;
> }
> -
> return NOTIFY_OK;
> }
>
> @@ -1359,7 +1281,7 @@ static int __init uncore_cpu_init(void)
> return 0;
> }
>
> - ret = uncore_types_init(uncore_msr_uncores);
> + ret = uncore_types_init(uncore_msr_uncores, true);
> if (ret)
> goto err;
>
> @@ -1375,39 +1297,34 @@ static int __init uncore_cpu_init(void)
>
> static void __init uncore_cpu_setup(void *dummy)
> {
> - uncore_cpu_starting(smp_processor_id());
> + uncore_cpu_starting(smp_processor_id(), true);
> }
>
> +/* Lazy to avoid allocation of a few bytes for the normal case */
> +static __initdata DECLARE_BITMAP(packages, NR_CPUS);
> +
> static int __init uncore_cpumask_init(void)
> {
> - int cpu, ret = 0;
> -
> - cpu_notifier_register_begin();
> + unsigned int cpu;
>
> for_each_online_cpu(cpu) {
> - int i, phys_id = topology_physical_package_id(cpu);
> + unsigned int pkg = topology_physical_package_id(cpu);
> + int ret;
>
> - for_each_cpu(i, &uncore_cpu_mask) {
> - if (phys_id == topology_physical_package_id(i)) {
> - phys_id = -1;
> - break;
> - }
> - }
> - if (phys_id < 0)
> + if (test_and_set_bit(pkg, packages))
> continue;
> -
> - ret = uncore_cpu_prepare(cpu, phys_id);
> + /*
> + * The first online cpu of each package takes the refcounts
> + * for all other online cpus in that package.
> + */
> + ret = uncore_cpu_prepare(cpu);
> if (ret)
> - goto out;
> + return ret;
> uncore_event_init_cpu(cpu);
> + smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
> }
> - on_each_cpu(uncore_cpu_setup, NULL, 1);
> -
> __register_cpu_notifier(&uncore_cpu_nb);
> -
> -out:
> - cpu_notifier_register_done();
> - return ret;
> + return 0;
> }
>
> static int __init intel_uncore_init(void)
> @@ -1425,17 +1342,19 @@ static int __init intel_uncore_init(void
> return ret;
> ret = uncore_cpu_init();
> if (ret)
> - goto errpci;
> + goto err;
> +
> + cpu_notifier_register_begin();
> ret = uncore_cpumask_init();
> if (ret)
> - goto errcpu;
> -
> + goto err;
> + cpu_notifier_register_done();
> return 0;
>
> -errcpu:
> +err:
> uncore_types_exit(uncore_msr_uncores);
> -errpci:
> uncore_pci_exit();
> + cpu_notifier_register_done();
> return ret;
> }
> device_initcall(intel_uncore_init);
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> @@ -79,9 +79,9 @@ struct intel_uncore_pmu {
> int pmu_idx;
> int func_id;
> bool registered;
> + atomic_t activeboxes;
> struct intel_uncore_type *type;
> - struct intel_uncore_box ** __percpu box;
> - struct list_head box_list;
> + struct intel_uncore_box **boxes;
> };
>
> struct intel_uncore_extra_reg {
>
>