Re: [PATCH v2 1/3] platform/x86/intel-uncore-freq: Uncore frequency control via TPMI

From: srinivas pandruvada
Date: Thu Apr 20 2023 - 18:07:03 EST


On Thu, 2023-04-20 at 14:25 +0300, Ilpo Järvinen wrote:
> On Tue, 18 Apr 2023, Srinivas Pandruvada wrote:
>
> > Implement support of uncore frequency control via TPMI (Topology
> > Aware
> > Register and PM Capsule Interface). This driver provides the
> > similar
> > functionality as the current uncore frequency driver using MSRs.
> >
> > The hardware interface to read/write is basically substitution of
> > MSR
> > 0x620 and 0x621. There are specific MMIO offset and bits to get/set
> > minimum and maximum uncore ratio, similar to MSRs.
> >
> > The scope of the uncore MSRs is package/die. But new generation of
> > CPUs
> > have more granular control at a cluster level. Each package/die can
> > have
> > multiple power domains, which further can have multiple clusters.
> > The
> > TPMI interface allows control at cluster level.
> >
> > The primary use case for uncore sysfs is to set maximum and minimum
> > uncore frequency to reduce power consumption or latency. The
> > current
> > uncore sysfs control is per package/die. This is enough for the
> > majority
> > of users as workload will move to different power domains as it
> > moves
> > between different CPUs.
> >
> > The current uncore sysfs provides controls at package/die level.
> > When
> > user sets maximum/minimum limits, the driver sets the same limits
> > to
> > each cluster.
> >
> > Here number of power domains = number of resources in this aux
> > device.
> > There are offsets and bits to discover number of clusters and
> > offset for
> > each cluster level controls.
> >
> > The TPMI documentation can be downloaded from:
> > https://github.com/intel/tpmi_power_management
> >
> > Signed-off-by: Srinivas Pandruvada
> > <srinivas.pandruvada@xxxxxxxxxxxxxxx>
> > Reviewed-by: Zhang Rui <rui.zhang@xxxxxxxxx>
> > Tested-by: Wendy Wang <wendy.wang@xxxxxxxxx>
> > ---
Submitted update with the suggested changes here.

Thanks,
Srinivas

> > v2
> > - Changed mmio to u8* (Hans)
> > - Not setting pd_info->uncore_base to NULL (Hans)
> > - Handling failure of devm_kcalloc() (Hans)
> > - Merged init/remove to probe/remove functions (Rui)
> > - Log when platform is NULL (Rui)
> >
> >  .../x86/intel/uncore-frequency/Kconfig        |   4 +
> >  .../x86/intel/uncore-frequency/Makefile       |   2 +
> >  .../uncore-frequency/uncore-frequency-tpmi.c  | 338
> > ++++++++++++++++++
> >  3 files changed, 344 insertions(+)
> >  create mode 100644 drivers/platform/x86/intel/uncore-
> > frequency/uncore-frequency-tpmi.c
> >
> > diff --git a/drivers/platform/x86/intel/uncore-frequency/Kconfig
> > b/drivers/platform/x86/intel/uncore-frequency/Kconfig
> > index 21b209124916..a56d55056927 100644
> > --- a/drivers/platform/x86/intel/uncore-frequency/Kconfig
> > +++ b/drivers/platform/x86/intel/uncore-frequency/Kconfig
> > @@ -6,9 +6,13 @@
> >  menu "Intel Uncore Frequency Control"
> >         depends on X86_64 || COMPILE_TEST
> >  
> > +config INTEL_UNCORE_FREQ_CONTROL_TPMI
> > +       tristate
> > +
> >  config INTEL_UNCORE_FREQ_CONTROL
> >         tristate "Intel Uncore frequency control driver"
> >         depends on X86_64
> > +       select INTEL_UNCORE_FREQ_CONTROL_TPMI if INTEL_TPMI
> >         help
> >           This driver allows control of Uncore frequency limits on
> >           supported server platforms.
> > diff --git a/drivers/platform/x86/intel/uncore-frequency/Makefile
> > b/drivers/platform/x86/intel/uncore-frequency/Makefile
> > index e0f7968e8285..08ff57492b28 100644
> > --- a/drivers/platform/x86/intel/uncore-frequency/Makefile
> > +++ b/drivers/platform/x86/intel/uncore-frequency/Makefile
> > @@ -7,3 +7,5 @@ obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL) += intel-
> > uncore-frequency.o
> >  intel-uncore-frequency-y               := uncore-frequency.o
> >  obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL)        += intel-uncore-
> > frequency-common.o
> >  intel-uncore-frequency-common-y                := uncore-
> > frequency-common.o
> > +obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL_TPMI)   += intel-uncore-
> > frequency-tpmi.o
> > +intel-uncore-frequency-tpmi-y          := uncore-frequency-tpmi.o
> > diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-
> > frequency-tpmi.c b/drivers/platform/x86/intel/uncore-
> > frequency/uncore-frequency-tpmi.c
> > new file mode 100644
> > index 000000000000..5e454e9dd4a7
> > --- /dev/null
> > +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-
> > tpmi.c
> > @@ -0,0 +1,338 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * uncore-frquency-tpmi: Uncore frequency scaling using TPMI
> > + *
> > + * Copyright (c) 2023, Intel Corporation.
> > + * All Rights Reserved.
> > + *
> > + * The hardware interface to read/write is basically substitution
> > of
> > + * MSR 0x620 and 0x621.
> > + * There are specific MMIO offset and bits to get/set minimum and
> > + * maximum uncore ratio, similar to MSRs.
> > + * The scope of the uncore MSRs was package scope. But TPMI allows
> > + * new gen CPUs to have multiple uncore controls at uncore-cluster
> > + * level. Each package can have multiple power domains which
> > further
> > + * can have multiple clusters.
> > + * Here number of power domains = number of resources in this aux
> > + * device. There are offsets and bits to discover number of
> > clusters
> > + * and offset for each cluster level controls.
> > + *
> > + */
> > +
> > +#include <linux/auxiliary_bus.h>
> > +#include <linux/bitfield.h>
> > +#include <linux/bits.h>
> > +#include <linux/io.h>
> > +#include <linux/module.h>
> > +#include <linux/intel_tpmi.h>
> > +
> > +#include "uncore-frequency-common.h"
> > +
> > +#define        UNCORE_HEADER_VERSION           1
> > +#define UNCORE_HEADER_INDEX            0
> > +#define UNCORE_FABRIC_CLUSTER_OFFSET   8
> > +
> > +/* status + control + adv_ctl1 + adv_ctl2 */
> > +#define UNCORE_FABRIC_CLUSTER_SIZE     (4 * 8)
> > +
> > +#define UNCORE_STATUS_INDEX            0
> > +#define UNCORE_CONTROL_INDEX           8
> > +
> > +#define UNCORE_FREQ_KHZ_MULTIPLIER     100000
> > +
> > +struct tpmi_uncore_struct;
> > +
> > +/* Information for each cluster */
> > +struct tpmi_uncore_cluster_info {
> > +       u8 __iomem *cluster_base;
> > +       struct uncore_data uncore_data;
> > +       struct tpmi_uncore_struct *uncore_root;
> > +};
> > +
> > +/* Information for each power domain */
> > +struct tpmi_uncore_power_domain_info {
> > +       u8 __iomem *uncore_base;
> > +       int ufs_header_ver;
> > +       int cluster_count;
> > +       struct tpmi_uncore_cluster_info *cluster_infos;
> > +};
> > +
> > +/* Information for all power domains in a package */
> > +struct tpmi_uncore_struct {
> > +       int power_domain_count;
> > +       struct tpmi_uncore_power_domain_info *pd_info;
> > +       struct tpmi_uncore_cluster_info root_cluster;
> > +};
> > +
> > +#define UNCORE_GENMASK_MIN_RATIO       GENMASK_ULL(21, 15)
> > +#define UNCORE_GENMASK_MAX_RATIO       GENMASK_ULL(14, 8)
> > +
> > +/* Helper function to read MMIO offset for max/min control
> > frequency */
> > +static void read_control_freq(struct tpmi_uncore_cluster_info
> > *cluster_info,
> > +                            unsigned int *min, unsigned int *max)
> > +{
> > +       u64 control;
> > +
> > +       control = readq(cluster_info->cluster_base +
> > UNCORE_CONTROL_INDEX);
> > +       *max = FIELD_GET(UNCORE_GENMASK_MAX_RATIO, control) *
> > UNCORE_FREQ_KHZ_MULTIPLIER;
> > +       *min = FIELD_GET(UNCORE_GENMASK_MIN_RATIO, control) *
> > UNCORE_FREQ_KHZ_MULTIPLIER;
> > +}
> > +
> > +#define UNCORE_MAX_RATIO       0x7F
>
> FIELD_MAX(UNCORE_GENMASK_MAX_RATIO) ?
>
> > +
> > +/* Callback for sysfs read for max/min frequencies. Called under
> > mutex locks */
> > +static int uncore_read_control_freq(struct uncore_data *data,
> > unsigned int *min,
> > +                                   unsigned int *max)
> > +{
> > +       struct tpmi_uncore_cluster_info *cluster_info;
> > +       struct tpmi_uncore_struct *uncore_root;
> > +       int i, _min = 0, _max = 0;
> > +
> > +       cluster_info = container_of(data, struct
> > tpmi_uncore_cluster_info, uncore_data);
> > +       uncore_root = cluster_info->uncore_root;
> > +
> > +       *min = UNCORE_MAX_RATIO * UNCORE_FREQ_KHZ_MULTIPLIER;
> > +       *max = 0;
> > +
> > +       /*
> > +        * Get the max/min by looking at each cluster. Get the
> > lowest
> > +        * min and highest max.
> > +        */
> > +       for (i = 0; i < uncore_root->power_domain_count; ++i) {
> > +               int j;
> > +
> > +               for (j = 0; j < uncore_root-
> > >pd_info[i].cluster_count; ++j) {
> > +                       read_control_freq(&uncore_root-
> > >pd_info[i].cluster_infos[j],
> > +                                         &_min, &_max);
> > +                       if (*min > _min)
> > +                               *min = _min;
> > +                       if (*max < _max)
> > +                               *max = _max;
> > +               }
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +/* Helper function to write MMIO offset for max/min control
> > frequency */
> > +static void write_control_freq(struct tpmi_uncore_cluster_info
> > *cluster_info, unsigned int input,
> > +                             unsigned int min_max)
> > +{
> > +       u64 control;
> > +
> > +       control = readq(cluster_info->cluster_base +
> > UNCORE_CONTROL_INDEX);
> > +
> > +       if (min_max) {
> > +               control &= ~UNCORE_GENMASK_MAX_RATIO;
> > +               control |= FIELD_PREP(UNCORE_GENMASK_MAX_RATIO,
> > input);
> > +       } else {
> > +               control &= ~UNCORE_GENMASK_MIN_RATIO;
> > +               control |= FIELD_PREP(UNCORE_GENMASK_MIN_RATIO,
> > input);
> > +       }
> > +
> > +       writeq(control, (cluster_info->cluster_base +
> > UNCORE_CONTROL_INDEX));
> > +}
> > +
> > +/* Callback for sysfs write for max/min frequencies. Called under
> > mutex locks */
> > +static int uncore_write_control_freq(struct uncore_data *data,
> > unsigned int input,
> > +                                    unsigned int min_max)
> > +{
> > +       struct tpmi_uncore_cluster_info *cluster_info;
> > +       struct tpmi_uncore_struct *uncore_root;
> > +       int i;
> > +
> > +       input /= UNCORE_FREQ_KHZ_MULTIPLIER;
> > +       if (!input || input > UNCORE_MAX_RATIO)
> > +               return -EINVAL;
> > +
> > +       cluster_info = container_of(data, struct
> > tpmi_uncore_cluster_info, uncore_data);
> > +       uncore_root = cluster_info->uncore_root;
> > +
> > +       /* Update each cluster in a package */
> > +       for (i = 0; i < uncore_root->power_domain_count; ++i) {
> > +               int j;
> > +
> > +               for (j = 0; j < uncore_root-
> > >pd_info[i].cluster_count; ++j)
> > +                       write_control_freq(&uncore_root-
> > >pd_info[i].cluster_infos[j],
> > +                                          input, min_max);
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +/* Callback for sysfs read for the current uncore frequency.
> > Called under mutex locks */
> > +static int uncore_read_freq(struct uncore_data *data, unsigned int
> > *freq)
> > +{
> > +       return -ENODATA;
> > +}
> > +
> > +#define UNCORE_GENMASK_VERSION                 GENMASK_ULL(7, 0)
>
> GENMASK makes the name little bit confusing (what is "uncore genmask"
> ??).
> Either drop it entirely (my preference) or add _MASK to the end
> instead.
>
> > +#define UNCORE_LOCAL_FABRIC_CLUSTER_ID_MASK    GENMASK_ULL(15, 8)
> > +#define
> > UNCORE_CLUSTER_OFF_MASK                        GENMASK_ULL(7, 0)
> > +#define UNCORE_MAX_CLUSTER_PER_DOMAIN          8
> > +
> > +static int uncore_probe(struct auxiliary_device *auxdev, const
> > struct auxiliary_device_id *id)
> > +{
> > +       struct intel_tpmi_plat_info *plat_info;
> > +       struct tpmi_uncore_struct *tpmi_uncore;
> > +       int ret, i, pkg = 0;
> > +       int num_resources;
> > +
> > +       /* Get number of power domains, which is equal to number of
> > resources */
> > +       num_resources = tpmi_get_resource_count(auxdev);
> > +       if (!num_resources)
> > +               return -EINVAL;
> > +
> > +       /* Register callbacks to uncore core */
> > +       ret = uncore_freq_common_init(uncore_read_control_freq,
> > uncore_write_control_freq,
> > +                                     uncore_read_freq);
> > +       if (ret)
> > +               return ret;
> > +
> > +       /* Allocate uncore instance per package */
> > +       tpmi_uncore = devm_kzalloc(&auxdev->dev,
> > sizeof(*tpmi_uncore), GFP_KERNEL);
> > +       if (!tpmi_uncore) {
> > +               ret = -ENOMEM;
> > +               goto err_rem_common;
> > +       }
> > +
> > +       /* Allocate memory for all power domains in a package */
> > +       tpmi_uncore->pd_info = devm_kcalloc(&auxdev->dev,
> > num_resources,
> > +                                           sizeof(*tpmi_uncore-
> > >pd_info),
> > +                                           GFP_KERNEL);
> > +       if (!tpmi_uncore->pd_info) {
> > +               ret = -ENOMEM;
> > +               goto err_rem_common;
> > +       }
> > +
> > +       tpmi_uncore->power_domain_count = num_resources;
> > +
> > +       /* Get the package ID from the TPMI core */
> > +       plat_info = tpmi_get_platform_data(auxdev);
> > +       if (plat_info)
> > +               pkg = plat_info->package_id;
> > +       else
> > +               dev_info(&auxdev->dev, "Platform information is
> > NULL\n");
> > +
> > +       for (i = 0; i < num_resources; ++i) {
> > +               struct tpmi_uncore_power_domain_info *pd_info;
> > +               struct resource *res;
> > +               u64 cluster_offset;
> > +               u8 cluster_mask;
> > +               int mask, j;
> > +               u64 header;
> > +
> > +               res = tpmi_get_resource_at_index(auxdev, i);
> > +               if (!res)
> > +                       continue;
> > +
> > +               pd_info = &tpmi_uncore->pd_info[i];
> > +
> > +               pd_info->uncore_base =
> > devm_ioremap_resource(&auxdev->dev, res);
> > +               if (IS_ERR(pd_info->uncore_base)) {
> > +                       ret = PTR_ERR(pd_info->uncore_base);
> > +                       goto err_rem_common;
> > +               }
> > +
> > +               /* Check for version and skip this resource if
> > there is mismatch */
> > +               header = readq(pd_info->uncore_base);
> > +               pd_info->ufs_header_ver = header &
> > UNCORE_GENMASK_VERSION;
> > +               if (pd_info->ufs_header_ver !=
> > UNCORE_HEADER_VERSION) {
> > +                       dev_info(&auxdev->dev, "Uncore: Unsupported
> > version:%d\n",
> > +                               pd_info->ufs_header_ver);
> > +                       continue;
> > +               }
> > +
> > +               /* Get Cluster ID Mask */
> > +               cluster_mask =
> > FIELD_GET(UNCORE_LOCAL_FABRIC_CLUSTER_ID_MASK, header);
> > +               if (!cluster_mask) {
> > +                       dev_info(&auxdev->dev, "Uncore: Invalid
> > cluster mask:%x\n", cluster_mask);
> > +                       continue;
> > +               }
> > +
> > +               /* Find out number of clusters in this resource */
> > +               mask = 0x01;
> > +               for (j = 0; j < UNCORE_MAX_CLUSTER_PER_DOMAIN; ++j)
> > {
> > +                       if (cluster_mask & mask)
> > +                               pd_info->cluster_count++;
> > +                       mask <<= 1;
> > +               }
>
> pd_info->cluster_count = hweight8(cluster_mask);
>
> ?
>
>