[PATCH v2 1/2] platform/x86: Add support for Uncore frequency control

From: Srinivas Pandruvada
Date: Sun Jan 12 2020 - 22:59:36 EST


Some server users set limits on the uncore frequency using MSR 620H, while
running latency sensitive workloads. Here uncore frequency controls
RING/LLC(last-level cache) clocks.

But MSR control is not always possible from the user space, so this driver
provides a sysfs interface to set max and min frequency limits. This MSR
620H is a die scoped in multi-die system or package scoped in non multi-die
systems.

When this driver is loaded, a new directory is created under
/sys/devices/system/cpu.

For example on a two package Skylake server:
$cd /sys/devices/system/cpu/intel_uncore_frequency

$ls
package_00_die_00 package_01_die_00

$ls package_00_die_00
max_freq_khz min_freq_khz initial_max_freq_khz
initial_min_freq_khz

$grep . *
max_freq_khz:2400000
min_freq_khz:1200000
initial_max_freq_khz:2400000
initial_min_freq_khz:1200000

Here, initial_max_freq_khz and initial_min_freq_khz are read only
attributes to show power up or initial values of max and min frequencies
respectively. Other attributes are read-write, so that users can modify.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx>
---
v2:
- Len Brown suggested to chnage "power_up" to "initial" in the name.
- Split the documentation patch to another patch to merge via different
tree

drivers/platform/x86/Kconfig | 11 +
drivers/platform/x86/Makefile | 1 +
drivers/platform/x86/intel-uncore-frequency.c | 434 ++++++++++++++++++
3 files changed, 446 insertions(+)
create mode 100644 drivers/platform/x86/intel-uncore-frequency.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 27d5b40fb717..6013c3b96cfd 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1337,6 +1337,17 @@ config PCENGINES_APU2
To compile this driver as a module, choose M here: the module
will be called pcengines-apuv2.

+config INTEL_UNCORE_FREQ_CONTROL
+ tristate "Intel Uncore frequency control driver"
+ depends on X86_64
+ help
+ This driver allows control of uncore frequency limits on
+ supported server platforms.
+ Uncore frequency controls RING/LLC (last-level cache) clocks.
+
+ To compile this driver as a module, choose M here: the module
+ will be called intel-uncore-frequency.
+
source "drivers/platform/x86/intel_speed_select_if/Kconfig"

config SYSTEM76_ACPI
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 42d85a00be4e..3747b1f07cf1 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -105,3 +105,4 @@ obj-$(CONFIG_INTEL_ATOMISP2_PM) += intel_atomisp2_pm.o
obj-$(CONFIG_PCENGINES_APU2) += pcengines-apuv2.o
obj-$(CONFIG_INTEL_SPEED_SELECT_INTERFACE) += intel_speed_select_if/
obj-$(CONFIG_SYSTEM76_ACPI) += system76_acpi.o
+obj-$(CONFIG_INTEL_UNCORE_FREQ_CONTROL) += intel-uncore-frequency.o
diff --git a/drivers/platform/x86/intel-uncore-frequency.c b/drivers/platform/x86/intel-uncore-frequency.c
new file mode 100644
index 000000000000..b62386873d73
--- /dev/null
+++ b/drivers/platform/x86/intel-uncore-frequency.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Uncore Frequency Setting
+ * Copyright (c) 2019, Intel Corporation.
+ * All rights reserved.
+ *
+ * Provide interface to set MSR 620 at a granularity of per die. On CPU online,
+ * one control CPU is identified per die to read/write limit. This control CPU
+ * is changed, if the CPU state is changed to offline. When the last CPU is
+ * offline in a die then remove the sysfs object for that die.
+ * The majority of actual code is related to sysfs create and read/write
+ * attributes.
+ *
+ * Author: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx>
+ */
+
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define MSR_UNCORE_RATIO_LIMIT 0x620
+#define UNCORE_FREQ_KHZ_MULTIPLIER 100000
+
+/**
+ * struct uncore_data - Encapsulate all uncore data
+ * @stored_uncore_data: Last user changed MSR 620 value, which will be restored
+ * on system resume.
+ * @initial_min_freq_khz: Sampled minimum uncore frequency at driver init
+ * @initial_max_freq_khz: Sampled maximum uncore frequency at driver init
+ * @control_cpu: Designated CPU for a die to read/write
+ * @valid: Mark the data valid/invalid
+ *
+ * This structure is used to encapsulate all data related to uncore sysfs
+ * settings for a die/package.
+ */
+struct uncore_data {
+ struct kobject kobj;
+ u64 stored_uncore_data;
+ u32 initial_min_freq_khz;
+ u32 initial_max_freq_khz;
+ int control_cpu;
+ bool valid;
+};
+
+#define to_uncore_data(a) container_of(a, struct uncore_data, kobj)
+
+/* Max instances for uncore data, one for each die */
+static int uncore_max_entries __read_mostly;
+/* Storage for uncore data for all instances */
+static struct uncore_data *uncore_instances;
+/* Root of the all uncore sysfs kobjs */
+struct kobject uncore_root_kobj;
+/* Stores the CPU mask of the target CPUs to use during uncore read/write */
+static cpumask_t uncore_cpu_mask;
+/* CPU online callback register instance */
+static enum cpuhp_state uncore_hp_state __read_mostly;
+/* Mutex to control all mutual exclusions */
+static DEFINE_MUTEX(uncore_lock);
+
+struct uncore_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *kobj,
+ struct attribute *attr, const char *c, ssize_t count);
+};
+
+#define define_one_uncore_ro(_name) \
+static struct uncore_attr _name = \
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define define_one_uncore_rw(_name) \
+static struct uncore_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+#define show_uncore_data(member_name) \
+ static ssize_t show_##member_name(struct kobject *kobj, \
+ struct attribute *attr, \
+ char *buf) \
+ { \
+ struct uncore_data *data = to_uncore_data(kobj); \
+ return scnprintf(buf, PAGE_SIZE, "%u\n", \
+ data->member_name); \
+ } \
+ define_one_uncore_ro(member_name)
+
+show_uncore_data(initial_min_freq_khz);
+show_uncore_data(initial_max_freq_khz);
+
+/* Common function to read MSR 0x620 and read min/max */
+static int uncore_read_ratio(struct uncore_data *data, unsigned int *min,
+ unsigned int *max)
+{
+ u64 cap;
+ int ret;
+
+ ret = rdmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, &cap);
+ if (ret)
+ return ret;
+
+ *max = (cap & 0x7F) * UNCORE_FREQ_KHZ_MULTIPLIER;
+ *min = ((cap & GENMASK(14, 8)) >> 8) * UNCORE_FREQ_KHZ_MULTIPLIER;
+
+ return 0;
+}
+
+/* Common function to set min/max ratios to be used by sysfs callbacks */
+static int uncore_write_ratio(struct uncore_data *data, unsigned int input,
+ int set_max)
+{
+ int ret;
+ u64 cap;
+
+ mutex_lock(&uncore_lock);
+
+ input /= UNCORE_FREQ_KHZ_MULTIPLIER;
+ if (!input || input > 0x7F) {
+ ret = -EINVAL;
+ goto finish_write;
+ }
+
+ rdmsrl(MSR_UNCORE_RATIO_LIMIT, cap);
+ if (set_max) {
+ cap &= ~0x7F;
+ cap |= input;
+ } else {
+ cap &= ~GENMASK(14, 8);
+ cap |= (input << 8);
+ }
+
+ ret = wrmsrl_on_cpu(data->control_cpu, MSR_UNCORE_RATIO_LIMIT, cap);
+ if (ret)
+ goto finish_write;
+
+ data->stored_uncore_data = cap;
+
+finish_write:
+ mutex_unlock(&uncore_lock);
+
+ return ret;
+}
+
+static ssize_t store_min_max_freq_khz(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, ssize_t count,
+ int min_max)
+{
+ struct uncore_data *data = to_uncore_data(kobj);
+ unsigned int input;
+
+ if (kstrtouint(buf, 10, &input))
+ return -EINVAL;
+
+ uncore_write_ratio(data, input, min_max);
+
+ return count;
+}
+
+static ssize_t show_min_max_freq_khz(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf, int min_max)
+{
+ struct uncore_data *data = to_uncore_data(kobj);
+ unsigned int min, max;
+ int ret;
+
+ mutex_lock(&uncore_lock);
+ ret = uncore_read_ratio(data, &min, &max);
+ mutex_unlock(&uncore_lock);
+ if (ret)
+ return ret;
+
+ if (min_max)
+ return sprintf(buf, "%u\n", max);
+
+ return sprintf(buf, "%u\n", min);
+}
+
+#define store_uncore_min_max(name, min_max) \
+ static ssize_t store_##name(struct kobject *kobj, \
+ struct attribute *attr, \
+ const char *buf, ssize_t count) \
+ { \
+ \
+ return store_min_max_freq_khz(kobj, attr, buf, count, \
+ min_max); \
+ }
+
+#define show_uncore_min_max(name, min_max) \
+ static ssize_t show_##name(struct kobject *kobj, \
+ struct attribute *attr, char *buf) \
+ { \
+ \
+ return show_min_max_freq_khz(kobj, attr, buf, min_max); \
+ }
+
+store_uncore_min_max(min_freq_khz, 0);
+store_uncore_min_max(max_freq_khz, 1);
+
+show_uncore_min_max(min_freq_khz, 0);
+show_uncore_min_max(max_freq_khz, 1);
+
+define_one_uncore_rw(min_freq_khz);
+define_one_uncore_rw(max_freq_khz);
+
+static struct attribute *uncore_attrs[] = {
+ &initial_min_freq_khz.attr,
+ &initial_max_freq_khz.attr,
+ &max_freq_khz.attr,
+ &min_freq_khz.attr,
+ NULL
+};
+
+static struct kobj_type uncore_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = uncore_attrs,
+};
+
+static struct kobj_type uncore_root_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
+/* Caller provides protection */
+static struct uncore_data *uncore_get_instance(unsigned int cpu)
+{
+ int id = topology_logical_die_id(cpu);
+
+ if (id >= 0 && id < uncore_max_entries)
+ return &uncore_instances[id];
+
+ return NULL;
+}
+
+static void uncore_add_die_entry(int cpu)
+{
+ struct uncore_data *data;
+
+ mutex_lock(&uncore_lock);
+ data = uncore_get_instance(cpu);
+ if (!data) {
+ mutex_unlock(&uncore_lock);
+ return;
+ }
+
+ if (data->valid) {
+ /* control cpu changed */
+ data->control_cpu = cpu;
+ } else {
+ char str[64];
+ int ret;
+
+ memset(data, 0, sizeof(*data));
+ sprintf(str, "package_%02d_die_%02d",
+ topology_physical_package_id(cpu),
+ topology_die_id(cpu));
+
+ uncore_read_ratio(data, &data->initial_min_freq_khz,
+ &data->initial_max_freq_khz);
+
+ ret = kobject_init_and_add(&data->kobj, &uncore_ktype,
+ &uncore_root_kobj, str);
+ if (!ret) {
+ data->control_cpu = cpu;
+ data->valid = true;
+ }
+ }
+ mutex_unlock(&uncore_lock);
+}
+
+/* Last CPU in this die is offline, so remove sysfs entries */
+static void uncore_remove_die_entry(int cpu)
+{
+ struct uncore_data *data;
+
+ mutex_lock(&uncore_lock);
+ data = uncore_get_instance(cpu);
+ if (data) {
+ kobject_put(&data->kobj);
+ data->control_cpu = -1;
+ data->valid = false;
+ }
+ mutex_unlock(&uncore_lock);
+}
+
+static int uncore_event_cpu_online(unsigned int cpu)
+{
+ int target;
+
+ /* Check if there is an online cpu in the package for uncore MSR */
+ target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu));
+ if (target < nr_cpu_ids)
+ return 0;
+
+ /* Use this CPU on this die as a control CPU */
+ cpumask_set_cpu(cpu, &uncore_cpu_mask);
+ uncore_add_die_entry(cpu);
+
+ return 0;
+}
+
+static int uncore_event_cpu_offline(unsigned int cpu)
+{
+ int target;
+
+ /* Check if existing cpu is used for uncore MSRs */
+ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+ return 0;
+
+ /* Find a new cpu to set uncore MSR */
+ target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+
+ if (target < nr_cpu_ids) {
+ cpumask_set_cpu(target, &uncore_cpu_mask);
+ uncore_add_die_entry(target);
+ } else {
+ uncore_remove_die_entry(cpu);
+ }
+
+ return 0;
+}
+
+static int uncore_pm_notify(struct notifier_block *nb, unsigned long mode,
+ void *_unused)
+{
+ int cpu;
+
+ switch (mode) {
+ case PM_POST_HIBERNATION:
+ case PM_POST_RESTORE:
+ case PM_POST_SUSPEND:
+ for_each_cpu(cpu, &uncore_cpu_mask) {
+ struct uncore_data *data;
+ int ret;
+
+ data = uncore_get_instance(cpu);
+ if (!data || !data->valid || !data->stored_uncore_data)
+ continue;
+
+ ret = wrmsrl_on_cpu(cpu, MSR_UNCORE_RATIO_LIMIT,
+ data->stored_uncore_data);
+ if (ret)
+ return ret;
+ }
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static struct notifier_block uncore_pm_nb = {
+ .notifier_call = uncore_pm_notify,
+};
+
+#define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
+
+static const struct x86_cpu_id intel_uncore_cpu_ids[] = {
+ ICPU(INTEL_FAM6_BROADWELL_G),
+ ICPU(INTEL_FAM6_BROADWELL_X),
+ ICPU(INTEL_FAM6_BROADWELL_D),
+ ICPU(INTEL_FAM6_SKYLAKE_X),
+ ICPU(INTEL_FAM6_ICELAKE_X),
+ ICPU(INTEL_FAM6_ICELAKE_D),
+ {}
+};
+
+static int __init intel_uncore_init(void)
+{
+ const struct x86_cpu_id *id;
+ int ret;
+
+ id = x86_match_cpu(intel_uncore_cpu_ids);
+ if (!id)
+ return -ENODEV;
+
+ uncore_max_entries = topology_max_packages() *
+ topology_max_die_per_package();
+ uncore_instances = kcalloc(uncore_max_entries,
+ sizeof(*uncore_instances), GFP_KERNEL);
+ if (!uncore_instances)
+ return -ENOMEM;
+
+ ret = kobject_init_and_add(&uncore_root_kobj, &uncore_root_ktype,
+ &cpu_subsys.dev_root->kobj,
+ "intel_uncore_frequency");
+ if (ret)
+ goto err_free;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "platform/x86/uncore-freq:online",
+ uncore_event_cpu_online,
+ uncore_event_cpu_offline);
+ if (ret < 0)
+ goto err_rem_kobj;
+
+ uncore_hp_state = ret;
+
+ ret = register_pm_notifier(&uncore_pm_nb);
+ if (ret)
+ goto err_rem_state;
+
+ return 0;
+
+err_rem_state:
+ cpuhp_remove_state(uncore_hp_state);
+err_rem_kobj:
+ kobject_put(&uncore_root_kobj);
+err_free:
+ kfree(uncore_instances);
+
+ return ret;
+}
+module_init(intel_uncore_init)
+
+static void __exit intel_uncore_exit(void)
+{
+ int i;
+
+ unregister_pm_notifier(&uncore_pm_nb);
+ cpuhp_remove_state(uncore_hp_state);
+ for (i = 0; i < uncore_max_entries; ++i) {
+ if (uncore_instances[i].valid)
+ kobject_put(&uncore_instances[i].kobj);
+ }
+ kobject_put(&uncore_root_kobj);
+ kfree(uncore_instances);
+}
+module_exit(intel_uncore_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel Uncore Frequency Limits Driver");
--
2.24.1