Re: [PATCH 2/2] RAS: Introduce the FRU Memory Poison Manager

From: Yazen Ghannam
Date: Wed Feb 14 2024 - 10:34:26 EST


On 2/14/2024 7:02 AM, Borislav Petkov wrote:
On Tue, Feb 13, 2024 at 09:35:16PM -0600, Yazen Ghannam wrote:
+static unsigned int get_cpu_for_fru_num(unsigned int i)
+{
+ unsigned int cpu = 0;
+
+ /* Should there be more robust error handling if none found? */
+ for_each_online_cpu(cpu) {

You need to block CPU hotplug operations before iterating over online
CPUs: cpus_read_lock/unlock.


Okay.
+ if (topology_physical_package_id(cpu) == i)
+ return cpu;
+ }
+
+ return cpu;
+}

Fold this one into its single callsite.


Ack.
+
+static void init_fmps(void)
+{
+ struct fru_rec *rec;
+ unsigned int i, cpu;
+
+ for_each_fru(i, rec) {
+ cpu = get_cpu_for_fru_num(i);
+ set_fmp_fields(get_fmp(rec), cpu);
+ }
+}
+
+static int get_system_info(void)
+{
+ u8 model = boot_cpu_data.x86_model;

No need for that local var - just use boot_cpu_data.x86_model.


Ack.
+ /* Only load on MI300A systems for now. */
+ if (!(model >= 0x90 && model <= 0x9f))
+ return -ENODEV;
+
+ if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) {
+ pr_debug("PPIN feature not available");
+ return -ENODEV;
+ }
+
+ /* Use CPU Package (Socket) as FRU for MI300 systems. */
+ max_nr_fru = topology_max_packages();
+ if (!max_nr_fru)
+ return -ENODEV;
+
+ if (!max_nr_entries)
+ max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES;
+
+ max_rec_len = sizeof(struct fru_rec);
+ max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries;
+
+ pr_debug("max_nr_fru=%u max_nr_entries=%u, max_rec_len=%lu",
+ max_nr_fru, max_nr_entries, max_rec_len);
+ return 0;
+}
+
+static void deallocate_records(void)

free_records


Ack.
+{
+ struct fru_rec *rec;
+ int i;
+
+ for_each_fru(i, rec)
+ kfree(rec);
+
+ kfree(fru_records);
+}
+
+static int allocate_records(void)
+{
+ int i, ret = 0;
+
+ fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL);
+ if (!fru_records) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < max_nr_fru; i++) {
+ fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL);
+ if (!fru_records[i]) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ }
+
+ return ret;
+
+out_free:
+ for (; i >= 0; i--)
+ kfree(fru_records[i]);
+
+ kfree(fru_records);
+out:
+ return ret;
+}
+
+static const struct x86_cpu_id fmpm_cpuids[] = {
+ X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL),

This is why this should depend on AMD in Kconfig.


Okay.

I was also thinking that MODULE_DEVICE_TABLE shouldn't be used. Not all
MI300-based systems will need or can use this module. And it does depend
on specific platform configurations.

So the module should not autoload. Users will need to manually load it if
they know that it's usable on their platform. We can keep the cpuid[] and
model checks just for extra safety.

Thanks,
Yazen