Re: [PATCH 2/2] RAS: Introduce the FRU Memory Poison Manager

From: Borislav Petkov
Date: Wed Feb 14 2024 - 07:02:45 EST


On Tue, Feb 13, 2024 at 09:35:16PM -0600, Yazen Ghannam wrote:
> +static unsigned int get_cpu_for_fru_num(unsigned int i)
> +{
> + unsigned int cpu = 0;
> +
> + /* Should there be more robust error handling if none found? */
> + for_each_online_cpu(cpu) {

You need to block CPU hotplug operations before iterating over online
CPUs: cpus_read_lock/unlock.

> + if (topology_physical_package_id(cpu) == i)
> + return cpu;
> + }
> +
> + return cpu;
> +}

Fold this one into its single callsite.

> +
> +static void init_fmps(void)
> +{
> + struct fru_rec *rec;
> + unsigned int i, cpu;
> +
> + for_each_fru(i, rec) {
> + cpu = get_cpu_for_fru_num(i);
> + set_fmp_fields(get_fmp(rec), cpu);
> + }
> +}
> +
> +static int get_system_info(void)
> +{
> + u8 model = boot_cpu_data.x86_model;

No need for that local var - just use boot_cpu_data.x86_model.

> + /* Only load on MI300A systems for now. */
> + if (!(model >= 0x90 && model <= 0x9f))
> + return -ENODEV;
> +
> + if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) {
> + pr_debug("PPIN feature not available");
> + return -ENODEV;
> + }
> +
> + /* Use CPU Package (Socket) as FRU for MI300 systems. */
> + max_nr_fru = topology_max_packages();
> + if (!max_nr_fru)
> + return -ENODEV;
> +
> + if (!max_nr_entries)
> + max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES;
> +
> + max_rec_len = sizeof(struct fru_rec);
> + max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries;
> +
> + pr_debug("max_nr_fru=%u max_nr_entries=%u, max_rec_len=%lu",
> + max_nr_fru, max_nr_entries, max_rec_len);
> + return 0;
> +}
> +
> +static void deallocate_records(void)

free_records

> +{
> + struct fru_rec *rec;
> + int i;
> +
> + for_each_fru(i, rec)
> + kfree(rec);
> +
> + kfree(fru_records);
> +}
> +
> +static int allocate_records(void)
> +{
> + int i, ret = 0;
> +
> + fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL);
> + if (!fru_records) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + for (i = 0; i < max_nr_fru; i++) {
> + fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL);
> + if (!fru_records[i]) {
> + ret = -ENOMEM;
> + goto out_free;
> + }
> + }
> +
> + return ret;
> +
> +out_free:
> + for (; i >= 0; i--)
> + kfree(fru_records[i]);
> +
> + kfree(fru_records);
> +out:
> + return ret;
> +}
> +
> +static const struct x86_cpu_id fmpm_cpuids[] = {
> + X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL),

This is why this should depend on AMD in Kconfig.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette