[PATCH v2 2/2] x86/MCE/AMD, EDAC/mce_amd: Support non-uniform MCA bank type enumeration

From: Yazen Ghannam
Date: Thu Dec 16 2021 - 11:29:32 EST


AMD systems currently lay out MCA bank types such that the type of bank
number "i" is either the same across all CPUs or is Reserved/Read-as-Zero.

For example:

Bank # | CPUx | CPUy
0 LS LS
1 RAZ UMC
2 CS CS
3 SMU RAZ

Future AMD systems will lay out MCA bank types such that the type of
bank number "i" may be different across CPUs.

For example:

Bank # | CPUx | CPUy
0 LS LS
1 RAZ UMC
2 CS NBIO
3 SMU RAZ

Change the structures that cache MCA bank types to be per-CPU and update
smca_get_bank_type() to handle this change.

Move some SMCA-specific structures to amd.c from mce.h, since they no
longer need to be global.

Break out the "count" for bank types from struct smca_hwid, since this
should provide a per-CPU count rather than a system-wide count.

Apply the "const" qualifier to the struct smca_hwid_mcatypes array. The
values in this array should not change at runtime.

Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx>
---
Link:
https://lkml.kernel.org/r/20211203020017.728440-4-yazen.ghannam@xxxxxxx

v1->v2:
* Rework due to dropping patch 1 from v1.

arch/x86/include/asm/mce.h | 18 +-------
arch/x86/kernel/cpu/mce/amd.c | 59 +++++++++++++++----------
drivers/edac/mce_amd.c | 11 +----
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
4 files changed, 39 insertions(+), 51 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 52d2b35cac2d..cc73061e7255 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -329,22 +329,6 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};

-#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
-
-struct smca_hwid {
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
- u32 hwid_mcatype; /* (hwid,mcatype) tuple */
- u8 count; /* Number of instances. */
-};
-
-struct smca_bank {
- struct smca_hwid *hwid;
- u32 id; /* Value of MCA_IPID[InstanceId]. */
- u8 sysfs_id; /* Value used for sysfs name. */
-};
-
-extern struct smca_bank smca_banks[MAX_NR_BANKS];
-
extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);

@@ -352,7 +336,7 @@ extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);

void mce_amd_feature_init(struct cpuinfo_x86 *c);
-enum smca_bank_types smca_get_bank_type(unsigned int bank);
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
#else

static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index ffe8ea8bc877..345ccdf329bc 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -71,6 +71,22 @@ static const char * const smca_umc_block_names[] = {
"misc_umc"
};

+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+};
+
+struct smca_bank {
+ const struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
+static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
+
struct smca_bank_name {
const char *name; /* Short name for sysfs */
const char *long_name; /* Long name for pretty-printing */
@@ -126,14 +142,14 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);

-enum smca_bank_types smca_get_bank_type(unsigned int bank)
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
struct smca_bank *b;

if (bank >= MAX_NR_BANKS)
return N_SMCA_BANK_TYPES;

- b = &smca_banks[bank];
+ b = &per_cpu(smca_banks, cpu)[bank];
if (!b->hwid)
return N_SMCA_BANK_TYPES;

@@ -141,7 +157,7 @@ enum smca_bank_types smca_get_bank_type(unsigned int bank)
}
EXPORT_SYMBOL_GPL(smca_get_bank_type);

-static struct smca_hwid smca_hwid_mcatypes[] = {
+static const struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype } */

/* Reserved type */
@@ -219,9 +235,6 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};

-struct smca_bank smca_banks[MAX_NR_BANKS];
-EXPORT_SYMBOL_GPL(smca_banks);
-
/*
* In SMCA enabled processors, we can have multiple banks for a given IP type.
* So to define a unique name for each bank, we use a temp c-string to append
@@ -277,8 +290,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)

static void smca_configure(unsigned int bank, unsigned int cpu)
{
+ u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
+ const struct smca_hwid *s_hwid;
unsigned int i, hwid_mcatype;
- struct smca_hwid *s_hwid;
u32 high, low;
u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);

@@ -314,10 +328,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)

smca_set_misc_banks_map(bank, cpu);

- /* Return early if this bank was already initialized. */
- if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
- return;
-
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
@@ -328,10 +338,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu)

for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
+
if (hwid_mcatype == s_hwid->hwid_mcatype) {
- smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = low;
- smca_banks[bank].sysfs_id = s_hwid->count++;
+ this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
+ this_cpu_ptr(smca_banks)[bank].id = low;
+ this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
break;
}
}
@@ -617,7 +628,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,

bool amd_filter_mce(struct mce *m)
{
- enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;

/* See Family 17h Models 10h-2Fh Erratum #1114. */
@@ -655,7 +666,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
} else if (c->x86 == 0x17 &&
(c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {

- if (smca_get_bank_type(bank) != SMCA_IF)
+ if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
return;

msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
@@ -723,7 +734,7 @@ bool amd_mce_is_memory_error(struct mce *m)
u8 xec = (m->status >> 16) & 0x1f;

if (mce_flags.smca)
- return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
+ return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;

return m->bank == 4 && xec == 0x8;
}
@@ -1039,7 +1050,7 @@ static struct kobj_type threshold_ktype = {
.release = threshold_block_release,
};

-static const char *get_name(unsigned int bank, struct threshold_block *b)
+static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
{
enum smca_bank_types bank_type;

@@ -1050,7 +1061,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}

- bank_type = smca_get_bank_type(bank);
+ bank_type = smca_get_bank_type(cpu, bank);
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;

@@ -1060,12 +1071,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return NULL;
}

- if (smca_banks[bank].hwid->count == 1)
+ if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
return smca_get_name(bank_type);

snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
- "%s_%x", smca_get_name(bank_type),
- smca_banks[bank].sysfs_id);
+ "%s_%u", smca_get_name(bank_type),
+ per_cpu(smca_banks, cpu)[bank].sysfs_id);
return buf_mcatype;
}

@@ -1121,7 +1132,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
else
tb->blocks = b;

- err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
if (err)
goto out_free;
recurse:
@@ -1176,7 +1187,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
- const char *name = get_name(bank, NULL);
+ const char *name = get_name(cpu, bank, NULL);
int err = 0;

if (!dev)
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index cfd3f7ae9251..cc5c63feb26a 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1166,20 +1166,13 @@ static void decode_mc6_mce(struct mce *m)
/* Decode errors according to Scalable MCA specification */
static void decode_smca_error(struct mce *m)
{
- struct smca_hwid *hwid;
- enum smca_bank_types bank_type;
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
const char *ip_name;
u8 xec = XEC(m->status, xec_mask);

- if (m->bank >= ARRAY_SIZE(smca_banks))
+ if (bank_type >= N_SMCA_BANK_TYPES)
return;

- hwid = smca_banks[m->bank].hwid;
- if (!hwid)
- return;
-
- bank_type = hwid->bank_type;
-
if (bank_type == SMCA_RESERVED) {
pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
return;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 08133de21fdd..75dad0214dc7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2647,7 +2647,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
* and error occurred in DramECC (Extended error code = 0) then only
* process the error, else bail out.
*/
- if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
+ if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
(XEC(m->status, 0x3f) == 0x0)))
return NOTIFY_DONE;

--
2.25.1