[PATCH v2 6/6] RAS: EDAC/amd64: Retire all system physical address from HBM3 row

From: Muralidhara M K
Date: Wed Nov 29 2023 - 02:36:17 EST


From: Muralidhara M K <muralidhara.mk@xxxxxxx>

AMD systems have HBM memory embedded within the chip, The entire memory
is managed by host OS. Error containment needs to be reliable, because
HBM memory cannot be replaced.

HBM3 memory has 8 columns in each row and column bits are c2, c3 and c4
which gives 8 possible combination of addresses in each row.

Identify all these system physical addresses in a HBM row and retire all
system physical address to get rid of intermittent or recurrent memory
errors.

Signed-off-by: Muralidhara M K <muralidhara.mk@xxxxxxx>
---
Changes:
v1 -> v2 : Rename and modify function amd_umc_retire_column_spa_from_row()

drivers/edac/amd64_edac.c | 3 ++
drivers/ras/amd/atl/umc.c | 77 +++++++++++++++++++++++++++++++++++++++
include/linux/amd-atl.h | 2 +
3 files changed, 82 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 623f84c53d2d..9872ede7eca9 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2831,6 +2831,9 @@ static void decode_umc_error(int node_id, struct mce *m)

error_address_to_page_and_offset(sys_addr, &err);

+ if (pvt->fam == 0x19 && (pvt->model >= 0x90 && pvt->model <= 0x9f))
+ amd_umc_retire_column_spa_from_row(m);
+
log_error:
__log_ecc_error(mci, &err, ecc_type);
}
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 3533db279cec..de51b666b20e 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -255,3 +255,80 @@ int umc_mca_addr_to_sys_addr(struct mce *m, u64 *sys_addr)
return 0;
}
EXPORT_SYMBOL_GPL(umc_mca_addr_to_sys_addr);
+
+/*
+ * High Bandwidth Memory (HBM v3) has fixed number of columns in a row.
+ * In specific, HBMv3 has 8 columns in one row.
+ * Extract column bits in a row to find all the combination of masks and
+ * to retire all the system physical addresses in that particular row.
+ */
+#define MAX_COLUMNS_IN_HBM_ROW 8
+
+/* Column 2, 3 and 4th bits in Normalized Address */
+#define UMC_NA_C2_BIT BIT(8)
+#define UMC_NA_C3_BIT BIT(9)
+#define UMC_NA_C4_BIT BIT(14)
+
+/* Possible combinations of column address masks in a HBM v3 row */
+#define C_1_1_1_MASK (UMC_NA_C4_BIT | UMC_NA_C3_BIT | UMC_NA_C2_BIT)
+#define C_1_1_0_MASK (UMC_NA_C4_BIT | UMC_NA_C3_BIT)
+#define C_1_0_1_MASK (UMC_NA_C4_BIT | UMC_NA_C2_BIT)
+#define C_1_0_0_MASK (UMC_NA_C4_BIT)
+#define C_0_1_1_MASK (UMC_NA_C3_BIT | UMC_NA_C2_BIT)
+#define C_0_1_0_MASK (UMC_NA_C3_BIT)
+#define C_0_0_1_MASK (UMC_NA_C2_BIT)
+#define C_0_0_0_MASK ~C_1_1_1_MASK
+
+/* Identify system address physical addresses of all columns in a HBM v3 row */
+static void identify_column_spa_from_row(struct mce *m, u64 *col)
+{
+ u8 cs_inst_id = get_cs_inst_id(m);
+ u8 socket_id = get_socket_id(m);
+ u64 norm_addr = get_norm_addr(m);
+ u8 die_id = get_die_id(m);
+ u16 df_acc_id = get_df_acc_id(m);
+
+ u64 retire_addr, column;
+ u64 column_masks[] = { 0, C_0_0_1_MASK, C_0_1_0_MASK, C_0_1_1_MASK,
+ C_1_0_0_MASK, C_1_0_1_MASK, C_1_1_0_MASK, C_1_1_1_MASK };
+
+ /* clear and loop for all possibilities of [c4 c3 c2] */
+ norm_addr &= C_0_0_0_MASK;
+
+ for (column = 0; column < ARRAY_SIZE(column_masks); column++) {
+ retire_addr = norm_addr | column_masks[column];
+
+ if (norm_to_sys_addr(df_acc_id, socket_id, die_id, cs_inst_id, &retire_addr))
+ pr_warn("Failed norm_to_sys_addr for column[%lld]\n", column);
+ else
+ col[column] = retire_addr;
+ }
+}
+
+void amd_umc_retire_column_spa_from_row(struct mce *m)
+{
+ u64 col[MAX_COLUMNS_IN_HBM_ROW];
+ u64 tmp[MAX_COLUMNS_IN_HBM_ROW];
+ int i, j, count = 0;
+ unsigned long pfn;
+
+ pr_info("Identify SPA of all columns from row for MCE Addr:0x%llx\n", m->addr);
+ identify_column_spa_from_row(m, col);
+
+ /* Find duplicate column SPA in a row */
+ for (i = 0; i < MAX_COLUMNS_IN_HBM_ROW; i++) {
+ for (j = 0; j < count; j++) {
+ if (col[i] == tmp[j])
+ break;
+ }
+ if (j == count) {
+ tmp[count] = col[i];
+ /* do page retirement, except for duplicate addresses */
+ pr_debug("Retire column spa:0x%llx ", tmp[count]);
+ pfn = PHYS_PFN(tmp[count]);
+ memory_failure(pfn, 0);
+ count++;
+ }
+ }
+}
+EXPORT_SYMBOL(amd_umc_retire_column_spa_from_row);
diff --git a/include/linux/amd-atl.h b/include/linux/amd-atl.h
index c625ea3ab5d0..6cba39be63ca 100644
--- a/include/linux/amd-atl.h
+++ b/include/linux/amd-atl.h
@@ -25,4 +25,6 @@ static inline int amd_umc_mca_addr_to_sys_addr(struct mce *m, u64 *sys_addr)
return umc_mca_addr_to_sys_addr(m, sys_addr);
}

+void amd_umc_retire_column_spa_from_row(struct mce *m);
+
#endif /* _AMD_ATL_H */
--
2.25.1