[PATCH 6/7] EDAC/amd64: Add error instance get_err_info() to pvt->ops

From: Muralidhara M K
Date: Thu Jul 20 2023 - 08:55:29 EST


From: Muralidhara M K <muralidhara.mk@xxxxxxx>

On CPUs the data fabric ID of an instance on a CPU is equal to the
UMC number. since the UMC number and channel are equal in CPU nodes,
the channel can be used as the data fabric ID of the instance.

GPU node has 'X' number of PHYs and 'Y' number of channels.
This results in 'X*Y' number of instances in the data fabric.
Therefore the data fabric ID of an instance in GPU as below:
df_inst_id = 'X' * number of channels per PHY + 'Y'

Co-developed-by: Naveen Krishna Chatradhi <naveenkrishna.chatradhi@xxxxxxx>
Signed-off-by: Naveen Krishna Chatradhi <naveenkrishna.chatradhi@xxxxxxx>
Signed-off-by: Muralidhara M K <muralidhara.mk@xxxxxxx>
---
drivers/edac/amd64_edac.c | 36 +++++++++++++++++++++++++++++++++++-
drivers/edac/amd64_edac.h | 2 ++
2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 45d8093c117a..74b2b47cc22a 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3047,6 +3047,17 @@ static inline void decode_bus_error(int node_id, struct mce *m)
__log_ecc_error(mci, &err, ecc_type);
}

+/*
+ * On CPUs, The data fabric ID of an instance is equal to the UMC number.
+ * and since the UMC number and channel are equal in CPU nodes, the channel can be
+ * used as the data fabric ID of the instance.
+ */
+static int umc_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+ struct err_info *err)
+{
+ return err->channel;
+}
+
/*
* To find the UMC channel represented by this bank we need to match on its
* instance_id. The instance_id of a bank is held in the lower 32 bits of its
@@ -3071,6 +3082,7 @@ static void decode_umc_error(int node_id, struct mce *m)
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
struct err_info err;
+ u8 df_inst_id;
u64 sys_addr;

node_id = fixup_node_id(node_id, m);
@@ -3101,8 +3113,9 @@ static void decode_umc_error(int node_id, struct mce *m)
}

pvt->ops->get_err_info(m, &err);
+ df_inst_id = pvt->ops->get_inst_id(mci, pvt, &err);

- if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
+ if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) {
err.err_code = ERR_NORM_ADDR;
goto log_error;
}
@@ -3758,6 +3771,25 @@ static int umc_hw_info_get(struct amd64_pvt *pvt)
return 0;
}

+/*
+ * A GPU node has 'X' number of PHYs and 'Y' number of channels.
+ * This results in 'X*Y' number of instances in the data fabric.
+ * Therefore the data fabric ID of an instance can be found with the following formula:
+ * df_inst_id = 'X' * number of channels per PHY + 'Y'
+ *
+ */
+static int gpu_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+ struct err_info *err)
+{
+ int i, channels = 0;
+
+ /* The memory channels in case of GPUs are fully populated */
+ for_each_umc(i)
+ channels += pvt->csels[i].b_cnt;
+
+ return (err->csrow * channels / mci->nr_csrows) + err->channel;
+}
+
/*
* The CPUs have one channel per UMC, so UMC number is equivalent to a
* channel number. The GPUs have 8 channels per UMC, so the UMC number no
@@ -4015,6 +4047,7 @@ static struct low_ops umc_ops = {
.setup_mci_misc_attrs = umc_setup_mci_misc_attrs,
.dump_misc_regs = umc_dump_misc_regs,
.get_err_info = umc_get_err_info,
+ .get_inst_id = umc_inst_id,
};

static struct low_ops gpu_ops = {
@@ -4023,6 +4056,7 @@ static struct low_ops gpu_ops = {
.setup_mci_misc_attrs = gpu_setup_mci_misc_attrs,
.dump_misc_regs = gpu_dump_misc_regs,
.get_err_info = gpu_get_err_info,
+ .get_inst_id = gpu_inst_id,
};

/* Use Family 16h versions for defaults and adjust as needed below. */
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 5a4e4a59682b..d9e9e62dd4b1 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -471,6 +471,8 @@ struct low_ops {
void (*setup_mci_misc_attrs)(struct mem_ctl_info *mci);
void (*dump_misc_regs)(struct amd64_pvt *pvt);
void (*get_err_info)(struct mce *m, struct err_info *err);
+ int (*get_inst_id)(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+ struct err_info *err);
};

int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
--
2.25.1