[PATCH 3/4] EDAC/amd64: Add sysfs entry to read FRU poison data

From: Muralidhara M K
Date: Wed Nov 29 2023 - 02:51:14 EST


From: Muralidhara M K <muralidhara.mk@xxxxxxx>

Create sysfs file for each FRU ID with a list of DRAM MCE address and
MCA IPID value stored in ERST non-volatile storage.

Read the CPER Record information at any time when the system is up
using below command of particular node or FRU index of sysfs entry.
Example: cat /sys/devices/system/edac/mc/<node index>/fmpl

Data in sysfs entries is able to identify the list of poisoned
addresses and FRU index to decide on the replaceble criteria instead
of iterating over the kernel logs.

Co-developed-by: Naveen Krishna Chatradhi <naveenkrishna.chatradhi@xxxxxxx>
Signed-off-by: Naveen Krishna Chatradhi <naveenkrishna.chatradhi@xxxxxxx>
Co-developed-by: Sathya Priya Kumar <sathyapriya.k@xxxxxxx>
Signed-off-by: Sathya Priya Kumar <sathyapriya.k@xxxxxxx>
Signed-off-by: Muralidhara M K <muralidhara.mk@xxxxxxx>
---
drivers/edac/amd64_edac.c | 25 +++++++++++
drivers/ras/fmp/fru_mem_poison.c | 77 +++++++++++++++++++++++++++++++-
include/linux/fru_mem_poison.h | 2 +
3 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 9872ede7eca9..3790adfa78b5 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2,6 +2,7 @@
#include "amd64_edac.h"
#include <asm/amd_nb.h>
#include <linux/amd-atl.h>
+#include <linux/fru_mem_poison.h>

static struct edac_pci_ctl_info *pci_ctl;

@@ -574,6 +575,28 @@ static ssize_t dram_hole_show(struct device *dev, struct device_attribute *mattr
hole_size);
}

+/* sysfs entry to read FRU(Field Repaceable Unit) memory Poisons */
+static ssize_t fmpl_show(struct device *dev, struct device_attribute *mattr,
+ char *data)
+{
+ struct mem_ctl_info *mci = to_mci(dev);
+ struct amd64_pvt *pvt = mci->pvt_info;
+ ssize_t ret_len = 0, buf_size = PAGE_SIZE;
+ char *buf;
+
+ buf = kmalloc(buf_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ ret_len = copy_fmp_data_from_cache(pvt->mc_node_id, buf, buf_size);
+ if (!ret_len)
+ return -EINVAL;
+
+ memcpy(data, buf, ret_len);
+ kfree(buf);
+ return ret_len;
+}
+
/*
* update NUM_DBG_ATTRS in case you add new members
*/
@@ -581,6 +604,7 @@ static DEVICE_ATTR(dhar, S_IRUGO, dhar_show, NULL);
static DEVICE_ATTR(dbam, S_IRUGO, dbam0_show, NULL);
static DEVICE_ATTR(topmem, S_IRUGO, top_mem_show, NULL);
static DEVICE_ATTR(topmem2, S_IRUGO, top_mem2_show, NULL);
+static DEVICE_ATTR(fmpl, S_IRUGO, fmpl_show, NULL);
static DEVICE_ATTR_RO(dram_hole);

static struct attribute *dbg_attrs[] = {
@@ -589,6 +613,7 @@ static struct attribute *dbg_attrs[] = {
&dev_attr_topmem.attr,
&dev_attr_topmem2.attr,
&dev_attr_dram_hole.attr,
+ &dev_attr_fmpl.attr,
NULL
};

diff --git a/drivers/ras/fmp/fru_mem_poison.c b/drivers/ras/fmp/fru_mem_poison.c
index c21e736c3ed1..bd85ae527c7f 100644
--- a/drivers/ras/fmp/fru_mem_poison.c
+++ b/drivers/ras/fmp/fru_mem_poison.c
@@ -39,6 +39,11 @@ struct system_fru_poison_info {
struct cper_fru_poison_record *fru_record;
};

+#define REC_HDR() \
+ " FRU_IDX| FRU_ID\t | P_NUM | TIMESTAMP\t\t | MCA_IPID\t | MCA_ADDR\t| SPA\t\t |\n"
+#define REC_DATA() \
+ " %d\t| 0x%llx| %d\t | %s| 0x%017llx| 0x%013llx | 0x%013llx|\n"
+
#define CPER_CREATOR_FMP \
GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \
0xa0, 0x33, 0x08, 0x75)
@@ -122,15 +127,83 @@ static u64 calc_checksum(struct cper_sec_fru_mem_poisons *fmp)
return checksum;
}

-struct tm get_timestamp(u64 timestamp)
+ssize_t get_timestamp(u64 timestamp, char *tbuf, ssize_t t_size)
{
struct timespec64 ts;
struct tm tm;
+ ssize_t tlen = 0;

ts.tv_sec = timestamp;
time64_to_tm(ts.tv_sec, 0, &tm);
- return tm;
+ tlen = scnprintf(tbuf, t_size, "%ld-%02d-%02d %02d:%02d:%02d",
+ tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min,
+ tm.tm_sec);
+
+ return tlen;
+}
+
+/*
+ * buffer is filled with poison records information and exports to
+ * amd64_edac module to provide the info via sysfs entries
+ * /sys/devices/system/edac/mc/mc<sock_indx>/fmpl
+ */
+ssize_t copy_fmp_data_from_cache(int fru_idx, char *buf, ssize_t buf_size)
+{
+ struct cper_fru_poison_data *temp, *base;
+ int j, p_count;
+ struct mce *m;
+ ssize_t len = 0;
+ ssize_t t_len = 0;
+ ssize_t tb_size = 100;
+ u64 sys_addr;
+ char *t_buf;
+
+ pr_info("FRU_Idx[%d] Record information:\n", fru_idx);
+ pr_info("Record_ID : 0x%llx\n", sys_fmp_info[fru_idx]->recordid);
+ pr_info("FRU_ID : 0x%llx\n", sys_fmp_info[fru_idx]->sys_fru_id);
+
+ p_count = sys_fmp_info[fru_idx]->fru_record->fmp.poison_count;
+ pr_info("FRU Memory poison details under FRU_idx[%d]: %d\n", fru_idx, p_count);
+
+ base = (struct cper_fru_poison_data *)&sys_fmp_info[fru_idx]->fru_record->fmp.p_list_off;
+ len = scnprintf(buf, buf_size, REC_HDR());
+ buf += len;
+ buf_size -= len;
+
+ for (j = 1; j <= p_count; j++) {
+ temp = base + j * sizeof(struct cper_fru_poison_data);
+ m = (struct mce *)temp;
+ fill_mce_poison_data(m, temp, fru_idx);
+
+ if (amd_umc_mca_addr_to_sys_addr(m, &sys_addr)) {
+ pr_warn("normalized address failed for mce addr:0x%llx\n", m->addr);
+ sys_addr = 0;
+ }
+ t_buf = kmalloc(tb_size, GFP_KERNEL);
+ if (!t_buf)
+ return -ENOMEM;
+
+ t_len = get_timestamp(temp->timestamp, t_buf, tb_size);
+ if (!t_len)
+ kfree(t_buf);
+
+ pr_info("poison_number[%d] hw_id:0x%llx addr:0x%llx\n", j, temp->hw_id, temp->addr);
+ len = scnprintf(buf, buf_size, REC_DATA(),
+ fru_idx, sys_fmp_info[fru_idx]->sys_fru_id, j, t_buf, temp->hw_id,
+ temp->addr, sys_addr);
+
+ buf_size -= len;
+ if ((buf_size - len) <= 0) {
+ pr_warn("%s FMP cache Buffer full!", __func__);
+ goto out;
+ }
+ buf += len;
+ }
+out:
+ kfree(t_buf);
+ return (PAGE_SIZE - buf_size);
}
+EXPORT_SYMBOL(copy_fmp_data_from_cache);

/* Fill initial fmp structure variable during empty record creation */
static int init_fru_poison_fmp_cache(struct system_fru_poison_info *p)
diff --git a/include/linux/fru_mem_poison.h b/include/linux/fru_mem_poison.h
index d3e567c990aa..d2642e1224de 100644
--- a/include/linux/fru_mem_poison.h
+++ b/include/linux/fru_mem_poison.h
@@ -12,4 +12,6 @@

struct system_fru_poison_info **sys_fmp_info;

+ssize_t copy_fmp_data_from_cache(int fru_idx, char *buf, ssize_t buf_size);
+
#endif /* _X86_FMP_H */
--
2.25.1