Re: [PATCH 2/2] RAS: Introduce the FRU Memory Poison Manager

From: Borislav Petkov
Date: Wed Feb 14 2024 - 13:45:07 EST


On Wed, Feb 14, 2024 at 09:56:14AM -0500, Yazen Ghannam wrote:
> > This one needs to go too.
> >
>
> Ack.

Gone:

diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c
index a67a4b67cf9d..643c36b6dc9c 100644
--- a/drivers/ras/amd/fmpm.c
+++ b/drivers/ras/amd/fmpm.c
@@ -146,11 +146,6 @@ static DEFINE_MUTEX(fmpm_update_mutex);
#define for_each_fru(i, rec) \
for (i = 0; rec = fru_records[i], i < max_nr_fru; i++)

-static inline struct cper_fru_poison_desc *get_fpd(struct fru_rec *rec, u32 entry)
-{
- return &rec->entries[entry];
-}
-
static inline u32 get_fmp_len(struct fru_rec *rec)
{
return rec->sec_desc.section_length - sizeof(struct cper_section_descriptor);
@@ -253,7 +248,9 @@ static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
unsigned int i;

for (i = 0; i < rec->fmp.nr_entries; i++) {
- if (same_fpd(get_fpd(rec, i), fpd)) {
+ struct cper_fru_poison_desc *fpd_i = &rec->entries[i];
+
+ if (same_fpd(fpd_i, fpd)) {
pr_debug("Found duplicate record");
return true;
}
@@ -265,7 +262,7 @@ static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
static void update_fru_record(struct fru_rec *rec, struct mce *m)
{
struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
- struct cper_fru_poison_desc fpd;
+ struct cper_fru_poison_desc fpd, *fpd_dest;
u32 entry = 0;

mutex_lock(&fmpm_update_mutex);
@@ -287,9 +284,10 @@ static void update_fru_record(struct fru_rec *rec, struct mce *m)
goto out_unlock;
}

- entry = fmp->nr_entries;
+ entry = fmp->nr_entries;
+ fpd_dest = &rec->entries[entry];

- memcpy(get_fpd(rec, entry), &fpd, sizeof(struct cper_fru_poison_desc));
+ memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc));

fmp->nr_entries = entry + 1;
fmp->validation_bits |= FMP_VALID_LIST_ENTRIES;
@@ -359,11 +357,10 @@ static u32 get_cpu_from_fru_id(u64 fru_id)

static void retire_mem_fmp(struct fru_rec *rec, u32 nr_entries, u32 cpu)
{
- struct cper_fru_poison_desc *fpd;
unsigned int i;

for (i = 0; i < nr_entries; i++) {
- fpd = get_fpd(rec, i);
+ struct cper_fru_poison_desc *fpd = &rec->entries[i];

if (fpd->hw_id_type != FPD_HW_ID_TYPE_MCA_IPID)
continue;


> > /* Use the complement value. */
> > rec->fmp.checksum = -checksum;
> >
> > I'd say.
> >
>
> This was my first thought. Other checksum code in the kernel does
> the (0-X) thing. So I wasn't sure if there's any odd side effects
> of one over the other. And I didn't take the time to dig into it.

I guess to probably be more expressive? I don't see how

0 - X

and

-X

differ.

And you can always do a before-after and look at the asm:

before:
# drivers/ras/amd/fmpm.c:202: rec->fmp.checksum = 0 - checksum;
#NO_APP
subl %edx, %eax # checksum, tmp100
movl %eax, 200(%rbx) # tmp100, rec_9(D)->fmp.checksum

after:
# drivers/ras/amd/fmpm.c:202: rec->fmp.checksum = -checksum;
#NO_APP
subl %edx, %eax # checksum, tmp100
movl %eax, 200(%rbx) # tmp100, rec_9(D)->fmp.checksum


> > -/* Calculate a new checksum. */
> > -static u32 get_fmp_checksum(struct fru_rec *rec)
>
> I made this a helper because we need to validate the checksum when
> reading records from storage too.

It has a single user that's why I whacked it. If a new one materializes,
sure, you can carve it out.

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette