Re: [PATCH V14 02/10] ras: acpi/apei: cper: generic error data entry v3 per ACPI 6.1

From: Baicar, Tyler
Date: Thu Apr 13 2017 - 16:30:35 EST


On 4/12/2017 7:34 AM, Borislav Petkov wrote:
Subject: [PATCH V14 02/10] ras: acpi/apei: cper: generic error data entry v3 per ACPI 6.1
Use a verb in your patch subjects: "Add support for ..." or so.
Hello Boris,

Will do in the next version.

On Tue, Mar 28, 2017 at 01:30:32PM -0600, Tyler Baicar wrote:
Currently when a RAS error is reported it is not timestamped.
What is a RAS error? You mean a hardware error?
Will change to hardware error.

The ACPI 6.1 spec adds the timestamp field to the generic error
data entry v3 structure. The timestamp of when the firmware
generated the error is now being reported.
So what this patch does doesn't have a lot to to do with the Subject?
Please state what the patch does in the Subject.
I'll change the wording to describe it better.

Also, your commit message talks about adding timestamp but the patch
does more. You need to state that too and explain what this patch does
actually.
I'll break this up into two patches as you suggest below so that this only adds the timestamp and the new patch adds the helper defines and usage.
Signed-off-by: Tyler Baicar <tbaicar@xxxxxxxxxxxxxx>
CC: Jonathan (Zhixiong) Zhang <zjzhang@xxxxxxxxxxxxxx>
Reviewed-by: James Morse <james.morse@xxxxxxx>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@xxxxxxxxxx>
---
drivers/acpi/apei/ghes.c | 9 ++++---
drivers/firmware/efi/cper.c | 63 +++++++++++++++++++++++++++++++++++----------
include/acpi/ghes.h | 22 ++++++++++++++++
3 files changed, 77 insertions(+), 17 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 0241e36..9ddbb93 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -421,7 +421,8 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
int flags = -1;
int sec_sev = ghes_severity(gdata->error_severity);
struct cper_sec_mem_err *mem_err;
- mem_err = (struct cper_sec_mem_err *)(gdata + 1);
+
+ mem_err = acpi_hest_generic_data_payload(gdata);
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return;
@@ -458,7 +459,8 @@ static void ghes_do_proc(struct ghes *ghes,
if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err;
- mem_err = (struct cper_sec_mem_err *)(gdata+1);
+
+ mem_err = acpi_hest_generic_data_payload(gdata);
ghes_edac_report_mem_error(ghes, sev, mem_err);
arch_apei_report_mem_error(sev, mem_err);
@@ -468,7 +470,8 @@ static void ghes_do_proc(struct ghes *ghes,
else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
CPER_SEC_PCIE)) {
struct cper_sec_pcie *pcie_err;
- pcie_err = (struct cper_sec_pcie *)(gdata+1);
+
+ pcie_err = acpi_hest_generic_data_payload(gdata);
if (sev == GHES_SEV_RECOVERABLE &&
sec_sev == GHES_SEV_RECOVERABLE &&
pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index d425374..8fa4e23 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -32,6 +32,9 @@
#include <linux/acpi.h>
#include <linux/pci.h>
#include <linux/aer.h>
+#include <linux/printk.h>
+#include <linux/bcd.h>
+#include <acpi/ghes.h>
#define INDENT_SP " "
@@ -386,13 +389,37 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
pfx, pcie->bridge.secondary_status, pcie->bridge.control);
}
+static void cper_estatus_print_section_v300(const char *pfx,
+ const struct acpi_hest_generic_data_v300 *gdata)
Yuck, acpi_hest_generic_data_v300. Can we make those struct names smaller pls?
And v300 is just silly.

And then align args at opening brace.
Will do.

+{
+ __u8 hour, min, sec, day, mon, year, century, *timestamp;
+
+ if (gdata->validation_bits & ACPI_HEST_GEN_VALID_TIMESTAMP) {
+ timestamp = (__u8 *)&(gdata->time_stamp);
+ sec = bcd2bin(timestamp[0]);
+ min = bcd2bin(timestamp[1]);
+ hour = bcd2bin(timestamp[2]);
+ day = bcd2bin(timestamp[4]);
+ mon = bcd2bin(timestamp[5]);
+ year = bcd2bin(timestamp[6]);
+ century = bcd2bin(timestamp[7]);
Align those vertically on the = sign.
Will do.

+ printk("%stime: %7s %02d%02d-%02d-%02d %02d:%02d:%02d\n", pfx,
+ 0x01 & *(timestamp + 3) ? "precise" : "",
Move that test in a separate if-statement - that printk is unreadable as
it is. Also, the test bit always comes second.
Will do.
century,
+ year, mon, day, hour, min, sec);
+ }
+}
+
static void cper_estatus_print_section(
- const char *pfx, const struct acpi_hest_generic_data *gdata, int sec_no)
+ const char *pfx, struct acpi_hest_generic_data *gdata, int sec_no)
static void
cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
int sec_no)

looks much better.
All I did here was remove the const, but will do.
{
uuid_le *sec_type = (uuid_le *)gdata->section_type;
__u16 severity;
char newpfx[64];
+ if (acpi_hest_generic_data_version(gdata) >= 3)
Jeez, that macro name is like a one-lined book!

Let's make that "hest_gdata_ver()" or something else shorter.
As Joe mentioned, acpi_hest_generic<foo> is already used throughout this code base. I do not see the value in varying from the preexisting naming style.

+ cper_estatus_print_section_v300(pfx,
+ (const struct acpi_hest_generic_data_v300 *)gdata);
+
severity = gdata->error_severity;
printk("%s""Error %d, type: %s\n", pfx, sec_no,
cper_severity_str(severity));
@@ -403,14 +430,18 @@ static void cper_estatus_print_section(
snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) {
- struct cper_sec_proc_generic *proc_err = (void *)(gdata + 1);
+ struct cper_sec_proc_generic *proc_err;
+
+ proc_err = acpi_hest_generic_data_payload(gdata);
This looks like an unrelated change. The payload function addition and the
conversion of the code to use it should be a separate patch. And shorten that
function name too pls.
I'll break this into two patches.

printk("%s""section_type: general processor error\n", newpfx);
if (gdata->error_data_length >= sizeof(*proc_err))
cper_print_proc_generic(newpfx, proc_err);
else
goto err_section_too_small;
} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
- struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
+ struct cper_sec_mem_err *mem_err;
+
+ mem_err = acpi_hest_generic_data_payload(gdata);
printk("%s""section_type: memory error\n", newpfx);
if (gdata->error_data_length >=
sizeof(struct cper_sec_mem_err_old))
@@ -419,7 +450,9 @@ static void cper_estatus_print_section(
else
goto err_section_too_small;
} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
- struct cper_sec_pcie *pcie = (void *)(gdata + 1);
+ struct cper_sec_pcie *pcie;
+
+ pcie = acpi_hest_generic_data_payload(gdata);
printk("%s""section_type: PCIe error\n", newpfx);
if (gdata->error_data_length >= sizeof(*pcie))
cper_print_pcie(newpfx, pcie, gdata);
@@ -438,7 +471,7 @@ void cper_estatus_print(const char *pfx,
const struct acpi_hest_generic_status *estatus)
{
struct acpi_hest_generic_data *gdata;
- unsigned int data_len, gedata_len;
+ unsigned int data_len;
int sec_no = 0;
char newpfx[64];
__u16 severity;
@@ -451,12 +484,13 @@ void cper_estatus_print(const char *pfx,
printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
data_len = estatus->data_length;
gdata = (struct acpi_hest_generic_data *)(estatus + 1);
+
snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
- while (data_len >= sizeof(*gdata)) {
- gedata_len = gdata->error_data_length;
+
+ while (data_len >= acpi_hest_generic_data_size(gdata)) {
cper_estatus_print_section(newpfx, gdata, sec_no);
- data_len -= gedata_len + sizeof(*gdata);
- gdata = (void *)(gdata + 1) + gedata_len;
+ data_len -= acpi_hest_generic_data_record_size(gdata);
+ gdata = acpi_hest_generic_data_next(gdata);
sec_no++;
}
}
@@ -486,12 +520,13 @@ int cper_estatus_check(const struct acpi_hest_generic_status *estatus)
return rc;
data_len = estatus->data_length;
gdata = (struct acpi_hest_generic_data *)(estatus + 1);
- while (data_len >= sizeof(*gdata)) {
- gedata_len = gdata->error_data_length;
- if (gedata_len > data_len - sizeof(*gdata))
+
+ while (data_len >= acpi_hest_generic_data_size(gdata)) {
+ gedata_len = acpi_hest_generic_data_error_length(gdata);
+ if (gedata_len > data_len - acpi_hest_generic_data_size(gdata))
return -EINVAL;
- data_len -= gedata_len + sizeof(*gdata);
- gdata = (void *)(gdata + 1) + gedata_len;
+ data_len -= gedata_len + acpi_hest_generic_data_size(gdata);
+ gdata = acpi_hest_generic_data_next(gdata);
}
if (data_len)
return -EINVAL;
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 68f088a..6ae318b 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -12,6 +12,18 @@
#define GHES_TO_CLEAR 0x0001
#define GHES_EXITING 0x0002
+#define acpi_hest_generic_data_error_length(gdata) \
+ (((struct acpi_hest_generic_data *)(gdata))->error_data_length)
+#define acpi_hest_generic_data_size(gdata) \
+ ((acpi_hest_generic_data_version(gdata) >= 3) ? \
+ sizeof(struct acpi_hest_generic_data_v300) : \
+ sizeof(struct acpi_hest_generic_data))
+#define acpi_hest_generic_data_record_size(gdata) \
+ (acpi_hest_generic_data_size(gdata) + \
+ acpi_hest_generic_data_error_length(gdata))
+#define acpi_hest_generic_data_next(gdata) \
+ ((void *)(gdata) + acpi_hest_generic_data_record_size(gdata))
This is one unreadable pile of too long names with a clearly redundant
and too long prefix. Please shorten it all.

+
struct ghes {
union {
struct acpi_hest_generic *generic;
@@ -73,3 +85,13 @@ static inline void ghes_edac_unregister(struct ghes *ghes)
{
}
#endif
+
+#define acpi_hest_generic_data_version(gdata) \
+ (gdata->revision >> 8)
+
+static inline void *acpi_hest_generic_data_payload(struct acpi_hest_generic_data *gdata)
Lemme try to shorten it:

static inline void *acpi_hest_get_payload(struct acpi_hest_gdata *d)
{
if (hest_gdata_ver(d) >= 3)
return (void *)(((struct acpi_hest_gdata_v3 *)d) + 1);
else
return d + 1;
}

Now this is much more readable IMO. You can actually see what's going
on. And you still know what the struct names are.

So let's drop all that unnecessary too long prefixing and make the
code readable. That cper thing needs a lot more scrubbing, of course,
but some other day.
I do not agree with this. The struct being passed to this function is already named acpi_hest_generic_data in the existing code and all over this code is named gdata not just d.

Also, these helpers already helped this code be significantly more readable. They were added in version 4 of this series to reduce code duplication and make iterating over the generic data entries readable. https://lkml.org/lkml/2016/10/11/454

Thanks,
Tyler

--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.