RE: 回复: [PATCH] ACPI: APEI: move edac_init ahead of ghes platform drv register

From: Justin He
Date: Wed Aug 10 2022 - 02:02:08 EST


Hi Borislav

> -----Original Message-----
> From: Borislav Petkov <bp@xxxxxxxxx>
> Sent: Wednesday, August 10, 2022 2:46 AM
> To: Justin He <Justin.He@xxxxxxx>
> Cc: Kani, Toshi <toshi.kani@xxxxxxx>; Rafael J. Wysocki <rafael@xxxxxxxxxx>;
> Len Brown <lenb@xxxxxxxxxx>; James Morse <James.Morse@xxxxxxx>;
> Tony Luck <tony.luck@xxxxxxxxx>; Mauro Carvalho Chehab
> <mchehab@xxxxxxxxxx>; Robert Richter <rric@xxxxxxxxxx>; Shuai Xue
> <xueshuai@xxxxxxxxxxxxxxxxx>; Jarkko Sakkinen <jarkko@xxxxxxxxxx>; ACPI
> Devel Maling List <linux-acpi@xxxxxxxxxxxxxxx>; Linux Kernel Mailing List
> <linux-kernel@xxxxxxxxxxxxxxx>; open list:EDAC-CORE
> <linux-edac@xxxxxxxxxxxxxxx>
> Subject: Re: 回复: [PATCH] ACPI: APEI: move edac_init ahead of ghes
> platform drv register
>
> On Tue, Aug 09, 2022 at 07:15:43PM +0200, Borislav Petkov wrote:
> > Yes, I think you do. Lemme write something and you can finish it/test
> > it.
>
> Here's something to only show what I'm thinking of. It doesn't build because
> of:
>
> drivers/acpi/apei/ghes.c: In function ‘ghes_do_proc’:
> drivers/acpi/apei/ghes.c:651:25: error: implicit declaration of function
> ‘ghes_edac_report_mem_error’; did you mean
> ‘arch_apei_report_mem_error’? [-Werror=implicit-function-declaration]
> 651 | ghes_edac_report_mem_error(sev,
> mem_err);
>
>
> and that needs more thinking what to do. My idea currently is to do a notifier
> like we do for MCA...

There is still a small gap even if we use notifier for ghes edac report mem error.

The notifier will be registered only when calling module_init() of ghes_edac module.
Then there will be different behavior for ghes_do_proc:
1. if ghes_do_proc is invoked in the acpi_init code path, the notifier has NOT been registered,
ghes_edac_report_mem_err() does nothing.
2.If ghes_do_proc is invoked after module_init(), the notifier is registered, everything is fine.

Is this strange or any other side effects?

--
Cheers,
Justin (Jia He)
>
> And I'm not really happy about returning a list of ghes devices just so
> ghes_edac instances can get their struct device * pointers. Maybe we'll think
> of something better.
>
> But this is the general goal: not call module code from builtin code - i.e., call
> ghes_edac.c code from ghes.c. It works now because ghes_edac is forced
> builtin which is ugly as hell.
>
> Anyway, enough for today...
>
> ---
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
> d91ad378c00d..0919317b8313 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -118,6 +118,9 @@ module_param_named(disable, ghes_disable, bool,
> 0); static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex);
>
> +static LIST_HEAD(ghes_devs);
> +static DEFINE_MUTEX(ghes_devs_mutex);
> +
> /*
> * Because the memory area used to transfer hardware error information
> * from BIOS to Linux can be determined only in NMI, IRQ or timer @@
> -1376,7 +1379,11 @@ static int ghes_probe(struct platform_device
> *ghes_dev)
>
> platform_set_drvdata(ghes_dev, ghes);
>
> - ghes_edac_register(ghes, &ghes_dev->dev);
> + ghes->dev = &ghes_dev->dev;
> +
> + mutex_lock(&ghes_devs_mutex);
> + list_add_tail(&ghes->elist, &ghes_devs);
> + mutex_unlock(&ghes_devs_mutex);
>
> /* Handle any pending errors right away */
> spin_lock_irqsave(&ghes_notify_lock_irq, flags); @@ -1440,8 +1447,6
> @@ static int ghes_remove(struct platform_device *ghes_dev)
>
> ghes_fini(ghes);
>
> - ghes_edac_unregister(ghes);
> -
> kfree(ghes);
>
> platform_set_drvdata(ghes_dev, NULL);
> @@ -1497,3 +1502,25 @@ void __init acpi_ghes_init(void)
> else
> pr_info(GHES_PFX "Failed to enable APEI firmware first
> mode.\n"); }
> +
> +/*
> + * Known x86 systems that prefer GHES error reporting:
> + */
> +static struct acpi_platform_list plat_list[] = {
> + {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions},
> + { } /* End */
> +};
> +
> +struct list_head *ghes_get_devices(bool force) {
> + int idx = -1;
> +
> + if (IS_ENABLED(CONFIG_X86)) {
> + idx = acpi_match_platform_list(plat_list);
> + if (idx < 0 && !force)
> + return NULL;
> + }
> +
> + return &ghes_devs;
> +}
> +EXPORT_SYMBOL_GPL(ghes_get_devices);
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index
> 17562cf1fe97..df45db81858b 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -53,8 +53,8 @@ config EDAC_DECODE_MCE
> has been initialized.
>
> config EDAC_GHES
> - bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
> - depends on ACPI_APEI_GHES && (EDAC=y)
> + tristate "Output ACPI APEI/GHES BIOS detected errors via EDAC"
> + depends on ACPI_APEI_GHES
> select UEFI_CPER
> help
> Not all machines support hardware-driven error report. Some of those
> diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index
> c8fa7dcfdbd0..da6d1a9e107d 100644
> --- a/drivers/edac/ghes_edac.c
> +++ b/drivers/edac/ghes_edac.c
> @@ -59,6 +59,8 @@ module_param(force_load, bool, 0);
>
> static bool system_scanned;
>
> +static struct list_head *ghes_devs;
> +
> /* Memory Device - Type 17 of SMBIOS spec */ struct memdev_dmi_entry
> {
> u8 type;
> @@ -376,34 +378,15 @@ void ghes_edac_report_mem_error(int sev, struct
> cper_sec_mem_err *mem_err)
> spin_unlock_irqrestore(&ghes_lock, flags); }
>
> -/*
> - * Known systems that are safe to enable this module.
> - */
> -static struct acpi_platform_list plat_list[] = {
> - {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions},
> - { } /* End */
> -};
> -
> -int ghes_edac_register(struct ghes *ghes, struct device *dev)
> +static int ghes_edac_register(struct device *dev)
> {
> bool fake = false;
> struct mem_ctl_info *mci;
> struct ghes_pvt *pvt;
> struct edac_mc_layer layers[1];
> unsigned long flags;
> - int idx = -1;
> int rc = 0;
>
> - if (IS_ENABLED(CONFIG_X86)) {
> - /* Check if safe to enable on this system */
> - idx = acpi_match_platform_list(plat_list);
> - if (!force_load && idx < 0)
> - return -ENODEV;
> - } else {
> - force_load = true;
> - idx = 0;
> - }
> -
> /* finish another registration/unregistration instance first */
> mutex_lock(&ghes_reg_mutex);
>
> @@ -447,7 +430,7 @@ int ghes_edac_register(struct ghes *ghes, struct
> device *dev)
> pr_info("This system has a very crappy BIOS: It doesn't even list the
> DIMMS.\n");
> pr_info("Its SMBIOS info is wrong. It is doubtful that the error report
> would\n");
> pr_info("work on such system. Use this driver with caution\n");
> - } else if (idx < 0) {
> + } else if (force_load) {
> pr_info("This EDAC driver relies on BIOS to enumerate memory and
> get error reports.\n");
> pr_info("Unfortunately, not all BIOSes reflect the memory layout
> correctly.\n");
> pr_info("So, the end result of using this driver varies from vendor to
> vendor.\n"); @@ -517,7 +500,7 @@ int ghes_edac_register(struct ghes *ghes,
> struct device *dev)
> return rc;
> }
>
> -void ghes_edac_unregister(struct ghes *ghes)
> +static void ghes_edac_unregister(struct ghes *ghes)
> {
> struct mem_ctl_info *mci;
> unsigned long flags;
> @@ -551,3 +534,30 @@ void ghes_edac_unregister(struct ghes *ghes)
> unlock:
> mutex_unlock(&ghes_reg_mutex);
> }
> +
> +static int __init ghes_edac_init(void)
> +{
> + struct ghes *g, *g_tmp;
> +
> + ghes_devs = ghes_get_devices(force_load);
> + if (ghes_devs)
> + return -ENODEV;
> +
> + list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
> + ghes_edac_register(g->dev);
> + }
> +
> + return 0;
> +}
> +module_init(ghes_edac_init);
> +
> +static void __exit ghes_edac_exit(void) {
> + struct ghes *g, *g_tmp;
> +
> +
> + list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) {
> + ghes_edac_unregister(g);
> + }
> +}
> +module_exit(ghes_edac_exit);
> diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index
> 34fb3431a8f3..f39b75c3f9c6 100644
> --- a/include/acpi/ghes.h
> +++ b/include/acpi/ghes.h
> @@ -27,6 +27,8 @@ struct ghes {
> struct timer_list timer;
> unsigned int irq;
> };
> + struct device *dev;
> + struct list_head elist;
> };
>
> struct ghes_estatus_node {
> @@ -69,35 +71,13 @@ int ghes_register_vendor_record_notifier(struct
> notifier_block *nb);
> * @nb: pointer to the notifier_block structure of the vendor record handler.
> */
> void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);
> +struct list_head *ghes_get_devices(bool force); #else static inline
> +struct list_head *ghes_get_devices(bool force) { return NULL; }
> #endif
>
> int ghes_estatus_pool_init(int num_ghes);
>
> -/* From drivers/edac/ghes_edac.c */
> -
> -#ifdef CONFIG_EDAC_GHES
> -void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err
> *mem_err);
> -
> -int ghes_edac_register(struct ghes *ghes, struct device *dev);
> -
> -void ghes_edac_unregister(struct ghes *ghes);
> -
> -#else
> -static inline void ghes_edac_report_mem_error(int sev,
> - struct cper_sec_mem_err *mem_err)
> -{
> -}
> -
> -static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) -{
> - return -ENODEV;
> -}
> -
> -static inline void ghes_edac_unregister(struct ghes *ghes) -{ -} -#endif
> -
> static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata)
> {
> return gdata->revision >> 8;
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.