Re: [PATCH v9 2/2] PCI: hip: Add handling of HiSilicon HIP PCIe controller errors

From: Andy Shevchenko
Date: Mon Jun 15 2020 - 08:00:58 EST


On Mon, Jun 15, 2020 at 11:15:52AM +0100, Shiju Jose wrote:
> From: Yicong Yang <yangyicong@xxxxxxxxxxxxx>
>
> The HiSilicon HIP PCIe controller is capable of handling errors
> on root port and perform port reset separately at each root port.
>
> Add error handling driver for HIP PCIe controller to log
> and report recoverable errors. Perform root port reset and restore
> link status after the recovery.
>
> Following are some of the PCIe controller's recoverable errors
> 1. completion transmission timeout error.
> 2. CRS retry counter over the threshold error.
> 3. ECC 2 bit errors
> 4. AXI bresponse/rresponse errors etc.
>
> The driver placed in the drivers/pci/controller/ because the
> HIP PCIe controller does not use DWC ip.

...

> +#include <linux/acpi.h>
> +#include <acpi/ghes.h>

bits.h ?

> +#include <linux/delay.h>
> +#include <linux/pci.h>
> +#include <linux/platform_device.h>
> +#include <linux/kfifo.h>
> +#include <linux/spinlock.h>

...

> +static guid_t hisi_pcie_sec_type = GUID_INIT(0xB2889FC9, 0xE7D7, 0x4F9D,
> + 0xA8, 0x67, 0xAF, 0x42, 0xE9, 0x8B, 0xE7, 0x72);

Can we have it in more common pattern, i.e.

static guid_t hisi_pcie_sec_type =
GUID_INIT(0xB2889FC9, 0xE7D7, 0x4F9D,
0xA8, 0x67, 0xAF, 0x42, 0xE9, 0x8B, 0xE7, 0x72);
?

...

> +#define HISI_PCIE_CORE_PORT_ID(v) (((v) % 8) << 1)

% -> & ?

...

> +struct hisi_pcie_error_private {
> + struct notifier_block nb;
> + struct platform_device *pdev;

Do you really need platform device? Isn't struct device * enough?

> +};

...

> +static char *hisi_pcie_sub_module_name(u8 id)
> +{
> + switch (id) {
> + case HISI_PCIE_SUB_MODULE_ID_AP: return "AP Layer";
> + case HISI_PCIE_SUB_MODULE_ID_TL: return "TL Layer";
> + case HISI_PCIE_SUB_MODULE_ID_MAC: return "MAC Layer";
> + case HISI_PCIE_SUB_MODULE_ID_DL: return "DL Layer";
> + case HISI_PCIE_SUB_MODULE_ID_SDI: return "SDI Layer";
> + }

match_string() ?

> + return "unknown";

> +}
> +
> +static char *hisi_pcie_error_severity(u8 err_sev)
> +{
> + switch (err_sev) {
> + case HISI_ERR_SEV_RECOVERABLE: return "recoverable";
> + case HISI_ERR_SEV_FATAL: return "fatal";
> + case HISI_ERR_SEV_CORRECTED: return "corrected";
> + case HISI_ERR_SEV_NONE: return "none";
> + }

Ditto?

> + return "unknown";
> +}

...

> + pdev = pci_get_domain_bus_and_slot(domain, busnr, devfn);
> + if (!pdev) {

> + dev_info(device, "Fail to get root port %04x:%02x:%02x.%d device\n",
> + domain, busnr, PCI_SLOT(devfn), PCI_FUNC(devfn));

pci_info() ?

> + return -ENODEV;
> + }

...

> + /*
> + * The initialization time of subordinate devices after
> + * hot reset is no more than 1s, which is required by
> + * the PCI spec v5.0 sec 6.6.1. The time will shorten
> + * if Readiness Notifications mechanisms are used. But
> + * wait 1s here to adapt any conditions.
> + */
> + ssleep(1UL);

It's a huge time out... Can we reduce it somehow?

...

> + for (i = 0; i < HISI_PCIE_ERR_MISC_REGS; i++) {
> + if (edata->val_bits &
> + BIT_ULL(HISI_PCIE_LOCAL_VALID_ERR_MISC + i))

for_each_set_bit() ?

> + dev_info(dev,
> + "ERR_MISC_%d = 0x%x\n", i, edata->err_misc[i]);
> + }

> +
> + /* Recovery for the PCIe controller errors */
> + if (edata->err_severity == HISI_ERR_SEV_RECOVERABLE) {

Perhaps negative conditional?

> + /* try reset PCI port for the error recovery */
> + rc = hisi_pcie_port_do_recovery(pdev, edata->socket_id,
> + HISI_PCIE_PORT_ID(edata->core_id, edata->port_id));
> + if (rc) {
> + dev_info(dev, "fail to do hisi pcie port reset\n");

> + return;

redundant.

> + }
> + }

...

> + const struct hisi_pcie_error_data *error_data =
> + acpi_hest_get_payload(gdata);

One line is better to read.

> + struct platform_device *pdev = priv->pdev;

> + hisi_pcie_handle_error(pdev, error_data);

And how exactly _platform_ device pointer is being used?

...


> + dev_err(&pdev->dev, "%s : ghes_register_event_notifier fail\n",
> + __func__);

Make error message more descriptive that __func__ will not be needed.

...

> + kfree(priv);

Double free?

...

> +static const struct acpi_device_id hisi_pcie_acpi_match[] = {
> + { "HISI0361", 0 },

', 0' part is not necessary to have.

> + { }
> +};

--
With Best Regards,
Andy Shevchenko