Re: [PATCH] EDAC/device: Add sysfs notification for UE,CE count change

From: Adrien Thierry
Date: Fri Sep 29 2023 - 14:41:42 EST


Hi Deepti,

On Mon, Jul 31, 2023 at 03:00:59PM -0700, Deepti Jaggi wrote:
> A daemon running in user space collects information on correctable
> and uncorrectable errors from EDAC driver by reading corresponding
> sysfs entries and takes appropriate action.
> This patch adds support for user space daemon to wait on poll() until
> the sysfs entries for UE count and CE count change and then read updated
> counts instead of continuously monitoring the sysfs entries for
> any changes.
>
> Signed-off-by: Deepti Jaggi <quic_djaggi@xxxxxxxxxxx>
> ---
> drivers/edac/edac_device.c | 16 ++++++++++++++++
> drivers/edac/edac_device.h | 8 ++++++++
> drivers/edac/edac_device_sysfs.c | 20 ++++++++++++++++++++
> 3 files changed, 44 insertions(+)
>
> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
> index 8c4d947fb848..7b7aec4da6b9 100644
> --- a/drivers/edac/edac_device.c
> +++ b/drivers/edac/edac_device.c
> @@ -587,12 +587,20 @@ void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
> if (instance->nr_blocks > 0) {
> block = instance->blocks + block_nr;
> block->counters.ce_count += count;
> +
> + /* Notify block sysfs attribute change */
> + if (block->kn_ce)
> + sysfs_notify_dirent(block->kn_ce);
> }
>
> /* Propagate the count up the 'totals' tree */
> instance->counters.ce_count += count;
> edac_dev->counters.ce_count += count;
>
> + /* Notify instance sysfs attribute change */
> + if (instance->kn_ce)
> + sysfs_notify_dirent(instance->kn_ce);
> +
> if (edac_device_get_log_ce(edac_dev))
> edac_device_printk(edac_dev, KERN_WARNING,
> "CE: %s instance: %s block: %s count: %d '%s'\n",
> @@ -633,12 +641,20 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
> if (instance->nr_blocks > 0) {
> block = instance->blocks + block_nr;
> block->counters.ue_count += count;
> +
> + /* Notify block sysfs attribute change */
> + if (block->kn_ue)
> + sysfs_notify_dirent(block->kn_ue);
> }
>
> /* Propagate the count up the 'totals' tree */
> instance->counters.ue_count += count;
> edac_dev->counters.ue_count += count;
>
> + /* Notify instance sysfs attribute change */
> + if (instance->kn_ue)
> + sysfs_notify_dirent(instance->kn_ue);
> +
> if (edac_device_get_log_ue(edac_dev))
> edac_device_printk(edac_dev, KERN_EMERG,
> "UE: %s instance: %s block: %s count: %d '%s'\n",
> diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
> index fc2d2c218064..459514ea549e 100644
> --- a/drivers/edac/edac_device.h
> +++ b/drivers/edac/edac_device.h
> @@ -127,6 +127,10 @@ struct edac_device_block {
>
> /* edac sysfs device control */
> struct kobject kobj;
> +
> + /* kern fs node for block ue_count and ce count attributes*/
> + struct kernfs_node *kn_ue;
> + struct kernfs_node *kn_ce;
> };
>
> /* device instance control structure */
> @@ -141,6 +145,10 @@ struct edac_device_instance {
>
> /* edac sysfs device control */
> struct kobject kobj;
> +
> + /* kern fs node for block ue_count and ce count attributes*/
> + struct kernfs_node *kn_ue;
> + struct kernfs_node *kn_ce;
> };
>
>
> diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
> index 5e7593753799..d1e04a9296c7 100644
> --- a/drivers/edac/edac_device_sysfs.c
> +++ b/drivers/edac/edac_device_sysfs.c
> @@ -562,6 +562,13 @@ static int edac_device_create_block(struct edac_device_ctl_info *edac_dev,
> }
> kobject_uevent(&block->kobj, KOBJ_ADD);
>
> + /*
> + * Save kernfs pointer for ue count and ce count
> + * to notify from any context when attributes change
> + */
> + block->kn_ue = sysfs_get_dirent(block->kobj.sd, "ue_count");
> + block->kn_ce = sysfs_get_dirent(block->kobj.sd, "ce_count");
> +
> return 0;
>
> /* Error unwind stack */
> @@ -594,6 +601,9 @@ static void edac_device_delete_block(struct edac_device_ctl_info *edac_dev,
> }
> }
>
> + block->kn_ue = NULL;
> + block->kn_ce = NULL;
> +

Isn't there a possibility for a race condition here? It seems to me that
between the moment the attribute files are removed with
sysfs_remove_file() a few lines above, and the moment block->kn_ue and
block->kn_ce are nulled, sysfs_notify_dirent() can be called from
edac_device_handle_ce_count() with an block->kn_ce that refers to a
deleted file.

> /* unregister this block's kobject, SEE:
> * edac_device_ctrl_block_release() callback operation
> */
> @@ -660,6 +670,13 @@ static int edac_device_create_instance(struct edac_device_ctl_info *edac_dev,
> edac_dbg(4, "Registered instance %d '%s' kobject\n",
> idx, instance->name);
>
> + /*
> + * Save kernfs pointer for ue count and ce count
> + * to notify from any context when attributes change
> + */
> + instance->kn_ue = sysfs_get_dirent(instance->kobj.sd, "ue_count");
> + instance->kn_ce = sysfs_get_dirent(instance->kobj.sd, "ce_count");
> +
> return 0;
>
> /* error unwind stack */
> @@ -682,6 +699,9 @@ static void edac_device_delete_instance(struct edac_device_ctl_info *edac_dev,
>
> instance = &edac_dev->instances[idx];
>
> + instance->kn_ue = NULL;
> + instance->kn_ce = NULL;
> +
> /* unregister all blocks in this instance */
> for (i = 0; i < instance->nr_blocks; i++)
> edac_device_delete_block(edac_dev, &instance->blocks[i]);
> --
> 2.31.1
>

Best,
Adrien