[PATCH v2 2/2] ras: close the race condition with timer

From: Cong Wang
Date: Tue Apr 16 2019 - 17:34:17 EST


cec_timer_fn() is a timer callback which reads ce_arr.array[]
and updates its decay values. Elements could be added to or
removed from this global array in parallel, although the array
itself will not grow or shrink. del_lru_elem_unlocked() uses
FULL_COUNT() as a key to find a right element to remove,
which could be affected by the parallel timer.

Fix this by converting the timer to a delayed work as suggested
by Borislav, to avoid using spinlock.

Fixes: 011d82611172 ("RAS: Add a Corrected Errors Collector")
Cc: Tony Luck <tony.luck@xxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Cong Wang <xiyou.wangcong@xxxxxxxxx>
---
drivers/ras/cec.c | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index a4ff54e50673..5c2040a7389d 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -2,6 +2,7 @@
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
+#include <linux/workqueue.h>

#include <asm/mce.h>

@@ -131,7 +132,7 @@ static unsigned int count_threshold = COUNT_MASK;
#define CEC_TIMER_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */
#define CEC_TIMER_MIN_INTERVAL 1 * 60 * 60 /* 1h */
#define CEC_TIMER_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */
-static struct timer_list cec_timer;
+static struct delayed_work cec_work;
static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;

/*
@@ -160,20 +161,21 @@ static void do_spring_cleaning(struct ce_array *ca)
/*
* @interval in seconds
*/
-static void cec_mod_timer(struct timer_list *t, unsigned long interval)
+static void cec_mod_work(unsigned long interval)
{
unsigned long iv;

- iv = interval * HZ + jiffies;
-
- mod_timer(t, round_jiffies(iv));
+ iv = interval * HZ;
+ mod_delayed_work(system_wq, &cec_work, round_jiffies(iv));
}

-static void cec_timer_fn(struct timer_list *unused)
+static void cec_work_fn(struct work_struct *work)
{
+ mutex_lock(&ce_mutex);
do_spring_cleaning(&ce_arr);
+ mutex_unlock(&ce_mutex);

- cec_mod_timer(&cec_timer, timer_interval);
+ cec_mod_work(timer_interval);
}

/*
@@ -383,7 +385,7 @@ static int decay_interval_set(void *data, u64 val)

timer_interval = val;

- cec_mod_timer(&cec_timer, timer_interval);
+ cec_mod_work(timer_interval);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
@@ -509,8 +511,8 @@ void __init cec_init(void)
if (create_debugfs_nodes())
return;

- timer_setup(&cec_timer, cec_timer_fn, 0);
- cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
+ INIT_DELAYED_WORK(&cec_work, cec_work_fn);
+ schedule_delayed_work(&cec_work, CEC_TIMER_DEFAULT_INTERVAL);

pr_info("Correctable Errors collector initialized.\n");
}
--
2.20.1