Re: [PATCH 5/5] thermal/core: Move the mutex inside the thermal_zone_device_update() function

From: Daniel Lezcano
Date: Fri Aug 12 2022 - 09:17:58 EST



Hi Marek,

thanks for reporting.

Are you ok to check if the patch I've sent fixes the issue on your platform ?


On 12/08/2022 12:39, Marek Szyprowski wrote:
Hi Daniel,

On 05.08.2022 17:38, Daniel Lezcano wrote:
All the different calls inside the thermal_zone_device_update()
function take the mutex.

The previous changes move the mutex out of the different functions,
like the throttling ops. Now that the mutexes are all at the same
level in the call stack for the thermal_zone_device_update() function,
they can be moved inside this one.

That has the benefit of:

1. Simplify the code by not having a plethora of places where the lock is taken

2. Probably closes more race windows because releasing the lock from
one line to another can give the opportunity to the thermal zone to change
its state in the meantime. For example, the thermal zone can be
enabled right after checking it is disabled.

Signed-off-by: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx>

This patch landed recently in linux next-20220811 as commit ca48ad71717d
("thermal/core: Move the mutex inside the thermal_zone_device_update()
function"). Unfortunately it triggers a warning on Samsung ARM/ARM64
Exynos-based systems during the system suspend/resume cycle:

Restarting tasks ... done.
random: crng reseeded on system resumption
------------[ cut here ]------------
WARNING: CPU: 1 PID: 1374 at drivers/thermal/thermal_core.c:452
thermal_zone_device_is_enabled+0x58/0x5c
Modules linked in: cmac bnep hci_uart btbcm btintel bluetooth s5p_csis
s5p_fimc exynos4_is_common v4l2_fwnode v4l2_async ecdh_generic ecc
s5p_mfc brcmfmac cfg80211 s5p_jpeg videobuf2_dma_contig videobuf2_memops
brcmutil v4l2_mem2mem videobuf2_v4l2 videobuf2_common videodev mc
CPU: 1 PID: 1374 Comm: rtcwake Not tainted 5.18.0-02136-gca48ad71717d #12560
Hardware name: Samsung Exynos (Flattened Device Tree)
 unwind_backtrace from show_stack+0x10/0x14
 show_stack from dump_stack_lvl+0x58/0x70
 dump_stack_lvl from __warn+0x230/0x234
 __warn from warn_slowpath_fmt+0xac/0xb4
 warn_slowpath_fmt from thermal_zone_device_is_enabled+0x58/0x5c
 thermal_zone_device_is_enabled from thermal_pm_notify+0x84/0xe8
 thermal_pm_notify from blocking_notifier_call_chain+0x6c/0x94
 blocking_notifier_call_chain from pm_suspend+0x2e8/0x428
 pm_suspend from state_store+0x68/0xc8
 state_store from kernfs_fop_write_iter+0x108/0x1b0
 kernfs_fop_write_iter from vfs_write+0x268/0x50c
 vfs_write from ksys_write+0x54/0xc8
 ksys_write from ret_fast_syscall+0x0/0x1c
Exception stack(0xf0fa9fa8 to 0xf0fa9ff0)
...
irq event stamp: 49341
hardirqs last  enabled at (49349): [<c019d3f0>] __up_console_sem+0x50/0x60
hardirqs last disabled at (49358): [<c019d3dc>] __up_console_sem+0x3c/0x60
softirqs last  enabled at (49336): [<c0101808>] __do_softirq+0x4c8/0x5e8
softirqs last disabled at (49331): [<c01305b8>] irq_exit+0x1cc/0x200
---[ end trace 0000000000000000 ]---
PM: suspend exit


It looks that either the exynos_thermal driver or the framework has to
be somehow adjusted to avoid the above issue, but I didn't check the
details in the code yet.


---
drivers/thermal/thermal_core.c | 32 +++++--------
drivers/thermal/thermal_core.h | 2 +
drivers/thermal/thermal_helpers.c | 75 ++++++++++++++++++-------------
drivers/thermal/thermal_sysfs.c | 6 ++-
4 files changed, 62 insertions(+), 53 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 9d554f97e081..60110ac53e23 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -297,24 +297,18 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
static void monitor_thermal_zone(struct thermal_zone_device *tz)
{
- mutex_lock(&tz->lock);
-
if (tz->mode != THERMAL_DEVICE_ENABLED)
thermal_zone_device_set_polling(tz, 0);
else if (tz->passive)
thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies);
else if (tz->polling_delay_jiffies)
thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies);
-
- mutex_unlock(&tz->lock);
}
static void handle_non_critical_trips(struct thermal_zone_device *tz, int trip)
{
- mutex_lock(&tz->lock);
tz->governor ? tz->governor->throttle(tz, trip) :
def_governor->throttle(tz, trip);
- mutex_unlock(&tz->lock);
}
void thermal_zone_device_critical(struct thermal_zone_device *tz)
@@ -382,7 +376,7 @@ static void update_temperature(struct thermal_zone_device *tz)
{
int temp, ret;
- ret = thermal_zone_get_temp(tz, &temp);
+ ret = __thermal_zone_get_temp(tz, &temp);
if (ret) {
if (ret != -EAGAIN)
dev_warn(&tz->device,
@@ -391,10 +385,8 @@ static void update_temperature(struct thermal_zone_device *tz)
return;
}
- mutex_lock(&tz->lock);
tz->last_temperature = tz->temperature;
tz->temperature = temp;
- mutex_unlock(&tz->lock);
trace_thermal_temperature(tz);
@@ -457,15 +449,9 @@ EXPORT_SYMBOL_GPL(thermal_zone_device_disable);
int thermal_zone_device_is_enabled(struct thermal_zone_device *tz)
{
- enum thermal_device_mode mode;
-
- mutex_lock(&tz->lock);
-
- mode = tz->mode;
+ lockdep_assert_held(&tz->lock);
- mutex_unlock(&tz->lock);
-
- return mode == THERMAL_DEVICE_ENABLED;
+ return tz->mode == THERMAL_DEVICE_ENABLED;
}
void thermal_zone_device_update(struct thermal_zone_device *tz,
@@ -473,9 +459,6 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
{
int count;
- if (!thermal_zone_device_is_enabled(tz))
- return;
-
if (atomic_read(&in_suspend))
return;
@@ -483,9 +466,14 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
"'get_temp' ops set\n", __func__))
return;
+ mutex_lock(&tz->lock);
+
+ if (!thermal_zone_device_is_enabled(tz))
+ goto out;
+
update_temperature(tz);
- thermal_zone_set_trips(tz);
+ __thermal_zone_set_trips(tz);
tz->notify_event = event;
@@ -493,6 +481,8 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
handle_thermal_trip(tz, count);
monitor_thermal_zone(tz);
+out:
+ mutex_unlock(&tz->lock);
}
EXPORT_SYMBOL_GPL(thermal_zone_device_update);
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 2241d2dce017..1571917bd3c8 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -112,6 +112,8 @@ int thermal_build_list_of_policies(char *buf);
/* Helpers */
void thermal_zone_set_trips(struct thermal_zone_device *tz);
+void __thermal_zone_set_trips(struct thermal_zone_device *tz);
+int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
/* sysfs I/F */
int thermal_zone_create_device_groups(struct thermal_zone_device *, int);
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index 690890f054a3..702c70bdca48 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -64,27 +64,17 @@ get_thermal_instance(struct thermal_zone_device *tz,
}
EXPORT_SYMBOL(get_thermal_instance);
-/**
- * thermal_zone_get_temp() - returns the temperature of a thermal zone
- * @tz: a valid pointer to a struct thermal_zone_device
- * @temp: a valid pointer to where to store the resulting temperature.
- *
- * When a valid thermal zone reference is passed, it will fetch its
- * temperature and fill @temp.
- *
- * Return: On success returns 0, an error code otherwise
- */
-int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
+int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
{
int ret = -EINVAL;
int count;
int crit_temp = INT_MAX;
enum thermal_trip_type type;
+ lockdep_assert_held(&tz->lock);
+
if (!tz || IS_ERR(tz) || !tz->ops->get_temp)
- goto exit;
-
- mutex_lock(&tz->lock);
+ return -EINVAL;
ret = tz->ops->get_temp(tz, temp);
@@ -107,35 +97,42 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
*temp = tz->emul_temperature;
}
- mutex_unlock(&tz->lock);
-exit:
return ret;
}
-EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
/**
- * thermal_zone_set_trips - Computes the next trip points for the driver
- * @tz: a pointer to a thermal zone device structure
+ * thermal_zone_get_temp() - returns the temperature of a thermal zone
+ * @tz: a valid pointer to a struct thermal_zone_device
+ * @temp: a valid pointer to where to store the resulting temperature.
*
- * The function computes the next temperature boundaries by browsing
- * the trip points. The result is the closer low and high trip points
- * to the current temperature. These values are passed to the backend
- * driver to let it set its own notification mechanism (usually an
- * interrupt).
+ * When a valid thermal zone reference is passed, it will fetch its
+ * temperature and fill @temp.
*
- * It does not return a value
+ * Return: On success returns 0, an error code otherwise
*/
-void thermal_zone_set_trips(struct thermal_zone_device *tz)
+int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
+{
+ int ret;
+
+ mutex_lock(&tz->lock);
+ ret = __thermal_zone_get_temp(tz, temp);
+ mutex_unlock(&tz->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
+
+void __thermal_zone_set_trips(struct thermal_zone_device *tz)
{
int low = -INT_MAX;
int high = INT_MAX;
int trip_temp, hysteresis;
int i, ret;
- mutex_lock(&tz->lock);
-
+ lockdep_assert_held(&tz->lock);
+
if (!tz->ops->set_trips || !tz->ops->get_trip_hyst)
- goto exit;
+ return;
for (i = 0; i < tz->num_trips; i++) {
int trip_low;
@@ -154,7 +151,7 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz)
/* No need to change trip points */
if (tz->prev_low_trip == low && tz->prev_high_trip == high)
- goto exit;
+ return;
tz->prev_low_trip = low;
tz->prev_high_trip = high;
@@ -169,8 +166,24 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz)
ret = tz->ops->set_trips(tz, low, high);
if (ret)
dev_err(&tz->device, "Failed to set trips: %d\n", ret);
+}
-exit:
+/**
+ * thermal_zone_set_trips - Computes the next trip points for the driver
+ * @tz: a pointer to a thermal zone device structure
+ *
+ * The function computes the next temperature boundaries by browsing
+ * the trip points. The result is the closer low and high trip points
+ * to the current temperature. These values are passed to the backend
+ * driver to let it set its own notification mechanism (usually an
+ * interrupt).
+ *
+ * It does not return a value
+ */
+void thermal_zone_set_trips(struct thermal_zone_device *tz)
+{
+ mutex_lock(&tz->lock);
+ __thermal_zone_set_trips(tz);
mutex_unlock(&tz->lock);
}
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 3c513561d346..f094f7cbc455 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -49,7 +49,11 @@ static ssize_t
mode_show(struct device *dev, struct device_attribute *attr, char *buf)
{
struct thermal_zone_device *tz = to_thermal_zone(dev);
- int enabled = thermal_zone_device_is_enabled(tz);
+ int enabled;
+
+ mutex_lock(&tz->lock);
+ enabled = thermal_zone_device_is_enabled(tz);
+ mutex_unlock(&tz->lock);
return sprintf(buf, "%s\n", enabled ? "enabled" : "disabled");
}

Best regards


--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog