Re: [PATCH] power: suspend: Add suspend timeout handler

From: Greg Kroah-Hartman
Date: Fri Oct 16 2020 - 01:44:01 EST


On Fri, Oct 16, 2020 at 11:51:09AM +0800, Joseph Jang wrote:
> From: josephjang <josephjang@xxxxxxxxxx>

Please use your name as spelled out like you did above in the email
header.

>
> Add suspend timeout handler to prevent device stuck during suspend/
> resume process. Suspend timeout handler will dump disk sleep task
> at first round timeout and trigger kernel panic at second round timeout.
> The default timer for each round is 30 seconds.
>
> Note: Can use following command to simulate suspend hang for testing.
> adb shell echo 1 > /sys/power/pm_hang
> adb shell echo mem > /sys/power/state
> Signed-off-by: josephjang <josephjang@xxxxxxxxxx>

Need a blank line before the signed-off-by: and again, spell your name
the same way.



> ---
> include/linux/console.h | 1 +
> kernel/power/Kconfig | 9 +++
> kernel/power/main.c | 66 ++++++++++++++++
> kernel/power/suspend.c | 162 ++++++++++++++++++++++++++++++++++++++++
> kernel/printk/printk.c | 5 ++
> 5 files changed, 243 insertions(+)
>
> diff --git a/include/linux/console.h b/include/linux/console.h
> index 0670d3491e0e..ac468c602c0b 100644
> --- a/include/linux/console.h
> +++ b/include/linux/console.h
> @@ -192,6 +192,7 @@ static inline void console_sysfs_notify(void)
> { }
> #endif
> extern bool console_suspend_enabled;
> +extern int is_console_suspended(void);

For global functions, how about:
bool console_is_suspended(void);
?

>
> /* Suspend and resume console messages over PM events */
> extern void suspend_console(void);
> diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
> index a7320f07689d..52b7a181b6d8 100644
> --- a/kernel/power/Kconfig
> +++ b/kernel/power/Kconfig
> @@ -207,6 +207,15 @@ config PM_SLEEP_DEBUG
> def_bool y
> depends on PM_DEBUG && PM_SLEEP
>
> +config PM_SLEEP_MONITOR
> + bool "Linux kernel suspend/resume process monitor"
> + depends on PM_SLEEP
> + help
> + This option will enable suspend/resume monitor to prevent device
> + stuck during suspend/resume process. Suspend timeout handler will
> + dump disk sleep task at first round timeout and trigger kernel panic
> + at second round timeout. The default timer for each round is 30 seconds.

Ouch, are you sure you want to panic?


> +
> config DPM_WATCHDOG
> bool "Device suspend/resume watchdog"
> depends on PM_DEBUG && PSTORE && EXPERT
> diff --git a/kernel/power/main.c b/kernel/power/main.c
> index 40f86ec4ab30..f25b8a47583e 100644
> --- a/kernel/power/main.c
> +++ b/kernel/power/main.c
> @@ -575,6 +575,69 @@ void __pm_pr_dbg(bool defer, const char *fmt, ...)
> static inline void pm_print_times_init(void) {}
> #endif /* CONFIG_PM_SLEEP_DEBUG */
>
> +#ifdef CONFIG_PM_SLEEP_MONITOR
> +/* If set, devices will stuck at suspend for verification */
> +static bool pm_hang_enabled;
> +
> +static int pm_notify_test(struct notifier_block *nb,
> + unsigned long mode, void *_unused)
> +{
> + pr_info("Jump into infinite loop now\n");

Why do you have debugging code still enabled?

> +
> + /* Suspend thread stuck at a loop forever */
> + for (;;)
> + ;
> +

Don't busy spin, that will burn power.


> + pr_info("Fail to stuck at loop\n");

And how can this happen?

> +
> + return 0;
> +}
> +
> +static struct notifier_block pm_notify_nb = {
> + .notifier_call = pm_notify_test,
> +};
> +
> +static ssize_t pm_hang_show(struct kobject *kobj, struct kobj_attribute *attr,
> + char *buf)
> +{
> + return snprintf(buf, 10, "%d\n", pm_hang_enabled);
> +}
> +
> +static ssize_t pm_hang_store(struct kobject *kobj, struct kobj_attribute *attr,
> + const char *buf, size_t n)
> +{
> + unsigned long val;
> + int result;
> +
> + if (kstrtoul(buf, 10, &val))
> + return -EINVAL;
> +
> + if (val > 1)
> + return -EINVAL;
> +
> + pm_hang_enabled = !!val;
> +
> + if (pm_hang_enabled == true) {
> +
> + result = register_pm_notifier(&pm_notify_nb);
> + if (result)
> + pr_warn("Can not register suspend notifier, return %d\n",
> + result);
> +
> + } else {
> +
> + result = unregister_pm_notifier(&pm_notify_nb);
> + if (result)
> + pr_warn("Can not unregister suspend notifier, return %d\n",
> + result);
> + }
> +
> + return n;
> +}
> +
> +power_attr(pm_hang);
> +#endif
> +
> struct kobject *power_kobj;
>
> /**
> @@ -909,6 +972,9 @@ static struct attribute * g[] = {
> &pm_wakeup_irq_attr.attr,
> &pm_debug_messages_attr.attr,
> #endif
> +#ifdef CONFIG_PM_SLEEP_MONITOR
> + &pm_hang_attr.attr,

You added a sysfs file, but no Documentation/ABI/ update? That's not
ok.


> +#endif
> #endif
> #ifdef CONFIG_FREEZER
> &pm_freeze_timeout_attr.attr,
> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
> index 8b1bb5ee7e5d..6f2679cfd9d1 100644
> --- a/kernel/power/suspend.c
> +++ b/kernel/power/suspend.c
> @@ -30,6 +30,12 @@
> #include <trace/events/power.h>
> #include <linux/compiler.h>
> #include <linux/moduleparam.h>
> +#ifdef CONFIG_PM_SLEEP_MONITOR
> +#include <linux/sched/debug.h>
> +#include <linux/kthread.h>
> +#include <linux/sched.h>
> +#include <uapi/linux/sched/types.h>
> +#endif

Don't #ifdef include files.

And why the uapi file?

>
> #include "power.h"
>
> @@ -61,6 +67,133 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
> enum s2idle_states __read_mostly s2idle_state;
> static DEFINE_RAW_SPINLOCK(s2idle_lock);
>
> +#ifdef CONFIG_PM_SLEEP_MONITOR
> +/* Suspend monitor thread toggle reason */
> +enum toggle_reason {
> + TOGGLE_NONE,
> + TOGGLE_START,
> + TOGGLE_STOP,
> +};
> +
> +#define SUSPEND_TIMER_TIMEOUT_MS 30000
> +static struct task_struct *ksuspend_mon_tsk;
> +static DECLARE_WAIT_QUEUE_HEAD(power_suspend_waitqueue);
> +static enum toggle_reason suspend_mon_toggle;
> +static DEFINE_MUTEX(suspend_mon_lock);
> +
> +static void start_suspend_mon(void)
> +{
> + mutex_lock(&suspend_mon_lock);
> + suspend_mon_toggle = TOGGLE_START;
> + mutex_unlock(&suspend_mon_lock);

Why do you need a lock for a single integer?

> + wake_up(&power_suspend_waitqueue);
> +}
> +
> +static void stop_suspend_mon(void)
> +{
> + mutex_lock(&suspend_mon_lock);
> + suspend_mon_toggle = TOGGLE_STOP;
> + mutex_unlock(&suspend_mon_lock);
> + wake_up(&power_suspend_waitqueue);
> +}
> +
> +static void suspend_timeout(int timeout_count)
> +{
> + char *null_pointer = NULL;
> +
> + pr_info("Suspend monitor timeout (timer is %d seconds)\n",
> + (SUSPEND_TIMER_TIMEOUT_MS/1000));
> +
> + show_state_filter(TASK_UNINTERRUPTIBLE);
> +
> + if (timeout_count < 2)
> + return;
> +
> + if (is_console_suspended())
> + resume_console();
> +
> + pr_info("Trigger a panic\n");

Again, debugging code?

> +
> + /* Trigger a NULL pointer dereference */
> + *null_pointer = 'a';

Are you sure this will work on all platforms? We do have a panic
function if you really want to do that.

> +
> + /* Should not reach here */
> + pr_err("Trigger panic failed!\n");
> +}
> +
> +static int suspend_monitor_kthread(void *arg)
> +{
> + long err;
> + struct sched_param param = {.sched_priority
> + = MAX_RT_PRIO-1};

Ick, no, call the scheduler functions properly, don't do this "by hand"
ever.

> + static int timeout_count;
> + static long timeout;
> +
> + pr_info("Init ksuspend_mon thread\n");

Again, debugging code :(

> +
> + sched_setscheduler(current, SCHED_FIFO, &param);
> +
> + timeout_count = 0;
> + timeout = MAX_SCHEDULE_TIMEOUT;
> +
> + do {
> + /* Wait suspend timer timeout */
> + err = wait_event_interruptible_timeout(
> + power_suspend_waitqueue,
> + (suspend_mon_toggle != TOGGLE_NONE),
> + timeout);
> +
> + mutex_lock(&suspend_mon_lock);
> + /* suspend monitor state change */
> + if (suspend_mon_toggle != TOGGLE_NONE) {
> + if (suspend_mon_toggle == TOGGLE_START) {
> + timeout = msecs_to_jiffies(
> + SUSPEND_TIMER_TIMEOUT_MS);
> + pr_info("Start suspend monitor\n");
> + } else if (suspend_mon_toggle == TOGGLE_STOP) {
> + timeout = MAX_SCHEDULE_TIMEOUT;
> + timeout_count = 0;
> + pr_info("Stop suspend monitor\n");
> + }
> + suspend_mon_toggle = TOGGLE_NONE;
> + mutex_unlock(&suspend_mon_lock);
> + continue;
> + }
> + mutex_unlock(&suspend_mon_lock);
> +
> + /* suspend monitor event handler */
> + if (err == 0) {
> + timeout_count++;
> + suspend_timeout(timeout_count);
> + } else if (err == -ERESTARTSYS) {
> + pr_info("Exit ksuspend_mon!");
> + break;
> + }
> + } while (1);
> +
> + return 0;
> +}
> +
> +static void init_suspend_monitor_thread(void)
> +{
> + int ret;
> +
> + ksuspend_mon_tsk = kthread_create(suspend_monitor_kthread,
> + NULL, "ksuspend_mon");
> + if (IS_ERR(ksuspend_mon_tsk)) {
> + ret = PTR_ERR(ksuspend_mon_tsk);
> + ksuspend_mon_tsk = NULL;
> + pr_err("Create suspend_monitor_kthread failed! ret = %d\n",
> + ret);
> + return;
> + }
> +
> + suspend_mon_toggle = TOGGLE_NONE;
> + wake_up_process(ksuspend_mon_tsk);
> +
> +}
> +#endif
> +
> /**
> * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend.
> *
> @@ -89,6 +222,10 @@ static void s2idle_enter(void)
> {
> trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
>
> +#ifdef CONFIG_PM_SLEEP_MONITOR
> + stop_suspend_mon();
> +#endif

Do not put #ifdef in .c files, that's not the proper kernel coding
style. Especially for single function calls.

I've stopped here...

greg k-h