Re: [PATCH] watchdog: Prefer use "ref-cycles" for NMI watchdog

From: Song Liu
Date: Fri May 12 2023 - 12:44:10 EST


On Fri, May 12, 2023 at 5:47 AM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Tue, May 09, 2023 at 03:17:00PM -0700, Song Liu wrote:
> > NMI watchdog permanently consumes one hardware counters per CPU on the
> > system. For systems that use many hardware counters, this causes more
> > aggressive time multiplexing of perf events.
> >
> > OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
> > used. Try use "ref-cycles" for the watchdog. If the CPU supports it, so
> > that one more hardware counter is available to the user. If the CPU doesn't
> > support "ref-cycles", fall back to "cycles".
> >
> > The downside of this change is that users of "ref-cycles" need to disable
> > nmi_watchdog.
>
> Urgh..
>
> how about something like so instead; then you can use whatever event you
> like...

Configuring this at boot time is not ideal for our use case. Currently, we have
some systems support ref-cycles and some don't. So this is one more kernel
argument we need to make sure to get correctly. This also means we cannot
change this setting without reboot.

Another idea I have is to use sysctl kernel.nmi_watchdog, so we can change
the event after boot. Would this work?

Btw, the limitation here (ref-cycles users need to disable NMI watchdog) comes
from the limitation that the programmable counters cannot do ref-cycles. Is this
something we may change (or already changed)?

Thanks,
Song

>
> ---
> include/linux/nmi.h | 2 ++
> kernel/watchdog.c | 45 ++++++++++++++++++++++++++++++++++++---------
> kernel/watchdog_hld.c | 4 ++--
> 3 files changed, 40 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 048c0b9aa623..8b6307837346 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -19,6 +19,8 @@ bool is_hardlockup(void);
>
> extern int watchdog_user_enabled;
> extern int nmi_watchdog_user_enabled;
> +extern int nmi_watchdog_type;
> +extern u64 nmi_watchdog_config;
> extern int soft_watchdog_user_enabled;
> extern int watchdog_thresh;
> extern unsigned long watchdog_enabled;
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 8e61f21e7e33..b3c09e0f96a3 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -40,6 +40,8 @@ static DEFINE_MUTEX(watchdog_mutex);
> unsigned long __read_mostly watchdog_enabled;
> int __read_mostly watchdog_user_enabled = 1;
> int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
> +int __ro_after_init nmi_watchdog_type = PERF_TYPE_HARDWARE;
> +u64 __ro_after_init nmi_watchdog_config = PERF_COUNT_HW_CPU_CYCLES;
> int __read_mostly soft_watchdog_user_enabled = 1;
> int __read_mostly watchdog_thresh = 10;
> static int __read_mostly nmi_watchdog_available;
> @@ -73,15 +75,40 @@ void __init hardlockup_detector_disable(void)
>
> static int __init hardlockup_panic_setup(char *str)
> {
> - if (!strncmp(str, "panic", 5))
> - hardlockup_panic = 1;
> - else if (!strncmp(str, "nopanic", 7))
> - hardlockup_panic = 0;
> - else if (!strncmp(str, "0", 1))
> - nmi_watchdog_user_enabled = 0;
> - else if (!strncmp(str, "1", 1))
> - nmi_watchdog_user_enabled = 1;
> - return 1;
> + int ret = 1;
> +
> + if (!str)
> + return -EINVAL;
> +
> + while (str) {
> + char *next = strchr(str, ',');
> + if (next) {
> + *next = 0;
> + next++;
> + }
> +
> + if (!strcmp(str, "panic"))
> + hardlockup_panic = 1;
> + else if (!strcmp(str, "nopanic"))
> + hardlockup_panic = 0;
> + else if (!strcmp(str, "0"))
> + nmi_watchdog_user_enabled = 0;
> + else if (!strcmp(str, "1"))
> + nmi_watchdog_user_enabled = 1;
> + else if (str[0] == 'r') {
> + str++;
> + ret = kstrtou64(str, 16, &nmi_watchdog_config);
> + if (ret)
> + break;
> + nmi_watchdog_type = PERF_TYPE_RAW;
> + nmi_watchdog_user_enabled = 1;
> + }
> +
> + str = next;
> + }
> +
> + return ret;
> +
> }
> __setup("nmi_watchdog=", hardlockup_panic_setup);
>
> diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
> index 247bf0b1582c..27bc15f9a92a 100644
> --- a/kernel/watchdog_hld.c
> +++ b/kernel/watchdog_hld.c
> @@ -99,8 +99,6 @@ static inline bool watchdog_check_timestamp(void)
> #endif
>
> static struct perf_event_attr wd_hw_attr = {
> - .type = PERF_TYPE_HARDWARE,
> - .config = PERF_COUNT_HW_CPU_CYCLES,
> .size = sizeof(struct perf_event_attr),
> .pinned = 1,
> .disabled = 1,
> @@ -170,6 +168,8 @@ static int hardlockup_detector_event_create(void)
> struct perf_event *evt;
>
> wd_attr = &wd_hw_attr;
> + wd_attr->type = nmi_watchdog_type;
> + wd_attr->config = nmi_watchdog_config;
> wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
>
> /* Try to register using hardware perf events */