Re: [PATCH] ath9k: rng: escape sleep loop when unregistering

From: Gregory Erwin
Date: Fri Jun 24 2022 - 01:25:59 EST


Hi Jason,

I think you are on the right track, but even with this patch
'ip link set wlan0 down' blocks until the hwrng reader gives up.
The reader can either be userspace (dd, cat, etc) or it can also
be the rng_core module. I can replicate the hang in the two different
situations, so I gathered two stack traces for 'ip' depending on the
reader of hwrng:

-userspace-
$ ip link set wlan0 up
$ dd if=/dev/hwrng count=1 & # blocks until interrupted or receives -EIO
$ ip link set wlan0 down & # blocks until dd exits
$ cat /proc/$(pidof ip)/stack
[<0>] devres_release+0x2b/0x60
[<0>] ath9k_rng_stop+0x27/0x40 [ath9k]
[<0>] ath9k_stop+0x40/0x230 [ath9k]
[<0>] drv_stop+0x34/0x100 [mac80211]
[<0>] ieee80211_do_stop+0x685/0x880 [mac80211]
[<0>] ieee80211_stop+0x41/0x170 [mac80211]
[<0>] __dev_close_many+0x9e/0x110
[<0>] __dev_change_flags+0x1a7/0x250
[<0>] dev_change_flags+0x26/0x60
[<0>] do_setlink+0x32d/0x1220
[<0>] __rtnl_newlink+0x5a2/0x9f0
[<0>] rtnl_newlink+0x47/0x70
[<0>] rtnetlink_rcv_msg+0x143/0x370
[<0>] netlink_rcv_skb+0x55/0x100
[<0>] netlink_unicast+0x243/0x390
[<0>] netlink_sendmsg+0x254/0x4b0
[<0>] sock_sendmsg+0x60/0x70
[<0>] ____sys_sendmsg+0x264/0x2a0
[<0>] ___sys_sendmsg+0x96/0xd0
[<0>] __sys_sendmsg+0x7a/0xd0
[<0>] do_syscall_64+0x5f/0x90
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae

-rng_core-
$ ip link set wlan0 up
$ # wait a while, maybe a minute or two
$ ip link set wlan0 down & # blocks until 10s after 'hwrng: no data available'
$ cat /proc/$(pidof ip)/stack
[<0>] kthread_stop+0x65/0x170
[<0>] hwrng_unregister+0xbe/0xe0 [rng_core]
[<0>] devres_release+0x2b/0x60
[<0>] ath9k_rng_stop+0x27/0x40 [ath9k]
[<0>] ath9k_stop+0x40/0x230 [ath9k]
[<0>] drv_stop+0x34/0x100 [mac80211]
[<0>] ieee80211_do_stop+0x685/0x880 [mac80211]
[<0>] ieee80211_stop+0x41/0x170 [mac80211]
[<0>] __dev_close_many+0x9e/0x110
[<0>] __dev_change_flags+0x1a7/0x250
[<0>] dev_change_flags+0x26/0x60
[<0>] do_setlink+0x32d/0x1220
[<0>] __rtnl_newlink+0x5a2/0x9f0
[<0>] rtnl_newlink+0x47/0x70
[<0>] rtnetlink_rcv_msg+0x143/0x370
[<0>] netlink_rcv_skb+0x55/0x100
[<0>] netlink_unicast+0x243/0x390
[<0>] netlink_sendmsg+0x254/0x4b0
[<0>] sock_sendmsg+0x60/0x70
[<0>] ____sys_sendmsg+0x264/0x2a0
[<0>] ___sys_sendmsg+0x96/0xd0
[<0>] __sys_sendmsg+0x7a/0xd0
[<0>] do_syscall_64+0x5f/0x90
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae

Regards,
- Greg.



On Thu, Jun 23, 2022 at 6:15 PM Jason A. Donenfeld <Jason@xxxxxxxxx> wrote:
>
> There's currently an almost deadlock when ath9k shuts down if no random
> bytes are available:
>
> Thread A Thread B
> -------------------------------------------------------------------------
> rng_dev_read
> get_current_rng
> kref_get(&current_rng->ref)
> rng_get_data
> ath9k_rng_read
> msleep_interruptible(...)
> ath9k_rng_stop
> devm_hwrng_unregister
> devm_hwrng_release
> hwrng_unregister
> drop_current_rng
> kref_put(&current_rng->ref, cleanup_rng)
> // Does NOT call cleanup_rng here,
> // because of thread A's kref_get.
> wait_for_completion(&rng->cleanup_done);
> // Waits for a really long time...
> // Eventually sleep is over...
> put_rng
> kref_put(&rng->ref, cleanup_rng);
> cleanup_rng
> complete(&rng->cleanup_done);
> return
>
> When thread B doesn't return right away, sleep and other functions that
> are waiting for ath9k_rng_stop to complete time out, and problems ensue.
>
> This commit fixes the problem by using another completion inside of
> ath9k_rng_read so that hwrng_unregister can interrupt it as needed.
>
> Reported-by: Gregory Erwin <gregerwin256@xxxxxxxxx>
> Cc: Toke Høiland-Jørgensen <toke@xxxxxxxxxx>
> Cc: Kalle Valo <kvalo@xxxxxxxxxx>
> Cc: Rui Salvaterra <rsalvaterra@xxxxxxxxx>
> Cc: stable@xxxxxxxxxxxxxxx
> Fixes: fcd09c90c3c5 ("ath9k: use hw_random API instead of directly dumping into random.c")
> Link: https://lore.kernel.org/all/CAO+Okf6ZJC5-nTE_EJUGQtd8JiCkiEHytGgDsFGTEjs0c00giw@xxxxxxxxxxxxxx/
> Link: https://bugs.archlinux.org/task/75138
> Signed-off-by: Jason A. Donenfeld <Jason@xxxxxxxxx>
> ---
> I do not have an ath9k and therefore I can't test this myself. The
> analysis above was done completely statically, with no dynamic tracing
> and just a bug report of symptoms from Gregory. So it might be totally
> wrong. Thus, this patch very much requires Gregory's testing. Please
> don't apply it until we have his `Tested-by` line.
>
> drivers/net/wireless/ath/ath9k/ath9k.h | 1 +
> drivers/net/wireless/ath/ath9k/rng.c | 26 ++++++++++++++++----------
> 2 files changed, 17 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
> index 3ccf8cfc6b63..731db7f70e5d 100644
> --- a/drivers/net/wireless/ath/ath9k/ath9k.h
> +++ b/drivers/net/wireless/ath/ath9k/ath9k.h
> @@ -1072,6 +1072,7 @@ struct ath_softc {
>
> #ifdef CONFIG_ATH9K_HWRNG
> struct hwrng rng_ops;
> + struct completion rng_shutdown;
> u32 rng_last;
> char rng_name[sizeof("ath9k_65535")];
> #endif
> diff --git a/drivers/net/wireless/ath/ath9k/rng.c b/drivers/net/wireless/ath/ath9k/rng.c
> index cb5414265a9b..67c6b03a22ac 100644
> --- a/drivers/net/wireless/ath/ath9k/rng.c
> +++ b/drivers/net/wireless/ath/ath9k/rng.c
> @@ -1,5 +1,6 @@
> /*
> * Copyright (c) 2015 Qualcomm Atheros, Inc.
> + * Copyright (C) 2022 Jason A. Donenfeld <Jason@xxxxxxxxx>. All Rights Reserved.
> *
> * Permission to use, copy, modify, and/or distribute this software for any
> * purpose with or without fee is hereby granted, provided that the above
> @@ -52,18 +53,13 @@ static int ath9k_rng_data_read(struct ath_softc *sc, u32 *buf, u32 buf_size)
> return j << 2;
> }
>
> -static u32 ath9k_rng_delay_get(u32 fail_stats)
> +static unsigned long ath9k_rng_delay_get(u32 fail_stats)
> {
> - u32 delay;
> -
> if (fail_stats < 100)
> - delay = 10;
> + return msecs_to_jiffies(10);
> else if (fail_stats < 105)
> - delay = 1000;
> - else
> - delay = 10000;
> -
> - return delay;
> + return msecs_to_jiffies(1000);
> + return msecs_to_jiffies(10000);
> }
>
> static int ath9k_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
> @@ -83,7 +79,10 @@ static int ath9k_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
> if (!wait || !max || likely(bytes_read) || fail_stats > 110)
> break;
>
> - msleep_interruptible(ath9k_rng_delay_get(++fail_stats));
> + if (wait_for_completion_interruptible_timeout(
> + &sc->rng_shutdown,
> + ath9k_rng_delay_get(++fail_stats)))
> + break;
> }
>
> if (wait && !bytes_read && max)
> @@ -91,6 +90,11 @@ static int ath9k_rng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
> return bytes_read;
> }
>
> +static void ath9k_rng_cleanup(struct hwrng *rng)
> +{
> + complete(&container_of(rng, struct ath_softc, rng_ops)->rng_shutdown);
> +}
> +
> void ath9k_rng_start(struct ath_softc *sc)
> {
> static atomic_t serial = ATOMIC_INIT(0);
> @@ -104,8 +108,10 @@ void ath9k_rng_start(struct ath_softc *sc)
>
> snprintf(sc->rng_name, sizeof(sc->rng_name), "ath9k_%u",
> (atomic_inc_return(&serial) - 1) & U16_MAX);
> + init_completion(&sc->rng_shutdown);
> sc->rng_ops.name = sc->rng_name;
> sc->rng_ops.read = ath9k_rng_read;
> + sc->rng_ops.cleanup = ath9k_rng_cleanup;
> sc->rng_ops.quality = 320;
>
> if (devm_hwrng_register(sc->dev, &sc->rng_ops))
> --
> 2.35.1
>