Re: [PATCH v4 08/10] workqueue: Introduce struct wq_node_nr_active

From: Nathan Chancellor
Date: Tue Jan 30 2024 - 13:00:33 EST


Hi Tejun,

On Mon, Jan 29, 2024 at 08:14:09AM -1000, Tejun Heo wrote:
> >From 91ccc6e7233bb10a9c176aa4cc70d6f432a441a5 Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@xxxxxxxxxx>
> Date: Mon, 29 Jan 2024 08:11:24 -1000
>
> Currently, for both percpu and unbound workqueues, max_active applies
> per-cpu, which is a recent change for unbound workqueues. The change for
> unbound workqueues was a significant departure from the previous behavior of
> per-node application. It made some use cases create undesirable number of
> concurrent work items and left no good way of fixing them. To address the
> problem, workqueue is implementing a NUMA node segmented global nr_active
> mechanism, which will be explained further in the next patch.
>
> As a preparation, this patch introduces struct wq_node_nr_active. It's a
> data structured allocated for each workqueue and NUMA node pair and
> currently only tracks the workqueue's number of active work items on the
> node. This is split out from the next patch to make it easier to understand
> and review.
>
> Note that there is an extra wq_node_nr_active allocated for the invalid node
> nr_node_ids which is used to track nr_active for pools which don't have NUMA
> node associated such as the default fallback system-wide pool.
>
> This doesn't cause any behavior changes visible to userland yet. The next
> patch will expand to implement the control mechanism on top.
>
> v4: - Fixed out-of-bound access when freeing per-cpu workqueues.
>
> v3: - Use flexible array for wq->node_nr_active as suggested by Lai.
>
> v2: - wq->max_active now uses WRITE/READ_ONCE() as suggested by Lai.
>
> - Lai pointed out that pwq_tryinc_nr_active() incorrectly dropped
> pwq->max_active check. Restored. As the next patch replaces the
> max_active enforcement mechanism, this doesn't change the end result.
>
> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
> Reviewed-by: Lai Jiangshan <jiangshanlai@xxxxxxxxx>
> ---
> kernel/workqueue.c | 142 ++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 135 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index b5aba0e5a699..8d465478adb9 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -284,6 +284,16 @@ struct wq_flusher {
>
> struct wq_device;
>
> +/*
> + * Unlike in a per-cpu workqueue where max_active limits its concurrency level
> + * on each CPU, in an unbound workqueue, max_active applies to the whole system.
> + * As sharing a single nr_active across multiple sockets can be very expensive,
> + * the counting and enforcement is per NUMA node.
> + */
> +struct wq_node_nr_active {
> + atomic_t nr; /* per-node nr_active count */
> +};
> +
> /*
> * The externally visible workqueue. It relays the issued work items to
> * the appropriate worker_pool through its pool_workqueues.
> @@ -330,6 +340,7 @@ struct workqueue_struct {
> /* hot fields used during command issue, aligned to cacheline */
> unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
> struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
> + struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
> };
>
> static struct kmem_cache *pwq_cache;
> @@ -1425,6 +1436,31 @@ work_func_t wq_worker_last_func(struct task_struct *task)
> return worker->last_func;
> }
>
> +/**
> + * wq_node_nr_active - Determine wq_node_nr_active to use
> + * @wq: workqueue of interest
> + * @node: NUMA node, can be %NUMA_NO_NODE
> + *
> + * Determine wq_node_nr_active to use for @wq on @node. Returns:
> + *
> + * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
> + *
> + * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
> + *
> + * - Otherwise, node_nr_active[@node].
> + */
> +static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
> + int node)
> +{
> + if (!(wq->flags & WQ_UNBOUND))
> + return NULL;
> +
> + if (node == NUMA_NO_NODE)
> + node = nr_node_ids;
> +
> + return wq->node_nr_active[node];
> +}
> +
> /**
> * get_pwq - get an extra reference on the specified pool_workqueue
> * @pwq: pool_workqueue to get
> @@ -1506,12 +1542,17 @@ static bool pwq_activate_work(struct pool_workqueue *pwq,
> struct work_struct *work)
> {
> struct worker_pool *pool = pwq->pool;
> + struct wq_node_nr_active *nna;
>
> lockdep_assert_held(&pool->lock);
>
> if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
> return false;
>
> + nna = wq_node_nr_active(pwq->wq, pool->node);
> + if (nna)
> + atomic_inc(&nna->nr);
> +
> pwq->nr_active++;
> __pwq_activate_work(pwq, work);
> return true;
> @@ -1528,14 +1569,18 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
> {
> struct workqueue_struct *wq = pwq->wq;
> struct worker_pool *pool = pwq->pool;
> + struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
> bool obtained;
>
> lockdep_assert_held(&pool->lock);
>
> obtained = pwq->nr_active < READ_ONCE(wq->max_active);
>
> - if (obtained)
> + if (obtained) {
> pwq->nr_active++;
> + if (nna)
> + atomic_inc(&nna->nr);
> + }
> return obtained;
> }
>
> @@ -1572,10 +1617,26 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
> static void pwq_dec_nr_active(struct pool_workqueue *pwq)
> {
> struct worker_pool *pool = pwq->pool;
> + struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
>
> lockdep_assert_held(&pool->lock);
>
> + /*
> + * @pwq->nr_active should be decremented for both percpu and unbound
> + * workqueues.
> + */
> pwq->nr_active--;
> +
> + /*
> + * For a percpu workqueue, it's simple. Just need to kick the first
> + * inactive work item on @pwq itself.
> + */
> + if (!nna) {
> + pwq_activate_first_inactive(pwq);
> + return;
> + }
> +
> + atomic_dec(&nna->nr);
> pwq_activate_first_inactive(pwq);
> }
>
> @@ -4039,11 +4100,63 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
> }
> #endif
>
> +static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
> +{
> + int node;
> +
> + for_each_node(node) {
> + kfree(nna_ar[node]);
> + nna_ar[node] = NULL;
> + }
> +
> + kfree(nna_ar[nr_node_ids]);
> + nna_ar[nr_node_ids] = NULL;
> +}
> +
> +static void init_node_nr_active(struct wq_node_nr_active *nna)
> +{
> + atomic_set(&nna->nr, 0);
> +}
> +
> +/*
> + * Each node's nr_active counter will be accessed mostly from its own node and
> + * should be allocated in the node.
> + */
> +static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
> +{
> + struct wq_node_nr_active *nna;
> + int node;
> +
> + for_each_node(node) {
> + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
> + if (!nna)
> + goto err_free;
> + init_node_nr_active(nna);
> + nna_ar[node] = nna;
> + }
> +
> + /* [nr_node_ids] is used as the fallback */
> + nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
> + if (!nna)
> + goto err_free;
> + init_node_nr_active(nna);
> + nna_ar[nr_node_ids] = nna;
> +
> + return 0;
> +
> +err_free:
> + free_node_nr_active(nna_ar);
> + return -ENOMEM;
> +}
> +
> static void rcu_free_wq(struct rcu_head *rcu)
> {
> struct workqueue_struct *wq =
> container_of(rcu, struct workqueue_struct, rcu);
>
> + if (wq->flags & WQ_UNBOUND)
> + free_node_nr_active(wq->node_nr_active);
> +
> wq_free_lockdep(wq);
> free_percpu(wq->cpu_pwq);
> free_workqueue_attrs(wq->unbound_attrs);
> @@ -4785,7 +4898,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
> {
> va_list args;
> struct workqueue_struct *wq;
> - int len;
> + size_t wq_size;
> + int name_len;
>
> /*
> * Unbound && max_active == 1 used to imply ordered, which is no longer
> @@ -4801,7 +4915,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
> flags |= WQ_UNBOUND;
>
> /* allocate wq and format name */
> - wq = kzalloc(sizeof(*wq), GFP_KERNEL);
> + if (flags & WQ_UNBOUND)
> + wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
> + else
> + wq_size = sizeof(*wq);
> +
> + wq = kzalloc(wq_size, GFP_KERNEL);
> if (!wq)
> return NULL;
>
> @@ -4812,11 +4931,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
> }
>
> va_start(args, max_active);
> - len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
> + name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
> va_end(args);
>
> - if (len >= WQ_NAME_LEN)
> - pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name);
> + if (name_len >= WQ_NAME_LEN)
> + pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
> + wq->name);
>
> max_active = max_active ?: WQ_DFL_ACTIVE;
> max_active = wq_clamp_max_active(max_active, flags, wq->name);
> @@ -4835,8 +4955,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
> wq_init_lockdep(wq);
> INIT_LIST_HEAD(&wq->list);
>
> + if (flags & WQ_UNBOUND) {
> + if (alloc_node_nr_active(wq->node_nr_active) < 0)
> + goto err_unreg_lockdep;
> + }
> +
> if (alloc_and_link_pwqs(wq) < 0)
> - goto err_unreg_lockdep;
> + goto err_free_node_nr_active;
>
> if (wq_online && init_rescuer(wq) < 0)
> goto err_destroy;
> @@ -4861,6 +4986,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
>
> return wq;
>
> +err_free_node_nr_active:
> + if (wq->flags & WQ_UNBOUND)
> + free_node_nr_active(wq->node_nr_active);
> err_unreg_lockdep:
> wq_unregister_lockdep(wq);
> wq_free_lockdep(wq);
> --
> 2.43.0
>

I just bisected a crash that I see when booting several different architectures
in QEMU in -next to this change as commit 5797b1c18919 ("workqueue: Implement
system-wide nr_active enforcement for unbound workqueues"). For example, with
arm64 virtconfig (I also see it with ARCH=arm multi_v7_defconfig and
ARCH=riscv defconfig):

$ make -skj"$(nproc)" ARCH=arm64 CROSS_COMPILE=aarch64-linux- virtconfig Image.gz

$ qemu-system-aarch64 \
-display none \
-nodefaults \
-cpu max,pauth-impdef=true \
-machine virt,gic-version=max,virtualization=true \
-append 'console=ttyAMA0 earlycon' \
-kernel arch/arm64/boot/Image.gz \
-initrd rootfs.cpio \
-m 512m \
-serial mon:stdio
[ 0.000000] Booting Linux on physical CPU 0x0000000000 [0x000f0510]
[ 0.000000] Linux version 6.7.0-09946-g5797b1c18919 (nathan@dev-arch.thelio-3990X) (aarch64-linux-gcc (GCC) 13.2.0, GNU ld (GNU Binutils) 2.41) #1 SMP PREEMPT Tue Jan 30 10:10:57 MST 2024
..
[ 0.000000] Unable to handle kernel paging request at virtual address ffff000021c0b380
[ 0.000000] Mem abort info:
[ 0.000000] ESR = 0x0000000096000006
[ 0.000000] EC = 0x25: DABT (current EL), IL = 32 bits
[ 0.000000] SET = 0, FnV = 0
[ 0.000000] EA = 0, S1PTW = 0
[ 0.000000] FSC = 0x06: level 2 translation fault
[ 0.000000] Data abort info:
[ 0.000000] ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000
[ 0.000000] CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[ 0.000000] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[ 0.000000] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000413b1000
[ 0.000000] [ffff000021c0b380] pgd=180000005fff7003, p4d=180000005fff7003, pud=180000005fff6003, pmd=0000000000000000
[ 0.000000] Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
[ 0.000000] Modules linked in:
[ 0.000000] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.7.0-09946-g5797b1c18919 #1
[ 0.000000] Hardware name: linux,dummy-virt (DT)
[ 0.000000] pstate: 600000c9 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 0.000000] pc : wq_update_node_max_active+0x48/0x1d8
[ 0.000000] lr : apply_wqattrs_commit+0x160/0x180
[ 0.000000] sp : ffffda8d476b3be0
[ 0.000000] x29: ffffda8d476b3be0 x28: ffff000001c0d600 x27: 0000000000000000
[ 0.000000] x26: ffff000001c0d6c0 x25: 0000000000000001 x24: 0000000000000200
[ 0.000000] x23: 00000000ffffffff x22: ffffda8d476b9c40 x21: 0000000000000008
[ 0.000000] x20: ffffda8d476b9a40 x19: ffff000001c0b360 x18: ffff00001feebed0
[ 0.000000] x17: 0000000000c65c70 x16: ffff00001feebb28 x15: fffffc0000070488
[ 0.000000] x14: 0000000000000000 x13: 0000000000000000 x12: ffff00001feebb28
[ 0.000000] x11: 0000000000000001 x10: ffff000001c0b388 x9 : 0000000000000000
[ 0.000000] x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff000001c0d600
[ 0.000000] x5 : ffff000001c0d600 x4 : ffff000001c0e880 x3 : ffff000001c0d600
[ 0.000000] x2 : ffff000001c0b388 x1 : ffffda8d476b9000 x0 : 0000000003ffffff
[ 0.000000] Call trace:
[ 0.000000] wq_update_node_max_active+0x48/0x1d8
[ 0.000000] apply_wqattrs_commit+0x160/0x180
[ 0.000000] apply_workqueue_attrs_locked+0x50/0x84
[ 0.000000] alloc_workqueue+0x588/0x6c8
[ 0.000000] workqueue_init_early+0x480/0x554
[ 0.000000] start_kernel+0x240/0x5e8
[ 0.000000] __primary_switched+0xb8/0xc0
[ 0.000000] Code: f9418033 d000a081 9100a262 f90037e2 (f8607840)
[ 0.000000] ---[ end trace 0000000000000000 ]---
[ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task!
[ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

If there is any more information I can provide or patches I can test, I
am more than happy to do so.

Cheers,
Nathan

# bad: [41d66f96d0f15a0a2ad6fa2208f6bac1a66cbd52] Add linux-next specific files for 20240130
# good: [41bccc98fb7931d63d03f326a746ac4d429c1dd3] Linux 6.8-rc2
git bisect start '41d66f96d0f15a0a2ad6fa2208f6bac1a66cbd52' '41bccc98fb7931d63d03f326a746ac4d429c1dd3'
# good: [f3f89885646036e16b325aea597fc9f375f1a56a] Merge branch 'main' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
git bisect good f3f89885646036e16b325aea597fc9f375f1a56a
# good: [8042d32dd6c3730b0b4c8c9c811e204ed9f5f829] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
git bisect good 8042d32dd6c3730b0b4c8c9c811e204ed9f5f829
# bad: [f0edba72fe5ae932877f49796aaef8adf1a2eb8c] Merge branch 'togreg' of git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git
git bisect bad f0edba72fe5ae932877f49796aaef8adf1a2eb8c
# bad: [549632942a27cc3987473f8a8629200bc2ab0734] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
git bisect bad 549632942a27cc3987473f8a8629200bc2ab0734
# good: [9eb48e8d465d67362dac65963979e65cb2ad7634] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu.git
git bisect good 9eb48e8d465d67362dac65963979e65cb2ad7634
# good: [bc83a87759cabbf6f3366568e44bda088b315204] dt-bindings: usb: dwc3: Add snps,host-vbus-glitches-quirk avoid vbus glitch
git bisect good bc83a87759cabbf6f3366568e44bda088b315204
# bad: [2c0ea2f8eb6140767fae8d01a30ce45dcf4ead85] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git
git bisect bad 2c0ea2f8eb6140767fae8d01a30ce45dcf4ead85
# good: [a045a272d887575da17ad86d6573e82871b50c27] workqueue: Move pwq->max_active to wq->max_active
git bisect good a045a272d887575da17ad86d6573e82871b50c27
# good: [9f66cff212bb3c1cd25996aaa0dfd0c9e9d8baab] workqueue: RCU protect wq->dfl_pwq and implement accessors for it
git bisect good 9f66cff212bb3c1cd25996aaa0dfd0c9e9d8baab
# good: [91ccc6e7233bb10a9c176aa4cc70d6f432a441a5] workqueue: Introduce struct wq_node_nr_active
git bisect good 91ccc6e7233bb10a9c176aa4cc70d6f432a441a5
# bad: [07daa99b7fd7adfffa22180184e39ec124e73013] tools/workqueue/wq_dump.py: Add node_nr/max_active dump
git bisect bad 07daa99b7fd7adfffa22180184e39ec124e73013
# bad: [5797b1c18919cd9c289ded7954383e499f729ce0] workqueue: Implement system-wide nr_active enforcement for unbound workqueues
git bisect bad 5797b1c18919cd9c289ded7954383e499f729ce0
# first bad commit: [5797b1c18919cd9c289ded7954383e499f729ce0] workqueue: Implement system-wide nr_active enforcement for unbound workqueues