[PATCH v3 2/5] workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask

From: Waiman Long
Date: Wed Nov 15 2023 - 12:04:26 EST


When the "isolcpus" boot command line option is used to add a set
of isolated CPUs, those CPUs will be excluded automatically from
wq_unbound_cpumask to avoid running work functions from unbound
workqueues.

Recently cpuset has been extended to allow the creation of partitions
of isolated CPUs dynamically. To make it closer to the "isolcpus"
in functionality, the CPUs in those isolated cpuset partitions should be
excluded from wq_unbound_cpumask as well. This can be done currently by
explicitly writing to the workqueue's cpumask sysfs file after creating
the isolated partitions. However, this process can be error prone.

Ideally, the cpuset code should be allowed to request the workqueue code
to exclude those isolated CPUs from wq_unbound_cpumask so that this
operation can be done automatically and the isolated CPUs will be returned
back to wq_unbound_cpumask after the destructions of the isolated
cpuset partitions.

This patch adds a new workqueue_unbound_exclude_cpumask() function to
enable that. This new function will exclude the specified isolated
CPUs from wq_unbound_cpumask. To be able to restore those isolated
CPUs back after the destruction of isolated cpuset partitions, a new
wq_requested_unbound_cpumask is added to store the user provided unbound
cpumask either from the boot command line options or from writing to
the cpumask sysfs file. This new cpumask provides the basis for CPU
exclusion.

To enable users to understand how the wq_unbound_cpumask is being
modified internally, this patch also exposes the newly introduced
wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to
store the cpumask to be excluded from wq_unbound_cpumask as read-only
sysfs files.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
include/linux/workqueue.h | 1 +
kernel/workqueue.c | 91 +++++++++++++++++++++++++++++++++++----
2 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index cf49b467bd57..b0b9604b76b8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -491,6 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void);
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs);
+extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);

extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8996cdba8f07..367a93d4a5e4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -381,6 +381,12 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

+/* PL: user requested unbound cpumask via sysfs */
+static cpumask_var_t wq_requested_unbound_cpumask;
+
+/* PL: isolated cpumask to be excluded from unbound cpumask */
+static cpumask_var_t wq_isolated_cpumask;
+
/* for further constrain wq_unbound_cpumask by cmdline parameter*/
static struct cpumask wq_cmdline_cpumask __initdata;

@@ -5784,7 +5790,7 @@ void thaw_workqueues(void)
}
#endif /* CONFIG_FREEZER */

-static int __maybe_unused workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
+static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
LIST_HEAD(ctxs);
int ret = 0;
@@ -5827,6 +5833,44 @@ static int __maybe_unused workqueue_apply_unbound_cpumask(const cpumask_var_t un
return ret;
}

+/**
+ * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
+ * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
+ *
+ * This function can be called from cpuset code to provide a set of isolated
+ * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
+ * either cpus_read_lock or cpus_write_lock.
+ */
+int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
+{
+ cpumask_var_t cpumask;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ lockdep_assert_cpus_held();
+ mutex_lock(&wq_pool_mutex);
+
+ /* Save the current isolated cpumask & export it via sysfs */
+ cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+
+ /*
+ * If the operation fails, it will fall back to
+ * wq_requested_unbound_cpumask which is initially set to
+ * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+ * by any subsequent write to workqueue/cpumask sysfs file.
+ */
+ if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+ cpumask_copy(cpumask, wq_requested_unbound_cpumask);
+ if (!cpumask_equal(cpumask, wq_unbound_cpumask))
+ ret = workqueue_apply_unbound_cpumask(cpumask);
+
+ mutex_unlock(&wq_pool_mutex);
+ free_cpumask_var(cpumask);
+ return ret;
+}
+
static int parse_affn_scope(const char *val)
{
int i;
@@ -6144,6 +6188,7 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
apply_wqattrs_lock();
+ cpumask_copy(wq_requested_unbound_cpumask, cpumask);
if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
ret = 0;
goto out_unlock;
@@ -6158,19 +6203,36 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
return ret;
}

-static ssize_t wq_unbound_cpumask_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t __wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
int written;

mutex_lock(&wq_pool_mutex);
- written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
- cpumask_pr_args(wq_unbound_cpumask));
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
mutex_unlock(&wq_pool_mutex);

return written;
}

+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
+}
+
+static ssize_t wq_requested_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
+}
+
+static ssize_t wq_isolated_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
+}
+
static ssize_t wq_unbound_cpumask_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
@@ -6188,9 +6250,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev,
return ret ? ret : count;
}

-static struct device_attribute wq_sysfs_cpumask_attr =
+static struct device_attribute wq_sysfs_cpumask_attrs[] = {
__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
- wq_unbound_cpumask_store);
+ wq_unbound_cpumask_store),
+ __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),
+ __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),
+ __ATTR_NULL,
+};

static int __init wq_sysfs_init(void)
{
@@ -6203,7 +6269,13 @@ static int __init wq_sysfs_init(void)

dev_root = bus_get_dev_root(&wq_subsys);
if (dev_root) {
- err = device_create_file(dev_root, &wq_sysfs_cpumask_attr);
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {
+ err = device_create_file(dev_root, attr);
+ if (err)
+ break;
+ }
put_device(dev_root);
}
return err;
@@ -6534,11 +6606,14 @@ void __init workqueue_init_early(void)
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));

if (!cpumask_empty(&wq_cmdline_cpumask))
cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+ cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);

pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

--
2.39.3