[PATCH 3/3] workqueue: Add anon workqueue sysfs hierarchy

From: Frederic Weisbecker
Date: Fri Mar 14 2014 - 12:39:47 EST


We call "anon workqueues" the set of unbound workqueues that don't
carry the WQ_SYSFS flag.

They are a problem nowaday because people who work on CPU isolation
(HPC, Real time, etc...) want to be able to migrate all the unbound
workqueues away to a single CPU. This control is possible through sysfs
but only with WQ_SYSFS workqueues.

Now we need to deal with the other unbound workqueues. There is two
possible solutions:

1) Implement a sysfs directory for each unbound !WQ_SYSFS. This could
be done with a specific Kconfig to make sure that these workqueue
won't be considered as a stable ABI. But we all know that all distros
will enable this Kconfig symbol and that a warning in the Kconfig help
text won't protect against anything.

2) Implement a single sysfs directory containing only the cpumask file
to the control the affinity of all the !WQ_SYSFS workqueues.

This patch implements the second solution.

Two issues I have seen though:

* This triggers the following warning in apply_workqueue_attrs():

/* creating multiple pwqs breaks ordering guarantee */
if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
return -EINVAL;

I haven't yet checked into the details.

* wq_calc_node_cpumask() tells that if NUMA affinity is not enabled,
cpumask is always used. Which suggest that if NUMA affinity is enabled
the cpumask may be ignored?

Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Kevin Hilman <khilman@xxxxxxxxxx>
Cc: Mike Galbraith <bitbucket@xxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Viresh Kumar <viresh.kumar@xxxxxxxxxx>
Not-Yet-Signed-off-by: Frederic Weisbecker <fweisbec@xxxxxxxxx>
---
kernel/workqueue.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 112 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ad8f727..aabee1f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -289,7 +289,8 @@ static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;

static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
-static DEFINE_MUTEX(wq_unbound_mutex); /* protects list of unbound workqueues */
+/* protects list of unbound workqueues and wq_anon_cpumask*/
+static DEFINE_MUTEX(wq_unbound_mutex);
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */

static LIST_HEAD(workqueues); /* PL: list of all workqueues */
@@ -3311,13 +3312,122 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
__ATTR_NULL,
};

+/* Protected by wq_unbound_mutex */
+static cpumask_t wq_anon_cpumask;
+static ssize_t wq_anon_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int written;
+
+ mutex_lock(&wq_unbound_mutex);
+ written = cpumask_scnprintf(buf, PAGE_SIZE, &wq_anon_cpumask);
+ mutex_unlock(&wq_unbound_mutex);
+
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+
+ return written;
+}
+
+/* Must be called with wq_unbound_mutex held */
+static int wq_anon_cpumask_set(cpumask_var_t cpumask)
+{
+ struct workqueue_attrs *attrs;
+ struct workqueue_struct *wq;
+ int ret;
+
+ list_for_each_entry(wq, &workqueues_unbound, unbound_list) {
+ if (wq->flags & WQ_SYSFS)
+ continue;
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ cpumask_copy(attrs->cpumask, cpumask);
+ ret = apply_workqueue_attrs(wq, attrs);
+ free_workqueue_attrs(attrs);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static ssize_t wq_anon_cpumask_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ cpumask_var_t cpumask;
+ int ret = -EINVAL;
+
+ if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, cpumask);
+ if (ret)
+ goto out;
+
+ get_online_cpus();
+ if (cpumask_intersects(cpumask, cpu_online_mask)) {
+ mutex_lock(&wq_unbound_mutex);
+ ret = wq_anon_cpumask_set(cpumask);
+ if (!ret)
+ cpumask_copy(&wq_anon_cpumask, cpumask);
+ mutex_unlock(&wq_unbound_mutex);
+ }
+ put_online_cpus();
+out:
+ free_cpumask_var(cpumask);
+ return ret ? ret : count;
+}
+
+static void device_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+static struct device_attribute wq_sysfs_anon_attr =
+ __ATTR(cpumask, 0644, wq_anon_cpumask_show, wq_anon_cpumask_store);
+
static struct bus_type wq_subsys = {
.name = "workqueue",
};

static int __init wq_sysfs_init(void)
{
- return subsys_virtual_register(&wq_subsys, NULL);
+ struct device *anon_dev;
+ int ret;
+
+ ret = subsys_virtual_register(&wq_subsys, NULL);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&wq_unbound_mutex);
+ cpumask_copy(&wq_anon_cpumask, cpu_possible_mask);
+ mutex_unlock(&wq_unbound_mutex);
+
+ anon_dev = kzalloc(sizeof(*anon_dev), GFP_KERNEL);
+ if (!anon_dev)
+ return -ENOMEM;
+
+ anon_dev->bus = &wq_subsys;
+ anon_dev->init_name = "anon_wqs";
+ anon_dev->release = device_release;
+
+ ret = device_register(anon_dev);
+ if (ret) {
+ kfree(anon_dev);
+ return ret;
+ }
+
+ ret = device_create_file(anon_dev, &wq_sysfs_anon_attr);
+ if (ret) {
+ device_unregister(anon_dev);
+ return ret;
+ }
+
+ kobject_uevent(&anon_dev->kobj, KOBJ_ADD);
+
+ return 0;
}
core_initcall(wq_sysfs_init);

--
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/