[RFC PATCH 2/3] fsio-throttle: controller infrastructure

From: Andrea Righi
Date: Fri Jan 18 2019 - 05:32:01 EST


This is the core of the fsio-throttle controller: it defines the
interface to the cgroup subsystem and implements the I/O measurement and
throttling logic.

Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx>
---
include/linux/cgroup_subsys.h | 4 +
include/linux/fsio-throttle.h | 43 +++
init/Kconfig | 11 +
kernel/cgroup/Makefile | 1 +
kernel/cgroup/fsio-throttle.c | 501 ++++++++++++++++++++++++++++++++++
5 files changed, 560 insertions(+)
create mode 100644 include/linux/fsio-throttle.h
create mode 100644 kernel/cgroup/fsio-throttle.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..33beb70c0eca 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -61,6 +61,10 @@ SUBSYS(pids)
SUBSYS(rdma)
#endif

+#if IS_ENABLED(CONFIG_CGROUP_FSIO_THROTTLE)
+SUBSYS(fsio)
+#endif
+
/*
* The following subsystems are not supported on the default hierarchy.
*/
diff --git a/include/linux/fsio-throttle.h b/include/linux/fsio-throttle.h
new file mode 100644
index 000000000000..3a46df712475
--- /dev/null
+++ b/include/linux/fsio-throttle.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __FSIO_THROTTLE_H__
+#define __FSIO_THROTTLE_H__
+
+#include <linux/fs.h>
+#include <linux/genhd.h>
+
+#ifdef CONFIG_BLOCK
+static inline dev_t bdev_to_dev(struct block_device *bdev)
+{
+ return bdev ? MKDEV(MAJOR(bdev->bd_inode->i_rdev),
+ bdev->bd_disk->first_minor) : 0;
+}
+
+static inline struct block_device *as_to_bdev(struct address_space *mapping)
+{
+ return (mapping->host && mapping->host->i_sb->s_bdev) ?
+ mapping->host->i_sb->s_bdev : NULL;
+}
+#else /* CONFIG_BLOCK */
+static dev_t bdev_to_dev(struct block_device *bdev)
+{
+ return 0;
+}
+
+static inline struct block_device *as_to_bdev(struct address_space *mapping)
+{
+ return NULL;
+}
+#endif /* CONFIG_BLOCK */
+
+#ifdef CONFIG_CGROUP_FSIO_THROTTLE
+int fsio_throttle(dev_t dev, ssize_t bytes, int state);
+#else /* CONFIG_CGROUP_FSIO_THROTTLE */
+static inline int
+fsio_throttle(dev_t dev, ssize_t bytes, int state)
+{
+ return 0;
+}
+#endif /* CONFIG_CGROUP_FSIO_THROTTLE */
+
+#endif /* __FSIO_THROTTLE_H__ */
diff --git a/init/Kconfig b/init/Kconfig
index d47cb77a220e..95d7342801eb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -775,6 +775,17 @@ config CGROUP_WRITEBACK
depends on MEMCG && BLK_CGROUP
default y

+config CGROUP_FSIO_THROTTLE
+ bool "Filesystem I/O throttling controller"
+ default n
+ depends on BLOCK
+ help
+ This option enables filesystem I/O throttling infrastructure.
+
+ This allows to properly throttle reads and writes at the filesystem
+ level, without introducing I/O locking contentions or priority
+ inversion problems.
+
menuconfig CGROUP_SCHED
bool "CPU controller"
default n
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index bfcdae896122..12de828b36cd 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -2,6 +2,7 @@
obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o

obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_FSIO_THROTTLE) += fsio-throttle.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/fsio-throttle.c b/kernel/cgroup/fsio-throttle.c
new file mode 100644
index 000000000000..46f3ffd4015b
--- /dev/null
+++ b/kernel/cgroup/fsio-throttle.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * fsio-throttle.c - I/O cgroup controller
+ *
+ * Copyright (C) 2019 Andrea Righi <righi.andrea@xxxxxxxxx>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/moduleparam.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
+#include <linux/cgroup.h>
+#include <linux/fsio-throttle.h>
+
+#define KB(x) ((x) * 1024)
+#define MB(x) (KB(KB(x)))
+#define GB(x) (MB(KB(x)))
+
+static int throttle_kernel_threads __read_mostly;
+module_param(throttle_kernel_threads, int, 0644);
+MODULE_PARM_DESC(throttle_kernel_threads,
+ "enable/disable I/O throttling for kernel threads");
+
+static int throttle_timeslice_ms __read_mostly = 250;
+module_param(throttle_timeslice_ms, int, 0644);
+MODULE_PARM_DESC(throttle_kernel_threads,
+ "throttling time slice (default 250ms)");
+
+static int throttle_timeframe_ms __read_mostly = 2000;
+module_param(throttle_timeframe_ms, int, 0644);
+MODULE_PARM_DESC(throttle_kernel_threads,
+ "maximum sleep time enforced (default 2000ms)");
+
+struct iothrottle {
+ struct cgroup_subsys_state css;
+ struct list_head list;
+ /* protect the list of iothrottle_node elements (list) */
+ struct mutex lock;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+ bool timer_cancel;
+ /* protect the wait queue elements */
+ spinlock_t wait_lock;
+};
+
+struct iothrottle_limit {
+ unsigned long long usage;
+ unsigned long long bucket_size;
+ unsigned long long limit;
+ unsigned long long timestamp;
+ /* protect all of the above */
+ spinlock_t lock;
+};
+
+struct iothrottle_node {
+ struct list_head node;
+ struct rcu_head rcu;
+ struct iothrottle_limit bw;
+ dev_t dev;
+};
+
+static inline bool iothrottle_disabled(void)
+{
+ return !cgroup_subsys_enabled(fsio_cgrp_subsys);
+}
+
+static struct iothrottle *css_to_iothrottle(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct iothrottle, css) : NULL;
+}
+
+struct iothrottle *task_to_iothrottle(struct task_struct *p)
+{
+ if (unlikely(!p))
+ return NULL;
+ return css_to_iothrottle(task_css(p, fsio_cgrp_id));
+}
+
+static inline unsigned long long
+iothrottle_limit_delta_t(struct iothrottle_limit *res)
+{
+ return (long long)get_jiffies_64() - (long long)res->timestamp;
+}
+
+static void iothrottle_limit_init(struct iothrottle_limit *res,
+ unsigned long long limit,
+ unsigned long long bucket_size)
+{
+ spin_lock_init(&res->lock);
+ res->limit = limit;
+ res->usage = 0;
+ res->bucket_size = bucket_size;
+ res->timestamp = get_jiffies_64();
+}
+
+static unsigned long long
+iothrottle_limit_sleep(struct iothrottle_limit *res, unsigned long long size)
+{
+ unsigned long long delta;
+ long long tok;
+ unsigned long flags;
+
+ spin_lock_irqsave(&res->lock, flags);
+ res->usage -= size;
+ delta = jiffies_to_msecs(iothrottle_limit_delta_t(res));
+ res->timestamp = get_jiffies_64();
+ tok = (long long)res->usage * MSEC_PER_SEC;
+ if (delta) {
+ long long max = (long long)res->bucket_size * MSEC_PER_SEC;
+
+ tok += delta * res->limit;
+ tok = min_t(long long, tok, max);
+ res->usage = (unsigned long long)div_s64(tok, MSEC_PER_SEC);
+ }
+ spin_unlock_irqrestore(&res->lock, flags);
+
+ return (tok < 0) ? msecs_to_jiffies(div_u64(-tok, res->limit)) : 0;
+}
+
+static void iothrottle_limit_reset(struct iothrottle_limit *res)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&res->lock, flags);
+ res->usage = 0;
+ spin_unlock_irqrestore(&res->lock, flags);
+}
+
+static inline int iothrottle_node_size(void)
+{
+ return sizeof(struct iothrottle_node);
+}
+
+static struct iothrottle_node *iothrottle_node_alloc(gfp_t flags)
+{
+ struct iothrottle_node *n;
+ int size = iothrottle_node_size();
+
+ if (size < PAGE_SIZE)
+ n = kmalloc(size, flags);
+ else
+ n = vmalloc(size);
+ if (n)
+ memset(n, 0, size);
+ return n;
+}
+
+static void iothrottle_node_free(struct iothrottle_node *n)
+{
+ if (iothrottle_node_size() < PAGE_SIZE)
+ kfree(n);
+ else
+ vfree(n);
+}
+
+static struct iothrottle_node *
+iothrottle_node_search(const struct iothrottle *iot, dev_t dev)
+{
+ struct iothrottle_node *n;
+
+ list_for_each_entry_rcu(n, &iot->list, node)
+ if (n->dev == dev)
+ return n;
+ return NULL;
+}
+
+static void iothrottle_node_reclaim(struct rcu_head *rp)
+{
+ struct iothrottle_node *n;
+
+ n = container_of(rp, struct iothrottle_node, rcu);
+ iothrottle_node_free(n);
+}
+
+static int iothrottle_parse_args(char *buf, size_t nbytes,
+ dev_t *dev,
+ unsigned long long *io_limit,
+ unsigned long long *bucket_size)
+{
+ struct gendisk *disk;
+ unsigned int major, minor;
+ unsigned long long limit, size;
+ int part, ret = 0;
+
+ if (sscanf(buf, "%u:%u %llu %llu", &major, &minor, &limit, &size) != 4)
+ return -EINVAL;
+ disk = get_gendisk(MKDEV(major, minor), &part);
+ if (!disk)
+ return -ENODEV;
+ if (part) {
+ ret = -ENODEV;
+ goto out;
+ }
+ *dev = MKDEV(major, minor);
+ *io_limit = MB(limit);
+ *bucket_size = MB(size);
+out:
+ put_disk_and_module(disk);
+
+ return ret;
+}
+
+static ssize_t iothrottle_write(struct kernfs_open_file *of,
+ char *buffer, size_t nbytes, loff_t off)
+{
+ struct iothrottle *iot;
+ struct iothrottle_node *n, *newn = NULL;
+ unsigned long long io_limit, bucket_size;
+ dev_t dev;
+ char *buf;
+ int ret;
+
+ /*
+ * We need to allocate a new buffer here, because
+ * iothrottle_parse_args() can modify it and the buffer provided by
+ * write_string is supposed to be const.
+ */
+ buf = kmalloc(nbytes + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memcpy(buf, buffer, nbytes + 1);
+
+ ret = iothrottle_parse_args(buf, nbytes, &dev, &io_limit, &bucket_size);
+ if (ret)
+ goto out_free;
+
+ newn = iothrottle_node_alloc(GFP_KERNEL);
+ if (!newn) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ newn->dev = dev;
+ iothrottle_limit_init(&newn->bw, io_limit, bucket_size);
+
+ iot = css_to_iothrottle(of_css(of));
+ if (unlikely(!iot)) {
+ WARN_ON_ONCE(1);
+ goto out_free;
+ }
+ mutex_lock(&iot->lock);
+ n = iothrottle_node_search(iot, dev);
+ if (!n) {
+ /* Insert new node */
+ if (io_limit) {
+ list_add_rcu(&newn->node, &iot->list);
+ newn = NULL;
+ }
+ } else if (!io_limit) {
+ /* Delete existing node */
+ list_del_rcu(&n->node);
+ } else {
+ /* Update existing node */
+ list_replace_rcu(&n->node, &newn->node);
+ newn = NULL;
+ }
+ mutex_unlock(&iot->lock);
+ if (n)
+ call_rcu(&n->rcu, iothrottle_node_reclaim);
+ ret = nbytes;
+out_free:
+ if (newn)
+ iothrottle_node_free(newn);
+ kfree(buf);
+ return ret;
+}
+
+static void iothrottle_show_limit(struct seq_file *m,
+ dev_t dev, struct iothrottle_limit *res)
+{
+ seq_put_decimal_ull(m, "", MAJOR(dev));
+ seq_put_decimal_ull(m, ":", MINOR(dev));
+ seq_put_decimal_ull(m, " ", res->limit);
+ seq_put_decimal_ull(m, " ", res->usage);
+ seq_put_decimal_ull(m, " ", res->bucket_size);
+ seq_put_decimal_ull(m, " ",
+ jiffies_to_clock_t(iothrottle_limit_delta_t(res)));
+ seq_putc(m, '\n');
+}
+
+static int iothrottle_read(struct seq_file *m, void *v)
+{
+ struct iothrottle *iot = css_to_iothrottle(seq_css(m));
+ struct iothrottle_node *n;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(n, &iot->list, node)
+ iothrottle_show_limit(m, n->dev, &n->bw);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static struct cftype iothrottle_files[] = {
+ {
+ .name = "max_mbs",
+ .seq_show = iothrottle_read,
+ .write = iothrottle_write,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+};
+
+static void iothrottle_wakeup(struct iothrottle *iot, bool timer_cancel)
+{
+ spin_lock_bh(&iot->wait_lock);
+ if (timer_cancel)
+ iot->timer_cancel = true;
+ wake_up_all(&iot->wait);
+ spin_unlock_bh(&iot->wait_lock);
+}
+
+static void iothrottle_timer_wakeup(struct timer_list *t)
+{
+ struct iothrottle *iot = from_timer(iot, t, timer);
+
+ iothrottle_wakeup(iot, false);
+}
+
+static struct cgroup_subsys_state *
+iothrottle_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct iothrottle *iot;
+
+ iot = kzalloc(sizeof(*iot), GFP_KERNEL);
+ if (!iot)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&iot->list);
+ mutex_init(&iot->lock);
+ init_waitqueue_head(&iot->wait);
+ spin_lock_init(&iot->wait_lock);
+ iot->timer_cancel = false;
+ timer_setup(&iot->timer, iothrottle_timer_wakeup, 0);
+
+ return &iot->css;
+}
+
+static void iothrottle_css_offline(struct cgroup_subsys_state *css)
+{
+ struct iothrottle *iot = css_to_iothrottle(css);
+
+ spin_lock_bh(&iot->wait_lock);
+ iot->timer_cancel = true;
+ spin_unlock_bh(&iot->wait_lock);
+
+ iothrottle_wakeup(iot, true);
+}
+
+static void iothrottle_css_free(struct cgroup_subsys_state *css)
+{
+ struct iothrottle_node *n, *p;
+ struct iothrottle *iot = css_to_iothrottle(css);
+
+ del_timer_sync(&iot->timer);
+ /*
+ * don't worry about locking here, at this point there's no reference
+ * to the list.
+ */
+ list_for_each_entry_safe(n, p, &iot->list, node)
+ iothrottle_node_free(n);
+ kfree(iot);
+}
+
+static inline bool is_kernel_thread(void)
+{
+ return !!(current->flags & (PF_KTHREAD | PF_KSWAPD));
+}
+
+static inline bool is_urgent_task(void)
+{
+ /* Never throttle tasks that are going to exit */
+ if (current->flags & PF_EXITING)
+ return true;
+ /* Throttle kernel threads only if throttle_kernel_threads is set */
+ return is_kernel_thread() && !throttle_kernel_threads;
+}
+
+static struct iothrottle *try_get_iothrottle_from_task(struct task_struct *p)
+{
+ struct iothrottle *iot = NULL;
+
+ rcu_read_lock();
+ if (!task_css_is_root(p, fsio_cgrp_id)) {
+ do {
+ iot = task_to_iothrottle(p);
+ if (unlikely(!iot))
+ break;
+ } while (!css_tryget_online(&iot->css));
+ }
+ rcu_read_unlock();
+
+ return iot;
+}
+
+static int iothrottle_evaluate_sleep(struct iothrottle *iot, dev_t dev,
+ ssize_t bytes, int state)
+{
+ struct iothrottle_node *n;
+ unsigned long long sleep = 0;
+
+ rcu_read_lock();
+ n = iothrottle_node_search(iot, dev);
+ if (n) {
+ sleep = iothrottle_limit_sleep(&n->bw, bytes);
+ /*
+ * state == 0 is used to do only I/O accounting without
+ * enforcing sleeps.
+ */
+ if (!state || sleep < msecs_to_jiffies(throttle_timeslice_ms))
+ sleep = 0;
+ if (sleep)
+ iothrottle_limit_reset(&n->bw);
+ }
+ rcu_read_unlock();
+
+ return sleep;
+}
+
+static noinline void iothrottle_force_sleep(struct iothrottle *iot,
+ unsigned long long sleep,
+ int state)
+{
+ unsigned long expire, now;
+
+ /*
+ * Allow small IO bursts, by waking up the throttled task after a
+ * maximum sleep of throttle_timeframe millisec.
+ */
+ if (sleep > msecs_to_jiffies(throttle_timeframe_ms))
+ sleep = msecs_to_jiffies(throttle_timeframe_ms);
+
+ now = READ_ONCE(jiffies);
+ expire = now + sleep;
+
+ /*
+ * Round up the time to sleep to a multiple of the sleep timeslice.
+ *
+ * In this way we can strongly reduce timer softirqs and
+ * context switches in the system even when there are a lot of
+ * different cgroups.
+ */
+ expire = roundup(expire, msecs_to_jiffies(throttle_timeslice_ms));
+
+ /* Force sleep */
+ do {
+ DEFINE_WAIT(wait);
+
+ spin_lock_bh(&iot->wait_lock);
+ if (unlikely(iot->timer_cancel)) {
+ spin_unlock_bh(&iot->wait_lock);
+ break;
+ }
+ mod_timer(&iot->timer, expire);
+ spin_unlock_bh(&iot->wait_lock);
+
+ /*
+ * Do not enforce interruptible sleep if there are pending
+ * signals, otherwise we'll end up into a busy loop.
+ */
+ if (signal_pending(current))
+ state = TASK_KILLABLE;
+
+ /* Send to sleep */
+ prepare_to_wait(&iot->wait, &wait, state);
+ schedule();
+ finish_wait(&iot->wait, &wait);
+ } while (!fatal_signal_pending(current) &&
+ time_is_after_jiffies(expire));
+}
+
+int fsio_throttle(dev_t dev, ssize_t bytes, int state)
+{
+ struct iothrottle *iot;
+ unsigned long long sleep = 0;
+
+ if (iothrottle_disabled() || is_urgent_task())
+ return 0;
+ if (!dev)
+ return 0;
+ iot = try_get_iothrottle_from_task(current);
+ if (!iot)
+ return 0;
+ sleep = iothrottle_evaluate_sleep(iot, dev, bytes, state);
+ if (unlikely(sleep))
+ iothrottle_force_sleep(iot, sleep, state);
+ css_put(&iot->css);
+
+ return sleep;
+}
+
+struct cgroup_subsys fsio_cgrp_subsys = {
+ .css_alloc = iothrottle_css_alloc,
+ .css_free = iothrottle_css_free,
+ .css_offline = iothrottle_css_offline,
+ .dfl_cftypes = iothrottle_files,
+};
--
2.17.1