[PATCH 6/7] io-throttle instrumentation

From: Andrea Righi
Date: Sun May 03 2009 - 07:38:59 EST


Apply the io-throttle control and page tracking to the opportune kernel
functions.

Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx>
---
block/blk-core.c | 8 ++++++++
fs/aio.c | 12 ++++++++++++
fs/block_dev.c | 3 +++
fs/buffer.c | 2 ++
fs/direct-io.c | 3 +++
include/linux/fs.h | 4 ++++
include/linux/sched.h | 8 ++++++++
kernel/fork.c | 8 ++++++++
mm/bounce.c | 2 ++
mm/filemap.c | 2 ++
mm/page-writeback.c | 13 +++++++++++++
mm/readahead.c | 3 +++
12 files changed, 68 insertions(+), 0 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2998fe3..a9689df 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -26,6 +26,7 @@
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/blk-io-throttle.h>
#include <linux/blktrace_api.h>
#include <linux/fault-inject.h>
#include <trace/block.h>
@@ -1549,11 +1550,16 @@ void submit_bio(int rw, struct bio *bio)
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
+ unsigned long sleep = 0;
+
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
+ sleep = cgroup_io_throttle(bio,
+ bio->bi_bdev, bio->bi_size);
} else {
task_io_account_read(bio->bi_size);
count_vm_events(PGPGIN, count);
+ cgroup_io_throttle(NULL, bio->bi_bdev, bio->bi_size);
}

if (unlikely(block_dump)) {
@@ -1564,6 +1570,8 @@ void submit_bio(int rw, struct bio *bio)
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev, b));
}
+ if (sleep && !iothrottle_make_request(bio, jiffies + sleep))
+ return;
}

generic_make_request(bio);
diff --git a/fs/aio.c b/fs/aio.c
index 76da125..ab6c457 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -22,6 +22,7 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
+#include <linux/blk-io-throttle.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
@@ -1587,6 +1588,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
{
struct kiocb *req;
struct file *file;
+ struct block_device *bdev;
ssize_t ret;

/* enforce forwards compatibility on users */
@@ -1609,6 +1611,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
if (unlikely(!file))
return -EBADF;

+ /* check if we're exceeding the IO throttling limits */
+ bdev = as_to_bdev(file->f_mapping);
+ ret = cgroup_io_throttle(NULL, bdev, 0);
+ if (unlikely(ret)) {
+ fput(file);
+ return -EAGAIN;
+ }
+
req = aio_get_req(ctx); /* returns with 2 references to req */
if (unlikely(!req)) {
fput(file);
@@ -1652,12 +1662,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
goto out_put_req;

spin_lock_irq(&ctx->ctx_lock);
+ set_in_aio();
aio_run_iocb(req);
if (!list_empty(&ctx->run_list)) {
/* drain the run list */
while (__aio_run_iocbs(ctx))
;
}
+ unset_in_aio();
spin_unlock_irq(&ctx->ctx_lock);
aio_put_req(req); /* drop extra ref to req */
return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc1..21d1adf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -431,6 +431,9 @@ static void init_once(void *foo)
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_list);
#endif
+#ifdef CGROUP_IO_THROTTLE
+ bdev->last_access = jiffies;
+#endif
inode_init_once(&ei->vfs_inode);
/* Initialize mutex for freeze. */
mutex_init(&bdev->bd_fsfreeze_mutex);
diff --git a/fs/buffer.c b/fs/buffer.c
index aed2977..ecdcff5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -36,6 +36,7 @@
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
+#include <linux/blk-io-throttle.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
@@ -668,6 +669,7 @@ static void __set_page_dirty(struct page *page,
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
+ iothrottle_set_pagedirty_owner(page, current->mm);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bb..1b304b6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -28,6 +28,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/blk-io-throttle.h>
#include <linux/bio.h>
#include <linux/wait.h>
#include <linux/err.h>
@@ -340,7 +341,9 @@ static void dio_bio_submit(struct dio *dio)
if (dio->is_async && dio->rw == READ)
bio_set_pages_dirty(bio);

+ set_in_dio();
submit_bio(dio->rw, bio);
+ unset_in_dio();

dio->bio = NULL;
dio->boundary = 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bed436..701fc72 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -656,6 +656,10 @@ struct block_device {
struct gendisk * bd_disk;
struct list_head bd_list;
struct backing_dev_info *bd_inode_backing_dev_info;
+#ifdef CONFIG_CGROUP_IO_THROTTLE
+ unsigned int last_access;
+ unsigned int last_io_ticks;
+#endif
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..3294430 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1356,6 +1356,14 @@ struct task_struct {
unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
struct task_io_accounting ioac;
+#ifdef CONFIG_CGROUP_IO_THROTTLE
+ atomic_t in_aio;
+ atomic_t in_dio;
+ unsigned long long io_throttle_bw_cnt;
+ unsigned long long io_throttle_bw_sleep;
+ unsigned long long io_throttle_iops_cnt;
+ unsigned long long io_throttle_iops_sleep;
+#endif
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd..7b4d991 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1043,6 +1043,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);

+#ifdef CONFIG_CGROUP_IO_THROTTLE
+ atomic_set(&p->in_aio, 0);
+ atomic_set(&p->in_dio, 0);
+ p->io_throttle_bw_cnt = 0;
+ p->io_throttle_bw_sleep = 0;
+ p->io_throttle_iops_cnt = 0;
+ p->io_throttle_iops_sleep = 0;
+#endif
posix_cpu_timers_init(p);

p->lock_depth = -1; /* -1 = no lock */
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272..80bf52c 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
+#include <linux/blk-io-throttle.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
@@ -212,6 +213,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
to->bv_len = from->bv_len;
to->bv_offset = from->bv_offset;
inc_zone_page_state(to->bv_page, NR_BOUNCE);
+ iothrottle_copy_page_owner(to->bv_page, page);

if (rw == WRITE) {
char *vto, *vfrom;
diff --git a/mm/filemap.c b/mm/filemap.c
index 379ff0b..5498d1d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,7 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
+#include <linux/blk-io-throttle.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
@@ -464,6 +465,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
+ iothrottle_set_page_owner(page, current->mm);

error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 30351f0..90cd65a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -24,6 +24,7 @@
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
+#include <linux/blk-io-throttle.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
@@ -626,12 +627,23 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
unsigned long ratelimit;
unsigned long *p;
+ struct block_device *bdev = as_to_bdev(mapping);

ratelimit = ratelimit_pages;
if (mapping->backing_dev_info->dirty_exceeded)
ratelimit = 8;

/*
+ * Just check if we've exceeded cgroup IO limits, but do not account
+ * anything here because we're not actually doing IO at this stage.
+ *
+ * We just want to stop to dirty additional pages in the system,
+ * because we're not dispatching the IO requests generated by this
+ * cgroup.
+ */
+ cgroup_io_throttle(NULL, bdev, 0);
+
+ /*
* Check the rate limiting. Also, we do not want to throttle real-time
* tasks in balance_dirty_pages(). Period.
*/
@@ -1243,6 +1255,7 @@ int __set_page_dirty_nobuffers(struct page *page)
BUG_ON(mapping2 != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
+ iothrottle_set_pagedirty_owner(page, current->mm);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 133b6d5..25cae4c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -14,6 +14,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/blk-io-throttle.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>

@@ -81,6 +82,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int (*filler)(void *, struct page *), void *data)
{
struct page *page;
+ struct block_device *bdev = as_to_bdev(mapping);
int ret = 0;

while (!list_empty(pages)) {
@@ -99,6 +101,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
break;
}
task_io_account_read(PAGE_CACHE_SIZE);
+ cgroup_io_throttle(NULL, bdev, PAGE_CACHE_SIZE);
}
return ret;
}
--
1.6.0.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/