[PATCH 07/10 v2] readahead: add /debug/readahead/stats

From: Wu Fengguang
Date: Fri Dec 23 2011 - 07:59:30 EST


The accounting code will be compiled in by default (CONFIG_READAHEAD_STATS=y),
and will remain inactive by default.

It can be runtime enabled/disabled through the debugfs interface

echo 1 > /debug/readahead/stats_enable
echo 0 > /debug/readahead/stats_enable

Example output:
(taken from a fresh booted NFS-ROOT console box with rsize=524288)

$ cat /debug/readahead/stats
pattern readahead eof_hit cache_hit io sync_io mmap_io meta_io size async_size io_size
initial 702 511 0 692 692 0 0 2 0 2
subsequent 7 0 1 7 1 1 0 23 22 23
context 160 161 0 2 0 1 0 0 0 16
around 184 184 177 184 184 184 0 58 0 53
backwards 2 0 2 2 2 0 0 4 0 3
fadvise 2593 47 8 2588 2588 0 0 1 0 1
oversize 0 0 0 0 0 0 0 0 0 0
random 45 20 0 44 44 0 0 1 0 1
all 3697 923 188 3519 3511 186 0 4 0 4

The two most important columns are
- io number of readahead IO
- io_size average readahead IO size

CC: Ingo Molnar <mingo@xxxxxxx>
CC: Jens Axboe <axboe@xxxxxxxxx>
CC: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Acked-by: Rik van Riel <riel@xxxxxxxxxx>
Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx>
---
mm/Kconfig | 15 +++
mm/readahead.c | 202 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 217 insertions(+)

This switches to the percpu_counter facilities.

--- linux-next.orig/mm/readahead.c 2011-12-23 20:29:14.000000000 +0800
+++ linux-next/mm/readahead.c 2011-12-23 20:50:04.000000000 +0800
@@ -33,6 +33,202 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);

#define list_to_page(head) (list_entry((head)->prev, struct page, lru))

+#ifdef CONFIG_READAHEAD_STATS
+#include <linux/ftrace_event.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+static u32 readahead_stats_enable __read_mostly;
+
+static const struct trace_print_flags ra_pattern_names[] = {
+ READAHEAD_PATTERNS
+};
+
+enum ra_account {
+ /* number of readaheads */
+ RA_ACCOUNT_COUNT, /* readahead request */
+ RA_ACCOUNT_EOF, /* readahead request covers EOF */
+ RA_ACCOUNT_CACHE_HIT, /* readahead request covers some cached pages */
+ RA_ACCOUNT_IOCOUNT, /* readahead IO */
+ RA_ACCOUNT_SYNC, /* readahead IO that is synchronous */
+ RA_ACCOUNT_MMAP, /* readahead IO by mmap page faults */
+ RA_ACCOUNT_METADATA, /* readahead IO on metadata */
+ /* number of readahead pages */
+ RA_ACCOUNT_SIZE, /* readahead size */
+ RA_ACCOUNT_ASYNC_SIZE, /* readahead async size */
+ RA_ACCOUNT_ACTUAL, /* readahead actual IO size */
+ /* end mark */
+ RA_ACCOUNT_MAX,
+};
+
+#define RA_STAT_BATCH (INT_MAX / 2)
+static struct percpu_counter ra_stat[RA_PATTERN_ALL][RA_ACCOUNT_MAX];
+
+static inline void add_ra_stat(int i, int j, s64 amount)
+{
+ __percpu_counter_add(&ra_stat[i][j], amount, RA_STAT_BATCH);
+}
+
+static inline void inc_ra_stat(int i, int j)
+{
+ add_ra_stat(i, j, 1);
+}
+
+static void readahead_stats(struct address_space *mapping,
+ pgoff_t offset,
+ unsigned long req_size,
+ bool for_mmap,
+ bool for_metadata,
+ enum readahead_pattern pattern,
+ pgoff_t start,
+ unsigned long size,
+ unsigned long async_size,
+ int actual)
+{
+ pgoff_t eof = ((i_size_read(mapping->host)-1) >> PAGE_CACHE_SHIFT) + 1;
+
+ inc_ra_stat(pattern, RA_ACCOUNT_COUNT);
+ add_ra_stat(pattern, RA_ACCOUNT_SIZE, size);
+ add_ra_stat(pattern, RA_ACCOUNT_ASYNC_SIZE, async_size);
+ add_ra_stat(pattern, RA_ACCOUNT_ACTUAL, actual);
+
+ if (start + size >= eof)
+ inc_ra_stat(pattern, RA_ACCOUNT_EOF);
+ if (actual < size)
+ inc_ra_stat(pattern, RA_ACCOUNT_CACHE_HIT);
+
+ if (actual) {
+ inc_ra_stat(pattern, RA_ACCOUNT_IOCOUNT);
+
+ if (start <= offset && offset < start + size)
+ inc_ra_stat(pattern, RA_ACCOUNT_SYNC);
+
+ if (for_mmap)
+ inc_ra_stat(pattern, RA_ACCOUNT_MMAP);
+ if (for_metadata)
+ inc_ra_stat(pattern, RA_ACCOUNT_METADATA);
+ }
+}
+
+static void readahead_stats_reset(void)
+{
+ int i, j;
+
+ for (i = 0; i < RA_PATTERN_ALL; i++)
+ for (j = 0; j < RA_ACCOUNT_MAX; j++)
+ percpu_counter_set(&ra_stat[i][j], 0);
+}
+
+static void
+readahead_stats_sum(long long ra_stats[RA_PATTERN_MAX][RA_ACCOUNT_MAX])
+{
+ int i, j;
+
+ for (i = 0; i < RA_PATTERN_ALL; i++)
+ for (j = 0; j < RA_ACCOUNT_MAX; j++) {
+ s64 n = percpu_counter_sum(&ra_stat[i][j]);
+ ra_stats[i][j] += n;
+ ra_stats[RA_PATTERN_ALL][j] += n;
+ }
+}
+
+static int readahead_stats_show(struct seq_file *s, void *_)
+{
+ long long ra_stats[RA_PATTERN_MAX][RA_ACCOUNT_MAX];
+ int i;
+
+ seq_printf(s,
+ "%-10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
+ "pattern", "readahead", "eof_hit", "cache_hit",
+ "io", "sync_io", "mmap_io", "meta_io",
+ "size", "async_size", "io_size");
+
+ memset(ra_stats, 0, sizeof(ra_stats));
+ readahead_stats_sum(ra_stats);
+
+ for (i = 0; i < RA_PATTERN_MAX; i++) {
+ unsigned long count = ra_stats[i][RA_ACCOUNT_COUNT];
+ unsigned long iocount = ra_stats[i][RA_ACCOUNT_IOCOUNT];
+ /*
+ * avoid division-by-zero
+ */
+ if (count == 0)
+ count = 1;
+ if (iocount == 0)
+ iocount = 1;
+
+ seq_printf(s, "%-10s %10lld %10lld %10lld %10lld %10lld "
+ "%10lld %10lld %10lld %10lld %10lld\n",
+ ra_pattern_names[i].name,
+ ra_stats[i][RA_ACCOUNT_COUNT],
+ ra_stats[i][RA_ACCOUNT_EOF],
+ ra_stats[i][RA_ACCOUNT_CACHE_HIT],
+ ra_stats[i][RA_ACCOUNT_IOCOUNT],
+ ra_stats[i][RA_ACCOUNT_SYNC],
+ ra_stats[i][RA_ACCOUNT_MMAP],
+ ra_stats[i][RA_ACCOUNT_METADATA],
+ ra_stats[i][RA_ACCOUNT_SIZE] / count,
+ ra_stats[i][RA_ACCOUNT_ASYNC_SIZE] / count,
+ ra_stats[i][RA_ACCOUNT_ACTUAL] / iocount);
+ }
+
+ return 0;
+}
+
+static int readahead_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, readahead_stats_show, NULL);
+}
+
+static ssize_t readahead_stats_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *offset)
+{
+ readahead_stats_reset();
+ return size;
+}
+
+static const struct file_operations readahead_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = readahead_stats_open,
+ .write = readahead_stats_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init readahead_create_debugfs(void)
+{
+ struct dentry *root;
+ struct dentry *entry;
+ int i, j;
+
+ root = debugfs_create_dir("readahead", NULL);
+ if (!root)
+ goto out;
+
+ entry = debugfs_create_file("stats", 0644, root,
+ NULL, &readahead_stats_fops);
+ if (!entry)
+ goto out;
+
+ entry = debugfs_create_bool("stats_enable", 0644, root,
+ &readahead_stats_enable);
+ if (!entry)
+ goto out;
+
+ for (i = 0; i < RA_PATTERN_ALL; i++)
+ for (j = 0; j < RA_ACCOUNT_MAX; j++)
+ percpu_counter_init(&ra_stat[i][j], 0);
+
+ return 0;
+out:
+ printk(KERN_ERR "readahead: failed to create debugfs entries\n");
+ return -ENOMEM;
+}
+
+late_initcall(readahead_create_debugfs);
+#endif
+
static inline void readahead_event(struct address_space *mapping,
pgoff_t offset,
unsigned long req_size,
@@ -44,6 +240,12 @@ static inline void readahead_event(struc
unsigned long async_size,
int actual)
{
+#ifdef CONFIG_READAHEAD_STATS
+ if (readahead_stats_enable)
+ readahead_stats(mapping, offset, req_size,
+ for_mmap, for_metadata,
+ pattern, start, size, async_size, actual);
+#endif
trace_readahead(mapping, offset, req_size,
pattern, start, size, async_size, actual);
}
--- linux-next.orig/mm/Kconfig 2011-12-23 20:28:06.000000000 +0800
+++ linux-next/mm/Kconfig 2011-12-23 20:29:31.000000000 +0800
@@ -396,3 +396,18 @@ config FRONTSWAP
and swap data is stored as normal on the matching swap device.

If unsure, say Y to enable frontswap.
+
+config READAHEAD_STATS
+ bool "Collect page cache readahead stats"
+ depends on DEBUG_FS
+ default y
+ help
+ This provides the readahead events accounting facilities.
+
+ To do readahead accounting for a workload:
+
+ echo 1 > /sys/kernel/debug/readahead/stats_enable
+ echo 0 > /sys/kernel/debug/readahead/stats # reset counters
+ # run the workload
+ cat /sys/kernel/debug/readahead/stats # check counters
+ echo 0 > /sys/kernel/debug/readahead/stats_enable
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/