Re: [PATCH 2/3] bcache: Convert to lib/time_stats

From: Coly Li
Date: Wed Aug 31 2022 - 12:00:30 EST




> 2022年8月30日 00:53,Kent Overstreet <kent.overstreet@xxxxxxxxx> 写道:
>
> This patch converts bcache to the new generic time_stats code
> lib/time_stats.c. The new code is from bcachefs, and has some changes
> from the version in bcache:
>
> - we now use ktime_get_ns(), not local_clock(). When the code was
> originally written multi processor systems that lacked synchronized
> TSCs were still common, and so local_clock() was much cheaper than
> sched_clock() (though not necessarily fully accurate, due to TSC
> drift). ktime_get_ns() should be cheap enough on all common hardware
> now, and more standard/correct.
>
> - time_stats are now exported in a single file in sysfs, which means we
> can improve the statistics we keep track of without changing all
> users. This also means we don't have to manually specify which units
> (ms, us, ns) a given time_stats should be printed in; that's handled
> dynamically.
>
> - There's a lazily-allocated percpu buffer, which now needs to be freed
> with time_stats_exit().
>
> Signed-off-by: Kent Overstreet <kent.overstreet@xxxxxxxxx>
> Cc: Coly Li <colyli@xxxxxxx>

Hi Kent,

Overall I am OK with the change to bcache code. You may add
Acked-by: Coly Li <colyli@xxxxxxx>
in future version of this patch.

In bcache-tools, they don’t read the changed sysfs files (including bcache-status), IMHO changing the output format won’t be problem for upstream.

My only question is, how to understand the time_stats_to_text() output format,
count: 3
rate: 0/sec
frequency: 4 sec
avg duration: 4 sec
max duration: 4 sec
quantiles (ns): 0 4288669120 4288669120 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048

Fro the above output, what are “rate”, “frequence” and “quantiles” for?

Thanks.

Coly Li

> ---
> drivers/md/bcache/Kconfig | 1 +
> drivers/md/bcache/bcache.h | 1 +
> drivers/md/bcache/bset.c | 8 +++---
> drivers/md/bcache/bset.h | 1 +
> drivers/md/bcache/btree.c | 12 ++++----
> drivers/md/bcache/super.c | 3 ++
> drivers/md/bcache/sysfs.c | 43 ++++++++++++++++++++--------
> drivers/md/bcache/util.c | 30 --------------------
> drivers/md/bcache/util.h | 57 --------------------------------------
> 9 files changed, 47 insertions(+), 109 deletions(-)
>
> diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
> index 529c9d04e9..8d165052e5 100644
> --- a/drivers/md/bcache/Kconfig
> +++ b/drivers/md/bcache/Kconfig
> @@ -4,6 +4,7 @@ config BCACHE
> tristate "Block device as cache"
> select BLOCK_HOLDER_DEPRECATED if SYSFS
> select CRC64
> + select TIME_STATS
> help
> Allows a block device to be used as cache for other devices; uses
> a btree for indexing and the layout is optimized for SSDs.
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 2acda9cea0..5100010a38 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -185,6 +185,7 @@
> #include <linux/rbtree.h>
> #include <linux/rwsem.h>
> #include <linux/refcount.h>
> +#include <linux/time_stats.h>
> #include <linux/types.h>
> #include <linux/workqueue.h>
> #include <linux/kthread.h>
> diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
> index 94d38e8a59..727e9b7aea 100644
> --- a/drivers/md/bcache/bset.c
> +++ b/drivers/md/bcache/bset.c
> @@ -1251,7 +1251,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
> order = state->page_order;
> }
>
> - start_time = local_clock();
> + start_time = ktime_get_ns();
>
> btree_mergesort(b, out, iter, fixup, false);
> b->nsets = start;
> @@ -1286,7 +1286,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
> bch_bset_build_written_tree(b);
>
> if (!start)
> - bch_time_stats_update(&state->time, start_time);
> + time_stats_update(&state->time, start_time);
> }
>
> void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
> @@ -1322,14 +1322,14 @@ void bch_btree_sort_and_fix_extents(struct btree_keys *b,
> void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
> struct bset_sort_state *state)
> {
> - uint64_t start_time = local_clock();
> + uint64_t start_time = ktime_get_ns();
> struct btree_iter iter;
>
> bch_btree_iter_init(b, &iter, NULL);
>
> btree_mergesort(b, new->set->data, &iter, false, true);
>
> - bch_time_stats_update(&state->time, start_time);
> + time_stats_update(&state->time, start_time);
>
> new->set->size = 0; // XXX: why?
> }
> diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
> index d795c84246..13e524ad77 100644
> --- a/drivers/md/bcache/bset.h
> +++ b/drivers/md/bcache/bset.h
> @@ -3,6 +3,7 @@
> #define _BCACHE_BSET_H
>
> #include <linux/kernel.h>
> +#include <linux/time_stats.h>
> #include <linux/types.h>
>
> #include "bcache_ondisk.h"
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index 147c493a98..abf543bc75 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -242,7 +242,7 @@ static void btree_node_read_endio(struct bio *bio)
>
> static void bch_btree_node_read(struct btree *b)
> {
> - uint64_t start_time = local_clock();
> + uint64_t start_time = ktime_get_ns();
> struct closure cl;
> struct bio *bio;
>
> @@ -270,7 +270,7 @@ static void bch_btree_node_read(struct btree *b)
> goto err;
>
> bch_btree_node_read_done(b);
> - bch_time_stats_update(&b->c->btree_read_time, start_time);
> + time_stats_update(&b->c->btree_read_time, start_time);
>
> return;
> err:
> @@ -1789,7 +1789,7 @@ static void bch_btree_gc(struct cache_set *c)
> struct gc_stat stats;
> struct closure writes;
> struct btree_op op;
> - uint64_t start_time = local_clock();
> + uint64_t start_time = ktime_get_ns();
>
> trace_bcache_gc_start(c);
>
> @@ -1815,7 +1815,7 @@ static void bch_btree_gc(struct cache_set *c)
> bch_btree_gc_finish(c);
> wake_up_allocators(c);
>
> - bch_time_stats_update(&c->btree_gc_time, start_time);
> + time_stats_update(&c->btree_gc_time, start_time);
>
> stats.key_bytes *= sizeof(uint64_t);
> stats.data <<= 9;
> @@ -2191,7 +2191,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
> {
> bool split;
> struct btree *n1, *n2 = NULL, *n3 = NULL;
> - uint64_t start_time = local_clock();
> + uint64_t start_time = ktime_get_ns();
> struct closure cl;
> struct keylist parent_keys;
>
> @@ -2297,7 +2297,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
> btree_node_free(b);
> rw_unlock(true, n1);
>
> - bch_time_stats_update(&b->c->btree_split_time, start_time);
> + time_stats_update(&b->c->btree_split_time, start_time);
>
> return 0;
> err_free2:
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index ba3909bb6b..26c8fa93b5 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1691,6 +1691,9 @@ static void cache_set_free(struct closure *cl)
> kobject_put(&ca->kobj);
> }
>
> + time_stats_exit(&c->btree_gc_time);
> + time_stats_exit(&c->btree_split_time);
> + time_stats_exit(&c->sort.time);
>
> if (c->moving_gc_wq)
> destroy_workqueue(c->moving_gc_wq);
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index c6f6770592..01eec5877c 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -16,6 +16,7 @@
> #include <linux/blkdev.h>
> #include <linux/sort.h>
> #include <linux/sched/clock.h>
> +#include <linux/seq_buf.h>
>
> extern bool bcache_is_reboot;
>
> @@ -79,10 +80,10 @@ read_attribute(active_journal_entries);
> read_attribute(backing_dev_name);
> read_attribute(backing_dev_uuid);
>
> -sysfs_time_stats_attribute(btree_gc, sec, ms);
> -sysfs_time_stats_attribute(btree_split, sec, us);
> -sysfs_time_stats_attribute(btree_sort, ms, us);
> -sysfs_time_stats_attribute(btree_read, ms, us);
> +read_attribute(btree_gc_time);
> +read_attribute(btree_split_time);
> +read_attribute(btree_sort_time);
> +read_attribute(btree_read_time);
>
> read_attribute(btree_nodes);
> read_attribute(btree_used_percent);
> @@ -731,6 +732,9 @@ static unsigned int bch_average_key_size(struct cache_set *c)
> SHOW(__bch_cache_set)
> {
> struct cache_set *c = container_of(kobj, struct cache_set, kobj);
> + struct seq_buf s;
> +
> + seq_buf_init(&s, buf, PAGE_SIZE);
>
> sysfs_print(synchronous, CACHE_SYNC(&c->cache->sb));
> sysfs_print(journal_delay_ms, c->journal_delay_ms);
> @@ -743,10 +747,25 @@ SHOW(__bch_cache_set)
> sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
> sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
>
> - sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
> - sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
> - sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
> - sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
> + if (attr == &sysfs_btree_gc_time) {
> + time_stats_to_text(&s, &c->btree_gc_time);
> + return s.len;
> + }
> +
> + if (attr == &sysfs_btree_split_time) {
> + time_stats_to_text(&s, &c->btree_split_time);
> + return s.len;
> + }
> +
> + if (attr == &sysfs_btree_sort_time) {
> + time_stats_to_text(&s, &c->sort.time);
> + return s.len;
> + }
> +
> + if (attr == &sysfs_btree_read_time) {
> + time_stats_to_text(&s, &c->btree_read_time);
> + return s.len;
> + }
>
> sysfs_print(btree_used_percent, bch_btree_used(c));
> sysfs_print(btree_nodes, c->gc_stats.nodes);
> @@ -988,10 +1007,10 @@ KTYPE(bch_cache_set);
> static struct attribute *bch_cache_set_internal_attrs[] = {
> &sysfs_active_journal_entries,
>
> - sysfs_time_stats_attribute_list(btree_gc, sec, ms)
> - sysfs_time_stats_attribute_list(btree_split, sec, us)
> - sysfs_time_stats_attribute_list(btree_sort, ms, us)
> - sysfs_time_stats_attribute_list(btree_read, ms, us)
> + &sysfs_btree_gc_time,
> + &sysfs_btree_split_time,
> + &sysfs_btree_sort_time,
> + &sysfs_btree_read_time,
>
> &sysfs_btree_nodes,
> &sysfs_btree_used_percent,
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index ae380bc399..95282bf0f9 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -160,36 +160,6 @@ int bch_parse_uuid(const char *s, char *uuid)
> return i;
> }
>
> -void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
> -{
> - uint64_t now, duration, last;
> -
> - spin_lock(&stats->lock);
> -
> - now = local_clock();
> - duration = time_after64(now, start_time)
> - ? now - start_time : 0;
> - last = time_after64(now, stats->last)
> - ? now - stats->last : 0;
> -
> - stats->max_duration = max(stats->max_duration, duration);
> -
> - if (stats->last) {
> - ewma_add(stats->average_duration, duration, 8, 8);
> -
> - if (stats->average_frequency)
> - ewma_add(stats->average_frequency, last, 8, 8);
> - else
> - stats->average_frequency = last << 8;
> - } else {
> - stats->average_duration = duration << 8;
> - }
> -
> - stats->last = now ?: 1;
> -
> - spin_unlock(&stats->lock);
> -}
> -
> /**
> * bch_next_delay() - update ratelimiting statistics and calculate next delay
> * @d: the struct bch_ratelimit to update
> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
> index 6f3cb7c921..1e1bdbae95 100644
> --- a/drivers/md/bcache/util.h
> +++ b/drivers/md/bcache/util.h
> @@ -345,68 +345,11 @@ ssize_t bch_hprint(char *buf, int64_t v);
> bool bch_is_zero(const char *p, size_t n);
> int bch_parse_uuid(const char *s, char *uuid);
>
> -struct time_stats {
> - spinlock_t lock;
> - /*
> - * all fields are in nanoseconds, averages are ewmas stored left shifted
> - * by 8
> - */
> - uint64_t max_duration;
> - uint64_t average_duration;
> - uint64_t average_frequency;
> - uint64_t last;
> -};
> -
> -void bch_time_stats_update(struct time_stats *stats, uint64_t time);
> -
> static inline unsigned int local_clock_us(void)
> {
> return local_clock() >> 10;
> }
>
> -#define NSEC_PER_ns 1L
> -#define NSEC_PER_us NSEC_PER_USEC
> -#define NSEC_PER_ms NSEC_PER_MSEC
> -#define NSEC_PER_sec NSEC_PER_SEC
> -
> -#define __print_time_stat(stats, name, stat, units) \
> - sysfs_print(name ## _ ## stat ## _ ## units, \
> - div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
> -
> -#define sysfs_print_time_stats(stats, name, \
> - frequency_units, \
> - duration_units) \
> -do { \
> - __print_time_stat(stats, name, \
> - average_frequency, frequency_units); \
> - __print_time_stat(stats, name, \
> - average_duration, duration_units); \
> - sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
> - div_u64((stats)->max_duration, \
> - NSEC_PER_ ## duration_units)); \
> - \
> - sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
> - ? div_s64(local_clock() - (stats)->last, \
> - NSEC_PER_ ## frequency_units) \
> - : -1LL); \
> -} while (0)
> -
> -#define sysfs_time_stats_attribute(name, \
> - frequency_units, \
> - duration_units) \
> -read_attribute(name ## _average_frequency_ ## frequency_units); \
> -read_attribute(name ## _average_duration_ ## duration_units); \
> -read_attribute(name ## _max_duration_ ## duration_units); \
> -read_attribute(name ## _last_ ## frequency_units)
> -
> -#define sysfs_time_stats_attribute_list(name, \
> - frequency_units, \
> - duration_units) \
> -&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
> -&sysfs_ ## name ## _average_duration_ ## duration_units, \
> -&sysfs_ ## name ## _max_duration_ ## duration_units, \
> -&sysfs_ ## name ## _last_ ## frequency_units,
> -
> #define ewma_add(ewma, val, weight, factor) \
> ({ \
> (ewma) *= (weight) - 1; \
> --
> 2.36.1
>