[RFC 2/2] Track io_ticks at microsecond granularity.

From: Josh Snyder
Date: Tue Jun 09 2020 - 00:08:11 EST


Previously, we performed truncation of I/O issue/completion times during
calculation of io_ticks, counting only I/Os which cross a jiffy
boundary. The effect is a sampling of I/Os: at every boundary between
jiffies we ask "is there an outstanding I/O" and increment a counter if
the answer is yes. This produces results that are accurate (they don't
systematically over- or under-count), but not precise (there is high
variance associated with only taking 100 samples per second).

This change modifies the sampling rate from 100Hz to 976562.5Hz (1
sample per 1024 nanoseconds). I chose this sampling rate by simulating a
workload in which I/Os are issued randomly (by a Poisson process), and
processed in constant time: an M/D/â system (Kendall's notation). My
goal was to produce a sampled utilization fraction which was correct to
one part-per-thousand given one second of samples.

The tradeoff of the higher sampling rate is increased synchronization
overhead caused by more frequent compare-and-swap operations. The
technique of commit 5b18b5a73760 ("block: delete part_round_stats and
switch to less precise counting") is to allow multiple I/Os to complete
while performing only one synchronized operation. As we are increasing
the sample rate by a factor of 10000, we will less frequently be able to
exercise the synchronization-free code path.

Included below is the Python script I used to perform the simulation. It
estimates the correct (calculated without sampling) value of %util, and
then reports the root-mean-squared error of the as-sampled estimates.
The parameters `io_rate`, `sample_rates`, and `avgqu_sz` are meant to be
tweaked to fit characteristics of a given workload. I have chosen to
simulate against a difficult workload: 1000 I/Os per second with an
average queue size of 0.01, implying that each I/O takes 10
microseconds. This I/O latency is on par with some of the fastest
production block devices available today, and an order of magnitude
faster than a typical datacenter-grade SSD. With this change, an
estimate of disk %util will not fluctuate as displayed by iostat with
four decimal places, at a refresh rate of 1 Hz.

#!/usr/bin/env python3
from math import log
from math import sqrt
from random import random

GIGA = 1_000_000_000
SECOND = GIGA

def times(interval, avgqu_sz, sample_rates):
time = 0
correct = 0

est_counters = [0] * len(sample_rates)

while time < SECOND:
gap = -log(random()) * interval
busy = svctm if gap > svctm else gap
finish_time = time + busy

correct += busy
for i, rate in enumerate(sample_rates):
est_counters[i] += (
float(int(finish_time * rate)) - int(time * rate)
)

time += gap

return correct, [
correct - (counter / rate)
for counter, rate in zip(est_counters, sample_rates)
]

# How many I/Os per second?
io_rate = 1000
# How frequently are we sampling? (GHz)
sample_rates = [
100 / GIGA, # 100 Hz
1000 / GIGA, # 1000 Hz
1 / 65536, # 15259 Hz
1 / 16384, # 61035 Hz
1 / 1024, # 976563 Hz
1 / 64, # 15625000 Hz
]
avgqu_sz = 0.01

interval = SECOND / io_rate
svctm = interval * avgqu_sz
total = 0
total_errors = [0] * len(sample_rates)
count = 0
while True:
correct, errors = times(interval, svctm, sample_rates)
for i, error in enumerate(errors):
total_errors[i] += error * error
total += correct / SECOND
count += 1

# prints [{RMS error} for rate in sample_rates]
to_print = [
"{:05.2f}".format(100 * sqrt(error / count) / SECOND)
for error in total_errors
]
print(' '.join(to_print))

Signed-off-by: Josh Snyder <joshs@xxxxxxxxxxx>
Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
---
block/blk-core.c | 16 +++++++++++-----
block/genhd.c | 4 ++--
include/linux/genhd.h | 2 +-
include/linux/part_stat.h | 2 +-
4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a0bbd9e099b9..2749c52d649c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -62,6 +62,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);

DEFINE_IDA(blk_queue_ida);

+#define IO_TICKS_COARSENESS 10
+
/*
* For queue allocation
*/
@@ -1396,10 +1398,14 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);

-static void update_io_ticks(struct hd_struct *part, unsigned long now, unsigned long start)
+static void update_io_ticks(struct hd_struct *part, u64 now, u64 start)
{
- unsigned long stamp;
- unsigned long elapsed;
+ u64 stamp;
+ u64 elapsed;
+
+ start &= ~((1<<IO_TICKS_COARSENESS) - 1);
+ now &= ~((1<<IO_TICKS_COARSENESS) - 1);
+
again:
stamp = READ_ONCE(part->stamp);
if (unlikely(stamp != now)) {
@@ -1447,7 +1453,7 @@ void blk_account_io_done(struct request *req, u64 now)
part_stat_lock();
part = req->part;

- update_io_ticks(part, jiffies, nsecs_to_jiffies(req->start_time_ns));
+ update_io_ticks(part, now, req->start_time_ns);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
part_stat_unlock();
@@ -1493,7 +1499,7 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
unsigned long duration = now - start_time;

part_stat_lock();
- update_io_ticks(part, now, start_time);
+ update_io_ticks(part, jiffies_to_nsecs(now), jiffies_to_nsecs(start_time));
part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock();
diff --git a/block/genhd.c b/block/genhd.c
index 1a7659327664..045cc9cd7a2c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1296,7 +1296,7 @@ ssize_t part_stat_show(struct device *dev,
(unsigned long long)stat.sectors[STAT_WRITE],
(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
inflight,
- jiffies_to_msecs(stat.io_ticks),
+ (unsigned int)div_u64(stat.io_ticks, NSEC_PER_MSEC),
(unsigned int)div_u64(stat.nsecs[STAT_READ] +
stat.nsecs[STAT_WRITE] +
stat.nsecs[STAT_DISCARD] +
@@ -1601,7 +1601,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
(unsigned int)div_u64(stat.nsecs[STAT_WRITE],
NSEC_PER_MSEC),
inflight,
- jiffies_to_msecs(stat.io_ticks),
+ (unsigned int)div_u64(stat.io_ticks, NSEC_PER_MSEC),
(unsigned int)div_u64(stat.nsecs[STAT_READ] +
stat.nsecs[STAT_WRITE] +
stat.nsecs[STAT_DISCARD] +
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 392aad5e29a2..ce13f47a4674 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -62,7 +62,7 @@ struct hd_struct {
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
seqcount_t nr_sects_seq;
#endif
- unsigned long stamp;
+ u64 stamp;
struct disk_stats __percpu *dkstats;
struct percpu_ref ref;

diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 24125778ef3e..208904b2447d 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -9,7 +9,7 @@ struct disk_stats {
unsigned long sectors[NR_STAT_GROUPS];
unsigned long ios[NR_STAT_GROUPS];
unsigned long merges[NR_STAT_GROUPS];
- unsigned long io_ticks;
+ u64 io_ticks;
local_t in_flight[2];
};

--
2.25.1