[PATCH 3/5] writeback: dirty rate control

From: Wu Fengguang
Date: Sat Aug 06 2011 - 08:21:25 EST


It's all about bdi->dirty_ratelimit, which aims to be (write_bw / N)
when there are N dd tasks.

On write() syscall, use bdi->dirty_ratelimit
============================================

balance_dirty_pages(pages_dirtied)
{
pos_bw = bdi->dirty_ratelimit * bdi_position_ratio();
pause = pages_dirtied / pos_bw;
sleep(pause);
}

On every 200ms, update bdi->dirty_ratelimit
===========================================

bdi_update_dirty_ratelimit()
{
bw = bdi->dirty_ratelimit;
ref_bw = bw * bdi_position_ratio() * write_bw / dirty_bw;
if (dirty pages unbalanced)
bdi->dirty_ratelimit = (bw * 3 + ref_bw) / 4;
}

Estimation of balanced bdi->dirty_ratelimit
===========================================

When started N dd, throttle each dd at

task_ratelimit = pos_bw (any non-zero initial value is OK)

After 200ms, we got

dirty_bw = # of pages dirtied by app / 200ms
write_bw = # of pages written to disk / 200ms

For aggressive dirtiers, the equality holds

dirty_bw == N * task_ratelimit
== N * pos_bw (1)

The balanced throttle bandwidth can be estimated by

ref_bw = pos_bw * write_bw / dirty_bw (2)

>From (1) and (2), we get equality

ref_bw == write_bw / N (3)

If the N dd's are all throttled at ref_bw, the dirty/writeback rates
will match. So ref_bw is the balanced dirty rate.

In practice, the ref_bw calculated by (2) may fluctuate and have
estimation errors. So the bdi->dirty_ratelimit update policy is to
follow it only when both pos_bw and ref_bw point to the same direction
(indicating not only the dirty position has deviated from the global/bdi
setpoints, but also it's still departing away).

Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx>
---
include/linux/backing-dev.h | 7 +++
mm/backing-dev.c | 1
mm/page-writeback.c | 69 +++++++++++++++++++++++++++++++++-
3 files changed, 75 insertions(+), 2 deletions(-)

--- linux-next.orig/include/linux/backing-dev.h 2011-08-05 18:05:36.000000000 +0800
+++ linux-next/include/linux/backing-dev.h 2011-08-05 18:05:36.000000000 +0800
@@ -75,10 +75,17 @@ struct backing_dev_info {
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];

unsigned long bw_time_stamp; /* last time write bw is updated */
+ unsigned long dirtied_stamp;
unsigned long written_stamp; /* pages written at bw_time_stamp */
unsigned long write_bandwidth; /* the estimated write bandwidth */
unsigned long avg_write_bandwidth; /* further smoothed write bw */

+ /*
+ * The base throttle bandwidth, re-calculated on every 200ms.
+ * All the bdi tasks' dirty rate will be curbed under it.
+ */
+ unsigned long dirty_ratelimit;
+
struct prop_local_percpu completions;
int dirty_exceeded;

--- linux-next.orig/mm/backing-dev.c 2011-08-05 18:05:36.000000000 +0800
+++ linux-next/mm/backing-dev.c 2011-08-05 18:05:36.000000000 +0800
@@ -674,6 +674,7 @@ int bdi_init(struct backing_dev_info *bd
bdi->bw_time_stamp = jiffies;
bdi->written_stamp = 0;

+ bdi->dirty_ratelimit = INIT_BW;
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;

--- linux-next.orig/mm/page-writeback.c 2011-08-05 18:05:36.000000000 +0800
+++ linux-next/mm/page-writeback.c 2011-08-06 09:08:35.000000000 +0800
@@ -736,6 +736,66 @@ static void global_update_bandwidth(unsi
spin_unlock(&dirty_lock);
}

+/*
+ * Maintain bdi->dirty_ratelimit, the base throttle bandwidth.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long dirtied,
+ unsigned long elapsed)
+{
+ unsigned long bw = bdi->dirty_ratelimit;
+ unsigned long dirty_bw;
+ unsigned long pos_bw;
+ unsigned long ref_bw;
+ unsigned long long pos_ratio;
+
+ /*
+ * The dirty rate will match the writeback rate in long term, except
+ * when dirty pages are truncated by userspace or re-dirtied by FS.
+ */
+ dirty_bw = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+
+ pos_ratio = bdi_position_ratio(bdi, thresh, dirty,
+ bdi_thresh, bdi_dirty);
+ /*
+ * pos_bw reflects each dd's dirty rate enforced for the past 200ms.
+ */
+ pos_bw = bw * pos_ratio >> BANDWIDTH_CALC_SHIFT;
+ pos_bw++; /* this avoids bdi->dirty_ratelimit get stuck in 0 */
+
+ /*
+ * ref_bw = pos_bw * write_bw / dirty_bw
+ *
+ * It's a linear estimation of the "balanced" throttle bandwidth.
+ */
+ pos_ratio *= bdi->avg_write_bandwidth;
+ do_div(pos_ratio, dirty_bw | 1);
+ ref_bw = bw * pos_ratio >> BANDWIDTH_CALC_SHIFT;
+
+ /*
+ * dirty_ratelimit will follow ref_bw/pos_bw conservatively iff they
+ * are on the same side of dirty_ratelimit. Which not only makes it
+ * more stable, but also is essential for preventing it being driven
+ * away by possible systematic errors in ref_bw.
+ */
+ if (pos_bw < bw) {
+ if (ref_bw < bw)
+ bw = max(ref_bw, pos_bw);
+ } else {
+ if (ref_bw > bw)
+ bw = min(ref_bw, pos_bw);
+ }
+
+ bdi->dirty_ratelimit = bw;
+}
+
void __bdi_update_bandwidth(struct backing_dev_info *bdi,
unsigned long thresh,
unsigned long dirty,
@@ -745,6 +805,7 @@ void __bdi_update_bandwidth(struct backi
{
unsigned long now = jiffies;
unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long dirtied;
unsigned long written;

/*
@@ -753,6 +814,7 @@ void __bdi_update_bandwidth(struct backi
if (elapsed < BANDWIDTH_INTERVAL)
return;

+ dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);

/*
@@ -762,12 +824,15 @@ void __bdi_update_bandwidth(struct backi
if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
goto snapshot;

- if (thresh)
+ if (thresh) {
global_update_bandwidth(thresh, dirty, now);
-
+ bdi_update_dirty_ratelimit(bdi, thresh, dirty, bdi_thresh,
+ bdi_dirty, dirtied, elapsed);
+ }
bdi_update_write_bandwidth(bdi, elapsed, written);

snapshot:
+ bdi->dirtied_stamp = dirtied;
bdi->written_stamp = written;
bdi->bw_time_stamp = now;
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/