[PATCH 006 of 6] md: Add support for reshape of a raid6

From: NeilBrown
Date: Tue Feb 20 2007 - 01:37:54 EST



i.e. one or more drives can be added and the array will re-stripe
while on-line.
Most of the interesting work was already done for raid5.
This just extends it to raid6.

mdadm newer than 2.6 is needed for complete safety, however any
version of mdadm which support raid5 reshape will do a good enough job
in almost all cases (an 'echo repair > /sys/block/mdX/md/sync_action'
is recommended after a reshape that was aborted and had to be
restarted with an such a version of mdadm).

Signed-off-by: Neil Brown <neilb@xxxxxxx>

### Diffstat output
./drivers/md/raid5.c | 157 ++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 124 insertions(+), 33 deletions(-)

diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c
--- .prev/drivers/md/raid5.c 2007-02-20 17:14:35.000000000 +1100
+++ ./drivers/md/raid5.c 2007-02-20 17:14:48.000000000 +1100
@@ -1050,7 +1050,7 @@ static void compute_parity5(struct strip
static void compute_parity6(struct stripe_head *sh, int method)
{
raid6_conf_t *conf = sh->raid_conf;
- int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+ int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
struct bio *chosen;
/**** FIX THIS: This could be very bad if disks is close to 256 ****/
void *ptrs[disks];
@@ -1131,8 +1131,7 @@ static void compute_parity6(struct strip
/* Compute one missing block */
static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
{
- raid6_conf_t *conf = sh->raid_conf;
- int i, count, disks = conf->raid_disks;
+ int i, count, disks = sh->disks;
void *ptr[MAX_XOR_BLOCKS], *p;
int pd_idx = sh->pd_idx;
int qd_idx = raid6_next_disk(pd_idx, disks);
@@ -1170,8 +1169,7 @@ static void compute_block_1(struct strip
/* Compute two missing blocks */
static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
{
- raid6_conf_t *conf = sh->raid_conf;
- int i, count, disks = conf->raid_disks;
+ int i, count, disks = sh->disks;
int pd_idx = sh->pd_idx;
int qd_idx = raid6_next_disk(pd_idx, disks);
int d0_idx = raid6_next_disk(qd_idx, disks);
@@ -1887,11 +1885,11 @@ static void handle_stripe5(struct stripe
static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
{
raid6_conf_t *conf = sh->raid_conf;
- int disks = conf->raid_disks;
+ int disks = sh->disks;
struct bio *return_bi= NULL;
struct bio *bi;
int i;
- int syncing;
+ int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
int non_overwrite = 0;
int failed_num[2] = {0, 0};
@@ -1909,6 +1907,8 @@ static void handle_stripe6(struct stripe
clear_bit(STRIPE_DELAYED, &sh->state);

syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
/* Now to look around and see what can be done */

rcu_read_lock();
@@ -2114,13 +2114,15 @@ static void handle_stripe6(struct stripe
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
- if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
+ if (to_read || non_overwrite || (to_write && failed) ||
+ (syncing && (uptodate < disks)) || expanding) {
for (i=disks; i--;) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
(dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
syncing ||
+ expanding ||
(failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
(failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
)
@@ -2355,6 +2357,79 @@ static void handle_stripe6(struct stripe
}
}
}
+
+ if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+ /* Need to write out all blocks after computing P&Q */
+ sh->disks = conf->raid_disks;
+ sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
+ conf->raid_disks);
+ compute_parity6(sh, RECONSTRUCT_WRITE);
+ for (i = conf->raid_disks ; i-- ; ) {
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ locked++;
+ set_bit(R5_Wantwrite, &sh->dev[i].flags);
+ }
+ clear_bit(STRIPE_EXPANDING, &sh->state);
+ } else if (expanded) {
+ clear_bit(STRIPE_EXPAND_READY, &sh->state);
+ atomic_dec(&conf->reshape_stripes);
+ wake_up(&conf->wait_for_overlap);
+ md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+ }
+
+ if (expanding && locked == 0) {
+ /* We have read all the blocks in this stripe and now we need to
+ * copy some of them into a target stripe for expand.
+ */
+ clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ for (i = 0; i < sh->disks ; i++)
+ if (i != pd_idx && i != qd_idx) {
+ int dd_idx2, pd_idx2, j;
+ struct stripe_head *sh2;
+
+ sector_t bn = compute_blocknr(sh, i);
+ sector_t s = raid5_compute_sector(
+ bn, conf->raid_disks,
+ conf->raid_disks - conf->max_degraded,
+ &dd_idx2, &pd_idx2, conf);
+ sh2 = get_active_stripe(conf, s,
+ conf->raid_disks,
+ pd_idx2, 1);
+ if (sh2 == NULL)
+ /* so for only the early blocks of
+ * this stripe have been requests.
+ * When later blocks get requests, we
+ * will try again
+ */
+ continue;
+ if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+ test_bit(R5_Expanded,
+ &sh2->dev[dd_idx2].flags)) {
+ /* must have already done this block */
+ release_stripe(sh2);
+ continue;
+ }
+ memcpy(page_address(sh2->dev[dd_idx2].page),
+ page_address(sh->dev[i].page),
+ STRIPE_SIZE);
+ set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
+ set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
+ for (j = 0 ; j < conf->raid_disks ; j++)
+ if (j != sh2->pd_idx &&
+ j != raid6_next_disk(sh2->pd_idx,
+ sh2->disks) &&
+ !test_bit(R5_Expanded,
+ &sh2->dev[j].flags))
+ break;
+ if (j == conf->raid_disks) {
+ set_bit(STRIPE_EXPAND_READY,
+ &sh2->state);
+ set_bit(STRIPE_HANDLE, &sh2->state);
+ }
+ release_stripe(sh2);
+ }
+ }
+
spin_unlock(&sh->lock);

while ((bi=return_bi)) {
@@ -2395,7 +2470,7 @@ static void handle_stripe6(struct stripe
rcu_read_unlock();

if (rdev) {
- if (syncing)
+ if (syncing || expanding || expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);

bi->bi_bdev = rdev->bdev;
@@ -2915,8 +2990,9 @@ static sector_t reshape_request(mddev_t
struct stripe_head *sh;
int pd_idx;
sector_t first_sector, last_sector;
- int raid_disks;
- int data_disks;
+ int raid_disks = conf->previous_raid_disks;
+ int data_disks = raid_disks - conf->max_degraded;
+ int new_data_disks = conf->raid_disks - conf->max_degraded;
int i;
int dd_idx;
sector_t writepos, safepos, gap;
@@ -2925,7 +3001,7 @@ static sector_t reshape_request(mddev_t
conf->expand_progress != 0) {
/* restarting in the middle, skip the initial sectors */
sector_nr = conf->expand_progress;
- sector_div(sector_nr, conf->raid_disks-1);
+ sector_div(sector_nr, new_data_disks);
*skipped = 1;
return sector_nr;
}
@@ -2939,14 +3015,14 @@ static sector_t reshape_request(mddev_t
* to after where expand_lo old_maps to
*/
writepos = conf->expand_progress +
- conf->chunk_size/512*(conf->raid_disks-1);
- sector_div(writepos, conf->raid_disks-1);
+ conf->chunk_size/512*(new_data_disks);
+ sector_div(writepos, new_data_disks);
safepos = conf->expand_lo;
- sector_div(safepos, conf->previous_raid_disks-1);
+ sector_div(safepos, data_disks);
gap = conf->expand_progress - conf->expand_lo;

if (writepos >= safepos ||
- gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+ gap > (new_data_disks)*3000*2 /*3Meg*/) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
@@ -2976,6 +3052,9 @@ static sector_t reshape_request(mddev_t
sector_t s;
if (j == sh->pd_idx)
continue;
+ if (conf->level == 6 &&
+ j == raid6_next_disk(sh->pd_idx, sh->disks))
+ continue;
s = compute_blocknr(sh, j);
if (s < (mddev->array_size<<1)) {
skipped = 1;
@@ -2999,21 +3078,20 @@ static sector_t reshape_request(mddev_t
* The source stripes are determined by mapping the first and last
* block on the destination stripes.
*/
- raid_disks = conf->previous_raid_disks;
- data_disks = raid_disks - 1;
first_sector =
- raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+ raid5_compute_sector(sector_nr*(new_data_disks),
raid_disks, data_disks,
&dd_idx, &pd_idx, conf);
last_sector =
raid5_compute_sector((sector_nr+conf->chunk_size/512)
- *(conf->raid_disks-1) -1,
+ *(new_data_disks) -1,
raid_disks, data_disks,
&dd_idx, &pd_idx, conf);
if (last_sector >= (mddev->size<<1))
last_sector = (mddev->size<<1)-1;
while (first_sector <= last_sector) {
- pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+ pd_idx = stripe_to_pdidx(first_sector, conf,
+ conf->previous_raid_disks);
sh = get_active_stripe(conf, first_sector,
conf->previous_raid_disks, pd_idx, 0);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
@@ -3348,35 +3426,44 @@ static int run(mddev_t *mddev)
*/
sector_t here_new, here_old;
int old_disks;
+ int max_degraded = (mddev->level == 5 ? 1 : 2);

if (mddev->new_level != mddev->level ||
mddev->new_layout != mddev->layout ||
mddev->new_chunk != mddev->chunk_size) {
- printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+ printk(KERN_ERR "raid5: %s: unsupported reshape "
+ "required - aborting.\n",
mdname(mddev));
return -EINVAL;
}
if (mddev->delta_disks <= 0) {
- printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+ printk(KERN_ERR "raid5: %s: unsupported reshape "
+ "(reduce disks) required - aborting.\n",
mdname(mddev));
return -EINVAL;
}
old_disks = mddev->raid_disks - mddev->delta_disks;
/* reshape_position must be on a new-stripe boundary, and one
- * further up in new geometry must map after here in old geometry.
+ * further up in new geometry must map after here in old
+ * geometry.
*/
here_new = mddev->reshape_position;
- if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
- printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+ if (sector_div(here_new, (mddev->chunk_size>>9)*
+ (mddev->raid_disks - max_degraded))) {
+ printk(KERN_ERR "raid5: reshape_position not "
+ "on a stripe boundary\n");
return -EINVAL;
}
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
- sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
- /* here_old is the first stripe that we might need to read from */
+ sector_div(here_old, (mddev->chunk_size>>9)*
+ (old_disks-max_degraded));
+ /* here_old is the first stripe that we might need to read
+ * from */
if (here_new >= here_old) {
/* Reading from the same stripe as writing to - bad */
- printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+ printk(KERN_ERR "raid5: reshape_position too early for "
+ "auto-recovery - aborting.\n");
return -EINVAL;
}
printk(KERN_INFO "raid5: reshape will continue\n");
@@ -3829,8 +3916,7 @@ static int raid5_start_reshape(mddev_t *
int added_devices = 0;
unsigned long flags;

- if (mddev->degraded ||
- test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;

ITERATE_RDEV(mddev, rdev, rtmp)
@@ -3838,7 +3924,7 @@ static int raid5_start_reshape(mddev_t *
!test_bit(Faulty, &rdev->flags))
spares++;

- if (spares < mddev->delta_disks-1)
+ if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
/* Not enough devices even to make a degraded array
* of that size
*/
@@ -3901,7 +3987,8 @@ static void end_reshape(raid5_conf_t *co
struct block_device *bdev;

if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
- conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+ conf->mddev->array_size = conf->mddev->size *
+ (conf->raid_disks - conf->max_degraded);
set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
conf->mddev->changed = 1;

@@ -3974,6 +4061,10 @@ static struct mdk_personality raid6_pers
.spare_active = raid5_spare_active,
.sync_request = sync_request,
.resize = raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+ .check_reshape = raid5_check_reshape,
+ .start_reshape = raid5_start_reshape,
+#endif
.quiesce = raid5_quiesce,
};
static struct mdk_personality raid5_personality =
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/