Re: [PATCH -next 3/6] md/raid1: remove rcu protection to access rdev from conf

From: Song Liu
Date: Wed Oct 18 2023 - 13:57:17 EST


On Sun, Oct 15, 2023 at 6:28 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote:
>
> From: Yu Kuai <yukuai3@xxxxxxxxxx>
>
> It's safe to accees rdev from conf:
> - If any spinlock is held, because synchronize_rcu() from
> md_kick_rdev_from_array() will prevent 'rdev' to be freed until
> spinlock is released;
> - If 'reconfig_lock' is held, because rdev can't be added or removed from
> array;

Maybe add lockdep asserts for the above cases?

Thanks,
Song

> - If there is normal IO inflight, because mddev_suspend() will prevent
> rdev to be added or removed from array;
> - If there is sync IO inflight, because 'MD_RECOVERY_RUNNING' is
> checked in remove_and_add_spares().
>
> And these will cover all the scenarios in raid1.
>
> Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>
> ---
> drivers/md/raid1.c | 57 +++++++++++++++++-----------------------------
> 1 file changed, 21 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 4348d670439d..5c647036663d 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -609,7 +609,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> int choose_first;
> int choose_next_idle;
>
> - rcu_read_lock();
> /*
> * Check if we can balance. We can balance on the whole
> * device if no resync is going on, or below the resync window.
> @@ -642,7 +641,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> unsigned int pending;
> bool nonrot;
>
> - rdev = rcu_dereference(conf->mirrors[disk].rdev);
> + rdev = conf->mirrors[disk].rdev;
> if (r1_bio->bios[disk] == IO_BLOCKED
> || rdev == NULL
> || test_bit(Faulty, &rdev->flags))
> @@ -773,7 +772,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> }
>
> if (best_disk >= 0) {
> - rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
> + rdev = conf->mirrors[best_disk].rdev;
> if (!rdev)
> goto retry;
> atomic_inc(&rdev->nr_pending);
> @@ -784,7 +783,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
>
> conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
> }
> - rcu_read_unlock();
> *max_sectors = sectors;
>
> return best_disk;
> @@ -1235,14 +1233,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>
> if (r1bio_existed) {
> /* Need to get the block device name carefully */
> - struct md_rdev *rdev;
> - rcu_read_lock();
> - rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
> + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
> +
> if (rdev)
> snprintf(b, sizeof(b), "%pg", rdev->bdev);
> else
> strcpy(b, "???");
> - rcu_read_unlock();
> }
>
> /*
> @@ -1396,10 +1392,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>
> disks = conf->raid_disks * 2;
> blocked_rdev = NULL;
> - rcu_read_lock();
> max_sectors = r1_bio->sectors;
> for (i = 0; i < disks; i++) {
> - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
> + struct md_rdev *rdev = conf->mirrors[i].rdev;
>
> /*
> * The write-behind io is only attempted on drives marked as
> @@ -1465,7 +1460,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
> }
> r1_bio->bios[i] = bio;
> }
> - rcu_read_unlock();
>
> if (unlikely(blocked_rdev)) {
> /* Wait for this device to become unblocked */
> @@ -1617,15 +1611,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
> struct r1conf *conf = mddev->private;
> int i;
>
> + lockdep_assert_held(&mddev->lock);
> +
> seq_printf(seq, " [%d/%d] [", conf->raid_disks,
> conf->raid_disks - mddev->degraded);
> - rcu_read_lock();
> for (i = 0; i < conf->raid_disks; i++) {
> - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
> + struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
> +
> seq_printf(seq, "%s",
> rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
> }
> - rcu_read_unlock();
> seq_printf(seq, "]");
> }
>
> @@ -1785,7 +1780,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
> */
> if (rdev->saved_raid_disk < 0)
> conf->fullsync = 1;
> - rcu_assign_pointer(p->rdev, rdev);
> + WRITE_ONCE(p->rdev, rdev);
> break;
> }
> if (test_bit(WantReplacement, &p->rdev->flags) &&
> @@ -1801,7 +1796,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
> rdev->raid_disk = repl_slot;
> err = 0;
> conf->fullsync = 1;
> - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
> + WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
> }
>
> return err;
> @@ -1835,7 +1830,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
> err = -EBUSY;
> goto abort;
> }
> - p->rdev = NULL;
> + WRITE_ONCE(p->rdev, NULL);
> if (conf->mirrors[conf->raid_disks + number].rdev) {
> /* We just removed a device that is being replaced.
> * Move down the replacement. We drain all IO before
> @@ -1856,7 +1851,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
> goto abort;
> }
> clear_bit(Replacement, &repl->flags);
> - p->rdev = repl;
> + WRITE_ONCE(p->rdev, repl);
> conf->mirrors[conf->raid_disks + number].rdev = NULL;
> unfreeze_array(conf);
> }
> @@ -2253,8 +2248,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
> sector_t first_bad;
> int bad_sectors;
>
> - rcu_read_lock();
> - rdev = rcu_dereference(conf->mirrors[d].rdev);
> + rdev = conf->mirrors[d].rdev;
> if (rdev &&
> (test_bit(In_sync, &rdev->flags) ||
> (!test_bit(Faulty, &rdev->flags) &&
> @@ -2262,15 +2256,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
> is_badblock(rdev, sect, s,
> &first_bad, &bad_sectors) == 0) {
> atomic_inc(&rdev->nr_pending);
> - rcu_read_unlock();
> if (sync_page_io(rdev, sect, s<<9,
> conf->tmppage, REQ_OP_READ, false))
> success = 1;
> rdev_dec_pending(rdev, mddev);
> if (success)
> break;
> - } else
> - rcu_read_unlock();
> + }
> +
> d++;
> if (d == conf->raid_disks * 2)
> d = 0;
> @@ -2289,29 +2282,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
> if (d==0)
> d = conf->raid_disks * 2;
> d--;
> - rcu_read_lock();
> - rdev = rcu_dereference(conf->mirrors[d].rdev);
> + rdev = conf->mirrors[d].rdev;
> if (rdev &&
> !test_bit(Faulty, &rdev->flags)) {
> atomic_inc(&rdev->nr_pending);
> - rcu_read_unlock();
> r1_sync_page_io(rdev, sect, s,
> conf->tmppage, WRITE);
> rdev_dec_pending(rdev, mddev);
> - } else
> - rcu_read_unlock();
> + }
> }
> d = start;
> while (d != read_disk) {
> if (d==0)
> d = conf->raid_disks * 2;
> d--;
> - rcu_read_lock();
> - rdev = rcu_dereference(conf->mirrors[d].rdev);
> + rdev = conf->mirrors[d].rdev;
> if (rdev &&
> !test_bit(Faulty, &rdev->flags)) {
> atomic_inc(&rdev->nr_pending);
> - rcu_read_unlock();
> if (r1_sync_page_io(rdev, sect, s,
> conf->tmppage, READ)) {
> atomic_add(s, &rdev->corrected_errors);
> @@ -2322,8 +2310,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
> rdev->bdev);
> }
> rdev_dec_pending(rdev, mddev);
> - } else
> - rcu_read_unlock();
> + }
> }
> sectors -= s;
> sect += s;
> @@ -2704,7 +2691,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
>
> r1_bio = raid1_alloc_init_r1buf(conf);
>
> - rcu_read_lock();
> /*
> * If we get a correctably read error during resync or recovery,
> * we might want to read from a different device. So we
> @@ -2725,7 +2711,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
> struct md_rdev *rdev;
> bio = r1_bio->bios[i];
>
> - rdev = rcu_dereference(conf->mirrors[i].rdev);
> + rdev = conf->mirrors[i].rdev;
> if (rdev == NULL ||
> test_bit(Faulty, &rdev->flags)) {
> if (i < conf->raid_disks)
> @@ -2783,7 +2769,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
> bio->bi_opf |= MD_FAILFAST;
> }
> }
> - rcu_read_unlock();
> if (disk < 0)
> disk = wonly;
> r1_bio->read_disk = disk;
> --
> 2.39.2
>