[PATCH v9 2/4] Applying the blk_interposer in the block device layer

From: Sergei Shtepa
Date: Wed Apr 21 2021 - 12:46:28 EST


In order to prevent the same bio request from being intercepted multiple
times, the BIO_INTERPOSED flag was added.

The blk_partition_remap() function was moved from submit_bio_checks()
to submit_bio_noacct(). This allows the interposer to receive the bio
request unchanged.

The __submit_bio() and __submit_bio_noacct_mq() functions have been
removed and their respective functionalities were merged into
submit_bio_noacct() and __submit_bio_noacct() accordingly. This allows
to process bio requests from request-based and bio-based block devices
in one common loop.

Functions bio_interposer_lock() and bio_interposer_unlock() in
submit_bio_noacct() allow to stop the receipt of new bio requests for
processing, but not lock the processing of bio requests that have been
already added to the current->bio_list. To keep the penalty for a new
lock to a minimum, percpu_rw_sem is used.

Signed-off-by: Sergei Shtepa <sergei.shtepa@xxxxxxxxx>
---
block/bio.c | 2 +
block/blk-core.c | 194 ++++++++++++++++++++++++++---------------------
2 files changed, 108 insertions(+), 88 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 50e579088aca..6fc9e8f395a6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -640,6 +640,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
bio_set_flag(bio, BIO_THROTTLED);
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
+ if (bio_flagged(bio_src, BIO_INTERPOSED))
+ bio_set_flag(bio, BIO_INTERPOSED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
diff --git a/block/blk-core.c b/block/blk-core.c
index fc60ff208497..a987daa76a79 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -735,26 +735,27 @@ static inline int bio_check_eod(struct bio *bio)
handle_bad_sector(bio, maxsector);
return -EIO;
}
+
+ if (unlikely(should_fail_request(bio->bi_bdev, bio->bi_iter.bi_size)))
+ return -EIO;
+
return 0;
}

/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
-static int blk_partition_remap(struct bio *bio)
+static inline void blk_partition_remap(struct bio *bio)
{
- struct block_device *p = bio->bi_bdev;
+ struct block_device *bdev = bio->bi_bdev;

- if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
- return -EIO;
- if (bio_sectors(bio)) {
- bio->bi_iter.bi_sector += p->bd_start_sect;
- trace_block_bio_remap(bio, p->bd_dev,
+ if (bdev->bd_partno && bio_sectors(bio)) {
+ bio->bi_iter.bi_sector += bdev->bd_start_sect;
+ trace_block_bio_remap(bio, bdev->bd_dev,
bio->bi_iter.bi_sector -
- p->bd_start_sect);
+ bdev->bd_start_sect);
}
bio_set_flag(bio, BIO_REMAPPED);
- return 0;
}

/*
@@ -819,8 +820,6 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
if (!bio_flagged(bio, BIO_REMAPPED)) {
if (unlikely(bio_check_eod(bio)))
goto end_io;
- if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
- goto end_io;
}

/*
@@ -910,20 +909,6 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
return false;
}

-static blk_qc_t __submit_bio(struct bio *bio)
-{
- struct gendisk *disk = bio->bi_bdev->bd_disk;
- blk_qc_t ret = BLK_QC_T_NONE;
-
- if (blk_crypto_bio_prep(&bio)) {
- if (!disk->fops->submit_bio)
- return blk_mq_submit_bio(bio);
- ret = disk->fops->submit_bio(bio);
- }
- blk_queue_exit(disk->queue);
- return ret;
-}
-
/*
* The loop in this function may be a bit non-obvious, and so deserves some
* explanation:
@@ -931,7 +916,7 @@ static blk_qc_t __submit_bio(struct bio *bio)
* - Before entering the loop, bio->bi_next is NULL (as all callers ensure
* that), so we have a list with a single bio.
* - We pretend that we have just taken it off a longer list, so we assign
- * bio_list to a pointer to the bio_list_on_stack, thus initialising the
+ * bio_list to a pointer to the current->bio_list, thus initialising the
* bio_list of new bios to be added. ->submit_bio() may indeed add some more
* bios through a recursive call to submit_bio_noacct. If it did, we find a
* non-NULL value in bio_list and re-enter the loop from the top.
@@ -939,83 +924,75 @@ static blk_qc_t __submit_bio(struct bio *bio)
* pretending) and so remove it from bio_list, and call into ->submit_bio()
* again.
*
- * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
- * bio_list_on_stack[1] contains bios that were submitted before the current
+ * current->bio_list[0] contains bios submitted by the current ->submit_bio.
+ * current->bio_list[1] contains bios that were submitted before the current
* ->submit_bio_bio, but that haven't been processed yet.
*/
static blk_qc_t __submit_bio_noacct(struct bio *bio)
{
- struct bio_list bio_list_on_stack[2];
- blk_qc_t ret = BLK_QC_T_NONE;
-
- BUG_ON(bio->bi_next);
-
- bio_list_init(&bio_list_on_stack[0]);
- current->bio_list = bio_list_on_stack;
-
- do {
- struct request_queue *q = bio->bi_bdev->bd_disk->queue;
- struct bio_list lower, same;
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct bio_list lower, same;
+ blk_qc_t ret;

- if (unlikely(bio_queue_enter(bio) != 0))
- continue;
+ if (!blk_crypto_bio_prep(&bio)) {
+ blk_queue_exit(disk->queue);
+ return BLK_QC_T_NONE;
+ }

- /*
- * Create a fresh bio_list for all subordinate requests.
- */
- bio_list_on_stack[1] = bio_list_on_stack[0];
- bio_list_init(&bio_list_on_stack[0]);
+ if (queue_is_mq(disk->queue))
+ return blk_mq_submit_bio(bio);

- ret = __submit_bio(bio);
+ /*
+ * Create a fresh bio_list for all subordinate requests.
+ */
+ current->bio_list[1] = current->bio_list[0];
+ bio_list_init(&current->bio_list[0]);

- /*
- * Sort new bios into those for a lower level and those for the
- * same level.
- */
- bio_list_init(&lower);
- bio_list_init(&same);
- while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
- if (q == bio->bi_bdev->bd_disk->queue)
- bio_list_add(&same, bio);
- else
- bio_list_add(&lower, bio);
+ WARN_ON_ONCE(!disk->fops->submit_bio);
+ ret = disk->fops->submit_bio(bio);
+ blk_queue_exit(disk->queue);
+ /*
+ * Sort new bios into those for a lower level and those
+ * for the same level.
+ */
+ bio_list_init(&lower);
+ bio_list_init(&same);
+ while ((bio = bio_list_pop(&current->bio_list[0])) != NULL)
+ if (disk->queue == bio->bi_bdev->bd_disk->queue)
+ bio_list_add(&same, bio);
+ else
+ bio_list_add(&lower, bio);

- /*
- * Now assemble so we handle the lowest level first.
- */
- bio_list_merge(&bio_list_on_stack[0], &lower);
- bio_list_merge(&bio_list_on_stack[0], &same);
- bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+ /*
+ * Now assemble so we handle the lowest level first.
+ */
+ bio_list_merge(&current->bio_list[0], &lower);
+ bio_list_merge(&current->bio_list[0], &same);
+ bio_list_merge(&current->bio_list[0], &current->bio_list[1]);

- current->bio_list = NULL;
return ret;
}

-static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
+static inline struct block_device *bio_interposer_lock(struct bio *bio)
{
- struct bio_list bio_list[2] = { };
- blk_qc_t ret = BLK_QC_T_NONE;
-
- current->bio_list = bio_list;
-
- do {
- struct gendisk *disk = bio->bi_bdev->bd_disk;
-
- if (unlikely(bio_queue_enter(bio) != 0))
- continue;
+ bool locked;
+ struct block_device *bdev = bio->bi_bdev;

- if (!blk_crypto_bio_prep(&bio)) {
- blk_queue_exit(disk->queue);
- ret = BLK_QC_T_NONE;
- continue;
+ if (bio->bi_opf & REQ_NOWAIT) {
+ locked = percpu_down_read_trylock(&bdev->bd_interposer_lock);
+ if (unlikely(!locked)) {
+ bio_wouldblock_error(bio);
+ return NULL;
}
+ } else
+ percpu_down_read(&bdev->bd_interposer_lock);

- ret = blk_mq_submit_bio(bio);
- } while ((bio = bio_list_pop(&bio_list[0])));
+ return bdev;
+}

- current->bio_list = NULL;
- return ret;
+static inline void bio_interposer_unlock(struct block_device *locked_bdev)
+{
+ percpu_up_read(&locked_bdev->bd_interposer_lock);
}

/**
@@ -1029,6 +1006,10 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
*/
blk_qc_t submit_bio_noacct(struct bio *bio)
{
+ struct block_device *locked_bdev;
+ struct bio_list bio_list_on_stack[2] = { };
+ blk_qc_t ret = BLK_QC_T_NONE;
+
if (!submit_bio_checks(bio))
return BLK_QC_T_NONE;

@@ -1043,9 +1024,46 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
return BLK_QC_T_NONE;
}

- if (!bio->bi_bdev->bd_disk->fops->submit_bio)
- return __submit_bio_noacct_mq(bio);
- return __submit_bio_noacct(bio);
+ BUG_ON(bio->bi_next);
+
+ locked_bdev = bio_interposer_lock(bio);
+ if (!locked_bdev)
+ return BLK_QC_T_NONE;
+
+ current->bio_list = bio_list_on_stack;
+
+ do {
+ if (unlikely(bio_queue_enter(bio) != 0)) {
+ ret = BLK_QC_T_NONE;
+ continue;
+ }
+
+ if (!bio_flagged(bio, BIO_INTERPOSED) &&
+ bio->bi_bdev->bd_interposer) {
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+ bio_set_dev(bio, bio->bi_bdev->bd_interposer);
+ bio_set_flag(bio, BIO_INTERPOSED);
+
+ bio_list_add(&bio_list_on_stack[0], bio);
+
+ blk_queue_exit(disk->queue);
+ ret = BLK_QC_T_NONE;
+ continue;
+ }
+
+ if (!bio_flagged(bio, BIO_REMAPPED))
+ blk_partition_remap(bio);
+
+ ret = __submit_bio_noacct(bio);
+
+ } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+
+ current->bio_list = NULL;
+
+ bio_interposer_unlock(locked_bdev);
+
+ return ret;
}
EXPORT_SYMBOL(submit_bio_noacct);

--
2.20.1