Re: [dm-devel] [PATCH v4 1/4] block: Introduce provisioning primitives

From: Darrick J. Wong
Date: Wed Apr 19 2023 - 11:36:21 EST


On Tue, Apr 18, 2023 at 03:12:04PM -0700, Sarthak Kukreti wrote:
> Introduce block request REQ_OP_PROVISION. The intent of this request
> is to request underlying storage to preallocate disk space for the given
> block range. Block devices that support this capability will export
> a provision limit within their request queues.
>
> This patch also adds the capability to call fallocate() in mode 0
> on block devices, which will send REQ_OP_PROVISION to the block
> device for the specified range,
>
> Signed-off-by: Sarthak Kukreti <sarthakkukreti@xxxxxxxxxxxx>
> ---
> block/blk-core.c | 5 ++++
> block/blk-lib.c | 53 +++++++++++++++++++++++++++++++++++++++
> block/blk-merge.c | 18 +++++++++++++
> block/blk-settings.c | 19 ++++++++++++++
> block/blk-sysfs.c | 8 ++++++
> block/bounce.c | 1 +
> block/fops.c | 25 +++++++++++++-----
> include/linux/bio.h | 6 +++--
> include/linux/blk_types.h | 5 +++-
> include/linux/blkdev.h | 16 ++++++++++++
> 10 files changed, 147 insertions(+), 9 deletions(-)
>

<cut to the fallocate part; the block/ changes look fine to /me/ at
first glance, but what do I know... ;)>

> diff --git a/block/fops.c b/block/fops.c
> index d2e6be4e3d1c..e1775269654a 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -611,9 +611,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> return ret;
> }
>
> +#define BLKDEV_FALLOC_FL_TRUNCATE \

At first I thought from this name that you were defining a new truncate
mode for fallocate, then I realized that this is mask for deciding if we
/want/ to truncate the pagecache.

#define BLKDEV_FALLOC_TRUNCATE_MASK ?

> + (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | \

Ok, so discarding and writing zeroes truncates the page cache, makes
sense since we're "writing" directly to the block device.

> + FALLOC_FL_NO_HIDE_STALE)

Here things get tricky -- some of the FALLOC_FL mode bits are really an
opcode and cannot be specified together, whereas others select optional
behavior for certain opcodes.

IIRC, the mutually exclusive opcodes are:

PUNCH_HOLE
ZERO_RANGE
COLLAPSE_RANGE
INSERT_RANGE
(none of the above, for allocation)

and the "variants on a theme are":

KEEP_SIZE
NO_HIDE_STALE
UNSHARE_RANGE

not all of which are supported by all the opcodes.

Does it make sense to truncate the page cache if userspace passes in
mode == NO_HIDE_STALE? There's currently no defined meaning for this
combination, but I think this means we'll truncate the pagecache before
deciding if we're actually going to issue any commands.

I think that's just a bug in the existing code -- it should be
validating that @mode is any of the supported combinations *before*
truncating the pagecache.

Otherwise you could have a mkfs program that starts writing new fs
metadata, decides to provision the storage (say for a logging region),
doesn't realize it's running on an old kernel, and then oops the
provision attempt fails but have we now shredded the pagecache and lost
all the writes?

--D

> +
> #define BLKDEV_FALLOC_FL_SUPPORTED \
> - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
> - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
> + (BLKDEV_FALLOC_FL_TRUNCATE | FALLOC_FL_KEEP_SIZE | \
> + FALLOC_FL_UNSHARE_RANGE)
>
> static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> loff_t len)
> @@ -625,7 +629,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> int error;
>
> /* Fail if we don't recognize the flags. */
> - if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> + if (mode != 0 && mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
> return -EOPNOTSUPP;
>
> /* Don't go off the end of the device. */
> @@ -649,11 +653,20 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
> filemap_invalidate_lock(inode->i_mapping);
>
> /* Invalidate the page cache, including dirty pages. */
> - error = truncate_bdev_range(bdev, file->f_mode, start, end);
> - if (error)
> - goto fail;
> + if (mode & BLKDEV_FALLOC_FL_TRUNCATE) {
> + error = truncate_bdev_range(bdev, file->f_mode, start, end);
> + if (error)
> + goto fail;
> + }
>
> switch (mode) {
> + case 0:
> + case FALLOC_FL_UNSHARE_RANGE:
> + case FALLOC_FL_KEEP_SIZE:
> + case FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE:
> + error = blkdev_issue_provision(bdev, start >> SECTOR_SHIFT,
> + len >> SECTOR_SHIFT, GFP_KERNEL);
> + break;
> case FALLOC_FL_ZERO_RANGE:
> case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
> error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index d766be7152e1..9820b3b039f2 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -57,7 +57,8 @@ static inline bool bio_has_data(struct bio *bio)
> bio->bi_iter.bi_size &&
> bio_op(bio) != REQ_OP_DISCARD &&
> bio_op(bio) != REQ_OP_SECURE_ERASE &&
> - bio_op(bio) != REQ_OP_WRITE_ZEROES)
> + bio_op(bio) != REQ_OP_WRITE_ZEROES &&
> + bio_op(bio) != REQ_OP_PROVISION)
> return true;
>
> return false;
> @@ -67,7 +68,8 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
> {
> return bio_op(bio) == REQ_OP_DISCARD ||
> bio_op(bio) == REQ_OP_SECURE_ERASE ||
> - bio_op(bio) == REQ_OP_WRITE_ZEROES;
> + bio_op(bio) == REQ_OP_WRITE_ZEROES ||
> + bio_op(bio) == REQ_OP_PROVISION;
> }
>
> static inline void *bio_data(struct bio *bio)
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 99be590f952f..27bdf88f541c 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -385,7 +385,10 @@ enum req_op {
> REQ_OP_DRV_IN = (__force blk_opf_t)34,
> REQ_OP_DRV_OUT = (__force blk_opf_t)35,
>
> - REQ_OP_LAST = (__force blk_opf_t)36,
> + /* request device to provision block */
> + REQ_OP_PROVISION = (__force blk_opf_t)37,
> +
> + REQ_OP_LAST = (__force blk_opf_t)38,
> };
>
> enum req_flag_bits {
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 941304f17492..239e2f418b6e 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -303,6 +303,7 @@ struct queue_limits {
> unsigned int discard_granularity;
> unsigned int discard_alignment;
> unsigned int zone_write_granularity;
> + unsigned int max_provision_sectors;
>
> unsigned short max_segments;
> unsigned short max_integrity_segments;
> @@ -921,6 +922,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
> unsigned int max_discard_sectors);
> extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
> unsigned int max_write_same_sectors);
> +extern void blk_queue_max_provision_sectors(struct request_queue *q,
> + unsigned int max_provision_sectors);
> extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
> extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
> unsigned int max_zone_append_sectors);
> @@ -1060,6 +1063,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
> sector_t nr_sects, gfp_t gfp);
>
> +extern int blkdev_issue_provision(struct block_device *bdev, sector_t sector,
> + sector_t nr_sects, gfp_t gfp_mask);
> +
> #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
> #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
>
> @@ -1139,6 +1145,11 @@ static inline unsigned short queue_max_discard_segments(const struct request_que
> return q->limits.max_discard_segments;
> }
>
> +static inline unsigned short queue_max_provision_sectors(const struct request_queue *q)
> +{
> + return q->limits.max_provision_sectors;
> +}
> +
> static inline unsigned int queue_max_segment_size(const struct request_queue *q)
> {
> return q->limits.max_segment_size;
> @@ -1281,6 +1292,11 @@ static inline bool bdev_nowait(struct block_device *bdev)
> return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
> }
>
> +static inline unsigned int bdev_max_provision_sectors(struct block_device *bdev)
> +{
> + return bdev_get_queue(bdev)->limits.max_provision_sectors;
> +}
> +
> static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
> {
> return blk_queue_zoned_model(bdev_get_queue(bdev));
> --
> 2.40.0.634.g4ca3ef3211-goog
>
> --
> dm-devel mailing list
> dm-devel@xxxxxxxxxx
> https://listman.redhat.com/mailman/listinfo/dm-devel
>