[RFC 3/8] iomap: Add atomic write support for direct-io

From: Ritesh Harjani (IBM)
Date: Sat Mar 02 2024 - 02:43:44 EST


This adds direct-io atomic writes support in iomap. This adds -
1. IOMAP_ATOMIC flag for iomap iter.
2. Sets REQ_ATOMIC to bio opflags.
3. Adds necessary checks in iomap_dio code to ensure a single bio is
submitted for an atomic write request. (since we only support ubuf
type iocb). Otherwise return an error EIO.
4. Adds a common helper routine iomap_dio_check_atomic(). It helps in
verifying mapped length and start/end physical offset against the hw
device constraints for supporting atomic writes.

This patch is based on a patch from John Garry <john.g.garry@xxxxxxxxxx>
which adds such support of DIO atomic writes to iomap.

Co-developed-by: Ojaswin Mujoo <ojaswin@xxxxxxxxxxxxx>
Signed-off-by: Ojaswin Mujoo <ojaswin@xxxxxxxxxxxxx>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@xxxxxxxxx>
---
fs/iomap/direct-io.c | 75 +++++++++++++++++++++++++++++++++++++++++--
fs/iomap/trace.h | 3 +-
include/linux/iomap.h | 1 +
3 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..b4548acb74e7 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -256,7 +256,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua)
+ const struct iomap *iomap, bool use_fua, bool atomic_write)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;

@@ -269,6 +269,9 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;

+ if (atomic_write)
+ opflags |= REQ_ATOMIC;
+
return opflags;
}

@@ -279,11 +282,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
+ const size_t orig_len = iter->len;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
bool need_zeroout = false;
- bool use_fua = false;
+ bool use_fua = false, atomic_write = iter->flags & IOMAP_ATOMIC;
int nr_pages, ret = 0;
size_t copied = 0;
size_t orig_count;
@@ -356,6 +360,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
+ if (unlikely(pad && atomic_write)) {
+ WARN_ON_ONCE("pos not atomic write aligned\n");
+ ret = -EINVAL;
+ goto out;
+ }
if (pad)
iomap_dio_zero(iter, dio, pos - pad, pad);
}
@@ -365,7 +374,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* can set up the page vector appropriately for a ZONE_APPEND
* operation.
*/
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_write);

nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@@ -397,6 +406,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}

n = bio->bi_iter.bi_size;
+
+ /* This bio should have covered the complete length */
+ if (unlikely(atomic_write && n != orig_len)) {
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ bio_put(bio);
+ goto out;
+ }
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@@ -429,6 +446,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
+ /* This should never happen */
+ WARN_ON_ONCE(unlikely(pad && atomic_write));
if (pad)
iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
}
@@ -516,6 +535,44 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
}
}

+/*
+ * iomap_dio_check_atomic: DIO Atomic checks before calling bio submission.
+ * @iter: iomap iterator
+ * This function is called after filesystem block mapping and before bio
+ * formation/submission. This is the right place to verify hw device/block
+ * layer constraints to be followed for doing atomic writes. Hence do those
+ * common checks here.
+ */
+static bool iomap_dio_check_atomic(struct iomap_iter *iter)
+{
+ struct block_device *bdev = iter->iomap.bdev;
+ unsigned long long map_len = iomap_length(iter);
+ unsigned long long start = iomap_sector(&iter->iomap, iter->pos)
+ << SECTOR_SHIFT;
+ unsigned long long end = start + map_len - 1;
+ unsigned int awu_min =
+ queue_atomic_write_unit_min_bytes(bdev->bd_queue);
+ unsigned int awu_max =
+ queue_atomic_write_unit_max_bytes(bdev->bd_queue);
+ unsigned long boundary =
+ queue_atomic_write_boundary_bytes(bdev->bd_queue);
+ unsigned long mask = ~(boundary - 1);
+
+
+ /* map_len should be same as user specified iter->len */
+ if (map_len < iter->len)
+ return false;
+ /* start should be aligned to block device min atomic unit alignment */
+ if (!IS_ALIGNED(start, awu_min))
+ return false;
+ /* If top bits doesn't match, means atomic unit boundary is crossed */
+ if (boundary && ((start | mask) != (end | mask)))
+ return false;
+
+ return true;
+}
+
+
/*
* iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
* is being issued as AIO or not. This allows us to optimise pure data writes
@@ -554,12 +611,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
struct blk_plug plug;
struct iomap_dio *dio;
loff_t ret = 0;
+ bool atomic_write = iocb->ki_flags & IOCB_ATOMIC;

trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);

if (!iomi.len)
return NULL;

+ if (atomic_write && !iter_is_ubuf(iter))
+ return ERR_PTR(-EINVAL);
+
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return ERR_PTR(-ENOMEM);
@@ -605,6 +666,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
dio->flags |= IOMAP_DIO_CALLER_COMP;

+ if (atomic_write)
+ iomi.flags |= IOMAP_ATOMIC;
+
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||
@@ -656,6 +720,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,

blk_start_plug(&plug);
while ((ret = iomap_iter(&iomi, ops)) > 0) {
+ if (atomic_write && !iomap_dio_check_atomic(&iomi)) {
+ ret = -EIO;
+ break;
+ }
+
iomi.processed = iomap_dio_iter(&iomi, dio);

/*
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..c95576420bca 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
- { IOMAP_NOWAIT, "NOWAIT" }
+ { IOMAP_NOWAIT, "NOWAIT" }, \
+ { IOMAP_ATOMIC, "ATOMIC" }

#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44..9eac704a0d6f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC (1 << 9)

struct iomap_ops {
/*
--
2.43.0