[PATCH v2 03/14] fs: xfs: Introduce FORCEALIGN inode flag

From: John Garry
Date: Mon Mar 04 2024 - 08:06:40 EST


From: "Darrick J. Wong" <djwong@xxxxxxxxxx>

Add a new inode flag to require that all file data extent mappings must
be aligned (both the file offset range and the allocated space itself)
to the extent size hint. Having a separate COW extent size hint is no
longer allowed.

The goal here is to enable sysadmins and users to mandate that all space
mappings in a file must have a startoff/blockcount that are aligned to
(say) a 2MB alignment and that the startblock/blockcount will follow the
same alignment.

jpg: Enforce extsize is a power-of-2 for forcealign
Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx>
Co-developed-by: John Garry <john.g.garry@xxxxxxxxxx>
Signed-off-by: John Garry <john.g.garry@xxxxxxxxxx>
---
fs/xfs/libxfs/xfs_format.h | 6 +++++-
fs/xfs/libxfs/xfs_inode_buf.c | 40 +++++++++++++++++++++++++++++++++++
fs/xfs/libxfs/xfs_inode_buf.h | 3 +++
fs/xfs/libxfs/xfs_sb.c | 2 ++
fs/xfs/xfs_inode.c | 12 +++++++++++
fs/xfs/xfs_inode.h | 5 +++++
fs/xfs/xfs_ioctl.c | 34 ++++++++++++++++++++++++++++-
fs/xfs/xfs_mount.h | 2 ++
fs/xfs/xfs_super.c | 4 ++++
include/uapi/linux/fs.h | 2 ++
10 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 382ab1e71c0b..db2113cf6e47 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30) /* aligned file data extents */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
@@ -1085,16 +1086,19 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define XFS_DIFLAG2_FORCEALIGN_BIT 5

#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_FORCEALIGN (1 << XFS_DIFLAG2_FORCEALIGN_BIT)

#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)

static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 137a65bda95d..61cc12cd54db 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -610,6 +610,14 @@ xfs_dinode_verify(
!xfs_has_bigtime(mp))
return __this_address;

+ if (flags2 & XFS_DIFLAG2_FORCEALIGN) {
+ fa = xfs_inode_validate_forcealign(mp, mode, flags,
+ be32_to_cpu(dip->di_extsize),
+ be32_to_cpu(dip->di_cowextsize));
+ if (fa)
+ return fa;
+ }
+
return NULL;
}

@@ -777,3 +785,35 @@ xfs_inode_validate_cowextsize(

return NULL;
}
+
+/* Validate the forcealign inode flag */
+xfs_failaddr_t
+xfs_inode_validate_forcealign(
+ struct xfs_mount *mp,
+ uint16_t mode,
+ uint16_t flags,
+ uint32_t extsize,
+ uint32_t cowextsize)
+{
+ /* superblock rocompat feature flag */
+ if (!xfs_has_forcealign(mp))
+ return __this_address;
+
+ /* Only regular files and directories */
+ if (!S_ISDIR(mode) && !S_ISREG(mode))
+ return __this_address;
+
+ /* Doesn't apply to realtime files */
+ if (flags & XFS_DIFLAG_REALTIME)
+ return __this_address;
+
+ /* Requires a non-zero power-of-2 extent size hint */
+ if (extsize == 0 || !is_power_of_2(extsize))
+ return __this_address;
+
+ /* Requires no cow extent size hint */
+ if (cowextsize != 0)
+ return __this_address;
+
+ return NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 585ed5a110af..50db17d22b68 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -33,6 +33,9 @@ xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
uint32_t cowextsize, uint16_t mode, uint16_t flags,
uint64_t flags2);
+xfs_failaddr_t xfs_inode_validate_forcealign(struct xfs_mount *mp,
+ uint16_t mode, uint16_t flags, uint32_t extsize,
+ uint32_t cowextsize);

static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
{
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5bb6e2bd6dee..f2c16a028fae 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -163,6 +163,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_REFLINK;
if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
features |= XFS_FEAT_INOBTCNT;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FORCEALIGN)
+ features |= XFS_FEAT_FORCEALIGN;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
features |= XFS_FEAT_FTYPE;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1fd94958aa97..2c439df8c47f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -629,6 +629,8 @@ xfs_ip2xflags(
flags |= FS_XFLAG_DAX;
if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
flags |= FS_XFLAG_COWEXTSIZE;
+ if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
+ flags |= FS_XFLAG_FORCEALIGN;
}

if (xfs_inode_has_attr_fork(ip))
@@ -758,6 +760,8 @@ xfs_inode_inherit_flags2(
}
if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+ if (pip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN)
+ ip->i_diflags2 |= XFS_DIFLAG2_FORCEALIGN;

/* Don't let invalid cowextsize hints propagate. */
failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
@@ -766,6 +770,14 @@ xfs_inode_inherit_flags2(
ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
ip->i_cowextsize = 0;
}
+
+ if (ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
+ failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+ VFS_I(ip)->i_mode, ip->i_diflags, ip->i_extsize,
+ ip->i_cowextsize);
+ if (failaddr)
+ ip->i_diflags2 &= ~XFS_DIFLAG2_FORCEALIGN;
+ }
}

/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 97f63bacd4c2..82e2838f6d64 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -305,6 +305,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
}

+static inline bool xfs_inode_forcealign(struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_FORCEALIGN;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f02b6e558af5..867d8d51a3d0 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1110,6 +1110,8 @@ xfs_flags2diflags2(
di_flags2 |= XFS_DIFLAG2_DAX;
if (xflags & FS_XFLAG_COWEXTSIZE)
di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ if (xflags & FS_XFLAG_FORCEALIGN)
+ di_flags2 |= XFS_DIFLAG2_FORCEALIGN;

return di_flags2;
}
@@ -1146,6 +1148,22 @@ xfs_ioctl_setattr_xflags(
if (i_flags2 && !xfs_has_v3inodes(mp))
return -EINVAL;

+ /*
+ * Force-align requires a nonzero extent size hint and a zero cow
+ * extent size hint. It doesn't apply to realtime files.
+ */
+ if (fa->fsx_xflags & FS_XFLAG_FORCEALIGN) {
+ if (!xfs_has_forcealign(mp))
+ return -EINVAL;
+ if (fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)
+ return -EINVAL;
+ if (!(fa->fsx_xflags & (FS_XFLAG_EXTSIZE |
+ FS_XFLAG_EXTSZINHERIT)))
+ return -EINVAL;
+ if (fa->fsx_xflags & FS_XFLAG_REALTIME)
+ return -EINVAL;
+ }
+
ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
ip->i_diflags2 = i_flags2;

@@ -1232,6 +1250,7 @@ xfs_ioctl_setattr_check_extsize(
struct xfs_mount *mp = ip->i_mount;
xfs_failaddr_t failaddr;
uint16_t new_diflags;
+ uint16_t new_diflags2;

if (!fa->fsx_valid)
return 0;
@@ -1244,6 +1263,7 @@ xfs_ioctl_setattr_check_extsize(
return -EINVAL;

new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+ new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);

/*
* Inode verifiers do not check that the extent size hint is an integer
@@ -1263,7 +1283,19 @@ xfs_ioctl_setattr_check_extsize(
failaddr = xfs_inode_validate_extsize(ip->i_mount,
XFS_B_TO_FSB(mp, fa->fsx_extsize),
VFS_I(ip)->i_mode, new_diflags);
- return failaddr != NULL ? -EINVAL : 0;
+ if (failaddr)
+ return -EINVAL;
+
+ if (new_diflags2 & XFS_DIFLAG2_FORCEALIGN) {
+ failaddr = xfs_inode_validate_forcealign(ip->i_mount,
+ VFS_I(ip)->i_mode, new_diflags,
+ XFS_B_TO_FSB(mp, fa->fsx_extsize),
+ XFS_B_TO_FSB(mp, fa->fsx_cowextsize));
+ if (failaddr)
+ return -EINVAL;
+ }
+
+ return 0;
}

static int
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 503fe3c7edbf..e1ef31675db3 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -289,6 +289,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
+#define XFS_FEAT_FORCEALIGN (1ULL << 27) /* aligned file data extents */

/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -352,6 +353,7 @@ __XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
+__XFS_HAS_FEAT(forcealign, FORCEALIGN)

/*
* Mount features
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5a2512d20bd0..74dcafddf6a9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1708,6 +1708,10 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}

+ if (xfs_has_forcealign(mp))
+ xfs_warn(mp,
+"EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!");
+
if (xfs_has_reflink(mp)) {
if (mp->m_sb.sb_rblocks) {
xfs_alert(mp,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a0975ae81e64..8828822331bf 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -140,6 +140,8 @@ struct fsxattr {
#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
#define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */
#define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */
+/* data extent mappings for regular files must be aligned to extent size hint */
+#define FS_XFLAG_FORCEALIGN 0x00020000
#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */

/* the read-only stuff doesn't really belong here, but any other place is
--
2.31.1