[PATCH RFC 12/16] xfs: Add support for fallocate2

From: John Garry
Date: Wed May 03 2023 - 14:42:35 EST


From: Allison Henderson <allison.henderson@xxxxxxxxxx>

Add support for fallocate2 ioctl, which is xfs' own version of fallocate.

Struct xfs_fallocate2 is passed in the ioctl, and xfs_fallocate2.alignment
allows the user to specify required extent alignment. This is key for
atomic write support, as we expect extents to be aligned on
atomic_write_unit_max boundaries.

The alignment flag is not sticky, so further extent mutation will not
obey this original alignment request. In addition, extent lengths should
always be a multiple of atomic_write_unit_max, which they are not yet. So
this really just works for scenarios when we were lucky enough to get a
single extent.

The following is sample usage and c code:

mkfs.xfs -f /dev/sda
mount /dev/sda mnt
xfs_fallocate2 mnt/test_file1.img 0 20971520 262144
filefrag -v mnt/test_file1.img

xfs_fallocate2.c

struct xfs_fallocate2 {
int64_t offset; /* bytes */
int64_t length; /* bytes */
uint64_t flags;
uint32_t alignment; /* bytes */
uint32_t padding[9];
};

int main(int argc, char **argv) {
char *file;
int fd, ret;
struct xfs_fallocate2 fa = {};

if (argc != 5) {
printf("expected 5 arguments\n");
exit(0);
}

argv++;
file = *argv;
argv++;

fa.offset = atoi(*argv);
argv++;

fa.length = atoi(*argv);
argv++;

fa.alignment = atoi(*argv);
argv++;

if (fa.alignment)
fa.flags = XFS_FALLOC2_ALIGNED;

fd = open(file, O_RDWR | O_CREAT, 0600);
if (fd < 0)
exit(0);

ret = ioctl(fd, XFS_IOC_FALLOCATE2, &fa);
close(fd);

return ret;
}

Signed-off-by: Allison Henderson <allison.henderson@xxxxxxxxxx>
Signed-off-by: Catherine Hoang <catherine.hoang@xxxxxxxxxx>
Signed-off-by: John Garry <john.g.garry@xxxxxxxxxx>
---
fs/xfs/Makefile | 1 +
fs/xfs/libxfs/xfs_attr_remote.c | 2 +-
fs/xfs/libxfs/xfs_bmap.c | 9 ++-
fs/xfs/libxfs/xfs_bmap.h | 4 +-
fs/xfs/libxfs/xfs_da_btree.c | 4 +-
fs/xfs/libxfs/xfs_fs.h | 1 +
fs/xfs/xfs_bmap_util.c | 7 ++-
fs/xfs/xfs_bmap_util.h | 2 +-
fs/xfs/xfs_dquot.c | 2 +-
fs/xfs/xfs_file.c | 19 +++++--
fs/xfs/xfs_fs_staging.c | 99 +++++++++++++++++++++++++++++++++
fs/xfs/xfs_fs_staging.h | 21 +++++++
fs/xfs/xfs_ioctl.c | 4 ++
fs/xfs/xfs_iomap.c | 4 +-
fs/xfs/xfs_reflink.c | 4 +-
fs/xfs/xfs_rtalloc.c | 2 +-
fs/xfs/xfs_symlink.c | 2 +-
security/security.c | 1 +
18 files changed, 168 insertions(+), 20 deletions(-)
create mode 100644 fs/xfs/xfs_fs_staging.c
create mode 100644 fs/xfs/xfs_fs_staging.h

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 92d88dc3c9f7..9b413544d358 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -93,6 +93,7 @@ xfs-y += xfs_aops.o \
xfs_sysfs.o \
xfs_trans.o \
xfs_xattr.o \
+ xfs_fs_staging.o \
kmem.o

# low-level transaction/log code
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..c5f190fef1b5 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -615,7 +615,7 @@ xfs_attr_rmtval_set_blk(
error = xfs_bmapi_write(args->trans, dp,
(xfs_fileoff_t)attr->xattri_lblkno,
attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
- map, &nmap);
+ map, &nmap, 0);
if (error)
return error;

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 34de6e6898c4..52a6e2b61228 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3275,7 +3275,9 @@ xfs_bmap_compute_alignments(
struct xfs_alloc_arg *args)
{
struct xfs_mount *mp = args->mp;
- xfs_extlen_t align = 0; /* minimum allocation alignment */
+
+ /* minimum allocation alignment */
+ xfs_extlen_t align = args->alignment;
int stripe_align = 0;

/* stripe alignment for allocation is determined by mount parameters */
@@ -3652,6 +3654,7 @@ xfs_bmap_btalloc(
.datatype = ap->datatype,
.alignment = 1,
.minalignslop = 0,
+ .alignment = ap->align,
};
xfs_fileoff_t orig_offset;
xfs_extlen_t orig_length;
@@ -4279,12 +4282,14 @@ xfs_bmapi_write(
uint32_t flags, /* XFS_BMAPI_... */
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
- int *nmap) /* i/o: mval size/count */
+ int *nmap,
+ xfs_extlen_t align) /* i/o: mval size/count */
{
struct xfs_bmalloca bma = {
.tp = tp,
.ip = ip,
.total = total,
+ .align = align,
};
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index dd08361ca5a6..0573dfc5fa6b 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -26,6 +26,7 @@ struct xfs_bmalloca {
xfs_fileoff_t offset; /* offset in file filling in */
xfs_extlen_t length; /* i/o length asked/allocated */
xfs_fsblock_t blkno; /* starting block of new extent */
+ xfs_extlen_t align;

struct xfs_btree_cur *cur; /* btree cursor */
struct xfs_iext_cursor icur; /* incore extent cursor */
@@ -189,7 +190,8 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
int *nmap, uint32_t flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
- xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
+ xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap,
+ xfs_extlen_t align);
int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
xfs_extnum_t nexts);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e576560b46e9..e6581254092f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2174,7 +2174,7 @@ xfs_da_grow_inode_int(
nmap = 1;
error = xfs_bmapi_write(tp, dp, *bno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
- args->total, &map, &nmap);
+ args->total, &map, &nmap, 0);
if (error)
return error;

@@ -2196,7 +2196,7 @@ xfs_da_grow_inode_int(
nmap = min(XFS_BMAP_MAX_NMAP, c);
error = xfs_bmapi_write(tp, dp, b, c,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
- args->total, &mapp[mapi], &nmap);
+ args->total, &mapp[mapi], &nmap, 0);
if (error)
goto out_free_map;
if (nmap < 1)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 1cfd5bc6520a..829316ca01ea 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -831,6 +831,7 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
+#define XFS_IOC_FALLOCATE2 _IOR ('X', 129, struct xfs_fallocate2)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */


diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a09dd2606479..a0c55af6f051 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -776,10 +776,12 @@ int
xfs_alloc_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ xfs_off_t align)
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
+ xfs_filblks_t align_fsb;
xfs_filblks_t allocated_fsb;
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
@@ -811,6 +813,7 @@ xfs_alloc_file_space(
nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+ align_fsb = XFS_B_TO_FSB(mp, align);
allocatesize_fsb = endoffset_fsb - startoffset_fsb;

/*
@@ -872,7 +875,7 @@ xfs_alloc_file_space(

error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
- &nimaps);
+ &nimaps, align_fsb);
if (error)
goto error;

diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 6888078f5c31..476f610ad617 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,

/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, xfs_off_t align);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8fb90da89787..475e1a56d1b0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -328,7 +328,7 @@ xfs_dquot_disk_alloc(
/* Create the block mapping. */
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
- &nmaps);
+ &nmaps, 0);
if (error)
goto err_cancel;

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 705250f9f90a..9b1db42a8d33 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -883,12 +883,13 @@ static inline bool xfs_file_sync_writes(struct file *filp)
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)

-STATIC long
-xfs_file_fallocate(
+long
+_xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ loff_t alignment)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
@@ -1035,7 +1036,7 @@ xfs_file_fallocate(
}

if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len);
+ error = xfs_alloc_file_space(ip, offset, len, alignment);
if (error)
goto out_unlock;
}
@@ -1073,6 +1074,16 @@ xfs_file_fallocate(
return error;
}

+STATIC long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ return _xfs_file_fallocate(file, mode, offset, len, 0);
+}
+
STATIC int
xfs_file_fadvise(
struct file *file,
diff --git a/fs/xfs/xfs_fs_staging.c b/fs/xfs/xfs_fs_staging.c
new file mode 100644
index 000000000000..1d635c0a9f49
--- /dev/null
+++ b/fs/xfs/xfs_fs_staging.c
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs_staging.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+
+#include "linux/security.h"
+#include "linux/fsnotify.h"
+
+extern long _xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ loff_t alignment);
+
+int xfs_fallocate2( struct file *filp,
+ void __user *arg)
+{
+ struct inode *inode = file_inode(filp);
+ //struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_fallocate2 fallocate2;
+ int ret;
+
+ if (copy_from_user(&fallocate2, arg, sizeof(fallocate2)))
+ return -EFAULT;
+
+ if (fallocate2.flags & XFS_FALLOC2_ALIGNED) {
+ if (!fallocate2.alignment || !is_power_of_2(fallocate2.alignment))
+ return -EINVAL;
+
+ if (fallocate2.offset % fallocate2.alignment)
+ return -EINVAL;
+
+ if (fallocate2.length % fallocate2.alignment)
+ return -EINVAL;
+ } else if (fallocate2.alignment) {
+ return -EINVAL;
+ }
+
+ /* These are all just copied from vfs_fallocate() */
+ if (fallocate2.offset < 0 || fallocate2.length <= 0)
+ return -EINVAL;
+
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ /*
+ * We cannot allow any fallocate operation on an active swapfile
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
+ /*
+ * Revalidate the write permissions, in case security policy has
+ * changed since the files were opened.
+ */
+ ret = security_file_permission(filp, MAY_WRITE);
+ if (ret)
+ return ret;
+
+ if (S_ISFIFO(inode->i_mode))
+ return -ESPIPE;
+
+ if (S_ISDIR(inode->i_mode))
+ return -EISDIR;
+
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -ENODEV;
+
+ /* Check for wrap through zero too */
+ if (((fallocate2.offset + fallocate2.length) > inode->i_sb->s_maxbytes) ||
+ ((fallocate2.offset + fallocate2.length) < 0))
+ return -EFBIG;
+
+ if (!filp->f_op->fallocate)
+ return -EOPNOTSUPP;
+
+ file_start_write(filp);
+ ret = _xfs_file_fallocate(filp, 0, fallocate2.offset, fallocate2.length, fallocate2.alignment);
+
+ if (ret == 0)
+ fsnotify_modify(filp);
+
+ file_end_write(filp);
+
+ return ret;
+}
diff --git a/fs/xfs/xfs_fs_staging.h b/fs/xfs/xfs_fs_staging.h
new file mode 100644
index 000000000000..a82e61063dba
--- /dev/null
+++ b/fs/xfs/xfs_fs_staging.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ */
+#ifndef __XFS_FS_STAGING_H__
+#define __XFS_FS_STAGING_H__
+
+struct xfs_fallocate2 {
+ s64 offset; /* bytes */
+ s64 length; /* bytes */
+ u64 flags;
+ u32 alignment; /* bytes */
+ u32 padding[8];
+};
+
+#define XFS_FALLOC2_ALIGNED (1U << 0)
+
+int xfs_fallocate2( struct file *filp,
+ void __user *arg);
+
+#endif /* __XFS_FS_STAGING_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55bb01173cde..6e60fce44068 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -4,6 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_fs_staging.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
@@ -2149,6 +2150,9 @@ xfs_file_ioctl(
return error;
}

+ case XFS_IOC_FALLOCATE2:
+ return xfs_fallocate2(filp, arg);
+
default:
return -ENOTTY;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 285885c308bd..a4389a0c4bf2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -306,7 +306,7 @@ xfs_iomap_write_direct(
*/
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
- imap, &nimaps);
+ imap, &nimaps, 0);
if (error)
goto out_trans_cancel;

@@ -614,7 +614,7 @@ xfs_iomap_write_unwritten(
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
XFS_BMAPI_CONVERT, resblks, &imap,
- &nimaps);
+ &nimaps, 0);
if (error)
goto error_on_bmapi_transaction;

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f5dc46ce9803..a2e5ba6cf7f3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -420,7 +420,7 @@ xfs_reflink_fill_cow_hole(
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
- &nimaps);
+ &nimaps, 0);
if (error)
goto out_trans_cancel;

@@ -490,7 +490,7 @@ xfs_reflink_fill_delalloc(
error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
cmap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
- cmap, &nimaps);
+ cmap, &nimaps, 0);
if (error)
goto out_trans_cancel;

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 16534e9873f6..a57a8a4d8294 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -817,7 +817,7 @@ xfs_growfs_rt_alloc(
*/
nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
- XFS_BMAPI_METADATA, 0, &map, &nmap);
+ XFS_BMAPI_METADATA, 0, &map, &nmap, 0);
if (!error && nmap < 1)
error = -ENOSPC;
if (error)
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 85e433df6a3f..2a4524bf34a5 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -269,7 +269,7 @@ xfs_symlink(
nmaps = XFS_SYMLINK_MAPS;

error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
- XFS_BMAPI_METADATA, resblks, mval, &nmaps);
+ XFS_BMAPI_METADATA, resblks, mval, &nmaps, 0);
if (error)
goto out_trans_cancel;

diff --git a/security/security.c b/security/security.c
index cf6cc576736f..d53b1b6c2d59 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1593,6 +1593,7 @@ int security_file_permission(struct file *file, int mask)

return fsnotify_perm(file, mask);
}
+EXPORT_SYMBOL(security_file_permission);

int security_file_alloc(struct file *file)
{
--
2.31.1