[PATCH] inotify: support returning file_handles

From: Max Kellermann
Date: Tue Sep 19 2023 - 16:23:33 EST


This patch adds the watch mask bit "IN_FID". It implements semantics
similar to fanotify's "FAN_REPORT_FID": if the bit is set in a "struct
inotify_event", the event (and the name) are followed by a "struct
inotify_extra_fid" which contains the filesystem id (not yet
implemented) and the file_handle.

It is debatable whether this is a useful feature; not useful for me,
but Jan Kara cited this feature as an advantage of fanotify over
inotify:
https://lore.kernel.org/linux-fsdevel/20230919100112.nlb2t4nm46wmugc2@quack3/

In a follow-up, Amir Goldstein did not want to accept that this
feature could be added easily to inotify, telling me that I "do not
understand the complexity involved":
https://lore.kernel.org/linux-fsdevel/CAOQ4uxgG6ync6dSBJiGW98docJGnajALiV+9tuwGiRt8NE8F+w@xxxxxxxxxxxxxx/

.. which Jan Kara agreed with: "I'll really find out it isn't so easy"
https://lore.kernel.org/linux-fsdevel/20230919132818.4s3n5bsqmokof6n2@quack3/

So here it is, an easy implementation in less than 90 lines of code
(which is slightly incomplete; grep TODO). This is a per-watch flag
and there is no backwards compatibility breakage because the extra
struct is only added if explicitly requested by user space.

This is just a proof-of-concept, to demonstrate that adding such a
feature to inotify is not only possible, but also easy. Even though
this part of the kernel is new to me, apparently I do "understand the
complexity involved", after all.

I don't expect this to be merged. As much as I'd like to see inotify
improved because fanotify is too complex and cumbersome for my taste,
I'm not deluded enough to believe this PoC will convince anybody. But
hacking it was fun and helped me learn about the inotify code which I
may continue to improve in our private kernel fork.

Signed-off-by: Max Kellermann <max.kellermann@xxxxxxxxx>
---
fs/notify/inotify/inotify.h | 4 ++-
fs/notify/inotify/inotify_fsnotify.c | 31 ++++++++++++++++++++
fs/notify/inotify/inotify_user.c | 43 +++++++++++++++++++++++++++-
include/linux/inotify.h | 1 +
include/uapi/linux/inotify.h | 11 +++++++
5 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7d5df7a21539..2b20aa67fa8b 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -9,6 +9,8 @@ struct inotify_event_info {
int wd;
u32 sync_cookie;
int name_len;
+ __kernel_fsid_t fsid;
+ u8 file_handle_size;
char name[];
};

@@ -27,7 +29,7 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
* userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is
* used only internally to the kernel.
*/
-#define INOTIFY_USER_MASK (IN_ALL_EVENTS)
+#define INOTIFY_USER_MASK (IN_ALL_EVENTS|IN_FID)

static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
{
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 993375f0db67..1fdfaac91d7b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -17,6 +17,7 @@
#include <linux/fs.h> /* struct inode */
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
+#include <linux/exportfs.h>
#include <linux/path.h> /* struct path */
#include <linux/slab.h> /* kmem_* */
#include <linux/types.h>
@@ -43,6 +44,7 @@ static bool event_compare(struct fsnotify_event *old_fsn,
(old->name_len == new->name_len) &&
(!old->name_len || !strcmp(old->name, new->name)))
return true;
+ // TODO compare fsid, file_handle
return false;
}

@@ -56,6 +58,16 @@ static int inotify_merge(struct fsnotify_group *group,
return event_compare(last_event, event);
}

+static int get_file_handle_dwords(struct inode *inode)
+{
+ if (inode == NULL)
+ return 0;
+
+ int dwords = 0;
+ exportfs_encode_fid(inode, NULL, &dwords);
+ return dwords;
+}
+
int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *name, u32 cookie)
@@ -66,7 +78,9 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
struct fsnotify_group *group = inode_mark->group;
int ret;
int len = 0, wd;
+ int file_handle_dwords;
int alloc_len = sizeof(struct inotify_event_info);
+ size_t file_handle_offset = 0;
struct mem_cgroup *old_memcg;

if (name) {
@@ -74,6 +88,14 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
alloc_len += len + 1;
}

+ if (inode_mark->mask & IN_FID) {
+ file_handle_dwords = get_file_handle_dwords(inode);
+ if (file_handle_dwords > 0) {
+ file_handle_offset = roundup(alloc_len, 4);
+ alloc_len = file_handle_offset + sizeof(struct file_handle) + (file_handle_dwords << 2);
+ }
+ }
+
pr_debug("%s: group=%p mark=%p mask=%x\n", __func__, group, inode_mark,
mask);

@@ -120,9 +142,18 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
event->wd = wd;
event->sync_cookie = cookie;
event->name_len = len;
+ memset(&event->fsid, 0, sizeof(event->fsid)); // TODO
if (len)
strcpy(event->name, name->name);

+ if (file_handle_offset > 0) {
+ struct file_handle *fh = (struct file_handle *)((unsigned char *)event + file_handle_offset);
+ fh->handle_type = exportfs_encode_fid(inode, (struct fid *)fh->f_handle, &file_handle_dwords);
+ fh->handle_bytes = file_handle_dwords << 2;
+ event->mask |= IN_FID;
+ event->file_handle_size = sizeof(*fh) + (file_handle_dwords << 2);
+ }
+
ret = fsnotify_add_event(group, fsn_event, inotify_merge);
if (ret) {
/* Our event wasn't used in the end. Free it. */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1c4bfdab008d..a6bf235314b8 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -133,6 +133,7 @@ static inline unsigned int inotify_arg_to_flags(u32 arg)
static inline u32 inotify_mask_to_arg(__u32 mask)
{
return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
+ IN_FID |
IN_Q_OVERFLOW);
}

@@ -173,6 +174,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
{
size_t event_size = sizeof(struct inotify_event);
struct fsnotify_event *event;
+ const struct inotify_event_info *ie;

event = fsnotify_peek_first_event(group);
if (!event)
@@ -181,6 +183,12 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);

event_size += round_event_name_len(event);
+
+ ie = INOTIFY_E(event);
+ if (ie->mask & IN_FID)
+ event_size += roundup(sizeof(struct inotify_extra_fid) + ie->file_handle_size,
+ sizeof(struct inotify_event));
+
if (event_size > count)
return ERR_PTR(-EINVAL);

@@ -241,9 +249,42 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
/* fill userspace with 0's */
if (clear_user(buf, pad_name_len - name_len))
return -EFAULT;
+
+ buf += pad_name_len - name_len;
event_size += pad_name_len;
}

+ if (event->mask & IN_FID) {
+ const size_t handle_size = event->file_handle_size;
+ const size_t extra_size = sizeof(struct inotify_extra_fid) + handle_size;
+ const size_t padded_extra_size = roundup(extra_size, sizeof(struct inotify_event));
+ const size_t padding = padded_extra_size - extra_size;
+
+ const struct inotify_extra_fid extra = {
+ .fsid = event->fsid,
+ .handle_size = handle_size,
+ .padding = padding,
+ };
+
+ if (copy_to_user(buf, &extra, sizeof(extra)))
+ return -EFAULT;
+
+ buf += sizeof(extra);
+
+ const unsigned char *handle = (const unsigned char *)(event + 1) + roundup(name_len + (name_len > 0), 4);
+ if (copy_to_user(buf, handle, handle_size))
+ return -EFAULT;
+
+ buf += handle_size;
+
+ if (clear_user(buf, padding))
+ return -EFAULT;
+
+ buf += padding;
+
+ event_size += padded_extra_size;
+ }
+
return event_size;
}

@@ -860,7 +901,7 @@ static int __init inotify_user_setup(void)
BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);

- BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22);
+ BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 23);

inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 8d20caa1b268..0b23353a2f32 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -13,6 +13,7 @@
IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \
IN_MOVED_TO | IN_CREATE | IN_DELETE | \
IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT | \
+ IN_FID | \
IN_Q_OVERFLOW | IN_IGNORED | IN_ONLYDIR | \
IN_DONT_FOLLOW | IN_EXCL_UNLINK | IN_MASK_ADD | \
IN_MASK_CREATE | IN_ISDIR | IN_ONESHOT)
diff --git a/include/uapi/linux/inotify.h b/include/uapi/linux/inotify.h
index b3e165853d5b..90520262e47b 100644
--- a/include/uapi/linux/inotify.h
+++ b/include/uapi/linux/inotify.h
@@ -26,6 +26,16 @@ struct inotify_event {
char name[]; /* stub for possible name */
};

+struct inotify_extra_fid {
+ __kernel_fsid_t fsid;
+
+ __u8 handle_size;
+ __u8 padding;
+ __u8 reserved1, reserved2;
+
+ unsigned char handle[];
+};
+
/* the following are legal, implemented events that user-space can watch for */
#define IN_ACCESS 0x00000001 /* File was accessed */
#define IN_MODIFY 0x00000002 /* File was modified */
@@ -50,6 +60,7 @@ struct inotify_event {
#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO) /* moves */

/* special flags */
+#define IN_FID 0x00800000 /* return fsid and file_handle (struct inotify_extra_fid) */
#define IN_ONLYDIR 0x01000000 /* only watch the path if it is a directory */
#define IN_DONT_FOLLOW 0x02000000 /* don't follow a sym link */
#define IN_EXCL_UNLINK 0x04000000 /* exclude events on unlinked objects */
--
2.39.2