Re: [PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

From: Miklos Szeredi
Date: Mon Jan 11 2021 - 08:50:36 EST


On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
> Miklos Szeredi <mszeredi@xxxxxxxxxx> writes:
>
> > cap_convert_nscap() does permission checking as well as conversion of the
> > xattr value conditionally based on fs's user-ns.
> >
> > This is needed by overlayfs and probably other layered fs (ecryptfs) and is
> > what vfs_foo() is supposed to do anyway.
>
> Well crap.
>
> I just noticed this and it turns out this change is wrong.
>
> The problem is that it reads the rootid from the v3 fscap, using
> current_user_ns() and then writes it using the sb->s_user_ns.
>
> So any time the stacked filesystems sb->s_user_ns do not match or
> current_user_ns does not match sb->s_user_ns this could be a problem.
>
> In a stacked filesystem a second pass through vfs_setxattr will result
> in the rootid being translated a second time (with potentially the wrong
> namespaces). I think because of the security checks a we won't write
> something we shouldn't be able to write to the filesystem. Still we
> will be writing the wrong v3 fscap which can go quite badly.
>
> This doesn't look terribly difficult to fix.
>
> Probably convert this into a fs independent form using uids in
> init_user_ns at input and have cap_convert_nscap convert the v3 fscap
> into the filesystem dependent form. With some way for stackable
> filesystems to just skip converting it from the filesystem independent
> format.
>
> Uids in xattrs that are expected to go directly to disk, but aren't
> always suitable for going directly to disk are tricky.

I've been looking at this for a couple of days and can't say I clearly
understand everything yet.

For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a rootid of
zero, right?

If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
succeeding unconditionally while v3 one being either converted to v2, rejected
or left as v3 depending on current_user_ns())?

Anyway, here's a patch that I think fixes getxattr() layering for
security.capability. Does basically what you suggested. Slight change of
semantics vs. v1 caps, not sure if that is still needed, getxattr()/setxattr()
hasn't worked for these since the introduction of v3 in 4.14. Untested.

I still need to wrap my head around the permission requirements for the
setxattr() case...

Thanks,
Miklos

---
fs/overlayfs/super.c | 15 +++
include/linux/capability.h | 2
include/linux/fs.h | 1
security/commoncap.c | 210 ++++++++++++++++++++++++---------------------
4 files changed, 132 insertions(+), 96 deletions(-)

--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -395,6 +395,20 @@ static int ovl_remount(struct super_bloc
return ret;
}

+static int ovl_cap_get(struct dentry *dentry,
+ struct vfs_ns_cap_data *nscap)
+{
+ int res;
+ const struct cred *old_cred;
+ struct dentry *realdentry = ovl_dentry_real(dentry);
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_cap_get(realdentry, nscap);
+ revert_creds(old_cred);
+
+ return res;
+}
+
static const struct super_operations ovl_super_operations = {
.alloc_inode = ovl_alloc_inode,
.free_inode = ovl_free_inode,
@@ -405,6 +419,7 @@ static const struct super_operations ovl
.statfs = ovl_statfs,
.show_options = ovl_show_options,
.remount_fs = ovl_remount,
+ .cap_get = ovl_cap_get,
};

enum {
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -272,4 +272,6 @@ extern int get_vfs_caps_from_disk(const

extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);

+int vfs_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap);
+
#endif /* !_LINUX_CAPABILITY_H */
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1963,6 +1963,7 @@ struct super_operations {
struct shrink_control *);
long (*free_cached_objects)(struct super_block *,
struct shrink_control *);
+ int (*cap_get)(struct dentry *dentry, struct vfs_ns_cap_data *nscap);
};

/*
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -341,6 +341,13 @@ static __u32 sansflags(__u32 m)
return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

+static bool is_v1header(size_t size, const struct vfs_cap_data *cap)
+{
+ if (size != XATTR_CAPS_SZ_1)
+ return false;
+ return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_1;
+}
+
static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
{
if (size != XATTR_CAPS_SZ_2)
@@ -355,6 +362,72 @@ static bool is_v3header(size_t size, con
return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

+static bool validheader(size_t size, const struct vfs_cap_data *cap)
+{
+ return is_v1header(size, cap) || is_v2header(size, cap) || is_v3header(size, cap);
+}
+
+static void cap_to_v3(const struct vfs_cap_data *cap,
+ struct vfs_ns_cap_data *nscap)
+{
+ u32 magic, nsmagic;
+
+ nsmagic = VFS_CAP_REVISION_3;
+ magic = le32_to_cpu(cap->magic_etc);
+ if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+ nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+ nscap->magic_etc = cpu_to_le32(nsmagic);
+ nscap->rootid = cpu_to_le32(0);
+}
+
+static int default_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap)
+{
+ int err;
+ ssize_t size;
+ kuid_t kroot;
+ uid_t root, mappedroot;
+ char *tmpbuf = NULL;
+ struct vfs_cap_data *cap;
+ struct user_namespace *fs_ns = dentry->d_sb->s_user_ns;
+
+ size = vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, &tmpbuf,
+ sizeof(struct vfs_ns_cap_data), GFP_NOFS);
+ if (size < 0)
+ return size;
+
+ cap = (struct vfs_cap_data *) tmpbuf;
+ err = -EINVAL;
+ if (!validheader(size, cap))
+ goto out;
+
+ memset(nscap, 0, sizeof(*nscap));
+ memcpy(nscap, tmpbuf, size);
+ if (!is_v3header(size, cap))
+ cap_to_v3(cap, nscap);
+
+ /* Convert rootid from fs user namespace to init user namespace */
+ root = le32_to_cpu(nscap->rootid);
+ kroot = make_kuid(fs_ns, root);
+ mappedroot = from_kuid(&init_user_ns, kroot);
+ nscap->rootid = cpu_to_le32(mappedroot);
+
+ err = 0;
+out:
+ kfree(tmpbuf);
+ return err;
+}
+
+int vfs_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap)
+{
+ struct super_block *sb = dentry->d_sb;
+
+ if (sb->s_op->cap_get)
+ return sb->s_op->cap_get(dentry, nscap);
+ else
+ return default_cap_get(dentry, nscap);
+}
+EXPORT_SYMBOL(vfs_cap_get);
+
/*
* getsecurity: We are called for security.* before any attempt to read the
* xattr from the inode itself.
@@ -369,14 +442,15 @@ static bool is_v3header(size_t size, con
int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
bool alloc)
{
- int size, ret;
+ int ret;
+ ssize_t size;
kuid_t kroot;
+ __le32 nsmagic, magic;
uid_t root, mappedroot;
- char *tmpbuf = NULL;
+ void *tmpbuf = NULL;
struct vfs_cap_data *cap;
- struct vfs_ns_cap_data *nscap;
+ struct vfs_ns_cap_data nscap;
struct dentry *dentry;
- struct user_namespace *fs_ns;

if (strcmp(name, "capability") != 0)
return -EOPNOTSUPP;
@@ -385,67 +459,50 @@ int cap_inode_getsecurity(struct inode *
if (!dentry)
return -EINVAL;

- size = sizeof(struct vfs_ns_cap_data);
- ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
- &tmpbuf, size, GFP_NOFS);
+ ret = vfs_cap_get(dentry, &nscap);
dput(dentry);

if (ret < 0)
return ret;

- fs_ns = inode->i_sb->s_user_ns;
- cap = (struct vfs_cap_data *) tmpbuf;
- if (is_v2header((size_t) ret, cap)) {
- /* If this is sizeof(vfs_cap_data) then we're ok with the
- * on-disk value, so return that. */
- if (alloc)
- *buffer = tmpbuf;
- else
- kfree(tmpbuf);
- return ret;
- } else if (!is_v3header((size_t) ret, cap)) {
- kfree(tmpbuf);
- return -EINVAL;
- }
+ tmpbuf = kmalloc(sizeof(struct vfs_ns_cap_data), GFP_NOFS);
+ if (!tmpbuf)
+ return -ENOMEM;

- nscap = (struct vfs_ns_cap_data *) tmpbuf;
- root = le32_to_cpu(nscap->rootid);
- kroot = make_kuid(fs_ns, root);
+ root = le32_to_cpu(nscap.rootid);
+ kroot = make_kuid(&init_user_ns, root);

/* If the root kuid maps to a valid uid in current ns, then return
* this as a nscap. */
mappedroot = from_kuid(current_user_ns(), kroot);
if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
+ size = sizeof(struct vfs_cap_data);
if (alloc) {
*buffer = tmpbuf;
- nscap->rootid = cpu_to_le32(mappedroot);
- } else
- kfree(tmpbuf);
- return size;
+ tmpbuf = NULL;
+ nscap.rootid = cpu_to_le32(mappedroot);
+ memcpy(*buffer, &nscap, size);
+ }
+ goto out;
}

- if (!rootid_owns_currentns(kroot)) {
- kfree(tmpbuf);
- return -EOPNOTSUPP;
- }
+ size = -EOPNOTSUPP;
+ if (!rootid_owns_currentns(kroot))
+ goto out;

/* This comes from a parent namespace. Return as a v2 capability */
size = sizeof(struct vfs_cap_data);
if (alloc) {
- *buffer = kmalloc(size, GFP_ATOMIC);
- if (*buffer) {
- struct vfs_cap_data *cap = *buffer;
- __le32 nsmagic, magic;
- magic = VFS_CAP_REVISION_2;
- nsmagic = le32_to_cpu(nscap->magic_etc);
- if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
- magic |= VFS_CAP_FLAGS_EFFECTIVE;
- memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
- cap->magic_etc = cpu_to_le32(magic);
- } else {
- size = -ENOMEM;
- }
+ cap = *buffer = tmpbuf;
+ tmpbuf = NULL;
+ magic = VFS_CAP_REVISION_2;
+ nsmagic = le32_to_cpu(nscap.magic_etc);
+ if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
+ magic |= VFS_CAP_FLAGS_EFFECTIVE;
+ memcpy(&cap->data, &nscap.data, sizeof(__le32) * 2 * VFS_CAP_U32);
+ cap->magic_etc = cpu_to_le32(magic);
}
+out:
kfree(tmpbuf);
return size;
}
@@ -462,11 +519,6 @@ static kuid_t rootid_from_xattr(const vo
return make_kuid(task_ns, rootid);
}

-static bool validheader(size_t size, const struct vfs_cap_data *cap)
-{
- return is_v2header(size, cap) || is_v3header(size, cap);
-}
-
/*
* User requested a write of security.capability. If needed, update the
* xattr to change from v2 to v3, or to fixup the v3 rootid.
@@ -570,74 +622,40 @@ static inline int bprm_caps_from_vfs_cap
int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
{
struct inode *inode = d_backing_inode(dentry);
- __u32 magic_etc;
- unsigned tocopy, i;
- int size;
- struct vfs_ns_cap_data data, *nscaps = &data;
- struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
- kuid_t rootkuid;
- struct user_namespace *fs_ns;
+ unsigned int i;
+ int ret;
+ struct vfs_ns_cap_data nscaps;

memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

if (!inode)
return -ENODATA;

- fs_ns = inode->i_sb->s_user_ns;
- size = __vfs_getxattr((struct dentry *)dentry, inode,
- XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
- if (size == -ENODATA || size == -EOPNOTSUPP)
+ ret = vfs_cap_get((struct dentry *) dentry, &nscaps);
+ if (ret == -ENODATA || ret == -EOPNOTSUPP)
/* no data, that's ok */
return -ENODATA;

- if (size < 0)
- return size;
-
- if (size < sizeof(magic_etc))
- return -EINVAL;
-
- cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
+ if (ret < 0)
+ return ret;

- rootkuid = make_kuid(fs_ns, 0);
- switch (magic_etc & VFS_CAP_REVISION_MASK) {
- case VFS_CAP_REVISION_1:
- if (size != XATTR_CAPS_SZ_1)
- return -EINVAL;
- tocopy = VFS_CAP_U32_1;
- break;
- case VFS_CAP_REVISION_2:
- if (size != XATTR_CAPS_SZ_2)
- return -EINVAL;
- tocopy = VFS_CAP_U32_2;
- break;
- case VFS_CAP_REVISION_3:
- if (size != XATTR_CAPS_SZ_3)
- return -EINVAL;
- tocopy = VFS_CAP_U32_3;
- rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
- break;
+ cpu_caps->magic_etc = le32_to_cpu(nscaps.magic_etc);
+ cpu_caps->rootid = make_kuid(&init_user_ns, le32_to_cpu(nscaps.rootid));

- default:
- return -EINVAL;
- }
/* Limit the caps to the mounter of the filesystem
* or the more limited uid specified in the xattr.
*/
- if (!rootid_owns_currentns(rootkuid))
+ if (!rootid_owns_currentns(cpu_caps->rootid))
return -ENODATA;

CAP_FOR_EACH_U32(i) {
- if (i >= tocopy)
- break;
- cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
- cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
+ cpu_caps->permitted.cap[i] = le32_to_cpu(nscaps.data[i].permitted);
+ cpu_caps->inheritable.cap[i] = le32_to_cpu(nscaps.data[i].inheritable);
}

cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;

- cpu_caps->rootid = rootkuid;
-
return 0;
}