From 8654df4e2ac9704905198d63845554c2ddf6a93f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 16:06:06 -0500 Subject: mnt: Refactor fs_fully_visible into mount_too_revealing Replace the call of fs_fully_visible in do_new_mount from before the new superblock is allocated with a call of mount_too_revealing after the superblock is allocated. This winds up being a much better location for maintainability of the code. The first change this enables is the replacement of FS_USERNS_VISIBLE with SB_I_USERNS_VISIBLE. Moving the flag from struct filesystem_type to sb_iflags on the superblock. Unfortunately mount_too_revealing fundamentally needs to touch mnt_flags adding several MNT_LOCKED_XXX flags at the appropriate times. If the mnt_flags did not need to be touched the code could be easily moved into the filesystem specific mount code. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 38 +++++++++++++++++++++++++------------- fs/proc/inode.c | 1 + fs/proc/root.c | 2 +- fs/sysfs/mount.c | 4 ++-- 4 files changed, 29 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 783004af5707..1a69aa786975 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2375,7 +2375,7 @@ unlock: return err; } -static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags); +static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); /* * create a new mount for userspace and request it to be added into the @@ -2408,12 +2408,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, flags |= MS_NODEV; mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } - if (type->fs_flags & FS_USERNS_VISIBLE) { - if (!fs_fully_visible(type, &mnt_flags)) { - put_filesystem(type); - return -EPERM; - } - } } mnt = vfs_kern_mount(type, flags, name, data); @@ -2425,6 +2419,11 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); + if (mount_too_revealing(mnt, &mnt_flags)) { + mntput(mnt); + return -EPERM; + } + err = do_add_mount(real_mount(mnt), path, mnt_flags); if (err) mntput(mnt); @@ -3216,22 +3215,19 @@ bool current_chrooted(void) return chrooted; } -static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) +static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, + int *new_mnt_flags) { - struct mnt_namespace *ns = current->nsproxy->mnt_ns; int new_flags = *new_mnt_flags; struct mount *mnt; bool visible = false; - if (unlikely(!ns)) - return false; - down_read(&namespace_sem); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != type) + if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type) continue; /* This mount is not fully visible if it's root directory @@ -3298,6 +3294,22 @@ found: return visible; } +static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + unsigned long s_iflags; + + if (ns->user_ns == &init_user_ns) + return false; + + /* Can this filesystem be too revealing? */ + s_iflags = mnt->mnt_sb->s_iflags; + if (!(s_iflags & SB_I_USERNS_VISIBLE)) + return false; + + return !mnt_already_visible(ns, mnt, new_mnt_flags); +} + static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 42305ddcbaa0..78fa452d65ed 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -462,6 +462,7 @@ int proc_fill_super(struct super_block *s) struct inode *root_inode; int ret; + s->s_iflags |= SB_I_USERNS_VISIBLE; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; diff --git a/fs/proc/root.c b/fs/proc/root.c index 55bc7d6c8aac..a1b2860fec62 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -158,7 +158,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index f3db82071cfb..f31e36994dfb 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -42,7 +42,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); else if (new_sb) /* Userspace would break if executables appear on sysfs */ - root->d_sb->s_iflags |= SB_I_NOEXEC; + root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; return root; } @@ -59,7 +59,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) -- cgit v1.2.1 From d91ee87d8d85a0808c01787e8b4a6b48f2ba487b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 May 2016 14:51:59 -0500 Subject: vfs: Pass data, ns, and ns->userns to mount_ns Today what is normally called data (the mount options) is not passed to fill_super through mount_ns. Pass the mount options and the namespace separately to mount_ns so that filesystems such as proc that have mount options, can use mount_ns. Pass the user namespace to mount_ns so that the standard permission check that verifies the mounter has permissions over the namespace can be performed in mount_ns instead of in each filesystems .mount method. Thus removing the duplication between mqueuefs and proc in terms of permission checks. The extra permission check does not currently affect the rpc_pipefs filesystem and the nfsd filesystem as those filesystems do not currently allow unprivileged mounts. Without unpvileged mounts it is guaranteed that the caller has already passed capable(CAP_SYS_ADMIN) which guarantees extra permission check will pass. Update rpc_pipefs and the nfsd filesystem to ensure that the network namespace reference is always taken in fill_super and always put in kill_sb so that the logic is simpler and so that errors originating inside of fill_super do not cause a network namespace leak. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/nfsd/nfsctl.c | 13 ++++--------- fs/super.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9690cb4dd588..5a6ae2522266 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1154,20 +1154,15 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) #endif /* last one */ {""} }; - struct net *net = data; - int ret; - - ret = simple_fill_super(sb, 0x6e667364, nfsd_files); - if (ret) - return ret; - sb->s_fs_info = get_net(net); - return 0; + get_net(sb->s_fs_info); + return simple_fill_super(sb, 0x6e667364, nfsd_files); } static struct dentry *nfsd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super); } static void nfsd_umount(struct super_block *sb) diff --git a/fs/super.c b/fs/super.c index d78b9847e6cb..fd65667832e5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -918,12 +918,19 @@ static int ns_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -struct dentry *mount_ns(struct file_system_type *fs_type, int flags, - void *data, int (*fill_super)(struct super_block *, void *, int)) +struct dentry *mount_ns(struct file_system_type *fs_type, + int flags, void *data, void *ns, struct user_namespace *user_ns, + int (*fill_super)(struct super_block *, void *, int)) { struct super_block *sb; - sb = sget(fs_type, ns_test_super, ns_set_super, flags, data); + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns); if (IS_ERR(sb)) return ERR_CAST(sb); -- cgit v1.2.1 From e94591d0d90c13166cb6eb54ce5f96ed13d81b55 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:32:10 -0500 Subject: proc: Convert proc_mount to use mount_ns. Move the call of get_pid_ns, the call of proc_parse_options, and the setting of s_iflags into proc_fill_super so that mount_ns can be used. Convert proc_mount to call mount_ns and remove the now unnecessary code. Acked-by: Seth Forshee Reviewed-by: Djalal Harouni Signed-off-by: "Eric W. Biederman" --- fs/proc/inode.c | 9 +++++++-- fs/proc/internal.h | 3 ++- fs/proc/root.c | 52 ++++------------------------------------------------ 3 files changed, 13 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 78fa452d65ed..f4817efb25a6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -457,12 +457,17 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) return inode; } -int proc_fill_super(struct super_block *s) +int proc_fill_super(struct super_block *s, void *data, int silent) { + struct pid_namespace *ns = get_pid_ns(s->s_fs_info); struct inode *root_inode; int ret; - s->s_iflags |= SB_I_USERNS_VISIBLE; + if (!proc_parse_options(data, ns)) + return -EINVAL; + + /* User space would break if executables appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2781095bd1..7931c558c192 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -212,7 +212,7 @@ extern const struct inode_operations proc_pid_link_inode_operations; extern void proc_init_inodecache(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -extern int proc_fill_super(struct super_block *); +extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); /* @@ -268,6 +268,7 @@ static inline void proc_tty_init(void) {} * root.c */ extern struct proc_dir_entry proc_root; +extern int proc_parse_options(char *options, struct pid_namespace *pid); extern void proc_self_init(void); extern int proc_remount(struct super_block *, int *, char *); diff --git a/fs/proc/root.c b/fs/proc/root.c index a1b2860fec62..8d3e484055a6 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -23,21 +23,6 @@ #include "internal.h" -static int proc_test_super(struct super_block *sb, void *data) -{ - return sb->s_fs_info == data; -} - -static int proc_set_super(struct super_block *sb, void *data) -{ - int err = set_anon_super(sb, NULL); - if (!err) { - struct pid_namespace *ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); - } - return err; -} - enum { Opt_gid, Opt_hidepid, Opt_err, }; @@ -48,7 +33,7 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; -static int proc_parse_options(char *options, struct pid_namespace *pid) +int proc_parse_options(char *options, struct pid_namespace *pid) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -100,45 +85,16 @@ int proc_remount(struct super_block *sb, int *flags, char *data) static struct dentry *proc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - int err; - struct super_block *sb; struct pid_namespace *ns; - char *options; if (flags & MS_KERNMOUNT) { - ns = (struct pid_namespace *)data; - options = NULL; + ns = data; + data = NULL; } else { ns = task_active_pid_ns(current); - options = data; - - /* Does the mounter have privilege over the pid namespace? */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - } - - sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); - if (IS_ERR(sb)) - return ERR_CAST(sb); - - if (!proc_parse_options(options, ns)) { - deactivate_locked_super(sb); - return ERR_PTR(-EINVAL); - } - - if (!sb->s_root) { - err = proc_fill_super(sb); - if (err) { - deactivate_locked_super(sb); - return ERR_PTR(err); - } - - sb->s_flags |= MS_ACTIVE; - /* User space would break if executables appear on proc */ - sb->s_iflags |= SB_I_NOEXEC; } - return dget(sb->s_root); + return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); } static void proc_kill_sb(struct super_block *sb) -- cgit v1.2.1 From 6e4eab577a0cae15b3da9b888cff16fe57981b3e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 24 May 2016 09:29:01 -0500 Subject: fs: Add user namespace member to struct super_block Start marking filesystems with a user namespace owner, s_user_ns. In this change this is only used for permission checks of who may mount a filesystem. Ultimately s_user_ns will be used for translating ids and checking capabilities for filesystems mounted from user namespaces. The default policy for setting s_user_ns is implemented in sget(), which arranges for s_user_ns to be set to current_user_ns() and to ensure that the mounter of the filesystem has CAP_SYS_ADMIN in that user_ns. The guts of sget are split out into another function sget_userns(). The function sget_userns calls alloc_super with the specified user namespace or it verifies the existing superblock that was found has the expected user namespace, and fails with EBUSY when it is not. This failing prevents users with the wrong privileges mounting a filesystem. The reason for the split of sget_userns from sget is that in some cases such as mount_ns and kernfs_mount_ns a different policy for permission checking of mounts and setting s_user_ns is necessary, and the existence of sget_userns() allows those policies to be implemented. The helper mount_ns is expected to be used for filesystems such as proc and mqueuefs which present per namespace information. The function mount_ns is modified to call sget_userns instead of sget to ensure the user namespace owner of the namespace whose information is presented by the filesystem is used on the superblock. For sysfs and cgroup the appropriate permission checks are already in place, and kernfs_mount_ns is modified to call sget_userns so that the init_user_ns is the only user namespace used. For the cgroup filesystem cgroup namespace mounts are bind mounts of a subset of the full cgroup filesystem and as such s_user_ns must be the same for all of them as there is only a single superblock. Mounts of sysfs that vary based on the network namespace could in principle change s_user_ns but it keeps the analysis and implementation of kernfs simpler if that is not supported, and at present there appear to be no benefits from supporting a different s_user_ns on any sysfs mount. Getting the details of setting s_user_ns correct has been a long process. Thanks to Pavel Tikhorirorv who spotted a leak in sget_userns. Thanks to Seth Forshee who has kept the work alive. Thanks-to: Seth Forshee Thanks-to: Pavel Tikhomirov Acked-by: Seth Forshee Signed-off-by: Eric W. Biederman --- fs/kernfs/mount.c | 3 ++- fs/super.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 63534f5f9073..d90d574c15a2 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -241,7 +241,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; - sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, + &init_user_ns, info); if (IS_ERR(sb) || sb->s_fs_info != info) kfree(info); if (IS_ERR(sb)) diff --git a/fs/super.c b/fs/super.c index fd65667832e5..874c7e3ebb8f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "internal.h" @@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s) list_lru_destroy(&s->s_inode_lru); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); + put_user_ns(s->s_user_ns); kfree(s->s_subtype); kfree(s->s_options); call_rcu(&s->rcu, destroy_super_rcu); @@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s) * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags + * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ -static struct super_block *alloc_super(struct file_system_type *type, int flags) +static struct super_block *alloc_super(struct file_system_type *type, int flags, + struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; @@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) return NULL; INIT_LIST_HEAD(&s->s_mounts); + s->s_user_ns = get_user_ns(user_ns); if (security_sb_alloc(s)) goto fail; @@ -443,17 +448,18 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** - * sget - find or create a superblock + * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags + * @user_ns: User namespace for the super_block * @data: argument to each of them */ -struct super_block *sget(struct file_system_type *type, +struct super_block *sget_userns(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), - int flags, + int flags, struct user_namespace *user_ns, void *data) { struct super_block *s = NULL; @@ -466,6 +472,14 @@ retry: hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + if (s) { + up_write(&s->s_umount); + destroy_super(s); + } + return ERR_PTR(-EBUSY); + } if (!grab_super(old)) goto retry; if (s) { @@ -478,7 +492,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags); + s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -501,6 +515,31 @@ retry: return s; } +EXPORT_SYMBOL(sget_userns); + +/** + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @flags: mount flags + * @data: argument to each of them + */ +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, + void *data) +{ + struct user_namespace *user_ns = current_user_ns(); + + /* Ensure the requestor has permissions over the target filesystem */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return sget_userns(type, test, set, flags, user_ns, data); +} + EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) @@ -930,7 +969,8 @@ struct dentry *mount_ns(struct file_system_type *fs_type, if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns); + sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags, + user_ns, ns); if (IS_ERR(sb)) return ERR_CAST(sb); -- cgit v1.2.1 From a001e74cef34d95ede6535ef521011c612657a3a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 6 Jun 2016 15:48:04 -0500 Subject: mnt: Move the FS_USERNS_MOUNT check into sget_userns Allowing a filesystem to be mounted by other than root in the initial user namespace is a filesystem property not a mount namespace property and as such should be checked in filesystem specific code. Move the FS_USERNS_MOUNT test into super.c:sget_userns(). Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 4 ---- fs/super.c | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 1a69aa786975..2e13f6cfe5df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2397,10 +2397,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, return -ENODEV; if (user_ns != &init_user_ns) { - if (!(type->fs_flags & FS_USERNS_MOUNT)) { - put_filesystem(type); - return -EPERM; - } /* Only in special cases allow devices from mounts * created outside the initial user namespace. */ diff --git a/fs/super.c b/fs/super.c index 874c7e3ebb8f..78790ada7191 100644 --- a/fs/super.c +++ b/fs/super.c @@ -466,6 +466,10 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; + if (!(flags & MS_KERNMOUNT) && + !(type->fs_flags & FS_USERNS_MOUNT) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); retry: spin_lock(&sb_lock); if (test) { -- cgit v1.2.1 From 29a517c232d21a717aecea29838aeb07131f6196 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jun 2016 13:03:05 -0500 Subject: kernfs: The cgroup filesystem also benefits from SB_I_NOEXEC The cgroup filesystem is in the same boat as sysfs. No one ever permits executables of any kind on the cgroup filesystem, and there is no reasonable future case to support executables in the future. Therefore move the setting of SB_I_NOEXEC which makes the code proof against future mistakes of accidentally creating executables from sysfs to kernfs itself. Making the code simpler and covering the sysfs, cgroup, and cgroup2 filesystems. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/kernfs/mount.c | 2 ++ fs/sysfs/mount.c | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d90d574c15a2..1443df670260 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -152,6 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) struct dentry *root; info->sb = sb; + /* Userspace would break if executables appear on sysfs */ + sb->s_iflags |= SB_I_NOEXEC; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = magic; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index f31e36994dfb..20b8f82e115b 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -41,8 +41,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); else if (new_sb) - /* Userspace would break if executables appear on sysfs */ - root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; + root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return root; } -- cgit v1.2.1 From a2982cc922c3068783eb9a1f77a5626a1ec36a1f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:34:02 -0500 Subject: vfs: Generalize filesystem nodev handling. Introduce a function may_open_dev that tests MNT_NODEV and a new superblock flab SB_I_NODEV. Use this new function in all of the places where MNT_NODEV was previously tested. Add the new SB_I_NODEV s_iflag to proc, sysfs, and mqueuefs as those filesystems should never support device nodes, and a simple superblock flags makes that very hard to get wrong. With SB_I_NODEV set if any device nodes somehow manage to show up on on a filesystem those device nodes will be unopenable. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/block_dev.c | 2 +- fs/kernfs/mount.c | 4 ++-- fs/namei.c | 8 +++++++- fs/proc/inode.c | 4 ++-- 4 files changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 71ccab1d22c6..30b8d568203a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1857,7 +1857,7 @@ struct block_device *lookup_bdev(const char *pathname) if (!S_ISBLK(inode->i_mode)) goto fail; error = -EACCES; - if (path.mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(&path)) goto fail; error = -ENOMEM; bdev = bd_acquire(inode); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 1443df670260..b3d73ad52b22 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -152,8 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) struct dentry *root; info->sb = sb; - /* Userspace would break if executables appear on sysfs */ - sb->s_iflags |= SB_I_NOEXEC; + /* Userspace would break if executables or devices appear on sysfs */ + sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = magic; diff --git a/fs/namei.c b/fs/namei.c index 6a82fb7e2127..757a32725d92 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2881,6 +2881,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } EXPORT_SYMBOL(vfs_create); +bool may_open_dev(const struct path *path) +{ + return !(path->mnt->mnt_flags & MNT_NODEV) && + !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); +} + static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; @@ -2899,7 +2905,7 @@ static int may_open(struct path *path, int acc_mode, int flag) break; case S_IFBLK: case S_IFCHR: - if (path->mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(path)) return -EACCES; /*FALLTHRU*/ case S_IFIFO: diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f4817efb25a6..a5b2c33745b7 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -466,8 +466,8 @@ int proc_fill_super(struct super_block *s, void *data, int silent) if (!proc_parse_options(data, ns)) return -EINVAL; - /* User space would break if executables appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; -- cgit v1.2.1 From a1935c1738af53249a02290ff7c10e8a6e650a16 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2016 06:59:49 -0500 Subject: mnt: Simplify mount_too_revealing Verify all filesystems that we check in mount_too_revealing set SB_I_NOEXEC and SB_I_NODEV in sb->s_iflags. That is true for today and it should remain true in the future. Remove the now unnecessary checks from mnt_already_visibile that ensure MNT_LOCK_NOSUID, MNT_LOCK_NOEXEC, and MNT_LOCK_NODEV are preserved. Making the code shorter and easier to read. Relying on SB_I_NOEXEC and SB_I_NODEV instead of the user visible MNT_NOSUID, MNT_NOEXEC, and MNT_NODEV ensures the many current systems where proc and sysfs are mounted with "nosuid, nodev, noexec" and several slightly buggy container applications don't bother to set those flags continue to work. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 2e13f6cfe5df..b1da7f8182c4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3232,12 +3232,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; - /* Read the mount flags and filter out flags that - * may safely be ignored. - */ + /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; - if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) - mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); /* Don't miss readonly hidden in the superblock flags */ if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY) @@ -3249,15 +3245,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; - if ((mnt_flags & MNT_LOCK_NODEV) && - !(new_flags & MNT_NODEV)) - continue; - if ((mnt_flags & MNT_LOCK_NOSUID) && - !(new_flags & MNT_NOSUID)) - continue; - if ((mnt_flags & MNT_LOCK_NOEXEC) && - !(new_flags & MNT_NOEXEC)) - continue; if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; @@ -3277,9 +3264,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ - MNT_LOCK_NODEV | \ - MNT_LOCK_NOSUID | \ - MNT_LOCK_NOEXEC | \ MNT_LOCK_ATIME); visible = true; goto found; @@ -3292,6 +3276,7 @@ found: static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) { + const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; unsigned long s_iflags; @@ -3303,6 +3288,12 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; + if ((s_iflags & required_iflags) != required_iflags) { + WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", + required_iflags); + return true; + } + return !mnt_already_visible(ns, mnt, new_mnt_flags); } -- cgit v1.2.1 From 67690f937c38bbab1d94cb45f6a32e61612834ae Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 May 2016 13:50:06 -0500 Subject: userns: Remove implicit MNT_NODEV fragility. Replace the implict setting of MNT_NODEV on mounts that happen with just user namespace permissions with an implicit setting of SB_I_NODEV in s_iflags. The visibility of the implicit MNT_NODEV has caused problems in the past. With this change the fragile case where an implicit MNT_NODEV needs to be preserved in do_remount is removed. Using SB_I_NODEV is much less fragile as s_iflags are set during the original mount and never changed. In do_new_mount with the implicit setting of MNT_NODEV gone, the only code that can affect mnt_flags is fs_fully_visible so simplify the if statement and reduce the indentation of the code to make that clear. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 19 +------------------ fs/super.c | 3 +++ 2 files changed, 4 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index b1da7f8182c4..9786a38d1681 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2185,13 +2185,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) { - /* Was the nodev implicitly added in mount? */ - if ((mnt->mnt_ns->user_ns != &init_user_ns) && - !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { - mnt_flags |= MNT_NODEV; - } else { - return -EPERM; - } + return -EPERM; } if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) { @@ -2385,7 +2379,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct vfsmount *mnt; int err; @@ -2396,16 +2389,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, if (!type) return -ENODEV; - if (user_ns != &init_user_ns) { - /* Only in special cases allow devices from mounts - * created outside the initial user namespace. - */ - if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { - flags |= MS_NODEV; - mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; - } - } - mnt = vfs_kern_mount(type, flags, name, data); if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && !mnt->mnt_sb->s_subtype) diff --git a/fs/super.c b/fs/super.c index 78790ada7191..25cdceed2ad3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,6 +206,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; + if ((s->s_user_ns != &init_user_ns) && + !(type->fs_flags & FS_USERNS_DEV_MOUNT)) + s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); mutex_init(&s->s_sync_lock); -- cgit v1.2.1 From cc50a07a247e17db76b1f0b0ca06652556e04fa3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:44:48 -0500 Subject: userns: Remove the now unnecessary FS_USERNS_DEV_MOUNT flag Now that SB_I_NODEV controls the nodev behavior devpts can just clear this flag during mount. Simplifying the code and making it easier to audit how the code works. While still preserving the invariant that s_iflags is only modified during mount. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/devpts/inode.c | 3 ++- fs/super.c | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 37c134a132c7..d116453b0276 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -396,6 +396,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; @@ -480,7 +481,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; /* diff --git a/fs/super.c b/fs/super.c index 25cdceed2ad3..37813bf479cf 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,8 +206,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; - if ((s->s_user_ns != &init_user_ns) && - !(type->fs_flags & FS_USERNS_DEV_MOUNT)) + if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); -- cgit v1.2.1 From 380cf5ba6b0a0b307f4afb62b186ca801defb203 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 23 Jun 2016 16:41:05 -0500 Subject: fs: Treat foreign mounts as nosuid If a process gets access to a mount from a different user namespace, that process should not be able to take advantage of setuid files or selinux entrypoints from that filesystem. Prevent this by treating mounts from other mount namespaces and those not owned by current_user_ns() or an ancestor as nosuid. This will make it safer to allow more complex filesystems to be mounted in non-root user namespaces. This does not remove the need for MNT_LOCK_NOSUID. The setuid, setgid, and file capability bits can no longer be abused if code in a user namespace were to clear nosuid on an untrusted filesystem, but this patch, by itself, is insufficient to protect the system from abuse of files that, when execed, would increase MAC privilege. As a more concrete explanation, any task that can manipulate a vfsmount associated with a given user namespace already has capabilities in that namespace and all of its descendents. If they can cause a malicious setuid, setgid, or file-caps executable to appear in that mount, then that executable will only allow them to elevate privileges in exactly the set of namespaces in which they are already privileges. On the other hand, if they can cause a malicious executable to appear with a dangerous MAC label, running it could change the caller's security context in a way that should not have been possible, even inside the namespace in which the task is confined. As a hardening measure, this would have made CVE-2014-5207 much more difficult to exploit. Signed-off-by: Andy Lutomirski Signed-off-by: Seth Forshee Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/exec.c | 2 +- fs/namespace.c | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 887c1c955df8..ca239fc86d8d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1411,7 +1411,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) bprm->cred->euid = current_euid(); bprm->cred->egid = current_egid(); - if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + if (!mnt_may_suid(bprm->file->f_path.mnt)) return; if (task_no_new_privs(current)) diff --git a/fs/namespace.c b/fs/namespace.c index 9786a38d1681..aabe8e397fc3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3280,6 +3280,19 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return !mnt_already_visible(ns, mnt, new_mnt_flags); } +bool mnt_may_suid(struct vfsmount *mnt) +{ + /* + * Foreign mounts (accessed via fchdir or through /proc + * symlinks) are always treated as if they are nosuid. This + * prevents namespaces from trusting potentially unsafe + * suid/sgid bits, file caps, or security labels that originate + * in other namespaces. + */ + return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && + current_in_userns(mnt->mnt_sb->s_user_ns); +} + static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; -- cgit v1.2.1 From a475acf01f79e89a1a5845733e10108d80f77188 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 26 Apr 2016 14:36:25 -0500 Subject: fs: Refuse uid/gid changes which don't map into s_user_ns Add checks to notify_change to verify that uid and gid changes will map into the superblock's user namespace. If they do not fail with -EOVERFLOW. This is mandatory so that fileystems don't have to even think of dealing with ia_uid and ia_gid that --EWB Moved the test from inode_change_ok to notify_change Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/attr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index 25b24d0f6c88..dd723578ddce 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -255,6 +255,17 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) return 0; + /* + * Verify that uid/gid changes are valid in the target + * namespace of the superblock. + */ + if (ia_valid & ATTR_UID && + !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid)) + return -EOVERFLOW; + if (ia_valid & ATTR_GID && + !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) + return -EOVERFLOW; + error = security_inode_setattr(dentry, attr); if (error) return error; -- cgit v1.2.1 From 0d4d717f25834134bb6f43284f84c8ccee5bbf2a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Jun 2016 16:04:06 -0500 Subject: vfs: Verify acls are valid within superblock's s_user_ns. Update posix_acl_valid to verify that an acl is within a user namespace. Update the callers of posix_acl_valid to pass in an appropriate user namespace. For posix_acl_xattr_set and v9fs_xattr_set_acl pass in inode->i_sb->s_user_ns to posix_acl_valid. For md_unpack_acl pass in &init_user_ns as no inode or superblock is in sight. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/9p/acl.c | 2 +- fs/posix_acl.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 0576eaeb60b9..5b6a1743ea17 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -266,7 +266,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { - retval = posix_acl_valid(acl); + retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (retval) goto err_out; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8a4a266beff3..647c28180675 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -205,7 +205,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. */ int -posix_acl_valid(const struct posix_acl *acl) +posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; @@ -225,7 +225,7 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_USER: if (state != ACL_USER) return -EINVAL; - if (!uid_valid(pa->e_uid)) + if (!kuid_has_mapping(user_ns, pa->e_uid)) return -EINVAL; needs_mask = 1; break; @@ -240,7 +240,7 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_GROUP: if (state != ACL_GROUP) return -EINVAL; - if (!gid_valid(pa->e_gid)) + if (!kgid_has_mapping(user_ns, pa->e_gid)) return -EINVAL; needs_mask = 1; break; @@ -845,7 +845,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler, return PTR_ERR(acl); if (acl) { - ret = posix_acl_valid(acl); + ret = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (ret) goto out; } -- cgit v1.2.1 From 2d7f9e2ad35e4e7a3086231f19bfab33c6a8a64a Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 26 Apr 2016 14:36:23 -0500 Subject: fs: Check for invalid i_uid in may_follow_link() Filesystem uids which don't map into a user namespace may result in inode->i_uid being INVALID_UID. A symlink and its parent could have different owners in the filesystem can both get mapped to INVALID_UID, which may result in following a symlink when this would not have otherwise been permitted when protected symlinks are enabled. Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 757a32725d92..8701bd9a5270 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -901,6 +901,7 @@ static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; + kuid_t puid; if (!sysctl_protected_symlinks) return 0; @@ -916,7 +917,8 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if parent directory and link owner match. */ - if (uid_eq(parent->i_uid, inode->i_uid)) + puid = parent->i_uid; + if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) -- cgit v1.2.1 From 0bd23d09b874e53bd1a2fe2296030aa2720d7b08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 29 Jun 2016 14:54:46 -0500 Subject: vfs: Don't modify inodes with a uid or gid unknown to the vfs When a filesystem outside of init_user_ns is mounted it could have uids and gids stored in it that do not map to init_user_ns. The plan is to allow those filesystems to set i_uid to INVALID_UID and i_gid to INVALID_GID for unmapped uids and gids and then to handle that strange case in the vfs to ensure there is consistent robust handling of the weirdness. Upon a careful review of the vfs and filesystems about the only case where there is any possibility of confusion or trouble is when the inode is written back to disk. In that case filesystems typically read the inode->i_uid and inode->i_gid and write them to disk even when just an inode timestamp is being updated. Which leads to a rule that is very simple to implement and understand inodes whose i_uid or i_gid is not valid may not be written. In dealing with access times this means treat those inodes as if the inode flag S_NOATIME was set. Reads of the inodes appear safe and useful, but any write or modification is disallowed. The only inode write that is allowed is a chown that sets the uid and gid on the inode to valid values. After such a chown the inode is normal and may be treated as such. Denying all writes to inodes with uids or gids unknown to the vfs also prevents several oddball cases where corruption would have occurred because the vfs does not have complete information. One problem case that is prevented is attempting to use the gid of a directory for new inodes where the directories sgid bit is set but the directories gid is not mapped. Another problem case avoided is attempting to update the evm hash after setxattr, removexattr, and setattr. As the evm hash includeds the inode->i_uid or inode->i_gid not knowning the uid or gid prevents a correct evm hash from being computed. evm hash verification also fails when i_uid or i_gid is unknown but that is essentially harmless as it does not cause filesystem corruption. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/attr.c | 8 ++++++++ fs/inode.c | 7 +++++++ fs/namei.c | 26 +++++++++++++++++++++----- fs/xattr.c | 7 +++++++ 4 files changed, 43 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index dd723578ddce..42bb42bb3c72 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -266,6 +266,14 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) return -EOVERFLOW; + /* Don't allow modifications of files with invalid uids or + * gids unless those uids & gids are being made valid. + */ + if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid)) + return -EOVERFLOW; + if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid)) + return -EOVERFLOW; + error = security_inode_setattr(dentry, attr); if (error) return error; diff --git a/fs/inode.c b/fs/inode.c index 4ccbc21b30ce..c0ebb97fb085 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1617,6 +1617,13 @@ bool atime_needs_update(const struct path *path, struct inode *inode) if (inode->i_flags & S_NOATIME) return false; + + /* Atime updates will likely cause i_uid and i_gid to be written + * back improprely if their true value is unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return false; + if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) diff --git a/fs/namei.c b/fs/namei.c index 8701bd9a5270..840201c4c290 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -410,6 +410,14 @@ int __inode_permission(struct inode *inode, int mask) */ if (IS_IMMUTABLE(inode)) return -EACCES; + + /* + * Updating mtime will likely cause i_uid and i_gid to be + * written back improperly if their true value is unknown + * to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EACCES; } retval = do_inode_permission(inode, mask); @@ -2759,10 +2767,11 @@ EXPORT_SYMBOL(__check_sticky); * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. - * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. - * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. - * 9. We can't remove a root or mountpoint. - * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * 7. If the victim has an unknown uid or gid we can't change the inode. + * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 10. We can't remove a root or mountpoint. + * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) @@ -2784,7 +2793,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) return -EPERM; if (check_sticky(dir, inode) || IS_APPEND(inode) || - IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -4190,6 +4199,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + /* + * Updating the link count will likely cause i_uid and i_gid to + * be writen back improperly if their true value is unknown to + * the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) diff --git a/fs/xattr.c b/fs/xattr.c index 4beafc43daa5..c243905835ab 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -38,6 +38,13 @@ xattr_permission(struct inode *inode, const char *name, int mask) if (mask & MAY_WRITE) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; + /* + * Updating an xattr will likely cause i_uid and i_gid + * to be writen back improperly if their true value is + * unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EPERM; } /* -- cgit v1.2.1 From 036d523641c66bef713042894a17f4335f199e49 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 1 Jul 2016 12:52:06 -0500 Subject: vfs: Don't create inodes with a uid or gid unknown to the vfs It is expected that filesystems can not represent uids and gids from outside of their user namespace. Keep things simple by not even trying to create filesystem nodes with non-sense uids and gids. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namei.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 840201c4c290..629823f19a6a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2814,16 +2814,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) - * 3. We should have write and exec permissions on dir - * 4. We can't do it if dir is immutable (done in permission()) + * 3. We can't do it if the fs can't represent the fsuid or fsgid. + * 4. We should have write and exec permissions on dir + * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct inode *dir, struct dentry *child) { + struct user_namespace *s_user_ns; audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) + return -EOVERFLOW; return inode_permission(dir, MAY_WRITE | MAY_EXEC); } -- cgit v1.2.1 From d49d37624a1931c2f3b5d0cbe95bd5181cbdc279 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 30 Jun 2016 16:31:01 -0500 Subject: quota: Ensure qids map to the filesystem Introduce the helper qid_has_mapping and use it to ensure that the quota system only considers qids that map to the filesystems s_user_ns. In practice for quota supporting filesystems today this is the exact same check as qid_valid. As only 0xffffffff aka (qid_t)-1 does not map into init_user_ns. Replace the qid_valid calls with qid_has_mapping as values come in from userspace. This is harmless today and it prepares the quota system to work on filesystems with quotas but mounted by unprivileged users. Call qid_has_mapping from dqget. This ensures the passed in qid has a prepresentation on the underlying filesystem. Previously this was unnecessary as filesystesm never had qids that could not map. With the introduction of filesystems outside of s_user_ns this will not remain true. All of this ensures the quota code never has to deal with qids that don't map to the underlying filesystem. Cc: Jan Kara Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/quota/dquot.c | 3 +++ fs/quota/quota.c | 12 ++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ff21980d0119..74706b6aa747 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid) unsigned int hashent = hashfn(sb, qid); struct dquot *dquot, *empty = NULL; + if (!qid_has_mapping(sb->s_user_ns, qid)) + return ERR_PTR(-EINVAL); + if (!sb_has_quota_active(sb, qid.type)) return ERR_PTR(-ESRCH); we_slept: diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 0f10ee9892ce..73f6f4cf0a21 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (ret) @@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); if (ret) @@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; copy_from_if_dqblk(&fdq, &idq); return sb->s_qcop->set_dqblk(sb, qid, &fdq); @@ -581,7 +581,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ if (from_kqid(&init_user_ns, qid) == 0 && @@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); if (ret) @@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); if (ret) -- cgit v1.2.1 From cfd4c70a18c4e806aaac2f483153dae01e0ace1c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 Jul 2016 11:10:57 -0500 Subject: quota: Handle quota data stored in s_user_ns in quota_setxquota In Q_XSETQLIMIT use sb->s_user_ns to detect when we are dealing with the filesystems notion of id 0. Cc: Jan Kara Acked-by: Seth Forshee Inspired-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/quota/quota.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 73f6f4cf0a21..35df08ee9c97 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -584,7 +584,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ - if (from_kqid(&init_user_ns, qid) == 0 && + if (from_kqid(sb->s_user_ns, qid) == 0 && fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) { struct qc_info qinfo; int ret; -- cgit v1.2.1 From 5c0048280babd579fa9e5f0e787122b06aee3f3b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 Jul 2016 10:41:57 -0500 Subject: dquot: For now explicitly don't support filesystems outside of init_user_ns Mostly supporting filesystems outside of init_user_ns is s/&init_usre_ns/dquot->dq_sb->s_user_ns/. An actual need for supporting quotas on filesystems outside of s_user_ns is quite a ways away and to be done responsibily needs an audit on what can happen with hostile quota files. Until that audit is complete don't attempt to support quota files on filesystems outside of s_user_ns. Cc: Jan Kara Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/quota/dquot.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 74706b6aa747..87197d13cc76 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2271,6 +2271,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, error = -EINVAL; goto out_fmt; } + /* Filesystems outside of init_user_ns not yet supported */ + if (sb->s_user_ns != &init_user_ns) { + error = -EINVAL; + goto out_fmt; + } /* Usage always has to be set... */ if (!(flags & DQUOT_USAGE_ENABLED)) { error = -EINVAL; -- cgit v1.2.1 From aeaa4a79ff6a5ed912b7362f206cf8576fca538b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 23 Jul 2016 11:20:44 -0500 Subject: fs: Call d_automount with the filesystems creds Seth Forshee reported a mount regression in nfs autmounts with "fs: Add user namespace member to struct super_block". It turns out that the assumption that current->cred is something reasonable during mount while necessary to improve support of unprivileged mounts is wrong in the automount path. To fix the existing filesystems override current->cred with the init_cred before calling d_automount and restore current->cred after d_automount completes. To support unprivileged mounts would require a more nuanced cred selection, so fail on unprivileged mounts for the time being. As none of the filesystems that currently set FS_USERNS_MOUNT implement d_automount this check is only good for preventing future problems. Fixes: 6e4eab577a0c ("fs: Add user namespace member to struct super_block") Tested-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namei.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 629823f19a6a..ef573df3297f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1099,6 +1100,7 @@ static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; + const struct cred *old_cred; int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) @@ -1120,11 +1122,16 @@ static int follow_automount(struct path *path, struct nameidata *nd, path->dentry->d_inode) return -EISDIR; + if (path->dentry->d_sb->s_user_ns != &init_user_ns) + return -EACCES; + nd->total_link_count++; if (nd->total_link_count >= 40) return -ELOOP; + old_cred = override_creds(&init_cred); mnt = path->dentry->d_op->d_automount(path); + revert_creds(old_cred); if (IS_ERR(mnt)) { /* * The filesystem is allowed to return -EISDIR here to indicate -- cgit v1.2.1