From 8654df4e2ac9704905198d63845554c2ddf6a93f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 16:06:06 -0500
Subject: mnt: Refactor fs_fully_visible into mount_too_revealing

Replace the call of fs_fully_visible in do_new_mount from before the
new superblock is allocated with a call of mount_too_revealing after
the superblock is allocated.   This winds up being a much better location
for maintainability of the code.

The first change this enables is the replacement of FS_USERNS_VISIBLE
with SB_I_USERNS_VISIBLE.  Moving the flag from struct filesystem_type
to sb_iflags on the superblock.

Unfortunately mount_too_revealing fundamentally needs to touch
mnt_flags adding several MNT_LOCKED_XXX flags at the appropriate
times.  If the mnt_flags did not need to be touched the code
could be easily moved into the filesystem specific mount code.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c   | 38 +++++++++++++++++++++++++-------------
 fs/proc/inode.c  |  1 +
 fs/proc/root.c   |  2 +-
 fs/sysfs/mount.c |  4 ++--
 4 files changed, 29 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 783004af5707..1a69aa786975 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2375,7 +2375,7 @@ unlock:
 	return err;
 }
 
-static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
 
 /*
  * create a new mount for userspace and request it to be added into the
@@ -2408,12 +2408,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 			flags |= MS_NODEV;
 			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
 		}
-		if (type->fs_flags & FS_USERNS_VISIBLE) {
-			if (!fs_fully_visible(type, &mnt_flags)) {
-				put_filesystem(type);
-				return -EPERM;
-			}
-		}
 	}
 
 	mnt = vfs_kern_mount(type, flags, name, data);
@@ -2425,6 +2419,11 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
+	if (mount_too_revealing(mnt, &mnt_flags)) {
+		mntput(mnt);
+		return -EPERM;
+	}
+
 	err = do_add_mount(real_mount(mnt), path, mnt_flags);
 	if (err)
 		mntput(mnt);
@@ -3216,22 +3215,19 @@ bool current_chrooted(void)
 	return chrooted;
 }
 
-static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
+static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+				int *new_mnt_flags)
 {
-	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	int new_flags = *new_mnt_flags;
 	struct mount *mnt;
 	bool visible = false;
 
-	if (unlikely(!ns))
-		return false;
-
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		struct mount *child;
 		int mnt_flags;
 
-		if (mnt->mnt.mnt_sb->s_type != type)
+		if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
 			continue;
 
 		/* This mount is not fully visible if it's root directory
@@ -3298,6 +3294,22 @@ found:
 	return visible;
 }
 
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	unsigned long s_iflags;
+
+	if (ns->user_ns == &init_user_ns)
+		return false;
+
+	/* Can this filesystem be too revealing? */
+	s_iflags = mnt->mnt_sb->s_iflags;
+	if (!(s_iflags & SB_I_USERNS_VISIBLE))
+		return false;
+
+	return !mnt_already_visible(ns, mnt, new_mnt_flags);
+}
+
 static struct ns_common *mntns_get(struct task_struct *task)
 {
 	struct ns_common *ns = NULL;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 42305ddcbaa0..78fa452d65ed 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -462,6 +462,7 @@ int proc_fill_super(struct super_block *s)
 	struct inode *root_inode;
 	int ret;
 
+	s->s_iflags |= SB_I_USERNS_VISIBLE;
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 55bc7d6c8aac..a1b2860fec62 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -158,7 +158,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
-	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 void __init proc_root_init(void)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f3db82071cfb..f31e36994dfb 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -42,7 +42,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 	else if (new_sb)
 		/* Userspace would break if executables appear on sysfs */
-		root->d_sb->s_iflags |= SB_I_NOEXEC;
+		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
 
 	return root;
 }
@@ -59,7 +59,7 @@ static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 int __init sysfs_init(void)
-- 
cgit v1.2.1


From d91ee87d8d85a0808c01787e8b4a6b48f2ba487b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 23 May 2016 14:51:59 -0500
Subject: vfs: Pass data, ns, and ns->userns to mount_ns

Today what is normally called data (the mount options) is not passed
to fill_super through mount_ns.

Pass the mount options and the namespace separately to mount_ns so
that filesystems such as proc that have mount options, can use
mount_ns.

Pass the user namespace to mount_ns so that the standard permission
check that verifies the mounter has permissions over the namespace can
be performed in mount_ns instead of in each filesystems .mount method.
Thus removing the duplication between mqueuefs and proc in terms of
permission checks.  The extra permission check does not currently
affect the rpc_pipefs filesystem and the nfsd filesystem as those
filesystems do not currently allow unprivileged mounts.  Without
unpvileged mounts it is guaranteed that the caller has already passed
capable(CAP_SYS_ADMIN) which guarantees extra permission check will
pass.

Update rpc_pipefs and the nfsd filesystem to ensure that the network
namespace reference is always taken in fill_super and always put in kill_sb
so that the logic is simpler and so that errors originating inside of
fill_super do not cause a network namespace leak.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/nfsd/nfsctl.c | 13 ++++---------
 fs/super.c       | 13 ++++++++++---
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 9690cb4dd588..5a6ae2522266 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1154,20 +1154,15 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 #endif
 		/* last one */ {""}
 	};
-	struct net *net = data;
-	int ret;
-
-	ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
-	if (ret)
-		return ret;
-	sb->s_fs_info = get_net(net);
-	return 0;
+	get_net(sb->s_fs_info);
+	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
 static struct dentry *nfsd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+	struct net *net = current->nsproxy->net_ns;
+	return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super);
 }
 
 static void nfsd_umount(struct super_block *sb)
diff --git a/fs/super.c b/fs/super.c
index d78b9847e6cb..fd65667832e5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -918,12 +918,19 @@ static int ns_set_super(struct super_block *sb, void *data)
 	return set_anon_super(sb, NULL);
 }
 
-struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
-	void *data, int (*fill_super)(struct super_block *, void *, int))
+struct dentry *mount_ns(struct file_system_type *fs_type,
+	int flags, void *data, void *ns, struct user_namespace *user_ns,
+	int (*fill_super)(struct super_block *, void *, int))
 {
 	struct super_block *sb;
 
-	sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
+	/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+	 * over the namespace.
+	 */
+	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 
-- 
cgit v1.2.1


From e94591d0d90c13166cb6eb54ce5f96ed13d81b55 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 15:32:10 -0500
Subject: proc: Convert proc_mount to use mount_ns.

Move the call of get_pid_ns, the call of proc_parse_options, and
the setting of s_iflags into proc_fill_super so that mount_ns
can be used.

Convert proc_mount to call mount_ns and remove the now unnecessary
code.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Reviewed-by: Djalal Harouni <tixxdz@gmail.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/inode.c    |  9 +++++++--
 fs/proc/internal.h |  3 ++-
 fs/proc/root.c     | 52 ++++------------------------------------------------
 3 files changed, 13 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 78fa452d65ed..f4817efb25a6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -457,12 +457,17 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 	return inode;
 }
 
-int proc_fill_super(struct super_block *s)
+int proc_fill_super(struct super_block *s, void *data, int silent)
 {
+	struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
 	struct inode *root_inode;
 	int ret;
 
-	s->s_iflags |= SB_I_USERNS_VISIBLE;
+	if (!proc_parse_options(data, ns))
+		return -EINVAL;
+
+	/* User space would break if executables appear on proc */
+	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2781095bd1..7931c558c192 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -212,7 +212,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
 
 extern void proc_init_inodecache(void);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *);
+extern int proc_fill_super(struct super_block *, void *data, int flags);
 extern void proc_entry_rundown(struct proc_dir_entry *);
 
 /*
@@ -268,6 +268,7 @@ static inline void proc_tty_init(void) {}
  * root.c
  */
 extern struct proc_dir_entry proc_root;
+extern int proc_parse_options(char *options, struct pid_namespace *pid);
 
 extern void proc_self_init(void);
 extern int proc_remount(struct super_block *, int *, char *);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index a1b2860fec62..8d3e484055a6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -23,21 +23,6 @@
 
 #include "internal.h"
 
-static int proc_test_super(struct super_block *sb, void *data)
-{
-	return sb->s_fs_info == data;
-}
-
-static int proc_set_super(struct super_block *sb, void *data)
-{
-	int err = set_anon_super(sb, NULL);
-	if (!err) {
-		struct pid_namespace *ns = (struct pid_namespace *)data;
-		sb->s_fs_info = get_pid_ns(ns);
-	}
-	return err;
-}
-
 enum {
 	Opt_gid, Opt_hidepid, Opt_err,
 };
@@ -48,7 +33,7 @@ static const match_table_t tokens = {
 	{Opt_err, NULL},
 };
 
-static int proc_parse_options(char *options, struct pid_namespace *pid)
+int proc_parse_options(char *options, struct pid_namespace *pid)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -100,45 +85,16 @@ int proc_remount(struct super_block *sb, int *flags, char *data)
 static struct dentry *proc_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	int err;
-	struct super_block *sb;
 	struct pid_namespace *ns;
-	char *options;
 
 	if (flags & MS_KERNMOUNT) {
-		ns = (struct pid_namespace *)data;
-		options = NULL;
+		ns = data;
+		data = NULL;
 	} else {
 		ns = task_active_pid_ns(current);
-		options = data;
-
-		/* Does the mounter have privilege over the pid namespace? */
-		if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
-			return ERR_PTR(-EPERM);
-	}
-
-	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-
-	if (!proc_parse_options(options, ns)) {
-		deactivate_locked_super(sb);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (!sb->s_root) {
-		err = proc_fill_super(sb);
-		if (err) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(err);
-		}
-
-		sb->s_flags |= MS_ACTIVE;
-		/* User space would break if executables appear on proc */
-		sb->s_iflags |= SB_I_NOEXEC;
 	}
 
-	return dget(sb->s_root);
+	return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
 }
 
 static void proc_kill_sb(struct super_block *sb)
-- 
cgit v1.2.1


From 6e4eab577a0cae15b3da9b888cff16fe57981b3e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 24 May 2016 09:29:01 -0500
Subject: fs: Add user namespace member to struct super_block

Start marking filesystems with a user namespace owner, s_user_ns.  In
this change this is only used for permission checks of who may mount a
filesystem.  Ultimately s_user_ns will be used for translating ids and
checking capabilities for filesystems mounted from user namespaces.

The default policy for setting s_user_ns is implemented in sget(),
which arranges for s_user_ns to be set to current_user_ns() and to
ensure that the mounter of the filesystem has CAP_SYS_ADMIN in that
user_ns.

The guts of sget are split out into another function sget_userns().
The function sget_userns calls alloc_super with the specified user
namespace or it verifies the existing superblock that was found
has the expected user namespace, and fails with EBUSY when it is not.
This failing prevents users with the wrong privileges mounting a
filesystem.

The reason for the split of sget_userns from sget is that in some
cases such as mount_ns and kernfs_mount_ns a different policy for
permission checking of mounts and setting s_user_ns is necessary, and
the existence of sget_userns() allows those policies to be
implemented.

The helper mount_ns is expected to be used for filesystems such as
proc and mqueuefs which present per namespace information.  The
function mount_ns is modified to call sget_userns instead of sget to
ensure the user namespace owner of the namespace whose information is
presented by the filesystem is used on the superblock.

For sysfs and cgroup the appropriate permission checks are already in
place, and kernfs_mount_ns is modified to call sget_userns so that
the init_user_ns is the only user namespace used.

For the cgroup filesystem cgroup namespace mounts are bind mounts of a
subset of the full cgroup filesystem and as such s_user_ns must be the
same for all of them as there is only a single superblock.

Mounts of sysfs that vary based on the network namespace could in principle
change s_user_ns but it keeps the analysis and implementation of kernfs
simpler if that is not supported, and at present there appear to be no
benefits from supporting a different s_user_ns on any sysfs mount.

Getting the details of setting s_user_ns correct has been
a long process.  Thanks to Pavel Tikhorirorv who spotted a leak
in sget_userns.  Thanks to Seth Forshee who has kept the work alive.

Thanks-to: Seth Forshee <seth.forshee@canonical.com>
Thanks-to: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/kernfs/mount.c |  3 ++-
 fs/super.c        | 52 ++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 48 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 63534f5f9073..d90d574c15a2 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -241,7 +241,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 	info->root = root;
 	info->ns = ns;
 
-	sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+	sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
+			 &init_user_ns, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
 		kfree(info);
 	if (IS_ERR(sb))
diff --git a/fs/super.c b/fs/super.c
index fd65667832e5..874c7e3ebb8f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,6 +33,7 @@
 #include <linux/cleancache.h>
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
+#include <linux/user_namespace.h>
 #include "internal.h"
 
 
@@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s)
 	list_lru_destroy(&s->s_inode_lru);
 	security_sb_free(s);
 	WARN_ON(!list_empty(&s->s_mounts));
+	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
 	kfree(s->s_options);
 	call_rcu(&s->rcu, destroy_super_rcu);
@@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s)
  *	alloc_super	-	create new superblock
  *	@type:	filesystem type superblock should belong to
  *	@flags: the mount flags
+ *	@user_ns: User namespace for the super_block
  *
  *	Allocates and initializes a new &struct super_block.  alloc_super()
  *	returns a pointer new superblock or %NULL if allocation had failed.
  */
-static struct super_block *alloc_super(struct file_system_type *type, int flags)
+static struct super_block *alloc_super(struct file_system_type *type, int flags,
+				       struct user_namespace *user_ns)
 {
 	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static const struct super_operations default_op;
@@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 		return NULL;
 
 	INIT_LIST_HEAD(&s->s_mounts);
+	s->s_user_ns = get_user_ns(user_ns);
 
 	if (security_sb_alloc(s))
 		goto fail;
@@ -443,17 +448,18 @@ void generic_shutdown_super(struct super_block *sb)
 EXPORT_SYMBOL(generic_shutdown_super);
 
 /**
- *	sget	-	find or create a superblock
+ *	sget_userns -	find or create a superblock
  *	@type:	filesystem type superblock should belong to
  *	@test:	comparison callback
  *	@set:	setup callback
  *	@flags:	mount flags
+ *	@user_ns: User namespace for the super_block
  *	@data:	argument to each of them
  */
-struct super_block *sget(struct file_system_type *type,
+struct super_block *sget_userns(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-			int flags,
+			int flags, struct user_namespace *user_ns,
 			void *data)
 {
 	struct super_block *s = NULL;
@@ -466,6 +472,14 @@ retry:
 		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
 			if (!test(old, data))
 				continue;
+			if (user_ns != old->s_user_ns) {
+				spin_unlock(&sb_lock);
+				if (s) {
+					up_write(&s->s_umount);
+					destroy_super(s);
+				}
+				return ERR_PTR(-EBUSY);
+			}
 			if (!grab_super(old))
 				goto retry;
 			if (s) {
@@ -478,7 +492,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags);
+		s = alloc_super(type, flags, user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -501,6 +515,31 @@ retry:
 	return s;
 }
 
+EXPORT_SYMBOL(sget_userns);
+
+/**
+ *	sget	-	find or create a superblock
+ *	@type:	  filesystem type superblock should belong to
+ *	@test:	  comparison callback
+ *	@set:	  setup callback
+ *	@flags:	  mount flags
+ *	@data:	  argument to each of them
+ */
+struct super_block *sget(struct file_system_type *type,
+			int (*test)(struct super_block *,void *),
+			int (*set)(struct super_block *,void *),
+			int flags,
+			void *data)
+{
+	struct user_namespace *user_ns = current_user_ns();
+
+	/* Ensure the requestor has permissions over the target filesystem */
+	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	return sget_userns(type, test, set, flags, user_ns, data);
+}
+
 EXPORT_SYMBOL(sget);
 
 void drop_super(struct super_block *sb)
@@ -930,7 +969,8 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
 	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns);
+	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
+			 user_ns, ns);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 
-- 
cgit v1.2.1


From a001e74cef34d95ede6535ef521011c612657a3a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 6 Jun 2016 15:48:04 -0500
Subject: mnt: Move the FS_USERNS_MOUNT check into sget_userns

Allowing a filesystem to be mounted by other than root in the initial
user namespace is a filesystem property not a mount namespace property
and as such should be checked in filesystem specific code.  Move the
FS_USERNS_MOUNT test into super.c:sget_userns().

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 4 ----
 fs/super.c     | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1a69aa786975..2e13f6cfe5df 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2397,10 +2397,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 		return -ENODEV;
 
 	if (user_ns != &init_user_ns) {
-		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
-			put_filesystem(type);
-			return -EPERM;
-		}
 		/* Only in special cases allow devices from mounts
 		 * created outside the initial user namespace.
 		 */
diff --git a/fs/super.c b/fs/super.c
index 874c7e3ebb8f..78790ada7191 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -466,6 +466,10 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
+	if (!(flags & MS_KERNMOUNT) &&
+	    !(type->fs_flags & FS_USERNS_MOUNT) &&
+	    !capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
-- 
cgit v1.2.1


From 29a517c232d21a717aecea29838aeb07131f6196 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 10 Jun 2016 13:03:05 -0500
Subject: kernfs: The cgroup filesystem also benefits from SB_I_NOEXEC

The cgroup filesystem is in the same boat as sysfs.  No one ever
permits executables of any kind on the cgroup filesystem, and there is
no reasonable future case to support executables in the future.

Therefore move the setting of SB_I_NOEXEC which makes the code proof
against future mistakes of accidentally creating executables from
sysfs to kernfs itself.  Making the code simpler and covering the
sysfs, cgroup, and cgroup2 filesystems.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/kernfs/mount.c | 2 ++
 fs/sysfs/mount.c  | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d90d574c15a2..1443df670260 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -152,6 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	struct dentry *root;
 
 	info->sb = sb;
+	/* Userspace would break if executables appear on sysfs */
+	sb->s_iflags |= SB_I_NOEXEC;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = magic;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f31e36994dfb..20b8f82e115b 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -41,8 +41,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	if (IS_ERR(root) || !new_sb)
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 	else if (new_sb)
-		/* Userspace would break if executables appear on sysfs */
-		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
+		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
 
 	return root;
 }
-- 
cgit v1.2.1


From a2982cc922c3068783eb9a1f77a5626a1ec36a1f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 15:34:02 -0500
Subject: vfs: Generalize filesystem nodev handling.

Introduce a function may_open_dev that tests MNT_NODEV and a new
superblock flab SB_I_NODEV.  Use this new function in all of the
places where MNT_NODEV was previously tested.

Add the new SB_I_NODEV s_iflag to proc, sysfs, and mqueuefs as those
filesystems should never support device nodes, and a simple superblock
flags makes that very hard to get wrong.  With SB_I_NODEV set if any
device nodes somehow manage to show up on on a filesystem those
device nodes will be unopenable.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/block_dev.c    | 2 +-
 fs/kernfs/mount.c | 4 ++--
 fs/namei.c        | 8 +++++++-
 fs/proc/inode.c   | 4 ++--
 4 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 71ccab1d22c6..30b8d568203a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1857,7 +1857,7 @@ struct block_device *lookup_bdev(const char *pathname)
 	if (!S_ISBLK(inode->i_mode))
 		goto fail;
 	error = -EACCES;
-	if (path.mnt->mnt_flags & MNT_NODEV)
+	if (!may_open_dev(&path))
 		goto fail;
 	error = -ENOMEM;
 	bdev = bd_acquire(inode);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 1443df670260..b3d73ad52b22 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -152,8 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	struct dentry *root;
 
 	info->sb = sb;
-	/* Userspace would break if executables appear on sysfs */
-	sb->s_iflags |= SB_I_NOEXEC;
+	/* Userspace would break if executables or devices appear on sysfs */
+	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = magic;
diff --git a/fs/namei.c b/fs/namei.c
index 6a82fb7e2127..757a32725d92 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2881,6 +2881,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 EXPORT_SYMBOL(vfs_create);
 
+bool may_open_dev(const struct path *path)
+{
+	return !(path->mnt->mnt_flags & MNT_NODEV) &&
+		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+}
+
 static int may_open(struct path *path, int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
@@ -2899,7 +2905,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
 		break;
 	case S_IFBLK:
 	case S_IFCHR:
-		if (path->mnt->mnt_flags & MNT_NODEV)
+		if (!may_open_dev(path))
 			return -EACCES;
 		/*FALLTHRU*/
 	case S_IFIFO:
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index f4817efb25a6..a5b2c33745b7 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -466,8 +466,8 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 	if (!proc_parse_options(data, ns))
 		return -EINVAL;
 
-	/* User space would break if executables appear on proc */
-	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
+	/* User space would break if executables or devices appear on proc */
+	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
-- 
cgit v1.2.1


From a1935c1738af53249a02290ff7c10e8a6e650a16 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 15 Jun 2016 06:59:49 -0500
Subject: mnt: Simplify mount_too_revealing

Verify all filesystems that we check in mount_too_revealing set
SB_I_NOEXEC and SB_I_NODEV in sb->s_iflags.  That is true for today
and it should remain true in the future.

Remove the now unnecessary checks from mnt_already_visibile that
ensure MNT_LOCK_NOSUID, MNT_LOCK_NOEXEC, and MNT_LOCK_NODEV are
preserved.  Making the code shorter and easier to read.

Relying on SB_I_NOEXEC and SB_I_NODEV instead of the user visible
MNT_NOSUID, MNT_NOEXEC, and MNT_NODEV ensures the many current
systems where proc and sysfs are mounted with "nosuid, nodev, noexec"
and several slightly buggy container applications don't bother to
set those flags continue to work.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 2e13f6cfe5df..b1da7f8182c4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3232,12 +3232,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
 		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
 			continue;
 
-		/* Read the mount flags and filter out flags that
-		 * may safely be ignored.
-		 */
+		/* A local view of the mount flags */
 		mnt_flags = mnt->mnt.mnt_flags;
-		if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
-			mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
 
 		/* Don't miss readonly hidden in the superblock flags */
 		if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY)
@@ -3249,15 +3245,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
 		if ((mnt_flags & MNT_LOCK_READONLY) &&
 		    !(new_flags & MNT_READONLY))
 			continue;
-		if ((mnt_flags & MNT_LOCK_NODEV) &&
-		    !(new_flags & MNT_NODEV))
-			continue;
-		if ((mnt_flags & MNT_LOCK_NOSUID) &&
-		    !(new_flags & MNT_NOSUID))
-			continue;
-		if ((mnt_flags & MNT_LOCK_NOEXEC) &&
-		    !(new_flags & MNT_NOEXEC))
-			continue;
 		if ((mnt_flags & MNT_LOCK_ATIME) &&
 		    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
 			continue;
@@ -3277,9 +3264,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
 		}
 		/* Preserve the locked attributes */
 		*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
-					       MNT_LOCK_NODEV    | \
-					       MNT_LOCK_NOSUID   | \
-					       MNT_LOCK_NOEXEC   | \
 					       MNT_LOCK_ATIME);
 		visible = true;
 		goto found;
@@ -3292,6 +3276,7 @@ found:
 
 static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
 {
+	const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	unsigned long s_iflags;
 
@@ -3303,6 +3288,12 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
 	if (!(s_iflags & SB_I_USERNS_VISIBLE))
 		return false;
 
+	if ((s_iflags & required_iflags) != required_iflags) {
+		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
+			  required_iflags);
+		return true;
+	}
+
 	return !mnt_already_visible(ns, mnt, new_mnt_flags);
 }
 
-- 
cgit v1.2.1


From 67690f937c38bbab1d94cb45f6a32e61612834ae Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 18 May 2016 13:50:06 -0500
Subject: userns: Remove implicit MNT_NODEV fragility.

Replace the implict setting of MNT_NODEV on mounts that happen with
just user namespace permissions with an implicit setting of SB_I_NODEV
in s_iflags.  The visibility of the implicit MNT_NODEV has caused
problems in the past.

With this change the fragile case where an implicit MNT_NODEV needs to
be preserved in do_remount is removed.  Using SB_I_NODEV is much less
fragile as s_iflags are set during the original mount and never
changed.

In do_new_mount with the implicit setting of MNT_NODEV gone, the only
code that can affect mnt_flags is fs_fully_visible so simplify the if
statement and reduce the indentation of the code to make that clear.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 19 +------------------
 fs/super.c     |  3 +++
 2 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index b1da7f8182c4..9786a38d1681 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2185,13 +2185,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
 	    !(mnt_flags & MNT_NODEV)) {
-		/* Was the nodev implicitly added in mount? */
-		if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
-		    !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
-			mnt_flags |= MNT_NODEV;
-		} else {
-			return -EPERM;
-		}
+		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
 	    !(mnt_flags & MNT_NOSUID)) {
@@ -2385,7 +2379,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 			int mnt_flags, const char *name, void *data)
 {
 	struct file_system_type *type;
-	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct vfsmount *mnt;
 	int err;
 
@@ -2396,16 +2389,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 	if (!type)
 		return -ENODEV;
 
-	if (user_ns != &init_user_ns) {
-		/* Only in special cases allow devices from mounts
-		 * created outside the initial user namespace.
-		 */
-		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
-			flags |= MS_NODEV;
-			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
-		}
-	}
-
 	mnt = vfs_kern_mount(type, flags, name, data);
 	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
 	    !mnt->mnt_sb->s_subtype)
diff --git a/fs/super.c b/fs/super.c
index 78790ada7191..25cdceed2ad3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -206,6 +206,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
 	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
+	if ((s->s_user_ns != &init_user_ns) &&
+	    !(type->fs_flags & FS_USERNS_DEV_MOUNT))
+		s->s_iflags |= SB_I_NODEV;
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	mutex_init(&s->s_sync_lock);
-- 
cgit v1.2.1


From cc50a07a247e17db76b1f0b0ca06652556e04fa3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 15:44:48 -0500
Subject: userns: Remove the now unnecessary FS_USERNS_DEV_MOUNT flag

Now that SB_I_NODEV controls the nodev behavior devpts can just clear
this flag during mount.  Simplifying the code and making it easier
to audit how the code works.  While still preserving the invariant
that s_iflags is only modified during mount.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/devpts/inode.c | 3 ++-
 fs/super.c        | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 37c134a132c7..d116453b0276 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -396,6 +396,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode *inode;
 
+	s->s_iflags &= ~SB_I_NODEV;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = DEVPTS_SUPER_MAGIC;
@@ -480,7 +481,7 @@ static struct file_system_type devpts_fs_type = {
 	.name		= "devpts",
 	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 /*
diff --git a/fs/super.c b/fs/super.c
index 25cdceed2ad3..37813bf479cf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -206,8 +206,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
 	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
-	if ((s->s_user_ns != &init_user_ns) &&
-	    !(type->fs_flags & FS_USERNS_DEV_MOUNT))
+	if (s->s_user_ns != &init_user_ns)
 		s->s_iflags |= SB_I_NODEV;
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
-- 
cgit v1.2.1


From 380cf5ba6b0a0b307f4afb62b186ca801defb203 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 23 Jun 2016 16:41:05 -0500
Subject: fs: Treat foreign mounts as nosuid

If a process gets access to a mount from a different user
namespace, that process should not be able to take advantage of
setuid files or selinux entrypoints from that filesystem.  Prevent
this by treating mounts from other mount namespaces and those not
owned by current_user_ns() or an ancestor as nosuid.

This will make it safer to allow more complex filesystems to be
mounted in non-root user namespaces.

This does not remove the need for MNT_LOCK_NOSUID.  The setuid,
setgid, and file capability bits can no longer be abused if code in
a user namespace were to clear nosuid on an untrusted filesystem,
but this patch, by itself, is insufficient to protect the system
from abuse of files that, when execed, would increase MAC privilege.

As a more concrete explanation, any task that can manipulate a
vfsmount associated with a given user namespace already has
capabilities in that namespace and all of its descendents.  If they
can cause a malicious setuid, setgid, or file-caps executable to
appear in that mount, then that executable will only allow them to
elevate privileges in exactly the set of namespaces in which they
are already privileges.

On the other hand, if they can cause a malicious executable to
appear with a dangerous MAC label, running it could change the
caller's security context in a way that should not have been
possible, even inside the namespace in which the task is confined.

As a hardening measure, this would have made CVE-2014-5207 much
more difficult to exploit.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c      |  2 +-
 fs/namespace.c | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 887c1c955df8..ca239fc86d8d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1411,7 +1411,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
 	bprm->cred->euid = current_euid();
 	bprm->cred->egid = current_egid();
 
-	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+	if (!mnt_may_suid(bprm->file->f_path.mnt))
 		return;
 
 	if (task_no_new_privs(current))
diff --git a/fs/namespace.c b/fs/namespace.c
index 9786a38d1681..aabe8e397fc3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3280,6 +3280,19 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
 	return !mnt_already_visible(ns, mnt, new_mnt_flags);
 }
 
+bool mnt_may_suid(struct vfsmount *mnt)
+{
+	/*
+	 * Foreign mounts (accessed via fchdir or through /proc
+	 * symlinks) are always treated as if they are nosuid.  This
+	 * prevents namespaces from trusting potentially unsafe
+	 * suid/sgid bits, file caps, or security labels that originate
+	 * in other namespaces.
+	 */
+	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
+	       current_in_userns(mnt->mnt_sb->s_user_ns);
+}
+
 static struct ns_common *mntns_get(struct task_struct *task)
 {
 	struct ns_common *ns = NULL;
-- 
cgit v1.2.1


From a475acf01f79e89a1a5845733e10108d80f77188 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:25 -0500
Subject: fs: Refuse uid/gid changes which don't map into s_user_ns

Add checks to notify_change to verify that uid and gid changes
will map into the superblock's user namespace. If they do not
fail with -EOVERFLOW.

This is mandatory so that fileystems don't have to even think
of dealing with ia_uid and ia_gid that

--EWB Moved the test from inode_change_ok to notify_change

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/attr.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 25b24d0f6c88..dd723578ddce 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -255,6 +255,17 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
 		return 0;
 
+	/*
+	 * Verify that uid/gid changes are valid in the target
+	 * namespace of the superblock.
+	 */
+	if (ia_valid & ATTR_UID &&
+	    !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+		return -EOVERFLOW;
+	if (ia_valid & ATTR_GID &&
+	    !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+		return -EOVERFLOW;
+
 	error = security_inode_setattr(dentry, attr);
 	if (error)
 		return error;
-- 
cgit v1.2.1


From 0d4d717f25834134bb6f43284f84c8ccee5bbf2a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 27 Jun 2016 16:04:06 -0500
Subject: vfs: Verify acls are valid within superblock's s_user_ns.

Update posix_acl_valid to verify that an acl is within a user namespace.

Update the callers of posix_acl_valid to pass in an appropriate
user namespace.  For posix_acl_xattr_set and v9fs_xattr_set_acl pass in
inode->i_sb->s_user_ns to posix_acl_valid.  For md_unpack_acl pass in
&init_user_ns as no inode or superblock is in sight.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/9p/acl.c    | 2 +-
 fs/posix_acl.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 0576eaeb60b9..5b6a1743ea17 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -266,7 +266,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 		else if (acl) {
-			retval = posix_acl_valid(acl);
+			retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
 			if (retval)
 				goto err_out;
 		}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8a4a266beff3..647c28180675 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -205,7 +205,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
  * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
  */
 int
-posix_acl_valid(const struct posix_acl *acl)
+posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
 {
 	const struct posix_acl_entry *pa, *pe;
 	int state = ACL_USER_OBJ;
@@ -225,7 +225,7 @@ posix_acl_valid(const struct posix_acl *acl)
 			case ACL_USER:
 				if (state != ACL_USER)
 					return -EINVAL;
-				if (!uid_valid(pa->e_uid))
+				if (!kuid_has_mapping(user_ns, pa->e_uid))
 					return -EINVAL;
 				needs_mask = 1;
 				break;
@@ -240,7 +240,7 @@ posix_acl_valid(const struct posix_acl *acl)
 			case ACL_GROUP:
 				if (state != ACL_GROUP)
 					return -EINVAL;
-				if (!gid_valid(pa->e_gid))
+				if (!kgid_has_mapping(user_ns, pa->e_gid))
 					return -EINVAL;
 				needs_mask = 1;
 				break;
@@ -845,7 +845,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 			return PTR_ERR(acl);
 
 		if (acl) {
-			ret = posix_acl_valid(acl);
+			ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
 			if (ret)
 				goto out;
 		}
-- 
cgit v1.2.1


From 2d7f9e2ad35e4e7a3086231f19bfab33c6a8a64a Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:23 -0500
Subject: fs: Check for invalid i_uid in may_follow_link()

Filesystem uids which don't map into a user namespace may result
in inode->i_uid being INVALID_UID. A symlink and its parent
could have different owners in the filesystem can both get
mapped to INVALID_UID, which may result in following a symlink
when this would not have otherwise been permitted when protected
symlinks are enabled.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/namei.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 757a32725d92..8701bd9a5270 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -901,6 +901,7 @@ static inline int may_follow_link(struct nameidata *nd)
 {
 	const struct inode *inode;
 	const struct inode *parent;
+	kuid_t puid;
 
 	if (!sysctl_protected_symlinks)
 		return 0;
@@ -916,7 +917,8 @@ static inline int may_follow_link(struct nameidata *nd)
 		return 0;
 
 	/* Allowed if parent directory and link owner match. */
-	if (uid_eq(parent->i_uid, inode->i_uid))
+	puid = parent->i_uid;
+	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
 		return 0;
 
 	if (nd->flags & LOOKUP_RCU)
-- 
cgit v1.2.1


From 0bd23d09b874e53bd1a2fe2296030aa2720d7b08 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 29 Jun 2016 14:54:46 -0500
Subject: vfs: Don't modify inodes with a uid or gid unknown to the vfs

When a filesystem outside of init_user_ns is mounted it could have
uids and gids stored in it that do not map to init_user_ns.

The plan is to allow those filesystems to set i_uid to INVALID_UID and
i_gid to INVALID_GID for unmapped uids and gids and then to handle
that strange case in the vfs to ensure there is consistent robust
handling of the weirdness.

Upon a careful review of the vfs and filesystems about the only case
where there is any possibility of confusion or trouble is when the
inode is written back to disk.  In that case filesystems typically
read the inode->i_uid and inode->i_gid and write them to disk even
when just an inode timestamp is being updated.

Which leads to a rule that is very simple to implement and understand
inodes whose i_uid or i_gid is not valid may not be written.

In dealing with access times this means treat those inodes as if the
inode flag S_NOATIME was set.  Reads of the inodes appear safe and
useful, but any write or modification is disallowed.  The only inode
write that is allowed is a chown that sets the uid and gid on the
inode to valid values.  After such a chown the inode is normal and may
be treated as such.

Denying all writes to inodes with uids or gids unknown to the vfs also
prevents several oddball cases where corruption would have occurred
because the vfs does not have complete information.

One problem case that is prevented is attempting to use the gid of a
directory for new inodes where the directories sgid bit is set but the
directories gid is not mapped.

Another problem case avoided is attempting to update the evm hash
after setxattr, removexattr, and setattr.  As the evm hash includeds
the inode->i_uid or inode->i_gid not knowning the uid or gid prevents
a correct evm hash from being computed.  evm hash verification also
fails when i_uid or i_gid is unknown but that is essentially harmless
as it does not cause filesystem corruption.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/attr.c  |  8 ++++++++
 fs/inode.c |  7 +++++++
 fs/namei.c | 26 +++++++++++++++++++++-----
 fs/xattr.c |  7 +++++++
 4 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index dd723578ddce..42bb42bb3c72 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -266,6 +266,14 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	    !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
 		return -EOVERFLOW;
 
+	/* Don't allow modifications of files with invalid uids or
+	 * gids unless those uids & gids are being made valid.
+	 */
+	if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
+		return -EOVERFLOW;
+	if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
+		return -EOVERFLOW;
+
 	error = security_inode_setattr(dentry, attr);
 	if (error)
 		return error;
diff --git a/fs/inode.c b/fs/inode.c
index 4ccbc21b30ce..c0ebb97fb085 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1617,6 +1617,13 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
 
 	if (inode->i_flags & S_NOATIME)
 		return false;
+
+	/* Atime updates will likely cause i_uid and i_gid to be written
+	 * back improprely if their true value is unknown to the vfs.
+	 */
+	if (HAS_UNMAPPED_ID(inode))
+		return false;
+
 	if (IS_NOATIME(inode))
 		return false;
 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
diff --git a/fs/namei.c b/fs/namei.c
index 8701bd9a5270..840201c4c290 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -410,6 +410,14 @@ int __inode_permission(struct inode *inode, int mask)
 		 */
 		if (IS_IMMUTABLE(inode))
 			return -EACCES;
+
+		/*
+		 * Updating mtime will likely cause i_uid and i_gid to be
+		 * written back improperly if their true value is unknown
+		 * to the vfs.
+		 */
+		if (HAS_UNMAPPED_ID(inode))
+			return -EACCES;
 	}
 
 	retval = do_inode_permission(inode, mask);
@@ -2759,10 +2767,11 @@ EXPORT_SYMBOL(__check_sticky);
  *	c. have CAP_FOWNER capability
  *  6. If the victim is append-only or immutable we can't do antyhing with
  *     links pointing to it.
- *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
- *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
- *  9. We can't remove a root or mountpoint.
- * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *  7. If the victim has an unknown uid or gid we can't change the inode.
+ *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 10. We can't remove a root or mountpoint.
+ * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
 static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
@@ -2784,7 +2793,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 		return -EPERM;
 
 	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
-	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
 		return -EPERM;
 	if (isdir) {
 		if (!d_is_dir(victim))
@@ -4190,6 +4199,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 */
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return -EPERM;
+	/*
+	 * Updating the link count will likely cause i_uid and i_gid to
+	 * be writen back improperly if their true value is unknown to
+	 * the vfs.
+	 */
+	if (HAS_UNMAPPED_ID(inode))
+		return -EPERM;
 	if (!dir->i_op->link)
 		return -EPERM;
 	if (S_ISDIR(inode->i_mode))
diff --git a/fs/xattr.c b/fs/xattr.c
index 4beafc43daa5..c243905835ab 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -38,6 +38,13 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 	if (mask & MAY_WRITE) {
 		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 			return -EPERM;
+		/*
+		 * Updating an xattr will likely cause i_uid and i_gid
+		 * to be writen back improperly if their true value is
+		 * unknown to the vfs.
+		 */
+		if (HAS_UNMAPPED_ID(inode))
+			return -EPERM;
 	}
 
 	/*
-- 
cgit v1.2.1


From 036d523641c66bef713042894a17f4335f199e49 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 1 Jul 2016 12:52:06 -0500
Subject: vfs: Don't create inodes with a uid or gid unknown to the vfs

It is expected that filesystems can not represent uids and gids from
outside of their user namespace.  Keep things simple by not even
trying to create filesystem nodes with non-sense uids and gids.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namei.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 840201c4c290..629823f19a6a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2814,16 +2814,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
  *  1. We can't do it if child already exists (open has special treatment for
  *     this case, but since we are inlined it's OK)
  *  2. We can't do it if dir is read-only (done in permission())
- *  3. We should have write and exec permissions on dir
- *  4. We can't do it if dir is immutable (done in permission())
+ *  3. We can't do it if the fs can't represent the fsuid or fsgid.
+ *  4. We should have write and exec permissions on dir
+ *  5. We can't do it if dir is immutable (done in permission())
  */
 static inline int may_create(struct inode *dir, struct dentry *child)
 {
+	struct user_namespace *s_user_ns;
 	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
 	if (child->d_inode)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
+	s_user_ns = dir->i_sb->s_user_ns;
+	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+		return -EOVERFLOW;
 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
-- 
cgit v1.2.1


From d49d37624a1931c2f3b5d0cbe95bd5181cbdc279 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 30 Jun 2016 16:31:01 -0500
Subject: quota: Ensure qids map to the filesystem

Introduce the helper qid_has_mapping and use it to ensure that the
quota system only considers qids that map to the filesystems
s_user_ns.

In practice for quota supporting filesystems today this is the exact
same check as qid_valid.  As only 0xffffffff aka (qid_t)-1 does not
map into init_user_ns.

Replace the qid_valid calls with qid_has_mapping as values come in
from userspace.  This is harmless today and it prepares the quota
system to work on filesystems with quotas but mounted by unprivileged
users.

Call qid_has_mapping from dqget.  This ensures the passed in qid has a
prepresentation on the underlying filesystem.  Previously this was
unnecessary as filesystesm never had qids that could not map.  With
the introduction of filesystems outside of s_user_ns this will not
remain true.

All of this ensures the quota code never has to deal with qids that
don't map to the underlying filesystem.

Cc: Jan Kara <jack@suse.cz>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/quota/dquot.c |  3 +++
 fs/quota/quota.c | 12 ++++++------
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ff21980d0119..74706b6aa747 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid)
 	unsigned int hashent = hashfn(sb, qid);
 	struct dquot *dquot, *empty = NULL;
 
+	if (!qid_has_mapping(sb->s_user_ns, qid))
+		return ERR_PTR(-EINVAL);
+
         if (!sb_has_quota_active(sb, qid.type))
 		return ERR_PTR(-ESRCH);
 we_slept:
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 0f10ee9892ce..73f6f4cf0a21 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->get_dqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
 	if (ret)
@@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->get_nextdqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
 	if (ret)
@@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->set_dqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	copy_from_if_dqblk(&fdq, &idq);
 	return sb->s_qcop->set_dqblk(sb, qid, &fdq);
@@ -581,7 +581,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->set_dqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	/* Are we actually setting timer / warning limits for all users? */
 	if (from_kqid(&init_user_ns, qid) == 0 &&
@@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->get_dqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
 	if (ret)
@@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->get_nextdqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
 	if (ret)
-- 
cgit v1.2.1


From cfd4c70a18c4e806aaac2f483153dae01e0ace1c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 5 Jul 2016 11:10:57 -0500
Subject: quota: Handle quota data stored in s_user_ns in quota_setxquota

In Q_XSETQLIMIT use sb->s_user_ns to detect when we are dealing with
the filesystems notion of id 0.

Cc: Jan Kara <jack@suse.cz>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Inspired-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/quota/quota.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 73f6f4cf0a21..35df08ee9c97 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -584,7 +584,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
 	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	/* Are we actually setting timer / warning limits for all users? */
-	if (from_kqid(&init_user_ns, qid) == 0 &&
+	if (from_kqid(sb->s_user_ns, qid) == 0 &&
 	    fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) {
 		struct qc_info qinfo;
 		int ret;
-- 
cgit v1.2.1


From 5c0048280babd579fa9e5f0e787122b06aee3f3b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 5 Jul 2016 10:41:57 -0500
Subject: dquot: For now explicitly don't support filesystems outside of
 init_user_ns

Mostly supporting filesystems outside of init_user_ns is
s/&init_usre_ns/dquot->dq_sb->s_user_ns/.  An actual need for
supporting quotas on filesystems outside of s_user_ns is quite a ways
away and to be done responsibily needs an audit on what can happen
with hostile quota files.  Until that audit is complete don't attempt
to support quota files on filesystems outside of s_user_ns.

Cc: Jan Kara <jack@suse.cz>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/quota/dquot.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 74706b6aa747..87197d13cc76 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2271,6 +2271,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		error = -EINVAL;
 		goto out_fmt;
 	}
+	/* Filesystems outside of init_user_ns not yet supported */
+	if (sb->s_user_ns != &init_user_ns) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
 	/* Usage always has to be set... */
 	if (!(flags & DQUOT_USAGE_ENABLED)) {
 		error = -EINVAL;
-- 
cgit v1.2.1


From aeaa4a79ff6a5ed912b7362f206cf8576fca538b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 23 Jul 2016 11:20:44 -0500
Subject: fs: Call d_automount with the filesystems creds

Seth Forshee reported a mount regression in nfs autmounts
with "fs: Add user namespace member to struct super_block".

It turns out that the assumption that current->cred is something
reasonable during mount while necessary to improve support of
unprivileged mounts is wrong in the automount path.

To fix the existing filesystems override current->cred with the
init_cred before calling d_automount and restore current->cred after
d_automount completes.

To support unprivileged mounts would require a more nuanced cred
selection, so fail on unprivileged mounts for the time being.  As none
of the filesystems that currently set FS_USERNS_MOUNT implement
d_automount this check is only good for preventing future problems.

Fixes: 6e4eab577a0c ("fs: Add user namespace member to struct super_block")
Tested-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namei.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 629823f19a6a..ef573df3297f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -36,6 +36,7 @@
 #include <linux/posix_acl.h>
 #include <linux/hash.h>
 #include <linux/bitops.h>
+#include <linux/init_task.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -1099,6 +1100,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
+	const struct cred *old_cred;
 	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1120,11 +1122,16 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	    path->dentry->d_inode)
 		return -EISDIR;
 
+	if (path->dentry->d_sb->s_user_ns != &init_user_ns)
+		return -EACCES;
+
 	nd->total_link_count++;
 	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
+	old_cred = override_creds(&init_cred);
 	mnt = path->dentry->d_op->d_automount(path);
+	revert_creds(old_cred);
 	if (IS_ERR(mnt)) {
 		/*
 		 * The filesystem is allowed to return -EISDIR here to indicate
-- 
cgit v1.2.1