From c7da82b894e9eef60a04a15f065a8502341bf13b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 29 Nov 2017 16:10:18 -0800 Subject: mm: replace pmd_write with pmd_access_permitted in fault + gup paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'access_permitted' helper is used in the gup-fast path and goes beyond the simple _PAGE_RW check to also: - validate that the mapping is writable from a protection keys standpoint - validate that the pte has _PAGE_USER set since all fault paths where pmd_write is must be referencing user-memory. Link: http://lkml.kernel.org/r/151043111049.2842.15241454964150083466.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 95981591977a..78b72c48374e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -627,7 +627,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; - if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + if (!pmd_dirty(*pmdp) + && !pmd_access_permitted(*pmdp, WRITE)) goto unlock_pmd; flush_cache_page(vma, address, pfn); -- cgit v1.2.1 From 04e35f4495dd560db30c25efca4eecae8ec8c375 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 29 Nov 2017 16:10:51 -0800 Subject: exec: avoid RLIMIT_STACK races with prlimit() While the defense-in-depth RLIMIT_STACK limit on setuid processes was protected against races from other threads calling setrlimit(), I missed protecting it against races from external processes calling prlimit(). This adds locking around the change and makes sure that rlim_max is set too. Link: http://lkml.kernel.org/r/20171127193457.GA11348@beast Fixes: 64701dee4178e ("exec: Use sane stack rlimit under secureexec") Signed-off-by: Kees Cook Reported-by: Ben Hutchings Reported-by: Brad Spengler Acked-by: Serge Hallyn Cc: James Morris Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Jiri Slaby Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 1d6243d9f2b6..6be2aa0ab26f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1340,10 +1340,15 @@ void setup_new_exec(struct linux_binprm * bprm) * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid - * needing to clean up the change on failure. + * races from other threads changing the limits. This also + * must be protected from races with prlimit() calls. */ + task_lock(current->group_leader); if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM) + current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM; + task_unlock(current->group_leader); } arch_pick_mmap_layout(current->mm); -- cgit v1.2.1 From d5dabd633922ac5ee5bcc67748f7defb8b211469 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Wed, 29 Nov 2017 16:11:01 -0800 Subject: fs/mbcache.c: make count_objects() more robust When running ltp stress test for 7*24 hours, vmscan occasionally emits the following warning continuously: mb_cache_scan+0x0/0x3f0 negative objects to delete nr=-9232265467809300450 ... Tracing shows the freeable(mb_cache_count returns) is -1, which causes the continuous accumulation and overflow of total_scan. This patch makes sure that mb_cache_count() cannot return a negative value, which makes the mbcache shrinker more robust. Link: http://lkml.kernel.org/r/1511753419-52328-1-git-send-email-jiang.biao2@zte.com.cn Signed-off-by: Jiang Biao Cc: Al Viro Cc: Minchan Kim Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/mbcache.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/mbcache.c b/fs/mbcache.c index d818fd236787..b8b8b9ced9f8 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink, struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); + /* Unlikely, but not impossible */ + if (unlikely(cache->c_entry_count < 0)) + return 0; return cache->c_entry_count; } -- cgit v1.2.1 From b6e8e12c0aeb5fbf1bf46c84d58cc93aedede385 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Wed, 29 Nov 2017 16:11:19 -0800 Subject: fs/fat/inode.c: fix sb_rdonly() change Commit bc98a42c1f7d ("VFS: Convert sb->s_flags & MS_RDONLY to sb_rdonly(sb)") converted fat_remount():new_rdonly from a bool to an int. However fat_remount() depends upon the compiler's conversion of a non-zero integer into boolean `true'. Fix it by switching `new_rdonly' back into a bool. Link: http://lkml.kernel.org/r/87mv3d5x51.fsf@mail.parknet.co.jp Fixes: bc98a42c1f7d0f8 ("VFS: Convert sb->s_flags & MS_RDONLY to sb_rdonly(sb)") Signed-off-by: OGAWA Hirofumi Cc: Joe Perches Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 016c46b5e44c..20a0a89eaca5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -779,7 +779,7 @@ static void __exit fat_destroy_inodecache(void) static int fat_remount(struct super_block *sb, int *flags, char *data) { - int new_rdonly; + bool new_rdonly; struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); -- cgit v1.2.1 From 43694d4bf843ddd34519e8e9de983deefeada699 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 29 Nov 2017 16:11:23 -0800 Subject: autofs: revert "autofs: take more care to not update last_used on path walk" While commit 092a53452bb7 ("autofs: take more care to not update last_used on path walk") helped (partially) resolve a problem where automounts were not expiring due to aggressive accesses from user space it has a side effect for very large environments. This change helps with the expire problem by making the expire more aggressive but, for very large environments, that means more mount requests from clients. When there are a lot of clients that can mean fairly significant server load increases. It turns out I put the last_used in this position to solve this very problem and failed to update my own thinking of the autofs expire policy. So the patch being reverted introduces a regression which should be fixed. Link: http://lkml.kernel.org/r/151174729420.6162.1832622523537052460.stgit@pluto.themaw.net Fixes: 092a53452b ("autofs: take more care to not update last_used on path walk") Signed-off-by: Ian Kent Reviewed-by: NeilBrown Cc: Al Viro Cc: [4.11+] Cc: Colin Walters Cc: David Howells Cc: Ondrej Holy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d79ced925861..82e8f6edfb48 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk) pr_debug("waiting for mount name=%pd\n", path->dentry); status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); - ino->last_used = jiffies; } + ino->last_used = jiffies; return status; } @@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; struct dentry *new; new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - if (new == dentry) - dput(new); - else { - struct autofs_info *ino; - - ino = autofs4_dentry_ino(new); - ino->last_used = jiffies; - dput(path->dentry); - path->dentry = new; - } + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; + dput(path->dentry); + path->dentry = new; } return path->dentry; } -- cgit v1.2.1 From 5d38f049cee1e1c4a7ac55aa79d37d01ddcc3860 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 29 Nov 2017 16:11:26 -0800 Subject: autofs: revert "autofs: fix AT_NO_AUTOMOUNT not being honored" Commit 42f461482178 ("autofs: fix AT_NO_AUTOMOUNT not being honored") allowed the fstatat(2) system call to properly honor the AT_NO_AUTOMOUNT flag but introduced a semantic change. In order to honor AT_NO_AUTOMOUNT a semantic change was made to the negative dentry case for stat family system calls in follow_automount(). This changed the unconditional triggering of an automount in this case to no longer be done and an error returned instead. This has caused more problems than I expected so reverting the change is needed. In a discussion with Neil Brown it was concluded that the automount(8) daemon can implement this change without kernel modifications. So that will be done instead and the autofs module documentation updated with a description of the problem and what needs to be done by module users for this specific case. Link: http://lkml.kernel.org/r/151174730120.6162.3848002191530283984.stgit@pluto.themaw.net Fixes: 42f4614821 ("autofs: fix AT_NO_AUTOMOUNT not being honored") Signed-off-by: Ian Kent Cc: Neil Brown Cc: Al Viro Cc: David Howells Cc: Colin Walters Cc: Ondrej Holy Cc: [4.11+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index f0c7a7b9b6ca..9cc91fb7f156 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | - LOOKUP_AUTOMOUNT))) { - /* Positive dentry that isn't meant to trigger an - * automount, EISDIR will allow it to be used, - * otherwise there's no mount here "now" so return - * ENOENT. - */ - if (path->dentry->d_inode) - return -EISDIR; - else - return -ENOENT; - } + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + path->dentry->d_inode) + return -EISDIR; if (path->dentry->d_sb->s_user_ns != &init_user_ns) return -EACCES; -- cgit v1.2.1 From 72639e6df4128dfde8fee7638a35be7db8114000 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 29 Nov 2017 16:11:33 -0800 Subject: fs/hugetlbfs/inode.c: change put_page/unlock_page order in hugetlbfs_fallocate() hugetlfs_fallocate() currently performs put_page() before unlock_page(). This scenario opens a small time window, from the time the page is added to the page cache, until it is unlocked, in which the page might be removed from the page-cache by another core. If the page is removed during this time windows, it might cause a memory corruption, as the wrong page will be unlocked. It is arguable whether this scenario can happen in a real system, and there are several mitigating factors. The issue was found by code inspection (actually grep), and not by actually triggering the flow. Yet, since putting the page before unlocking is incorrect it should be fixed, if only to prevent future breakage or someone copy-pasting this code. Mike said: "I am of the opinion that this does not need to be sent to stable. Although the ordering is current code is incorrect, there is no way for this to be a problem with current locking. In addition, I verified that the perhaps bigger issue with sys_fadvise64(POSIX_FADV_DONTNEED) for hugetlbfs and other filesystems is addressed in 3a77d214807c ("mm: fadvise: avoid fadvise for fs without backing device")" Link: http://lkml.kernel.org/r/20170826191124.51642-1-namit@vmware.com Fixes: 70c3547e36f5c ("hugetlbfs: add hugetlbfs_fallocate()") Signed-off-by: Nadav Amit Reviewed-by: Mike Kravetz Acked-by: Michal Hocko Cc: Eric Biggers Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e76730aac0d..8a85f3f53446 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* - * page_put due to reference from alloc_huge_page() * unlock_page because locked by add_to_page_cache() + * page_put due to reference from alloc_huge_page() */ - put_page(page); unlock_page(page); + put_page(page); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) -- cgit v1.2.1