diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-03-22 18:48:43 -0700 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-03-22 18:48:43 -0700 | 
| commit | f36b7534b83357cf52e747905de6d65b4f7c2512 (patch) | |
| tree | ca52ebdc4aaa738bd464b22a06ed034e41c46acb | |
| parent | 8401c72c593d2be8607d2a0a4551ee5c867d6f2f (diff) | |
| parent | 9d3c3354bb85bab4d865fe95039443f09a4c8394 (diff) | |
| download | talos-op-linux-f36b7534b83357cf52e747905de6d65b4f7c2512.tar.gz talos-op-linux-f36b7534b83357cf52e747905de6d65b4f7c2512.zip  | |
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton:
 "13 fixes"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm, thp: do not cause memcg oom for thp
  mm/vmscan: wake up flushers for legacy cgroups too
  Revert "mm: page_alloc: skip over regions of invalid pfns where possible"
  mm/shmem: do not wait for lock_page() in shmem_unused_huge_shrink()
  mm/thp: do not wait for lock_page() in deferred_split_scan()
  mm/khugepaged.c: convert VM_BUG_ON() to collapse fail
  x86/mm: implement free pmd/pte page interfaces
  mm/vmalloc: add interfaces to free unmapped page table
  h8300: remove extraneous __BIG_ENDIAN definition
  hugetlbfs: check for pgoff value overflow
  lockdep: fix fs_reclaim warning
  MAINTAINERS: update Mark Fasheh's e-mail
  mm/mempolicy.c: avoid use uninitialized preferred_node
| -rw-r--r-- | MAINTAINERS | 2 | ||||
| -rw-r--r-- | arch/arm64/mm/mmu.c | 10 | ||||
| -rw-r--r-- | arch/h8300/include/asm/byteorder.h | 1 | ||||
| -rw-r--r-- | arch/x86/mm/pgtable.c | 48 | ||||
| -rw-r--r-- | fs/hugetlbfs/inode.c | 17 | ||||
| -rw-r--r-- | include/asm-generic/pgtable.h | 10 | ||||
| -rw-r--r-- | include/linux/memblock.h | 1 | ||||
| -rw-r--r-- | lib/ioremap.c | 6 | ||||
| -rw-r--r-- | mm/huge_memory.c | 9 | ||||
| -rw-r--r-- | mm/hugetlb.c | 7 | ||||
| -rw-r--r-- | mm/khugepaged.c | 15 | ||||
| -rw-r--r-- | mm/memblock.c | 28 | ||||
| -rw-r--r-- | mm/mempolicy.c | 3 | ||||
| -rw-r--r-- | mm/page_alloc.c | 13 | ||||
| -rw-r--r-- | mm/shmem.c | 31 | ||||
| -rw-r--r-- | mm/vmscan.c | 31 | 
16 files changed, 153 insertions, 79 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 4e62756936fa..73c0cdabf755 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10334,7 +10334,7 @@ F:	drivers/oprofile/  F:	include/linux/oprofile.h  ORACLE CLUSTER FILESYSTEM 2 (OCFS2) -M:	Mark Fasheh <mfasheh@versity.com> +M:	Mark Fasheh <mark@fasheh.com>  M:	Joel Becker <jlbec@evilplan.org>  L:	ocfs2-devel@oss.oracle.com (moderated for non-subscribers)  W:	http://ocfs2.wiki.kernel.org diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8c704f1e53c2..2dbb2c9f1ec1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp)  	pmd_clear(pmdp);  	return 1;  } + +int pud_free_pmd_page(pud_t *pud) +{ +	return pud_none(*pud); +} + +int pmd_free_pte_page(pmd_t *pmd) +{ +	return pmd_none(*pmd); +} diff --git a/arch/h8300/include/asm/byteorder.h b/arch/h8300/include/asm/byteorder.h index ecff2d1ca5a3..6eaa7ad5fc2c 100644 --- a/arch/h8300/include/asm/byteorder.h +++ b/arch/h8300/include/asm/byteorder.h @@ -2,7 +2,6 @@  #ifndef __H8300_BYTEORDER_H__  #define __H8300_BYTEORDER_H__ -#define __BIG_ENDIAN __ORDER_BIG_ENDIAN__  #include <linux/byteorder/big_endian.h>  #endif diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 004abf9ebf12..34cda7e0551b 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -702,4 +702,52 @@ int pmd_clear_huge(pmd_t *pmd)  	return 0;  } + +/** + * pud_free_pmd_page - Clear pud entry and free pmd page. + * @pud: Pointer to a PUD. + * + * Context: The pud range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pud_free_pmd_page(pud_t *pud) +{ +	pmd_t *pmd; +	int i; + +	if (pud_none(*pud)) +		return 1; + +	pmd = (pmd_t *)pud_page_vaddr(*pud); + +	for (i = 0; i < PTRS_PER_PMD; i++) +		if (!pmd_free_pte_page(&pmd[i])) +			return 0; + +	pud_clear(pud); +	free_page((unsigned long)pmd); + +	return 1; +} + +/** + * pmd_free_pte_page - Clear pmd entry and free pte page. + * @pmd: Pointer to a PMD. + * + * Context: The pmd range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pmd_free_pte_page(pmd_t *pmd) +{ +	pte_t *pte; + +	if (pmd_none(*pmd)) +		return 1; + +	pte = (pte_t *)pmd_page_vaddr(*pmd); +	pmd_clear(pmd); +	free_page((unsigned long)pte); + +	return 1; +}  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8fe1b0aa2896..b9a254dcc0e7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec)  	pagevec_reinit(pvec);  } +/* + * Mask used when checking the page offset value passed in via system + * calls.  This value will be converted to a loff_t which is signed. + * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the + * value.  The extra bit (- 1 in the shift value) is to take the sign + * bit into account. + */ +#define PGOFF_LOFFT_MAX \ +	(((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1))) +  static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)  {  	struct inode *inode = file_inode(file); @@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)  	vma->vm_ops = &hugetlb_vm_ops;  	/* -	 * Offset passed to mmap (before page shift) could have been -	 * negative when represented as a (l)off_t. +	 * page based offset in vm_pgoff could be sufficiently large to +	 * overflow a (l)off_t when converted to byte offset.  	 */ -	if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) +	if (vma->vm_pgoff & PGOFF_LOFFT_MAX)  		return -EINVAL; +	/* must be huge page aligned */  	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))  		return -EINVAL; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2cfa3075d148..bfbb44a5ad38 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);  int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);  int pud_clear_huge(pud_t *pud);  int pmd_clear_huge(pmd_t *pmd); +int pud_free_pmd_page(pud_t *pud); +int pmd_free_pte_page(pmd_t *pmd);  #else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */  static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)  { @@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd)  {  	return 0;  } +static inline int pud_free_pmd_page(pud_t *pud) +{ +	return 0; +} +static inline int pmd_free_pte_page(pmd_t *pmd) +{ +	return 0; +}  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */  #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8be5077efb5f..f92ea7783652 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,  			    unsigned long  *end_pfn);  void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,  			  unsigned long *out_end_pfn, int *out_nid); -unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);  /**   * for_each_mem_pfn_range - early memory pfn range iterator diff --git a/lib/ioremap.c b/lib/ioremap.c index b808a390e4c3..54e5bbaa3200 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,  		if (ioremap_pmd_enabled() &&  		    ((next - addr) == PMD_SIZE) && -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) { +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) && +		    pmd_free_pte_page(pmd)) {  			if (pmd_set_huge(pmd, phys_addr + addr, prot))  				continue;  		} @@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,  		if (ioremap_pud_enabled() &&  		    ((next - addr) == PUD_SIZE) && -		    IS_ALIGNED(phys_addr + addr, PUD_SIZE)) { +		    IS_ALIGNED(phys_addr + addr, PUD_SIZE) && +		    pud_free_pmd_page(pud)) {  			if (pud_set_huge(pud, phys_addr + addr, prot))  				continue;  		} diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 87ab9b8f56b5..5a68730eebd6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,  	VM_BUG_ON_PAGE(!PageCompound(page), page); -	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { +	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg, +				  true)) {  		put_page(page);  		count_vm_event(THP_FAULT_FALLBACK);  		return VM_FAULT_FALLBACK; @@ -1316,7 +1317,7 @@ alloc:  	}  	if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, -					huge_gfp, &memcg, true))) { +				huge_gfp | __GFP_NORETRY, &memcg, true))) {  		put_page(new_page);  		split_huge_pmd(vma, vmf->pmd, vmf->address);  		if (page) @@ -2783,11 +2784,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,  	list_for_each_safe(pos, next, &list) {  		page = list_entry((void *)pos, struct page, mapping); -		lock_page(page); +		if (!trylock_page(page)) +			goto next;  		/* split_huge_page() removes page from list on success */  		if (!split_huge_page(page))  			split++;  		unlock_page(page); +next:  		put_page(page);  	} diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a963f2034dfc..976bbc5646fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@  #include <linux/bootmem.h>  #include <linux/sysfs.h>  #include <linux/slab.h> +#include <linux/mmdebug.h>  #include <linux/sched/signal.h>  #include <linux/rmap.h>  #include <linux/string_helpers.h> @@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode,  	struct resv_map *resv_map;  	long gbl_reserve; +	/* This should never happen */ +	if (from > to) { +		VM_WARN(1, "%s called with a negative range\n", __func__); +		return -EINVAL; +	} +  	/*  	 * Only apply hugepage reservation if asked. At fault time, an  	 * attempt will be made for VM_NORESERVE to allocate a page diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b7e2268dfc9a..e42568284e06 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,  			goto out;  		} -		VM_BUG_ON_PAGE(PageCompound(page), page); +		/* TODO: teach khugepaged to collapse THP mapped with pte */ +		if (PageCompound(page)) { +			result = SCAN_PAGE_COMPOUND; +			goto out; +		} +  		VM_BUG_ON_PAGE(!PageAnon(page), page);  		/* @@ -960,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm,  		goto out_nolock;  	} -	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { +	/* Do not oom kill for khugepaged charges */ +	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, +					   &memcg, true))) {  		result = SCAN_CGROUP_CHARGE_FAIL;  		goto out_nolock;  	} @@ -1319,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm,  		goto out;  	} -	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { +	/* Do not oom kill for khugepaged charges */ +	if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, +					   &memcg, true))) {  		result = SCAN_CGROUP_CHARGE_FAIL;  		goto out;  	} diff --git a/mm/memblock.c b/mm/memblock.c index b6ba6b7adadc..48376bd33274 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,  		*out_nid = r->nid;  } -unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, -						      unsigned long max_pfn) -{ -	struct memblock_type *type = &memblock.memory; -	unsigned int right = type->cnt; -	unsigned int mid, left = 0; -	phys_addr_t addr = PFN_PHYS(++pfn); - -	do { -		mid = (right + left) / 2; - -		if (addr < type->regions[mid].base) -			right = mid; -		else if (addr >= (type->regions[mid].base + -				  type->regions[mid].size)) -			left = mid + 1; -		else { -			/* addr is within the region, so pfn is valid */ -			return pfn; -		} -	} while (left < right); - -	if (right == type->cnt) -		return -1UL; -	else -		return PHYS_PFN(type->regions[right].base); -} -  /**   * memblock_set_node - set node ID on memblock regions   * @base: base of area to set node ID for diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d879f1d8a44a..32cba0332787 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)  	case MPOL_INTERLEAVE:  		return !!nodes_equal(a->v.nodes, b->v.nodes);  	case MPOL_PREFERRED: +		/* a's ->flags is the same as b's */ +		if (a->flags & MPOL_F_LOCAL) +			return true;  		return a->v.preferred_node == b->v.preferred_node;  	default:  		BUG(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 635d7dd29d7f..1741dd23e7c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3596,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)  		return false;  	/* this guy won't enter reclaim */ -	if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) +	if (current->flags & PF_MEMALLOC)  		return false;  	/* We're only interested __GFP_FS allocations for now */ @@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  		if (context != MEMMAP_EARLY)  			goto not_early; -		if (!early_pfn_valid(pfn)) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -			/* -			 * Skip to the pfn preceding the next valid one (or -			 * end_pfn), such that we hit a valid pfn (or end_pfn) -			 * on our next iteration of the loop. -			 */ -			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; -#endif +		if (!early_pfn_valid(pfn))  			continue; -		}  		if (!early_pfn_in_nid(pfn, nid))  			continue;  		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) diff --git a/mm/shmem.c b/mm/shmem.c index 1907688b75ee..b85919243399 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -493,36 +493,45 @@ next:  		info = list_entry(pos, struct shmem_inode_info, shrinklist);  		inode = &info->vfs_inode; -		if (nr_to_split && split >= nr_to_split) { -			iput(inode); -			continue; -		} +		if (nr_to_split && split >= nr_to_split) +			goto leave; -		page = find_lock_page(inode->i_mapping, +		page = find_get_page(inode->i_mapping,  				(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);  		if (!page)  			goto drop; +		/* No huge page at the end of the file: nothing to split */  		if (!PageTransHuge(page)) { -			unlock_page(page);  			put_page(page);  			goto drop;  		} +		/* +		 * Leave the inode on the list if we failed to lock +		 * the page at this time. +		 * +		 * Waiting for the lock may lead to deadlock in the +		 * reclaim path. +		 */ +		if (!trylock_page(page)) { +			put_page(page); +			goto leave; +		} +  		ret = split_huge_page(page);  		unlock_page(page);  		put_page(page); -		if (ret) { -			/* split failed: leave it on the list */ -			iput(inode); -			continue; -		} +		/* If split failed leave the inode on the list */ +		if (ret) +			goto leave;  		split++;  drop:  		list_del_init(&info->shrinklist);  		removed++; +leave:  		iput(inode);  	} diff --git a/mm/vmscan.c b/mm/vmscan.c index bee53495a829..cd5dc3faaa57 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1780,6 +1780,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  		set_bit(PGDAT_WRITEBACK, &pgdat->flags);  	/* +	 * If dirty pages are scanned that are not queued for IO, it +	 * implies that flushers are not doing their job. This can +	 * happen when memory pressure pushes dirty pages to the end of +	 * the LRU before the dirty limits are breached and the dirty +	 * data has expired. It can also happen when the proportion of +	 * dirty pages grows not through writes but through memory +	 * pressure reclaiming all the clean cache. And in some cases, +	 * the flushers simply cannot keep up with the allocation +	 * rate. Nudge the flusher threads in case they are asleep. +	 */ +	if (stat.nr_unqueued_dirty == nr_taken) +		wakeup_flusher_threads(WB_REASON_VMSCAN); + +	/*  	 * Legacy memcg will stall in page writeback so avoid forcibly  	 * stalling here.  	 */ @@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  		if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)  			set_bit(PGDAT_CONGESTED, &pgdat->flags); -		/* -		 * If dirty pages are scanned that are not queued for IO, it -		 * implies that flushers are not doing their job. This can -		 * happen when memory pressure pushes dirty pages to the end of -		 * the LRU before the dirty limits are breached and the dirty -		 * data has expired. It can also happen when the proportion of -		 * dirty pages grows not through writes but through memory -		 * pressure reclaiming all the clean cache. And in some cases, -		 * the flushers simply cannot keep up with the allocation -		 * rate. Nudge the flusher threads in case they are asleep, but -		 * also allow kswapd to start writing pages during reclaim. -		 */ -		if (stat.nr_unqueued_dirty == nr_taken) { -			wakeup_flusher_threads(WB_REASON_VMSCAN); +		/* Allow kswapd to start writing pages during reclaim. */ +		if (stat.nr_unqueued_dirty == nr_taken)  			set_bit(PGDAT_DIRTY, &pgdat->flags); -		}  		/*  		 * If kswapd scans pages marked marked for immediate  | 

