diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/cma.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 14 | ||||
-rw-r--r-- | mm/hugetlb.c | 66 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 19 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 4 | ||||
-rw-r--r-- | mm/kasan/report.c | 3 | ||||
-rw-r--r-- | mm/khugepaged.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 8 | ||||
-rw-r--r-- | mm/list_lru.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 9 | ||||
-rw-r--r-- | mm/memory-failure.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 8 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 29 | ||||
-rw-r--r-- | mm/mlock.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 34 | ||||
-rw-r--r-- | mm/page_alloc.c | 139 | ||||
-rw-r--r-- | mm/shmem.c | 17 | ||||
-rw-r--r-- | mm/slab.c | 45 | ||||
-rw-r--r-- | mm/slab.h | 1 | ||||
-rw-r--r-- | mm/slab_common.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 8 | ||||
-rw-r--r-- | mm/vmscan.c | 4 | ||||
-rw-r--r-- | mm/workingset.c | 2 |
26 files changed, 242 insertions, 209 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index be0ee11fa0d9..86e3e0e74d20 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,7 +187,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on !KASAN + depends on COMPILE_TEST || !KASAN config MEMORY_HOTPLUG_SPARSE def_bool y @@ -385,6 +385,9 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) bitmap_maxno = cma_bitmap_maxno(cma); bitmap_count = cma_bitmap_pages_to_bits(cma, count); + if (bitmap_count > bitmap_maxno) + return NULL; + for (;;) { mutex_lock(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..50b52fe51937 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc); */ wait_queue_head_t *page_waitqueue(struct page *page) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return bit_waitqueue(page, 0); } EXPORT_SYMBOL(page_waitqueue); @@ -1734,6 +1732,9 @@ find_page: if (inode->i_blkbits == PAGE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; + /* pipes can't handle partially uptodate pages */ + if (unlikely(iter->type & ITER_PIPE)) + goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; /* Did it get truncated before we got the lock? */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cdcd25cb30fe..f8e35cc66d32 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -737,8 +737,9 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - if (track_pfn_insert(vma, &pgprot, pfn)) - return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, pfn); + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); return VM_FAULT_NOPAGE; } @@ -1426,11 +1427,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, - pmd_t *old_pmd, pmd_t *new_pmd) + pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; + bool force_flush = false; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || @@ -1456,6 +1458,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); + if (pmd_present(pmd) && pmd_dirty(pmd)) + force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl) && @@ -1467,6 +1471,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); if (new_ptl != old_ptl) spin_unlock(new_ptl); + if (force_flush) + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + else + *need_flush = true; spin_unlock(old_ptl); return true; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ec49d9ef1eef..418bf01a50ed 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1826,11 +1826,17 @@ static void return_unused_surplus_pages(struct hstate *h, * is not the case is if a reserve map was changed between calls. It * is the responsibility of the caller to notice the difference and * take appropriate action. + * + * vma_add_reservation is used in error paths where a reservation must + * be restored when a newly allocated huge page must be freed. It is + * to be called after calling vma_needs_reservation to determine if a + * reservation exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, + VMA_ADD_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, @@ -1856,6 +1862,14 @@ static long __vma_reservation_common(struct hstate *h, region_abort(resv, idx, idx + 1); ret = 0; break; + case VMA_ADD_RESV: + if (vma->vm_flags & VM_MAYSHARE) + ret = region_add(resv, idx, idx + 1); + else { + region_abort(resv, idx, idx + 1); + ret = region_del(resv, idx, idx + 1); + } + break; default: BUG(); } @@ -1903,6 +1917,56 @@ static void vma_end_reservation(struct hstate *h, (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } +static long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); +} + +/* + * This routine is called to restore a reservation on error paths. In the + * specific error paths, a huge page was allocated (via alloc_huge_page) + * and is about to be freed. If a reservation for the page existed, + * alloc_huge_page would have consumed the reservation and set PagePrivate + * in the newly allocated page. When the page is freed via free_huge_page, + * the global reservation count will be incremented if PagePrivate is set. + * However, free_huge_page can not adjust the reserve map. Adjust the + * reserve map here to be consistent with global reserve count adjustments + * to be made by free_huge_page. + */ +static void restore_reserve_on_error(struct hstate *h, + struct vm_area_struct *vma, unsigned long address, + struct page *page) +{ + if (unlikely(PagePrivate(page))) { + long rc = vma_needs_reservation(h, vma, address); + + if (unlikely(rc < 0)) { + /* + * Rare out of memory condition in reserve map + * manipulation. Clear PagePrivate so that + * global reserve count will not be incremented + * by free_huge_page. This will make it appear + * as though the reservation for this page was + * consumed. This may prevent the task from + * faulting in the page at a later time. This + * is better than inconsistent global huge page + * accounting of reserve counts. + */ + ClearPagePrivate(page); + } else if (rc) { + rc = vma_add_reservation(h, vma, address); + if (unlikely(rc < 0)) + /* + * See above comment about rare out of + * memory condition. + */ + ClearPagePrivate(page); + } else + vma_end_reservation(h, vma, address); + } +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -3498,6 +3562,7 @@ retry_avoidcopy: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out_release_all: + restore_reserve_on_error(h, vma, address, new_page); put_page(new_page); out_release_old: put_page(old_page); @@ -3680,6 +3745,7 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); + restore_reserve_on_error(h, vma, address, page); put_page(page); goto out; } diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 70c009741aab..0e9505f66ec1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -764,6 +764,25 @@ EXPORT_SYMBOL(__asan_storeN_noabort); void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); +/* Emitted by compiler to poison large objects when they go out of scope. */ +void __asan_poison_stack_memory(const void *addr, size_t size) +{ + /* + * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded + * by redzones, so we simply round up size to simplify logic. + */ + kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), + KASAN_USE_AFTER_SCOPE); +} +EXPORT_SYMBOL(__asan_poison_stack_memory); + +/* Emitted by compiler to unpoison large objects when they go into scope. */ +void __asan_unpoison_stack_memory(const void *addr, size_t size) +{ + kasan_unpoison_shadow(addr, size); +} +EXPORT_SYMBOL(__asan_unpoison_stack_memory); + #ifdef CONFIG_MEMORY_HOTPLUG static int kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index e5c2181fee6f..1c260e6b3b3c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -21,6 +21,7 @@ #define KASAN_STACK_MID 0xF2 #define KASAN_STACK_RIGHT 0xF3 #define KASAN_STACK_PARTIAL 0xF4 +#define KASAN_USE_AFTER_SCOPE 0xF8 /* Don't break randconfig/all*config builds */ #ifndef KASAN_ABI_VERSION @@ -53,6 +54,9 @@ struct kasan_global { #if KASAN_ABI_VERSION >= 4 struct kasan_source_location *location; #endif +#if KASAN_ABI_VERSION >= 5 + char *odr_indicator; +#endif }; /** diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 24c1211fe9d5..073325aedc68 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -90,6 +90,9 @@ static void print_error_description(struct kasan_access_info *info) case KASAN_KMALLOC_FREE: bug_type = "use-after-free"; break; + case KASAN_USE_AFTER_SCOPE: + bug_type = "use-after-scope"; + break; } pr_err("BUG: KASAN: %s in %pS at addr %p\n", diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 728d7790dc2d..87e1a7ca3846 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -103,6 +103,7 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; +#ifdef CONFIG_SYSFS static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -295,6 +296,7 @@ struct attribute_group khugepaged_attr_group = { .attrs = khugepaged_attr, .name = "khugepaged", }; +#endif /* CONFIG_SYSFS */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a5e453cf05c4..d1380ed93fdf 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1414,6 +1414,7 @@ static void kmemleak_scan(void) /* data/bss scanning */ scan_large_block(_sdata, _edata); scan_large_block(__bss_start, __bss_stop); + scan_large_block(__start_data_ro_after_init, __end_data_ro_after_init); #ifdef CONFIG_SMP /* per-cpu sections scanning */ @@ -1453,8 +1454,11 @@ static void kmemleak_scan(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - scan_block(task_stack_page(p), task_stack_page(p) + - THREAD_SIZE, NULL); + void *stack = try_get_task_stack(p); + if (stack) { + scan_block(stack, stack + THREAD_SIZE, NULL); + put_task_stack(p); + } } while_each_thread(g, p); read_unlock(&tasklist_lock); } diff --git a/mm/list_lru.c b/mm/list_lru.c index 1d05cb9d363d..234676e31edd 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -554,6 +554,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, err = memcg_init_list_lru(lru, memcg_aware); if (err) { kfree(lru->node); + /* Do this so a list_lru_destroy() doesn't crash: */ + lru->node = NULL; goto out; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae052b5e3315..0f870ba43942 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1917,6 +1917,15 @@ retry: current->flags & PF_EXITING)) goto force; + /* + * Prevent unbounded recursion when reclaim operations need to + * allocate memory. This might exceed the limits temporarily, + * but we prefer facilitating memory reclaim and getting back + * under the limit over triggering OOM kills in these cases. + */ + if (unlikely(current->flags & PF_MEMALLOC)) + goto force; + if (unlikely(task_in_memcg_oom(current))) goto nomem; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index de88f33519c0..19e796d36a62 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1112,10 +1112,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (!PageHuge(p) && PageTransHuge(hpage)) { - lock_page(hpage); - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { - unlock_page(hpage); - if (!PageAnon(hpage)) + lock_page(p); + if (!PageAnon(p) || unlikely(split_huge_page(p))) { + unlock_page(p); + if (!PageAnon(p)) pr_err("Memory failure: %#lx: non anonymous thp\n", pfn); else @@ -1126,9 +1126,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) put_hwpoison_page(p); return -EBUSY; } - unlock_page(hpage); - get_hwpoison_page(p); - put_hwpoison_page(hpage); + unlock_page(p); VM_BUG_ON_PAGE(!page_count(p), p); hpage = compound_head(p); } diff --git a/mm/memory.c b/mm/memory.c index e18c57bdc75c..33f45edf8272 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1637,8 +1637,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) - return -EINVAL; + + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); @@ -1655,8 +1655,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, pfn)) - return -EINVAL; + + track_pfn_insert(vma, &pgprot, pfn); /* * If we don't have pte special, then we have to use the pfn_valid() diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..cad4b9125695 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; - struct zone *zone; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); @@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); - zone = &pgdat->node_zones[0]; - for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone_is_initialized(zone)) { - nr_pages = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; - page = virt_to_page(zone->wait_table); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - } - } - pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); @@ -2131,7 +2117,6 @@ void try_offline_node(int nid) unsigned long start_pfn = pgdat->node_start_pfn; unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; unsigned long pfn; - int i; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { unsigned long section_nr = pfn_to_section_nr(pfn); @@ -2158,20 +2143,6 @@ void try_offline_node(int nid) */ node_set_offline(nid); unregister_one_node(nid); - - /* free waittable in each zone */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - /* - * wait_table may be allocated from boot memory, - * here only free if it's allocated by vmalloc. - */ - if (is_vmalloc_addr(zone->wait_table)) { - vfree(zone->wait_table); - zone->wait_table = NULL; - } - } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/mlock.c b/mm/mlock.c index 145a4258ddbc..cdbed8aaa426 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -190,10 +190,13 @@ unsigned int munlock_vma_page(struct page *page) */ spin_lock_irq(zone_lru_lock(zone)); - nr_pages = hpage_nr_pages(page); - if (!TestClearPageMlocked(page)) + if (!TestClearPageMlocked(page)) { + /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ + nr_pages = 1; goto unlock_out; + } + nr_pages = hpage_nr_pages(page); __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); if (__munlock_isolate_lru_page(page, true)) { diff --git a/mm/mremap.c b/mm/mremap.c index da22ad2a5678..30d7d2482eea 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,11 +104,13 @@ static pte_t move_soft_dirty_pte(pte_t pte) static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr, bool need_rmap_locks) + unsigned long new_addr, bool need_rmap_locks, bool *need_flush) { struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + bool force_flush = false; + unsigned long len = old_end - old_addr; /* * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma @@ -146,7 +148,19 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_pte++, new_addr += PAGE_SIZE) { if (pte_none(*old_pte)) continue; + pte = ptep_get_and_clear(mm, old_addr, old_pte); + /* + * If we are remapping a dirty PTE, make sure + * to flush TLB before we drop the PTL for the + * old PTE or we may race with page_mkclean(). + * + * This check has to be done after we removed the + * old PTE from page tables or another thread may + * dirty it after the check and before the removal. + */ + if (pte_present(pte) && pte_dirty(pte)) + force_flush = true; pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); set_pte_at(mm, new_addr, new_pte, pte); @@ -156,6 +170,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap(new_pte - 1); + if (force_flush) + flush_tlb_range(vma, old_end - len, old_end); + else + *need_flush = true; pte_unmap_unlock(old_pte - 1, old_ptl); if (need_rmap_locks) drop_rmap_locks(vma); @@ -201,13 +219,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (need_rmap_locks) take_rmap_locks(vma); moved = move_huge_pmd(vma, old_addr, new_addr, - old_end, old_pmd, new_pmd); + old_end, old_pmd, new_pmd, + &need_flush); if (need_rmap_locks) drop_rmap_locks(vma); - if (moved) { - need_flush = true; + if (moved) continue; - } } split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) @@ -220,11 +237,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma, extent = next - new_addr; if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; - move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr, need_rmap_locks); - need_flush = true; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, + new_pmd, new_addr, need_rmap_locks, &need_flush); } - if (likely(need_flush)) + if (need_flush) flush_tlb_range(vma, old_end-len, old_addr); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..61b0988bba8c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -92,7 +92,7 @@ int _node_numa_mem_[MAX_NUMNODES]; #endif #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY -volatile u64 latent_entropy __latent_entropy; +volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); #endif @@ -3658,7 +3658,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { warn_alloc(gfp_mask, - "page alloction stalls for %ums, order:%u\n", + "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; } @@ -4224,7 +4224,7 @@ static void show_migration_types(unsigned char type) } *p = '\0'; - printk("(%s) ", tmp); + printk(KERN_CONT "(%s) ", tmp); } /* @@ -4335,7 +4335,8 @@ void show_free_areas(unsigned int filter) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; show_node(zone); - printk("%s" + printk(KERN_CONT + "%s" " free:%lukB" " min:%lukB" " low:%lukB" @@ -4382,8 +4383,8 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %ld", zone->lowmem_reserve[i]); - printk("\n"); + printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); + printk(KERN_CONT "\n"); } for_each_populated_zone(zone) { @@ -4394,7 +4395,7 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); - printk("%s: ", zone->name); + printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -4412,11 +4413,12 @@ void show_free_areas(unsigned int filter) } spin_unlock_irqrestore(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - printk("%lu*%lukB ", nr[order], K(1UL) << order); + printk(KERN_CONT "%lu*%lukB ", + nr[order], K(1UL) << order); if (nr[order]) show_migration_types(types[order]); } - printk("= %lukB\n", K(total)); + printk(KERN_CONT "= %lukB\n", K(total)); } hugetlb_show_meminfo(); @@ -4977,72 +4979,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) } /* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} -#else -/* - * A zone's size might be changed by hot-add, so it is not possible to determine - * a suitable size for its wait_table. So we use the maximum size now. - * - * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: - * - * i386 (preemption config) : 4096 x 16 = 64Kbyte. - * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. - * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. - * - * The maximum entries are prepared when a zone's memory is (512K + 256) pages - * or more by the traditional way. (See above). It equals: - * - * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. - * ia64(16K page size) : = ( 8G + 4M)byte. - * powerpc (64K page size) : = (32G +16M)byte. - */ -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - return 4096UL; -} -#endif - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -5304,49 +5240,6 @@ void __init setup_per_cpu_pageset(void) alloc_percpu(struct per_cpu_nodestat); } -static noinline __ref -int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - size_t alloc_size; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_hash_nr_entries = - wait_table_hash_nr_entries(zone_size_pages); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_hash_nr_entries); - alloc_size = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - - if (!slab_is_available()) { - zone->wait_table = (wait_queue_head_t *) - memblock_virt_alloc_node_nopanic( - alloc_size, zone->zone_pgdat->node_id); - } else { - /* - * This case means that a zone whose size was 0 gets new memory - * via memory hot-add. - * But it may be the case that a new node was hot-added. In - * this case vmalloc() will not be able to use this new node's - * memory - this wait_table must be initialized to use this new - * node itself as well. - * To use this new node's memory, further consideration will be - * necessary. - */ - zone->wait_table = vmalloc(alloc_size); - } - if (!zone->wait_table) - return -ENOMEM; - - for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) - init_waitqueue_head(zone->wait_table + i); - - return 0; -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -5367,10 +5260,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - int ret; - ret = zone_wait_table_init(zone, size); - if (ret) - return ret; + pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -5382,6 +5272,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); + zone->initialized = 1; return 0; } @@ -6508,8 +6399,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) } if (pages && s) - pr_info("Freeing %s memory: %ldK (%p - %p)\n", - s, pages << (PAGE_SHIFT - 10), start, end); + pr_info("Freeing %s memory: %ldK\n", + s, pages << (PAGE_SHIFT - 10)); return pages; } diff --git a/mm/shmem.c b/mm/shmem.c index ad7813d73ea7..9d32e1cb9f38 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1483,6 +1483,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); + __SetPageLocked(newpage); + __SetPageSwapBacked(newpage); SetPageUptodate(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1846,6 +1848,18 @@ unlock: return error; } +/* + * This is like autoremove_wake_function, but it removes the wait queue + * entry unconditionally - even if something else had already woken the + * target. + */ +static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + list_del_init(&wait->task_list); + return ret; +} + static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); @@ -1881,7 +1895,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT(shmem_fault_wait); + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && @@ -2663,6 +2677,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); spin_unlock(&inode->i_lock); error = 0; goto out; diff --git a/mm/slab.c b/mm/slab.c index 090fb26b3a39..0b0550ca85b4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -233,6 +233,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; + parent->num_slabs = 0; } #define MAKE_LIST(cachep, listp, slab, nodeid) \ @@ -966,7 +967,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, * guaranteed to be valid until irq is re-enabled, because it will be * freed after synchronize_sched(). */ - if (force_change) + if (old_shared && force_change) synchronize_sched(); fail: @@ -1382,24 +1383,27 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; + unsigned long num_slabs_partial = 0, num_slabs_free = 0; + unsigned long num_slabs_full; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) { - active_objs += cachep->num; - active_slabs++; - } + num_slabs = n->num_slabs; list_for_each_entry(page, &n->slabs_partial, lru) { active_objs += page->active; - active_slabs++; + num_slabs_partial++; } list_for_each_entry(page, &n->slabs_free, lru) - num_slabs++; + num_slabs_free++; free_objects += n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - num_slabs += active_slabs; num_objs = num_slabs * cachep->num; + active_slabs = num_slabs - num_slabs_free; + num_slabs_full = num_slabs - + (num_slabs_partial + num_slabs_free); + active_objs += (num_slabs_full * cachep->num); + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); @@ -2314,6 +2318,7 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); + n->num_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2752,6 +2757,8 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) list_add_tail(&page->lru, &(n->slabs_free)); else fixup_slab_list(cachep, n, page, &list); + + n->num_slabs++; STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; spin_unlock(&n->list_lock); @@ -3443,6 +3450,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); + n->num_slabs--; } } @@ -4099,6 +4107,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) unsigned long num_objs; unsigned long active_slabs = 0; unsigned long num_slabs, free_objects = 0, shared_avail = 0; + unsigned long num_slabs_partial = 0, num_slabs_free = 0; + unsigned long num_slabs_full = 0; const char *name; char *error = NULL; int node; @@ -4111,33 +4121,34 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(page, &n->slabs_full, lru) { - if (page->active != cachep->num && !error) - error = "slabs_full accounting error"; - active_objs += cachep->num; - active_slabs++; - } + num_slabs += n->num_slabs; + list_for_each_entry(page, &n->slabs_partial, lru) { if (page->active == cachep->num && !error) error = "slabs_partial accounting error"; if (!page->active && !error) error = "slabs_partial accounting error"; active_objs += page->active; - active_slabs++; + num_slabs_partial++; } + list_for_each_entry(page, &n->slabs_free, lru) { if (page->active && !error) error = "slabs_free accounting error"; - num_slabs++; + num_slabs_free++; } + free_objects += n->free_objects; if (n->shared) shared_avail += n->shared->avail; spin_unlock_irq(&n->list_lock); } - num_slabs += active_slabs; num_objs = num_slabs * cachep->num; + active_slabs = num_slabs - num_slabs_free; + num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free); + active_objs += (num_slabs_full * cachep->num); + if (num_objs - active_objs != free_objects && !error) error = "free_objects accounting error"; diff --git a/mm/slab.h b/mm/slab.h index 9653f2e2591a..bc05fdc3edce 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -432,6 +432,7 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; + unsigned long num_slabs; unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 71f0b28a1bec..329b03843863 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -533,8 +533,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, s = create_cache(cache_name, root_cache->object_size, root_cache->size, root_cache->align, - root_cache->flags, root_cache->ctor, - memcg, root_cache); + root_cache->flags & CACHE_CREATE_MASK, + root_cache->ctor, memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root diff --git a/mm/swapfile.c b/mm/swapfile.c index 2210de290b54..f30438970cd1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2224,6 +2224,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } diff --git a/mm/truncate.c b/mm/truncate.c index a01cce450a26..8d8c62d89e6d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -283,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -371,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); @@ -492,7 +492,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!trylock_page(page)) continue; - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); /* Middle of THP: skip */ if (PageTransTail(page)) { @@ -612,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } lock_page(page); - WARN_ON(page_to_pgoff(page) != index); + WARN_ON(page_to_index(page) != index); if (page->mapping != mapping) { unlock_page(page); continue; diff --git a/mm/vmscan.c b/mm/vmscan.c index 744f926af442..d75cdf360730 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2354,6 +2354,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc } } + cond_resched(); + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) continue; @@ -3043,7 +3045,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + current->flags |= PF_MEMALLOC; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + current->flags &= ~PF_MEMALLOC; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); diff --git a/mm/workingset.c b/mm/workingset.c index 617475f529f4..fb1f9183d89a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -348,7 +348,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - if (memcg_kmem_enabled()) { + if (sc->memcg) { pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, LRU_ALL_FILE); } else { |