diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 9 | ||||
-rw-r--r-- | mm/backing-dev.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 7 | ||||
-rw-r--r-- | mm/memblock.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 14 | ||||
-rw-r--r-- | mm/mmap.c | 62 | ||||
-rw-r--r-- | mm/page_alloc.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 2 | ||||
-rw-r--r-- | mm/vmstat.c | 70 |
10 files changed, 96 insertions, 76 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 97a4e06b15c0..03cbfa072f42 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT bool config DEFERRED_STRUCT_PAGE_INIT - bool "Defer initialisation of struct pages to kswapd" + bool "Defer initialisation of struct pages to kthreads" default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG @@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up a subset of memmap at boot and then initialise the rest in parallel - when kswapd starts. This has a potential performance impact on - processes running early in the lifetime of the systemm until kswapd - finishes the initialisation. + by starting one-off "pgdatinitX" kernel thread for each node X. This + has a potential performance impact on processes running early in the + lifetime of the system until these kthreads finish the + initialisation. config IDLE_PAGE_TRACKING bool "Enable idle page tracking" diff --git a/mm/backing-dev.c b/mm/backing-dev.c index cc5d29d2da9b..926c76d56388 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * here rather than calling cond_resched(). */ if (current->flags & PF_WQ_WORKER) - schedule_timeout(1); + schedule_timeout_uninterruptible(1); else cond_resched(); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36c070167b71..08fc0ba2207e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3482,7 +3482,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_lock_irqsave(&pgdata->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &list) { + list_for_each_safe(pos, next, &pgdata->split_queue) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); if (get_page_unless_zero(page)) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 12908dcf5831..06ae13e869d0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { @@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; - BUG_ON(page_count(page)); - BUG_ON(page_mapcount(page)); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(page_mapcount(page), page); restore_reserve = PagePrivate(page); ClearPagePrivate(page); @@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) set_page_count(p, 0); set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } /* diff --git a/mm/memblock.c b/mm/memblock.c index d2ed81e59a94..dd7989929f13 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) * Remaining API functions */ -phys_addr_t __init memblock_phys_mem_size(void) +phys_addr_t __init_memblock memblock_phys_mem_size(void) { return memblock.memory.total_size; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 27d135408a22..4c4187c0e1de 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -548,8 +548,7 @@ retry: goto retry; } - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, qp->pagelist, flags); + migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (vma->vm_flags & VM_PFNMAP) + if (!vma_migratable(vma)) return 1; if (endvma > end) @@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma_migratable(vma) && - vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) change_prot_numa(vma, start, endvma); return 1; } - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) - /* queue pages from current vma */ + /* queue pages from current vma */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } diff --git a/mm/mmap.c b/mm/mmap.c index cfc0cdca421e..2f2415a7a688 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -390,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) } #ifdef CONFIG_DEBUG_VM_RB -static int browse_rb(struct rb_root *root) +static int browse_rb(struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; @@ -414,12 +415,14 @@ static int browse_rb(struct rb_root *root) vma->vm_start, vma->vm_end); bug = 1; } + spin_lock(&mm->page_table_lock); if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; } + spin_unlock(&mm->page_table_lock); i++; pn = nd; prev = vma->vm_start; @@ -456,12 +459,16 @@ static void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { + struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; - vma_lock_anon_vma(vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - vma_unlock_anon_vma(vma); + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } + highest_address = vma->vm_end; vma = vma->vm_next; i++; @@ -475,7 +482,7 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(&mm->mm_rb); + i = browse_rb(mm); if (i != mm->map_count) { if (i != -1) pr_emerg("map_count %d rb %d\n", mm->map_count, i); @@ -2142,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - int error; + int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ + /* Guard against wrapping around to address 0. */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. - * Also guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else { - vma_unlock_anon_vma(vma); - return -ENOMEM; - } - error = 0; + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { @@ -2185,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2208,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; @@ -2224,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - address &= PAGE_MASK; error = security_mmap_addr(address); if (error) return error; - vma_lock_anon_vma(vma); + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { @@ -2260,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma, * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2281,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma, } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea2c4d3e0c03..838ca8bb64f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page) return !has_unmovable_pages(zone, page, 0, true); } -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) { diff --git a/mm/vmscan.c b/mm/vmscan.c index eb3dd37ccd7c..71b1c29948db 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page) int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); + WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (PageLRU(page)) { struct zone *zone = page_zone(page); diff --git a/mm/vmstat.c b/mm/vmstat.c index 40b2c74ddf16..084c6725b373 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w) * Counters were updated so we expect more updates * to occur in the future. Keep on running the * update worker thread. + * If we were marked on cpu_stat_off clear the flag + * so that vmstat_shepherd doesn't schedule us again. */ - queue_delayed_work_on(smp_processor_id(), vmstat_wq, - this_cpu_ptr(&vmstat_work), - round_jiffies_relative(sysctl_stat_interval)); + if (!cpumask_test_and_clear_cpu(smp_processor_id(), + cpu_stat_off)) { + queue_delayed_work_on(smp_processor_id(), vmstat_wq, + this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + } } else { /* * We did not update any counters so the app may be in @@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w) * until the diffs stay at zero. The function is used by NOHZ and can only be * invoked when tick processing is not active. */ -void quiet_vmstat(void) -{ - if (system_state != SYSTEM_RUNNING) - return; - - do { - if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) - cancel_delayed_work(this_cpu_ptr(&vmstat_work)); - - } while (refresh_cpu_vm_stats(false)); -} - /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1452,6 +1445,30 @@ static bool need_update(int cpu) return false; } +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + /* + * If we are already in hands of the shepherd then there + * is nothing for us to do here. + */ + if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + return; + + if (!need_update(smp_processor_id())) + return; + + /* + * Just refresh counters and do not care about the pending delayed + * vmstat_update. It doesn't fire that often to matter and canceling + * it would be too expensive from this path. + * vmstat_shepherd will take care about that for us. + */ + refresh_cpu_vm_stats(false); +} + /* * Shepherd worker thread that checks the @@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ - for_each_cpu(cpu, cpu_stat_off) - if (need_update(cpu) && - cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - - queue_delayed_work_on(cpu, vmstat_wq, - &per_cpu(vmstat_work, cpu), 0); + for_each_cpu(cpu, cpu_stat_off) { + struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + if (need_update(cpu)) { + if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + queue_delayed_work_on(cpu, vmstat_wq, dw, 0); + } else { + /* + * Cancel the work if quiet_vmstat has put this + * cpu on cpu_stat_off because the work item might + * be still scheduled + */ + cancel_delayed_work(dw); + } + } put_online_cpus(); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); - } static void __init start_shepherd_timer(void) @@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void) int cpu; for_each_possible_cpu(cpu) - INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) |