diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-04-14 13:18:27 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-04-14 13:19:04 +0200 |
commit | 6ac1ef482d7ae0c690f1640bf6eb818ff9a2d91e (patch) | |
tree | 021cc9f6b477146fcebe6f3be4752abfa2ba18a9 /mm | |
parent | 682968e0c425c60f0dde37977e5beb2b12ddc4cc (diff) | |
parent | a385ec4f11bdcf81af094c03e2444ee9b7fad2e5 (diff) | |
download | blackbird-op-linux-6ac1ef482d7ae0c690f1640bf6eb818ff9a2d91e.tar.gz blackbird-op-linux-6ac1ef482d7ae0c690f1640bf6eb818ff9a2d91e.zip |
Merge branch 'perf/core' into perf/uprobes
Merge in latest upstream (and the latest perf development tree),
to prepare for tooling changes, and also to pick up v3.4 MM
changes that the uprobes code needs to take care of.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
40 files changed, 1727 insertions, 1197 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index 668e94df8cf2..0131170c9d54 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { bootmem_data_t *bdata; - unsigned long pfn, goal, limit; + unsigned long pfn, goal; pfn = section_nr_to_pfn(section_nr); goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); + return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); } #endif diff --git a/mm/bounce.c b/mm/bounce.c index 4e9ae722af83..d1be02ca1889 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -50,9 +50,9 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) unsigned char *vto; local_irq_save(flags); - vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); + vto = kmap_atomic(to->bv_page); memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto, KM_BOUNCE_READ); + kunmap_atomic(vto); local_irq_restore(flags); } diff --git a/mm/cleancache.c b/mm/cleancache.c index bcaae4c2a770..5646c740f613 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -15,29 +15,34 @@ #include <linux/fs.h> #include <linux/exportfs.h> #include <linux/mm.h> +#include <linux/debugfs.h> #include <linux/cleancache.h> /* * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/flush even on systems where cleancache_ops + * by cleancache_get/put/invalidate even on systems where cleancache_ops * is not claimed (e.g. cleancache is config'ed on but remains * disabled), so is preferred to the slower alternative: a function * call that checks a non-global. */ -int cleancache_enabled; +int cleancache_enabled __read_mostly; EXPORT_SYMBOL(cleancache_enabled); /* * cleancache_ops is set by cleancache_ops_register to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops cleancache_ops; +static struct cleancache_ops cleancache_ops __read_mostly; -/* useful stats available in /sys/kernel/mm/cleancache */ -static unsigned long cleancache_succ_gets; -static unsigned long cleancache_failed_gets; -static unsigned long cleancache_puts; -static unsigned long cleancache_flushes; +/* + * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * properly configured. These are for information only so are not protected + * against increment races. + */ +static u64 cleancache_succ_gets; +static u64 cleancache_failed_gets; +static u64 cleancache_puts; +static u64 cleancache_invalidates; /* * register operations for cleancache, returning previous thus allowing @@ -148,10 +153,11 @@ void __cleancache_put_page(struct page *page) EXPORT_SYMBOL(__cleancache_put_page); /* - * Flush any data from cleancache associated with the poolid and the + * Invalidate any data from cleancache associated with the poolid and the * page's inode and page index so that a subsequent "get" will fail. */ -void __cleancache_flush_page(struct address_space *mapping, struct page *page) +void __cleancache_invalidate_page(struct address_space *mapping, + struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ int pool_id = mapping->host->i_sb->cleancache_poolid; @@ -160,85 +166,57 @@ void __cleancache_flush_page(struct address_space *mapping, struct page *page) if (pool_id >= 0) { VM_BUG_ON(!PageLocked(page)); if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.flush_page)(pool_id, key, page->index); - cleancache_flushes++; + (*cleancache_ops.invalidate_page)(pool_id, + key, page->index); + cleancache_invalidates++; } } } -EXPORT_SYMBOL(__cleancache_flush_page); +EXPORT_SYMBOL(__cleancache_invalidate_page); /* - * Flush all data from cleancache associated with the poolid and the + * Invalidate all data from cleancache associated with the poolid and the * mappings's inode so that all subsequent gets to this poolid/inode * will fail. */ -void __cleancache_flush_inode(struct address_space *mapping) +void __cleancache_invalidate_inode(struct address_space *mapping) { int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.flush_inode)(pool_id, key); + (*cleancache_ops.invalidate_inode)(pool_id, key); } -EXPORT_SYMBOL(__cleancache_flush_inode); +EXPORT_SYMBOL(__cleancache_invalidate_inode); /* * Called by any cleancache-enabled filesystem at time of unmount; * note that pool_id is surrendered and may be reutrned by a subsequent * cleancache_init_fs or cleancache_init_shared_fs */ -void __cleancache_flush_fs(struct super_block *sb) +void __cleancache_invalidate_fs(struct super_block *sb) { if (sb->cleancache_poolid >= 0) { int old_poolid = sb->cleancache_poolid; sb->cleancache_poolid = -1; - (*cleancache_ops.flush_fs)(old_poolid); + (*cleancache_ops.invalidate_fs)(old_poolid); } } -EXPORT_SYMBOL(__cleancache_flush_fs); - -#ifdef CONFIG_SYSFS - -/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ - -#define CLEANCACHE_SYSFS_RO(_name) \ - static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", cleancache_##_name); \ - } \ - static struct kobj_attribute cleancache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = cleancache_##_name##_show, \ - } - -CLEANCACHE_SYSFS_RO(succ_gets); -CLEANCACHE_SYSFS_RO(failed_gets); -CLEANCACHE_SYSFS_RO(puts); -CLEANCACHE_SYSFS_RO(flushes); - -static struct attribute *cleancache_attrs[] = { - &cleancache_succ_gets_attr.attr, - &cleancache_failed_gets_attr.attr, - &cleancache_puts_attr.attr, - &cleancache_flushes_attr.attr, - NULL, -}; - -static struct attribute_group cleancache_attr_group = { - .attrs = cleancache_attrs, - .name = "cleancache", -}; - -#endif /* CONFIG_SYSFS */ +EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { -#ifdef CONFIG_SYSFS - int err; - - err = sysfs_create_group(mm_kobj, &cleancache_attr_group); -#endif /* CONFIG_SYSFS */ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("cleancache", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets); + debugfs_create_u64("failed_gets", S_IRUGO, + root, &cleancache_failed_gets); + debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts); + debugfs_create_u64("invalidates", S_IRUGO, + root, &cleancache_invalidates); +#endif return 0; } module_init(init_cleancache) diff --git a/mm/compaction.c b/mm/compaction.c index d9ebebe1a2aa..74a8c825ff28 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -35,7 +35,7 @@ struct compact_control { unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ - unsigned int order; /* order a direct compactor needs */ + int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; }; @@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact all zones within a node */ -static int compact_node(int nid) +static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) { int zoneid; - pg_data_t *pgdat; struct zone *zone; - if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) - return -EINVAL; - pgdat = NODE_DATA(nid); - - /* Flush pending updates to the LRU lists */ - lru_add_drain_all(); - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - struct compact_control cc = { - .nr_freepages = 0, - .nr_migratepages = 0, - .order = -1, - .sync = true, - }; zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; - cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); - - compact_zone(zone, &cc); + cc->nr_freepages = 0; + cc->nr_migratepages = 0; + cc->zone = zone; + INIT_LIST_HEAD(&cc->freepages); + INIT_LIST_HEAD(&cc->migratepages); + + if (cc->order == -1 || !compaction_deferred(zone, cc->order)) + compact_zone(zone, cc); + + if (cc->order > 0) { + int ok = zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0); + if (ok && cc->order > zone->compact_order_failed) + zone->compact_order_failed = cc->order + 1; + /* Currently async compaction is never deferred. */ + else if (!ok && cc->sync) + defer_compaction(zone, cc->order); + } - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); + VM_BUG_ON(!list_empty(&cc->freepages)); + VM_BUG_ON(!list_empty(&cc->migratepages)); } return 0; } +int compact_pgdat(pg_data_t *pgdat, int order) +{ + struct compact_control cc = { + .order = order, + .sync = false, + }; + + return __compact_pgdat(pgdat, &cc); +} + +static int compact_node(int nid) +{ + struct compact_control cc = { + .order = -1, + .sync = true, + }; + + return __compact_pgdat(NODE_DATA(nid), &cc); +} + /* Compact all nodes in the system */ static int compact_nodes(void) { int nid; + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + for_each_online_node(nid) compact_node(nid); @@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - compact_node(dev->id); + int nid = dev->id; + + if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + compact_node(nid); + } return count; } diff --git a/mm/filemap.c b/mm/filemap.c index b66275757c28..79c4b2b0b14e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -101,9 +101,8 @@ * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * (code doesn't rely on that order, so you could switch it around) - * ->tasklist_lock (memory_failure, collect_procs_ao) - * ->i_mmap_mutex + * ->i_mmap_mutex + * ->tasklist_lock (memory_failure, collect_procs_ao) */ /* @@ -123,7 +122,7 @@ void __delete_from_page_cache(struct page *page) if (PageUptodate(page) && PageMappedToDisk(page)) cleancache_put_page(page); else - cleancache_flush_page(mapping, page); + cleancache_invalidate_page(mapping, page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; @@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp) struct page *page; if (cpuset_do_page_mem_spread()) { - get_mems_allowed(); - n = cpuset_mem_spread_node(); - page = alloc_pages_exact_node(n, gfp, 0); - put_mems_allowed(); + unsigned int cpuset_mems_cookie; + do { + cpuset_mems_cookie = get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + return page; } return alloc_pages(gfp, 0); @@ -811,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found, nr_skip; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, start, nr_pages); - ret = 0; - nr_skip = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -835,7 +836,7 @@ repeat: * when entry at index 0 moves out of or back * to root: none yet gotten, safe to restart. */ - WARN_ON(start | i); + WARN_ON(iter.index); goto restart; } /* @@ -843,7 +844,6 @@ repeat: * here as an exceptional entry: so skip over it - * we only reach this from invalidate_mapping_pages(). */ - nr_skip++; continue; } @@ -851,21 +851,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found > nr_skip)) - goto restart; rcu_read_unlock(); return ret; } @@ -885,21 +880,22 @@ repeat: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, index, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); + /* The hole, there no reason to continue */ if (unlikely(!page)) - continue; + break; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -922,7 +918,7 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } @@ -932,14 +928,14 @@ repeat: * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page->index != index) { + if (page->mapping == NULL || page->index != iter.index) { page_cache_release(page); break; } pages[ret] = page; - ret++; - index++; + if (++ret == nr_pages) + break; } rcu_read_unlock(); return ret; @@ -960,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, - (void ***)pages, *index, nr_pages, tag); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, *index, tag) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -996,21 +993,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found)) - goto restart; rcu_read_unlock(); if (ret) @@ -1318,10 +1310,10 @@ int file_read_actor(read_descriptor_t *desc, struct page *page, * taking the kmap. */ if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); left = __copy_to_user_inatomic(desc->arg.buf, kaddr + offset, size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); if (left == 0) goto success; } @@ -2045,7 +2037,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, size_t copied; BUG_ON(!in_atomic()); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); if (likely(i->nr_segs == 1)) { int left; char __user *buf = i->iov->iov_base + i->iov_offset; @@ -2055,7 +2047,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, copied = __iovec_copy_from_user_inatomic(kaddr + offset, i->iov, i->iov_offset, bytes); } - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); return copied; } @@ -2341,7 +2333,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, struct page *page; gfp_t gfp_notmask = 0; - gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; + gfp_mask = mapping_gfp_mask(mapping); + if (mapping_cap_account_dirty(mapping)) + gfp_mask |= __GFP_WRITE; if (flags & AOP_FLAG_NOFS) gfp_notmask = __GFP_FS; repeat: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 91d3efb25d15..f0e5306eeb55 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -671,6 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, set_pmd_at(mm, haddr, pmd, entry); prepare_pmd_huge_pte(pgtable, mm); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm->nr_ptes++; spin_unlock(&mm->page_table_lock); } @@ -789,6 +790,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); prepare_pmd_huge_pte(pgtable, dst_mm); + dst_mm->nr_ptes++; ret = 0; out_unlock: @@ -887,7 +889,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } kfree(pages); - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); @@ -1030,31 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, { int ret = 0; - spin_lock(&tlb->mm->page_table_lock); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&tlb->mm->page_table_lock); - wait_split_huge_page(vma->anon_vma, - pmd); - } else { - struct page *page; - pgtable_t pgtable; - pgtable = get_pmd_huge_pte(tlb->mm); - page = pmd_page(*pmd); - pmd_clear(pmd); - tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - page_remove_rmap(page); - VM_BUG_ON(page_mapcount(page) < 0); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON(!PageHead(page)); - spin_unlock(&tlb->mm->page_table_lock); - tlb_remove_page(tlb, page); - pte_free(tlb->mm, pgtable); - ret = 1; - } - } else + if (__pmd_trans_huge_lock(pmd, vma) == 1) { + struct page *page; + pgtable_t pgtable; + pgtable = get_pmd_huge_pte(tlb->mm); + page = pmd_page(*pmd); + pmd_clear(pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); - + tlb_remove_page(tlb, page); + pte_free(tlb->mm, pgtable); + ret = 1; + } return ret; } @@ -1064,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { int ret = 0; - spin_lock(&vma->vm_mm->page_table_lock); - if (likely(pmd_trans_huge(*pmd))) { - ret = !pmd_trans_splitting(*pmd); - spin_unlock(&vma->vm_mm->page_table_lock); - if (unlikely(!ret)) - wait_split_huge_page(vma->anon_vma, pmd); - else { - /* - * All logical pages in the range are present - * if backed by a huge page. - */ - memset(vec, 1, (end - addr) >> PAGE_SHIFT); - } - } else + if (__pmd_trans_huge_lock(pmd, vma) == 1) { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ spin_unlock(&vma->vm_mm->page_table_lock); + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + ret = 1; + } return ret; } @@ -1108,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, goto out; } - spin_lock(&mm->page_table_lock); - if (likely(pmd_trans_huge(*old_pmd))) { - if (pmd_trans_splitting(*old_pmd)) { - spin_unlock(&mm->page_table_lock); - wait_split_huge_page(vma->anon_vma, old_pmd); - ret = -1; - } else { - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); - VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd); - spin_unlock(&mm->page_table_lock); - ret = 1; - } - } else { + ret = __pmd_trans_huge_lock(old_pmd, vma); + if (ret == 1) { + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + VM_BUG_ON(!pmd_none(*new_pmd)); + set_pmd_at(mm, new_addr, new_pmd, pmd); spin_unlock(&mm->page_table_lock); } out: @@ -1134,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; int ret = 0; - spin_lock(&mm->page_table_lock); + if (__pmd_trans_huge_lock(pmd, vma) == 1) { + pmd_t entry; + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + ret = 1; + } + + return ret; +} + +/* + * Returns 1 if a given pmd maps a stable (not under splitting) thp. + * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. + * + * Note that if it returns 1, this routine returns without unlocking page + * table locks. So callers must unlock them. + */ +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) +{ + spin_lock(&vma->vm_mm->page_table_lock); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(&vma->vm_mm->page_table_lock); wait_split_huge_page(vma->anon_vma, pmd); + return -1; } else { - pmd_t entry; - - entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); - set_pmd_at(mm, addr, pmd, entry); - spin_unlock(&vma->vm_mm->page_table_lock); - ret = 1; + /* Thp mapped by 'pmd' is stable, so we can + * handle it as it is. */ + return 1; } - } else - spin_unlock(&vma->vm_mm->page_table_lock); - - return ret; + } + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; } pmd_t *page_check_address_pmd(struct page *page, @@ -1375,7 +1370,6 @@ static int __split_huge_page_map(struct page *page, pte_unmap(pte); } - mm->nr_ptes++; smp_wmb(); /* make pte visible before pmd */ /* * Up to this point the pmd is present and huge and @@ -1988,7 +1982,6 @@ static void collapse_huge_page(struct mm_struct *mm, set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache(vma, address, _pmd); prepare_pmd_huge_pte(pgtable, mm); - mm->nr_ptes--; spin_unlock(&mm->page_table_lock); #ifndef CONFIG_NUMA diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f34bd8dda34..b8ce6f450956 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; */ static DEFINE_SPINLOCK(hugetlb_lock); +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +{ + bool free = (spool->count == 0) && (spool->used_hpages == 0); + + spin_unlock(&spool->lock); + + /* If no pages are used, and no other handles to the subpool + * remain, free the subpool the subpool remain */ + if (free) + kfree(spool); +} + +struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +{ + struct hugepage_subpool *spool; + + spool = kmalloc(sizeof(*spool), GFP_KERNEL); + if (!spool) + return NULL; + + spin_lock_init(&spool->lock); + spool->count = 1; + spool->max_hpages = nr_blocks; + spool->used_hpages = 0; + + return spool; +} + +void hugepage_put_subpool(struct hugepage_subpool *spool) +{ + spin_lock(&spool->lock); + BUG_ON(!spool->count); + spool->count--; + unlock_or_release_subpool(spool); +} + +static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, + long delta) +{ + int ret = 0; + + if (!spool) + return 0; + + spin_lock(&spool->lock); + if ((spool->used_hpages + delta) <= spool->max_hpages) { + spool->used_hpages += delta; + } else { + ret = -ENOMEM; + } + spin_unlock(&spool->lock); + + return ret; +} + +static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, + long delta) +{ + if (!spool) + return; + + spin_lock(&spool->lock); + spool->used_hpages -= delta; + /* If hugetlbfs_put_super couldn't free spool due to + * an outstanding quota reference, free it now. */ + unlock_or_release_subpool(spool); +} + +static inline struct hugepage_subpool *subpool_inode(struct inode *inode) +{ + return HUGETLBFS_SB(inode->i_sb)->spool; +} + +static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) +{ + return subpool_inode(vma->vm_file->f_dentry->d_inode); +} + /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. @@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - struct page *page = NULL; + struct page *page; struct mempolicy *mpol; nodemask_t *nodemask; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + unsigned int cpuset_mems_cookie; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask, &mpol, &nodemask); /* @@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } } } -err: + mpol_cond_put(mpol); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; + +err: + mpol_cond_put(mpol); + return NULL; } static void update_and_free_page(struct hstate *h, struct page *page) @@ -533,9 +618,9 @@ static void free_huge_page(struct page *page) */ struct hstate *h = page_hstate(page); int nid = page_to_nid(page); - struct address_space *mapping; + struct hugepage_subpool *spool = + (struct hugepage_subpool *)page_private(page); - mapping = (struct address_space *) page_private(page); set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); @@ -551,8 +636,7 @@ static void free_huge_page(struct page *page) enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); - if (mapping) - hugetlb_put_quota(mapping, 1); + hugepage_subpool_put_pages(spool, 1); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) struct page *page, *tmp; int ret, i; int needed, allocated; + bool alloc_ok = true; needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { @@ -867,17 +952,13 @@ retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_buddy_huge_page(h, NUMA_NO_NODE); - if (!page) - /* - * We were not able to allocate enough pages to - * satisfy the entire reservation so we free what - * we've allocated so far. - */ - goto free; - + if (!page) { + alloc_ok = false; + break; + } list_add(&page->lru, &surplus_list); } - allocated += needed; + allocated += i; /* * After retaking hugetlb_lock, we need to recalculate 'needed' @@ -886,9 +967,16 @@ retry: spin_lock(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); - if (needed > 0) - goto retry; - + if (needed > 0) { + if (alloc_ok) + goto retry; + /* + * We were not able to allocate enough pages to + * satisfy the entire reservation so we free what + * we've allocated so far. + */ + goto free; + } /* * The surplus_list now contains _at_least_ the number of extra pages * needed to accommodate the reservation. Add the appropriate number @@ -914,10 +1002,10 @@ retry: VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } +free: spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ -free: if (!list_empty(&surplus_list)) { list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_del(&page->lru); @@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h, /* * Determine if the huge page at addr within the vma has an associated * reservation. Where it does not we will need to logically increase - * reservation and actually increase quota before an allocation can occur. - * Where any new reservation would be required the reservation change is - * prepared, but not committed. Once the page has been quota'd allocated - * an instantiated the change should be committed via vma_commit_reservation. - * No action is required on failure. + * reservation and actually increase subpool usage before an allocation + * can occur. Where any new reservation would be required the + * reservation change is prepared, but not committed. Once the page + * has been allocated from the subpool and instantiated the change should + * be committed via vma_commit_reservation. No action is required on + * failure. */ static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) @@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h, static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { + struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; long chg; /* - * Processes that did not create the mapping will have no reserves and - * will not have accounted against quota. Check that the quota can be - * made before satisfying the allocation - * MAP_NORESERVE mappings may also need pages and quota allocated - * if no reserve mapping overlaps. + * Processes that did not create the mapping will have no + * reserves and will not have accounted against subpool + * limit. Check that the subpool limit can be made before + * satisfying the allocation MAP_NORESERVE mappings may also + * need pages and subpool limit allocated allocated if no reserve + * mapping overlaps. */ chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(-VM_FAULT_OOM); if (chg) - if (hugetlb_get_quota(inode->i_mapping, chg)) + if (hugepage_subpool_get_pages(spool, chg)) return ERR_PTR(-VM_FAULT_SIGBUS); spin_lock(&hugetlb_lock); @@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) { page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { - hugetlb_put_quota(inode->i_mapping, chg); + hugepage_subpool_put_pages(spool, chg); return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_private(page, (unsigned long) mapping); + set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); @@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *reservations = vma_resv_map(vma); + struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve; unsigned long start; unsigned long end; @@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) if (reserve) { hugetlb_acct_memory(h, -reserve); - hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + hugepage_subpool_put_pages(spool, reserve); } } } @@ -2241,16 +2331,23 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pmd_unshare(mm, &address, ptep)) continue; + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte)) + continue; + + /* + * HWPoisoned hugepage is already unmapped and dropped reference + */ + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + + page = pte_page(pte); /* * If a reference page is supplied, it is because a specific * page is being unmapped, not a range. Ensure the page we * are about to unmap is the actual page of interest. */ if (ref_page) { - pte = huge_ptep_get(ptep); - if (huge_pte_none(pte)) - continue; - page = pte_page(pte); if (page != ref_page) continue; @@ -2263,22 +2360,16 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } pte = huge_ptep_get_and_clear(mm, address, ptep); - if (huge_pte_none(pte)) - continue; - - /* - * HWPoisoned hugepage is already unmapped and dropped reference - */ - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) - continue; - - page = pte_page(pte); if (pte_dirty(pte)) set_page_dirty(page); list_add(&page->lru, &page_list); + + /* Bail out after unmapping reference page if supplied */ + if (ref_page) + break; } - spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { page_remove_rmap(page); @@ -2316,7 +2407,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ address = address & huge_page_mask(h); pgoff = vma_hugecache_offset(h, vma, address); - mapping = (struct address_space *)page_private(page); + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; /* * Take the mapping lock for the duration of the table walk. As @@ -2869,11 +2960,12 @@ int hugetlb_reserve_pages(struct inode *inode, { long ret, chg; struct hstate *h = hstate_inode(inode); + struct hugepage_subpool *spool = subpool_inode(inode); /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page - * and filesystem quota without using reserves + * without using reserves */ if (vm_flags & VM_NORESERVE) return 0; @@ -2900,17 +2992,17 @@ int hugetlb_reserve_pages(struct inode *inode, if (chg < 0) return chg; - /* There must be enough filesystem quota for the mapping */ - if (hugetlb_get_quota(inode->i_mapping, chg)) + /* There must be enough pages in the subpool for the mapping */ + if (hugepage_subpool_get_pages(spool, chg)) return -ENOSPC; /* * Check enough hugepages are available for the reservation. - * Hand back the quota if there are not + * Hand the pages back to the subpool if there are not */ ret = hugetlb_acct_memory(h, chg); if (ret < 0) { - hugetlb_put_quota(inode->i_mapping, chg); + hugepage_subpool_put_pages(spool, chg); return ret; } @@ -2934,12 +3026,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); long chg = region_truncate(&inode->i_mapping->private_list, offset); + struct hugepage_subpool *spool = subpool_inode(inode); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); - hugetlb_put_quota(inode->i_mapping, (chg - freed)); + hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); } diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c7fc7fd00e32..cc448bb983ba 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val) * do a racy check with elevated page count, to make sure PG_hwpoison * will only be set for the targeted owner (or on a free page). * We temporarily take page lock for try_get_mem_cgroup_from_page(). - * __memory_failure() will redo the check reliably inside page lock. + * memory_failure() will redo the check reliably inside page lock. */ lock_page(hpage); err = hwpoison_filter(hpage); @@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) inject: printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); - return __memory_failure(pfn, 18, MF_COUNT_INCREASED); + return memory_failure(pfn, 18, MF_COUNT_INCREASED); } static int hwpoison_unpoison(void *data, u64 val) @@ -28,7 +28,6 @@ #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> -#include <linux/memcontrol.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> @@ -375,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } +static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_area_struct *vma; + if (ksm_test_exit(mm)) + return NULL; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + return NULL; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + return NULL; + return vma; +} + static void break_cow(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -388,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item) put_anon_vma(rmap_item->anon_vma); down_read(&mm->mmap_sem); - if (ksm_test_exit(mm)) - goto out; - vma = find_vma(mm, addr); - if (!vma || vma->vm_start > addr) - goto out; - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) - goto out; - break_ksm(vma, addr); -out: + vma = find_mergeable_vma(mm, addr); + if (vma) + break_ksm(vma, addr); up_read(&mm->mmap_sem); } @@ -422,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) struct page *page; down_read(&mm->mmap_sem); - if (ksm_test_exit(mm)) - goto out; - vma = find_vma(mm, addr); - if (!vma || vma->vm_start > addr) - goto out; - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + vma = find_mergeable_vma(mm, addr); + if (!vma) goto out; page = follow_page(vma, addr, FOLL_GET); @@ -673,9 +676,9 @@ error: static u32 calc_checksum(struct page *page) { u32 checksum; - void *addr = kmap_atomic(page, KM_USER0); + void *addr = kmap_atomic(page); checksum = jhash2(addr, PAGE_SIZE / 4, 17); - kunmap_atomic(addr, KM_USER0); + kunmap_atomic(addr); return checksum; } @@ -684,11 +687,11 @@ static int memcmp_pages(struct page *page1, struct page *page2) char *addr1, *addr2; int ret; - addr1 = kmap_atomic(page1, KM_USER0); - addr2 = kmap_atomic(page2, KM_USER1); + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2, KM_USER1); - kunmap_atomic(addr1, KM_USER0); + kunmap_atomic(addr2); + kunmap_atomic(addr1); return ret; } @@ -1572,16 +1575,6 @@ struct page *ksm_does_need_to_copy(struct page *page, new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { - /* - * The memcg-specific accounting when moving - * pages around the LRU lists relies on the - * page's owner (memcg) to be valid. Usually, - * pages are assigned to a new owner before - * being put on the LRU list, but since this - * is not the case here, the stale owner from - * a previous allocation cycle must be reset. - */ - mem_cgroup_reset_owner(new_page); copy_user_highpage(new_page, page, address, vma); SetPageDirty(new_page); diff --git a/mm/madvise.c b/mm/madvise.c index 74bf193eff04..1ccbba5b6674 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -65,6 +65,12 @@ static long madvise_behavior(struct vm_area_struct * vma, } new_flags &= ~VM_DONTCOPY; break; + case MADV_DONTDUMP: + new_flags |= VM_NODUMP; + break; + case MADV_DODUMP: + new_flags &= ~VM_NODUMP; + break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); @@ -251,7 +257,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", page_to_pfn(p), start); /* Ignore return value for now */ - __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } return ret; } @@ -293,6 +299,8 @@ madvise_behavior_valid(int behavior) case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: #endif + case MADV_DONTDUMP: + case MADV_DODUMP: return 1; default: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 228d6461c12a..7d698df4a067 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -89,7 +89,6 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ - MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ MEM_CGROUP_STAT_NSTATS, }; @@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter { */ struct mem_cgroup_per_zone { struct lruvec lruvec; - unsigned long count[NR_LRU_LISTS]; + unsigned long lru_size[NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; @@ -144,11 +143,9 @@ struct mem_cgroup_per_zone { unsigned long long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; - struct mem_cgroup *mem; /* Back pointer, we cannot */ + struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; -/* Macro for accessing counter */ -#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; @@ -230,10 +227,30 @@ struct mem_cgroup { * the counter to account for memory usage */ struct res_counter res; - /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; + + union { + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; + + /* + * rcu_freeing is used only when freeing struct mem_cgroup, + * so put it into a union to avoid wasting more memory. + * It must be disjoint from the css field. It could be + * in a union with the res field, but res plays a much + * larger part in mem_cgroup life than memsw, and might + * be of interest, even at time of free, when debugging. + * So share rcu_head with the less interesting memsw. + */ + struct rcu_head rcu_freeing; + /* + * But when using vfree(), that cannot be done at + * interrupt time, so we must then queue the work. + */ + struct work_struct work_freeing; + }; + /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -280,6 +297,12 @@ struct mem_cgroup { */ unsigned long move_charge_at_immigrate; /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + /* * percpu counter. */ struct mem_cgroup_stat_cpu *stat; @@ -592,9 +615,9 @@ retry: * we will to add it back at the end of reclaim to its correct * position in the tree. */ - __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); - if (!res_counter_soft_limit_excess(&mz->mem->res) || - !css_tryget(&mz->mem->css)) + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + if (!res_counter_soft_limit_excess(&mz->memcg->res) || + !css_tryget(&mz->memcg->css)) goto retry; done: return mz; @@ -672,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, - bool file, int nr_pages) + bool anon, int nr_pages) { preempt_disable(); - if (file) - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], + /* + * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is + * counted as CACHE even if it's on ANON LRU. + */ + if (anon) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); /* pagein of a big page is an event. So, ignore page size */ @@ -701,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lru_mask) { struct mem_cgroup_per_zone *mz; - enum lru_list l; + enum lru_list lru; unsigned long ret = 0; mz = mem_cgroup_zoneinfo(memcg, nid, zid); - for_each_lru(l) { - if (BIT(l) & lru_mask) - ret += MEM_CGROUP_ZSTAT(mz, l); + for_each_lru(lru) { + if (BIT(lru) & lru_mask) + ret += mz->lru_size[lru]; } return ret; } @@ -1042,9 +1069,22 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, pc = lookup_page_cgroup(page); memcg = pc->mem_cgroup; + + /* + * Surreptitiously switch any uncharged page to root: + * an uncharged page off lru does nothing to secure + * its former mem_cgroup from sudden removal. + * + * Our caller holds lru_lock, and PageCgroupUsed is updated + * under page_cgroup lock: between them, they make all uses + * of pc->mem_cgroup safe. + */ + if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) + pc->mem_cgroup = memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); /* compound_order() is stabilized through lru_lock */ - MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); + mz->lru_size[lru] += 1 << compound_order(page); return &mz->lruvec; } @@ -1072,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) VM_BUG_ON(!memcg); mz = page_cgroup_zoneinfo(memcg, page); /* huge page split is done under lru_lock. so, we have no races. */ - VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); - MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); + VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); + mz->lru_size[lru] -= 1 << compound_order(page); } void mem_cgroup_lru_del(struct page *page) @@ -1252,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) return memcg->swappiness; } -static void mem_cgroup_start_move(struct mem_cgroup *memcg) -{ - int cpu; +/* + * memcg->moving_account is used for checking possibility that some thread is + * calling move_account(). When a thread on CPU-A starts moving pages under + * a memcg, other threads should check memcg->moving_account under + * rcu_read_lock(), like this: + * + * CPU-A CPU-B + * rcu_read_lock() + * memcg->moving_account+1 if (memcg->mocing_account) + * take heavy locks. + * synchronize_rcu() update something. + * rcu_read_unlock() + * start move here. + */ - get_online_cpus(); - spin_lock(&memcg->pcp_counter_lock); - for_each_online_cpu(cpu) - per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; - memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; - spin_unlock(&memcg->pcp_counter_lock); - put_online_cpus(); +/* for quick checking without looking up memcg */ +atomic_t memcg_moving __read_mostly; +static void mem_cgroup_start_move(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg_moving); + atomic_inc(&memcg->moving_account); synchronize_rcu(); } static void mem_cgroup_end_move(struct mem_cgroup *memcg) { - int cpu; - - if (!memcg) - return; - get_online_cpus(); - spin_lock(&memcg->pcp_counter_lock); - for_each_online_cpu(cpu) - per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; - memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; - spin_unlock(&memcg->pcp_counter_lock); - put_online_cpus(); + /* + * Now, mem_cgroup_clear_mc() may call this function with NULL. + * We check NULL in callee rather than caller. + */ + if (memcg) { + atomic_dec(&memcg_moving); + atomic_dec(&memcg->moving_account); + } } + /* * 2 routines for checking "mem" is under move_account() or not. * - * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used - * for avoiding race in accounting. If true, + * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This + * is used for avoiding races in accounting. If true, * pc->mem_cgroup may be overwritten. * * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or @@ -1293,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * waiting at hith-memory prressure caused by "move". */ -static bool mem_cgroup_stealed(struct mem_cgroup *memcg) +static bool mem_cgroup_stolen(struct mem_cgroup *memcg) { VM_BUG_ON(!rcu_read_lock_held()); - return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; + return atomic_read(&memcg->moving_account) > 0; } static bool mem_cgroup_under_move(struct mem_cgroup *memcg) @@ -1337,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } +/* + * Take this lock when + * - a code tries to modify page's memcg while it's USED. + * - a code tries to modify page state accounting in a memcg. + * see mem_cgroup_stolen(), too. + */ +static void move_lock_mem_cgroup(struct mem_cgroup *memcg, + unsigned long *flags) +{ + spin_lock_irqsave(&memcg->move_lock, *flags); +} + +static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, + unsigned long *flags) +{ + spin_unlock_irqrestore(&memcg->move_lock, *flags); +} + /** * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. * @memcg: The memory cgroup that went over limit @@ -1360,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) if (!memcg || !p) return; - rcu_read_lock(); mem_cgrp = memcg->css.cgroup; @@ -1739,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; wait_queue_t wait; }; static int memcg_oom_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, - *oom_wait_memcg; + struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; + struct mem_cgroup *oom_wait_memcg; struct oom_wait_info *oom_wait_info; oom_wait_info = container_of(wait, struct oom_wait_info, wait); - oom_wait_memcg = oom_wait_info->mem; + oom_wait_memcg = oom_wait_info->memcg; /* - * Both of oom_wait_info->mem and wake_mem are stable under us. + * Both of oom_wait_info->memcg and wake_memcg are stable under us. * Then we can use css_is_ancestor without taking care of RCU. */ if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) @@ -1778,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) /* * try to call OOM killer. returns false if we should exit memory-reclaim loop. */ -bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) +bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { struct oom_wait_info owait; bool locked, need_to_kill; - owait.mem = memcg; + owait.memcg = memcg; owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; @@ -1808,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) if (need_to_kill) { finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, mask); + mem_cgroup_out_of_memory(memcg, mask, order); } else { schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); @@ -1848,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) * by flags. * * Considering "move", this is an only case we see a race. To make the race - * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are - * possibility of race condition. If there is, we take a lock. + * small, we check mm->moving_account and detect there are possibility of race + * If there is, we take a lock. */ +void __mem_cgroup_begin_update_page_stat(struct page *page, + bool *locked, unsigned long *flags) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + + pc = lookup_page_cgroup(page); +again: + memcg = pc->mem_cgroup; + if (unlikely(!memcg || !PageCgroupUsed(pc))) + return; + /* + * If this memory cgroup is not under account moving, we don't + * need to take move_lock_page_cgroup(). Because we already hold + * rcu_read_lock(), any calls to move_account will be delayed until + * rcu_read_unlock() if mem_cgroup_stolen() == true. + */ + if (!mem_cgroup_stolen(memcg)) + return; + + move_lock_mem_cgroup(memcg, flags); + if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { + move_unlock_mem_cgroup(memcg, flags); + goto again; + } + *locked = true; +} + +void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +{ + struct page_cgroup *pc = lookup_page_cgroup(page); + + /* + * It's guaranteed that pc->mem_cgroup never changes while + * lock is held because a routine modifies pc->mem_cgroup + * should take move_lock_page_cgroup(). + */ + move_unlock_mem_cgroup(pc->mem_cgroup, flags); +} + void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *memcg; struct page_cgroup *pc = lookup_page_cgroup(page); - bool need_unlock = false; unsigned long uninitialized_var(flags); if (mem_cgroup_disabled()) return; - rcu_read_lock(); memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - goto out; - /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { - /* take a lock against to access pc->mem_cgroup */ - move_lock_page_cgroup(pc, &flags); - need_unlock = true; - memcg = pc->mem_cgroup; - if (!memcg || !PageCgroupUsed(pc)) - goto out; - } + return; switch (idx) { case MEMCG_NR_FILE_MAPPED: - if (val > 0) - SetPageCgroupFileMapped(pc); - else if (!page_mapped(page)) - ClearPageCgroupFileMapped(pc); idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: @@ -1890,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page, } this_cpu_add(memcg->stat->count[idx], val); - -out: - if (unlikely(need_unlock)) - move_unlock_page_cgroup(pc, &flags); - rcu_read_unlock(); - return; } -EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -2068,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) per_cpu(memcg->stat->events[i], cpu) = 0; memcg->nocpu_base.events[i] += x; } - /* need to clear ON_MOVE value, works as a kind of lock. */ - per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; - spin_unlock(&memcg->pcp_counter_lock); -} - -static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) -{ - int idx = MEM_CGROUP_ON_MOVE; - - spin_lock(&memcg->pcp_counter_lock); - per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; spin_unlock(&memcg->pcp_counter_lock); } @@ -2090,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, struct memcg_stock_pcp *stock; struct mem_cgroup *iter; - if ((action == CPU_ONLINE)) { - for_each_mem_cgroup(iter) - synchronize_mem_cgroup_on_move(iter, cpu); + if (action == CPU_ONLINE) return NOTIFY_OK; - } if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) return NOTIFY_OK; @@ -2179,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!oom_check) return CHARGE_NOMEM; /* check OOM */ - if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) + if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) return CHARGE_OOM_DIE; return CHARGE_RETRY; @@ -2408,8 +2477,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, struct page *page, unsigned int nr_pages, struct page_cgroup *pc, - enum charge_type ctype) + enum charge_type ctype, + bool lrucare) { + struct zone *uninitialized_var(zone); + bool was_on_lru = false; + bool anon; + lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); @@ -2420,6 +2494,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. */ + + /* + * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page + * may already be on some other mem_cgroup's LRU. Take care of it. + */ + if (lrucare) { + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ClearPageLRU(page); + del_page_from_lru_list(zone, page, page_lru(page)); + was_on_lru = true; + } + } + pc->mem_cgroup = memcg; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2429,23 +2518,25 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * See mem_cgroup_add_lru_list(), etc. */ smp_wmb(); - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_CACHE: - case MEM_CGROUP_CHARGE_TYPE_SHMEM: - SetPageCgroupCache(pc); - SetPageCgroupUsed(pc); - break; - case MEM_CGROUP_CHARGE_TYPE_MAPPED: - ClearPageCgroupCache(pc); - SetPageCgroupUsed(pc); - break; - default: - break; + SetPageCgroupUsed(pc); + + if (lrucare) { + if (was_on_lru) { + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + add_page_to_lru_list(zone, page, page_lru(page)); + } + spin_unlock_irq(&zone->lru_lock); } - mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); + if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) + anon = true; + else + anon = false; + + mem_cgroup_charge_statistics(memcg, anon, nr_pages); unlock_page_cgroup(pc); - WARN_ON_ONCE(PageLRU(page)); + /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. @@ -2456,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ - (1 << PCG_MIGRATION)) +#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -2508,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page, { unsigned long flags; int ret; + bool anon = PageAnon(page); VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(page)); @@ -2527,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page, if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) goto unlock; - move_lock_page_cgroup(pc, &flags); + move_lock_mem_cgroup(from, &flags); - if (PageCgroupFileMapped(pc)) { + if (!anon && page_mapped(page)) { /* Update mapped_file data for mem_cgroup */ preempt_disable(); __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); + mem_cgroup_charge_statistics(from, anon, -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ __mem_cgroup_cancel_charge(from, nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); + mem_cgroup_charge_statistics(to, anon, nr_pages); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of @@ -2551,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page, * guaranteed that "to" is never removed. So, we don't check rmdir * status here. */ - move_unlock_page_cgroup(pc, &flags); + move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: unlock_page_cgroup(pc); @@ -2643,7 +2734,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false); return 0; } @@ -2663,35 +2754,6 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); -static void -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) -{ - struct page_cgroup *pc = lookup_page_cgroup(page); - struct zone *zone = page_zone(page); - unsigned long flags; - bool removed = false; - - /* - * In some case, SwapCache, FUSE(splice_buf->radixtree), the page - * is already on LRU. It means the page may on some other page_cgroup's - * LRU. Take care of it. - */ - spin_lock_irqsave(&zone->lru_lock, flags); - if (PageLRU(page)) { - del_page_from_lru_list(zone, page, page_lru(page)); - ClearPageLRU(page); - removed = true; - } - __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); - if (removed) { - add_page_to_lru_list(zone, page, page_lru(page)); - SetPageLRU(page); - } - spin_unlock_irqrestore(&zone->lru_lock, flags); - return; -} - int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { @@ -2769,13 +2831,16 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { + struct page_cgroup *pc; + if (mem_cgroup_disabled()) return; if (!memcg) return; cgroup_exclude_rmdir(&memcg->css); - __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); + pc = lookup_page_cgroup(page); + __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2879,7 +2944,6 @@ direct_uncharge: res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); if (unlikely(batch->memcg != memcg)) memcg_oom_recover(memcg); - return; } /* @@ -2891,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; struct page_cgroup *pc; + bool anon; if (mem_cgroup_disabled()) return NULL; @@ -2916,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (!PageCgroupUsed(pc)) goto unlock_out; + anon = PageAnon(page); + switch (ctype) { case MEM_CGROUP_CHARGE_TYPE_MAPPED: + /* + * Generally PageAnon tells if it's the anon statistics to be + * updated; but sometimes e.g. mem_cgroup_uncharge_page() is + * used before page reached the stage of being marked PageAnon. + */ + anon = true; + /* fallthrough */ case MEM_CGROUP_CHARGE_TYPE_DROP: /* See mem_cgroup_prepare_migration() */ if (page_mapped(page) || PageCgroupMigration(pc)) @@ -2934,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); + mem_cgroup_charge_statistics(memcg, anon, -nr_pages); ClearPageCgroupUsed(pc); /* @@ -3027,23 +3101,6 @@ void mem_cgroup_uncharge_end(void) batch->memcg = NULL; } -/* - * A function for resetting pc->mem_cgroup for newly allocated pages. - * This function should be called if the newpage will be added to LRU - * before start accounting. - */ -void mem_cgroup_reset_owner(struct page *newpage) -{ - struct page_cgroup *pc; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(newpage); - VM_BUG_ON(PageCgroupUsed(pc)); - pc->mem_cgroup = root_mem_cgroup; -} - #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. @@ -3248,7 +3305,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false); return ret; } @@ -3258,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, { struct page *used, *unused; struct page_cgroup *pc; + bool anon; if (!memcg) return; @@ -3279,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, lock_page_cgroup(pc); ClearPageCgroupMigration(pc); unlock_page_cgroup(pc); - - __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); + anon = PageAnon(used); + __mem_cgroup_uncharge_common(unused, + anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED + : MEM_CGROUP_CHARGE_TYPE_CACHE); /* * If a page is a file cache, radix-tree replacement is very atomic @@ -3290,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, * and USED bit check in mem_cgroup_uncharge_page() will do enough * check. (see prepare_charge() also) */ - if (PageAnon(used)) + if (anon) mem_cgroup_uncharge_page(used); /* * At migration, we may charge account against cgroup which has no @@ -3320,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, /* fix accounting on old pages */ lock_page_cgroup(pc); memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); + mem_cgroup_charge_statistics(memcg, false, -1); ClearPageCgroupUsed(pc); unlock_page_cgroup(pc); @@ -3332,7 +3392,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, * the newpage may be on LRU(or pagevec for LRU) already. We lock * LRU while we overwrite pc->mem_cgroup. */ - __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true); } #ifdef CONFIG_DEBUG_VM @@ -3531,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; @@ -3558,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, next_mz = __mem_cgroup_largest_soft_limit_node(mctz); if (next_mz == mz) - css_put(&next_mz->mem->css); + css_put(&next_mz->memcg->css); else /* next_mz == NULL or other memcg */ break; } while (1); } - __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); - excess = res_counter_soft_limit_excess(&mz->mem->res); + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->memcg->res); /* * One school of thought says that we should not add * back the node to the tree if reclaim returns 0. @@ -3574,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * term TODO. */ /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); spin_unlock(&mctz->lock); - css_put(&mz->mem->css); + css_put(&mz->memcg->css); loop++; /* * Could not reclaim anything and there are no more @@ -3589,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; } while (!nr_reclaimed); if (next_mz) - css_put(&next_mz->mem->css); + css_put(&next_mz->memcg->css); return nr_reclaimed; } @@ -3611,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, mz = mem_cgroup_zoneinfo(memcg, node, zid); list = &mz->lruvec.lists[lru]; - loop = MEM_CGROUP_ZSTAT(mz, lru); + loop = mz->lru_size[lru]; /* give some margin against EBUSY etc...*/ loop += 256; busy = NULL; @@ -3685,10 +3745,10 @@ move_account: mem_cgroup_start_move(memcg); for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { - enum lru_list l; - for_each_lru(l) { + enum lru_list lru; + for_each_lru(lru) { ret = mem_cgroup_force_empty_list(memcg, - node, zid, l); + node, zid, lru); if (ret) break; } @@ -3842,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) break; default: BUG(); - break; } return val; } @@ -3921,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, out: *mem_limit = min_limit; *memsw_limit = min_memsw_limit; - return; } static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) @@ -4080,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) unsigned long total_nr, file_nr, anon_nr, unevictable_nr; unsigned long node_nr; struct cgroup *cont = m->private; - struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); + total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); seq_printf(m, "total=%lu", total_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); + node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); + file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); seq_printf(m, "file=%lu", file_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); + anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); seq_printf(m, "anon=%lu", anon_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON); seq_printf(m, " N%d=%lu", nid, node_nr); } seq_putc(m, '\n'); - unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); + unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); seq_printf(m, "unevictable=%lu", unevictable_nr); for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, + node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, BIT(LRU_UNEVICTABLE)); seq_printf(m, " N%d=%lu", nid, node_nr); } @@ -4123,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { - struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); struct mcs_total_stat mystat; int i; memset(&mystat, 0, sizeof(mystat)); - mem_cgroup_get_local_stat(mem_cont, &mystat); + mem_cgroup_get_local_stat(memcg, &mystat); for (i = 0; i < NR_MCS_STAT; i++) { @@ -4140,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, /* Hierarchical information */ { unsigned long long limit, memsw_limit; - memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); + memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); cb->fill(cb, "hierarchical_memory_limit", limit); if (do_swap_account) cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); } memset(&mystat, 0, sizeof(mystat)); - mem_cgroup_get_total_stat(mem_cont, &mystat); + mem_cgroup_get_total_stat(memcg, &mystat); for (i = 0; i < NR_MCS_STAT; i++) { if (i == MCS_SWAP && !do_swap_account) continue; @@ -4163,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); + mz = mem_cgroup_zoneinfo(memcg, nid, zid); recent_rotated[0] += mz->reclaim_stat.recent_rotated[0]; @@ -4408,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, else BUG(); - /* - * Something went wrong if we trying to unregister a threshold - * if we don't have thresholds - */ - BUG_ON(!thresholds); - if (!thresholds->primary) goto unlock; @@ -4584,10 +4636,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(cont, ss); }; -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont) { - mem_cgroup_sockets_destroy(cont, ss); + mem_cgroup_sockets_destroy(cont); } #else static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) @@ -4595,8 +4646,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont) { } #endif @@ -4720,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; - enum lru_list l; + enum lru_list lru; int zone, tmp = node; /* * This routine is called against possible nodes. @@ -4738,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - for_each_lru(l) - INIT_LIST_HEAD(&mz->lruvec.lists[l]); + for_each_lru(lru) + INIT_LIST_HEAD(&mz->lruvec.lists[lru]); mz->usage_in_excess = 0; mz->on_tree = false; - mz->mem = memcg; + mz->memcg = memcg; } memcg->info.nodeinfo[node] = pn; return 0; @@ -4755,33 +4805,54 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) static struct mem_cgroup *mem_cgroup_alloc(void) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; int size = sizeof(struct mem_cgroup); /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kzalloc(size, GFP_KERNEL); + memcg = kzalloc(size, GFP_KERNEL); else - mem = vzalloc(size); + memcg = vzalloc(size); - if (!mem) + if (!memcg) return NULL; - mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!mem->stat) + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat) goto out_free; - spin_lock_init(&mem->pcp_counter_lock); - return mem; + spin_lock_init(&memcg->pcp_counter_lock); + return memcg; out_free: if (size < PAGE_SIZE) - kfree(mem); + kfree(memcg); else - vfree(mem); + vfree(memcg); return NULL; } /* + * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, + * but in process context. The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. + */ +static void vfree_work(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, work_freeing); + vfree(memcg); +} +static void vfree_rcu(struct rcu_head *rcu_head) +{ + struct mem_cgroup *memcg; + + memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); + INIT_WORK(&memcg->work_freeing, vfree_work); + schedule_work(&memcg->work_freeing); +} + +/* * At destroying mem_cgroup, references from swap_cgroup can remain. * (scanning all at force_empty is too costly...) * @@ -4804,9 +4875,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_percpu(memcg->stat); if (sizeof(struct mem_cgroup) < PAGE_SIZE) - kfree(memcg); + kfree_rcu(memcg, rcu_freeing); else - vfree(memcg); + call_rcu(&memcg->rcu_freeing, vfree_rcu); } static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -4888,7 +4959,7 @@ err_cleanup: } static struct cgroup_subsys_state * __ref -mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) +mem_cgroup_create(struct cgroup *cont) { struct mem_cgroup *memcg, *parent; long error = -ENOMEM; @@ -4944,26 +5015,25 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) atomic_set(&memcg->refcnt, 1); memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); + spin_lock_init(&memcg->move_lock); return &memcg->css; free_out: __mem_cgroup_free(memcg); return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static int mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); return mem_cgroup_force_empty(memcg, false); } -static void mem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void mem_cgroup_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(ss, cont); + kmem_cgroup_destroy(cont); mem_cgroup_put(memcg); } @@ -5040,7 +5110,7 @@ one_by_one: } /** - * is_target_pte_for_mc - check a pte whether it is valid for move charge + * get_mctgt_type - get target type of moving charge * @vma: the vma the pte to be checked belongs * @addr: the address corresponding to the pte to be checked * @ptent: the pte to be checked @@ -5063,7 +5133,7 @@ union mc_target { }; enum mc_target_type { - MC_TARGET_NONE, /* not used */ + MC_TARGET_NONE = 0, MC_TARGET_PAGE, MC_TARGET_SWAP, }; @@ -5144,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return page; } -static int is_target_pte_for_mc(struct vm_area_struct *vma, +static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { struct page *page = NULL; struct page_cgroup *pc; - int ret = 0; + enum mc_target_type ret = MC_TARGET_NONE; swp_entry_t ent = { .val = 0 }; if (pte_present(ptent)) @@ -5160,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, page = mc_handle_file_pte(vma, addr, ptent, &ent); if (!page && !ent.val) - return 0; + return ret; if (page) { pc = lookup_page_cgroup(page); /* @@ -5186,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, return ret; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * We don't consider swapping or file mapped pages because THP does not + * support them for now. + * Caller should make sure that pmd_trans_huge(pmd) is true. + */ +static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, union mc_target *target) +{ + struct page *page = NULL; + struct page_cgroup *pc; + enum mc_target_type ret = MC_TARGET_NONE; + + page = pmd_page(pmd); + VM_BUG_ON(!page || !PageHead(page)); + if (!move_anon()) + return ret; + pc = lookup_page_cgroup(page); + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + ret = MC_TARGET_PAGE; + if (target) { + get_page(page); + target->page = page; + } + } + return ret; +} +#else +static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, union mc_target *target) +{ + return MC_TARGET_NONE; +} +#endif + static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -5194,11 +5299,18 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) + mc.precharge += HPAGE_PMD_NR; + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; + } + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) - if (is_target_pte_for_mc(vma, addr, *pte, NULL)) + if (get_mctgt_type(vma, addr, *pte, NULL)) mc.precharge++; /* increment precharge temporarily */ pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -5300,9 +5412,8 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; @@ -5340,9 +5451,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); } @@ -5355,23 +5465,57 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, struct vm_area_struct *vma = walk->private; pte_t *pte; spinlock_t *ptl; + enum mc_target_type target_type; + union mc_target target; + struct page *page; + struct page_cgroup *pc; - split_huge_page_pmd(walk->mm, pmd); + /* + * We don't take compound_lock() here but no race with splitting thp + * happens because: + * - if pmd_trans_huge_lock() returns 1, the relevant thp is not + * under splitting, which means there's no concurrent thp split, + * - if another thread runs into split_huge_page() just after we + * entered this if-block, the thread must wait for page table lock + * to be unlocked in __split_huge_page_splitting(), where the main + * part of thp split is not executed yet. + */ + if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (!mc.precharge) { + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; + } + target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); + if (target_type == MC_TARGET_PAGE) { + page = target.page; + if (!isolate_lru_page(page)) { + pc = lookup_page_cgroup(page); + if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, + pc, mc.from, mc.to, + false)) { + mc.precharge -= HPAGE_PMD_NR; + mc.moved_charge += HPAGE_PMD_NR; + } + putback_lru_page(page); + } + put_page(page); + } + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); - union mc_target target; - int type; - struct page *page; - struct page_cgroup *pc; swp_entry_t ent; if (!mc.precharge) break; - type = is_target_pte_for_mc(vma, addr, ptent, &target); - switch (type) { + switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_PAGE: page = target.page; if (isolate_lru_page(page)) @@ -5384,7 +5528,7 @@ retry: mc.moved_charge++; } putback_lru_page(page); -put: /* is_target_pte_for_mc() gets the page */ +put: /* get_mctgt_type() gets the page */ put_page(page); break; case MC_TARGET_SWAP: @@ -5457,9 +5601,8 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, - struct cgroup *cont, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, + struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); struct mm_struct *mm = get_task_mm(p); @@ -5474,20 +5617,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, - struct cgroup *cont, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, + struct cgroup_taskset *tset) { } #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 56080ea36140..97cc2733551a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -187,33 +187,40 @@ int hwpoison_filter(struct page *p) EXPORT_SYMBOL_GPL(hwpoison_filter); /* - * Send all the processes who have the page mapped an ``action optional'' - * signal. + * Send all the processes who have the page mapped a signal. + * ``action optional'' if they are not immediately affected by the error + * ``action required'' if error happened in current execution context */ -static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn, struct page *page) +static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, + unsigned long pfn, struct page *page, int flags) { struct siginfo si; int ret; printk(KERN_ERR - "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", + "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); si.si_signo = SIGBUS; si.si_errno = 0; - si.si_code = BUS_MCEERR_AO; si.si_addr = (void *)addr; #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; - /* - * Don't use force here, it's convenient if the signal - * can be temporarily blocked. - * This could cause a loop when the user sets SIGBUS - * to SIG_IGN, but hopefully no one will do that? - */ - ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + + if ((flags & MF_ACTION_REQUIRED) && t == current) { + si.si_code = BUS_MCEERR_AR; + ret = force_sig_info(SIGBUS, &si, t); + } else { + /* + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully no one will do that? + */ + si.si_code = BUS_MCEERR_AO; + ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + } if (ret < 0) printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); @@ -338,8 +345,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * Also when FAIL is set do a force kill because something went * wrong earlier. */ -static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, struct page *page, unsigned long pfn) +static void kill_procs(struct list_head *to_kill, int doit, int trapno, + int fail, struct page *page, unsigned long pfn, + int flags) { struct to_kill *tk, *next; @@ -363,8 +371,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, * check for that, but we need to tell the * process anyways. */ - else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn, page) < 0) + else if (kill_proc(tk->tsk, tk->addr, trapno, + pfn, page, flags) < 0) printk(KERN_ERR "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); @@ -844,7 +852,7 @@ static int page_action(struct page_state *ps, struct page *p, * the pages and send SIGBUS to the processes if the data was dirty. */ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, - int trapno) + int trapno, int flags) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; @@ -962,8 +970,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, - ret != SWAP_SUCCESS, p, pfn); + kill_procs(&tokill, !!PageDirty(ppage), trapno, + ret != SWAP_SUCCESS, p, pfn, flags); return ret; } @@ -984,7 +992,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) ClearPageHWPoison(hpage + i); } -int __memory_failure(unsigned long pfn, int trapno, int flags) +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: fine tune action taken + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +int memory_failure(unsigned long pfn, int trapno, int flags) { struct page_state *ps; struct page *p; @@ -1063,7 +1089,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageHuge(p) && !PageTransCompound(p)) { + if (!PageHuge(p) && !PageTransTail(p)) { if (!PageLRU(p)) shake_page(p, 0); if (!PageLRU(p)) { @@ -1130,7 +1156,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. */ - if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { + if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); res = -EBUSY; goto out; @@ -1156,29 +1182,7 @@ out: unlock_page(hpage); return res; } -EXPORT_SYMBOL_GPL(__memory_failure); - -/** - * memory_failure - Handle memory failure of a page. - * @pfn: Page Number of the corrupted page - * @trapno: Trap number reported in the signal to user space. - * - * This function is called by the low level machine check code - * of an architecture when it detects hardware memory corruption - * of a page. It tries its best to recover, which includes - * dropping pages, killing processes etc. - * - * The function is primarily of use for corruptions that - * happen outside the current execution context (e.g. when - * detected by a background scrubber) - * - * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. - */ -void memory_failure(unsigned long pfn, int trapno) -{ - __memory_failure(pfn, trapno, 0); -} +EXPORT_SYMBOL_GPL(memory_failure); #define MEMORY_FAILURE_FIFO_ORDER 4 #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) @@ -1251,7 +1255,7 @@ static void memory_failure_work_func(struct work_struct *work) spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; - __memory_failure(entry.pfn, entry.trapno, entry.flags); + memory_failure(entry.pfn, entry.trapno, entry.flags); } } diff --git a/mm/memory.c b/mm/memory.c index fa2f04e0337c..6105f475fa86 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -125,17 +125,17 @@ core_initcall(init_zero_pfn); #if defined(SPLIT_RSS_COUNTING) -static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +void sync_mm_rss(struct mm_struct *mm) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) { - if (task->rss_stat.count[i]) { - add_mm_counter(mm, i, task->rss_stat.count[i]); - task->rss_stat.count[i] = 0; + if (current->rss_stat.count[i]) { + add_mm_counter(mm, i, current->rss_stat.count[i]); + current->rss_stat.count[i] = 0; } } - task->rss_stat.events = 0; + current->rss_stat.events = 0; } static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) @@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task) if (unlikely(task != current)) return; if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - __sync_task_rss_stat(task, task->mm); -} - -unsigned long get_mm_counter(struct mm_struct *mm, int member) -{ - long val = 0; - - /* - * Don't use task->mm here...for avoiding to use task_get_mm().. - * The caller must guarantee task->mm is not invalid. - */ - val = atomic_long_read(&mm->rss_stat.count[member]); - /* - * counter is updated in asynchronous manner and may go to minus. - * But it's never be expected number for users. - */ - if (val < 0) - return 0; - return (unsigned long)val; -} - -void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) -{ - __sync_task_rss_stat(task, mm); + sync_mm_rss(task->mm); } #else /* SPLIT_RSS_COUNTING */ @@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) int i; if (current->mm == mm) - sync_mm_rss(current, mm); + sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); @@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { - if (next-addr != HPAGE_PMD_SIZE) { + if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) - continue; + goto next; /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) - continue; + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_sem in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: cond_resched(); } while (pmd++, addr = next, addr != end); @@ -1282,10 +1267,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, return addr; } -static unsigned long unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - struct zap_details *details) +static void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details) { pgd_t *pgd; unsigned long next; @@ -1305,8 +1290,47 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); mem_cgroup_uncharge_end(); +} - return addr; + +static void unmap_single_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted, + struct zap_details *details) +{ + unsigned long start = max(vma->vm_start, start_addr); + unsigned long end; + + if (start >= vma->vm_end) + return; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (vma->vm_flags & VM_ACCOUNT) + *nr_accounted += (end - start) >> PAGE_SHIFT; + + if (unlikely(is_pfn_mapping(vma))) + untrack_pfn_vma(vma, 0, 0); + + if (start != end) { + if (unlikely(is_vm_hugetlb_page(vma))) { + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of do_mmap_pgoff. When + * hugetlbfs ->mmap method fails, + * do_mmap_pgoff() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) + unmap_hugepage_range(vma, start, end, NULL); + } else + unmap_page_range(tlb, vma, start, end, details); + } } /** @@ -1318,8 +1342,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here * @details: details of nonlinear truncation or shared cache invalidation * - * Returns the end address of the unmapping (restart addr if interrupted). - * * Unmap all pages in the vma list. * * Only addresses between `start' and `end' will be unmapped. @@ -1331,55 +1353,18 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather *tlb, +void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { - unsigned long start = start_addr; struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { - unsigned long end; - - start = max(vma->vm_start, start_addr); - if (start >= vma->vm_end) - continue; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) - continue; - - if (vma->vm_flags & VM_ACCOUNT) - *nr_accounted += (end - start) >> PAGE_SHIFT; - - if (unlikely(is_pfn_mapping(vma))) - untrack_pfn_vma(vma, 0, 0); - - while (start != end) { - if (unlikely(is_vm_hugetlb_page(vma))) { - /* - * It is undesirable to test vma->vm_file as it - * should be non-null for valid hugetlb area. - * However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When - * hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file - * before calling this function to clean up. - * Since no pte has actually been setup, it is - * safe to do nothing in this case. - */ - if (vma->vm_file) - unmap_hugepage_range(vma, start, end, NULL); - - start = end; - } else - start = unmap_page_range(tlb, vma, start, end, details); - } - } - + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) + unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, + details); mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); - return start; /* which is now the end (or restart) address */ } /** @@ -1388,8 +1373,10 @@ unsigned long unmap_vmas(struct mmu_gather *tlb, * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation + * + * Caller must protect the VMA list */ -unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, +void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; @@ -1400,9 +1387,34 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, lru_add_drain(); tlb_gather_mmu(&tlb, mm, 0); update_hiwater_rss(mm); - end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + tlb_finish_mmu(&tlb, address, end); +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of nonlinear truncation or shared cache invalidation + * + * The range must fit into one VMA. + */ +static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + unsigned long end = address + size; + unsigned long nr_accounted = 0; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, 0); + update_hiwater_rss(mm); + mmu_notifier_invalidate_range_start(mm, address, end); + unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); + mmu_notifier_invalidate_range_end(mm, address, end); tlb_finish_mmu(&tlb, address, end); - return end; } /** @@ -1423,7 +1435,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, if (address < vma->vm_start || address + size > vma->vm_end || !(vma->vm_flags & VM_PFNMAP)) return -1; - zap_page_range(vma, address, size, NULL); + zap_page_range_single(vma, address, size, NULL); return 0; } EXPORT_SYMBOL_GPL(zap_vma_ptes); @@ -2447,7 +2459,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo * fails, we just zero-fill it. Live with it. */ if (unlikely(!src)) { - void *kaddr = kmap_atomic(dst, KM_USER0); + void *kaddr = kmap_atomic(dst); void __user *uaddr = (void __user *)(va & PAGE_MASK); /* @@ -2458,7 +2470,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) clear_page(kaddr); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); flush_dcache_page(dst); } else copy_user_highpage(dst, src, va, vma); @@ -2770,7 +2782,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { - zap_page_range(vma, start_addr, end_addr - start_addr, details); + zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct prio_tree_root *root, @@ -3611,13 +3623,7 @@ static int __init gate_vma_init(void) gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; } __initcall(gate_vma_init); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 06b145fb64ab..cfb6c8678754 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, do { next = pmd_addr_end(addr, end); split_huge_page_pmd(vma->vm_mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, flags, private)) @@ -640,10 +640,11 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long vmstart; unsigned long vmend; - vma = find_vma_prev(mm, start, &prev); + vma = find_vma(mm, start); if (!vma || vma->vm_start > start) return -EFAULT; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; @@ -1322,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, err = -ESRCH; goto out; } - mm = get_task_mm(task); - rcu_read_unlock(); + get_task_struct(task); err = -EINVAL; - if (!mm) - goto out; /* * Check if this process has the right to modify the specified @@ -1335,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, * capabilities, superuser privileges or the same * userid as the target process. */ - rcu_read_lock(); tcred = __task_cred(task); if (cred->euid != tcred->suid && cred->euid != tcred->uid && cred->uid != tcred->suid && cred->uid != tcred->uid && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; - goto out; + goto out_put; } rcu_read_unlock(); @@ -1350,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; - goto out; + goto out_put; } if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { err = -EINVAL; - goto out; + goto out_put; } err = security_task_movememory(task); if (err) - goto out; + goto out_put; - err = do_migrate_pages(mm, old, new, - capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); -out: + mm = get_task_mm(task); + put_task_struct(task); if (mm) - mmput(mm); + err = do_migrate_pages(mm, old, new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); + else + err = -EINVAL; + + mmput(mm); +out: NODEMASK_SCRATCH_FREE(scratch); return err; + +out_put: + put_task_struct(task); + goto out; + } @@ -1843,18 +1850,24 @@ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node) { - struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct mempolicy *pol; struct zonelist *zl; struct page *page; + unsigned int cpuset_mems_cookie; + +retry_cpuset: + pol = get_vma_policy(current, vma, addr); + cpuset_mems_cookie = get_mems_allowed(); - get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } zl = policy_zonelist(gfp, pol, node); @@ -1865,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; } /* @@ -1873,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, */ page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; } @@ -1900,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; struct page *page; + unsigned int cpuset_mems_cookie; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* * No reference counting needed for current->mempolicy * nor system default_policy @@ -1915,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - put_mems_allowed(); + + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(alloc_pages_current); diff --git a/mm/migrate.c b/mm/migrate.c index df141f60289e..51c08a0c6f68 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; - mem_cgroup_reset_owner(newpage); - if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto out; @@ -1176,20 +1174,17 @@ set_status: * Migrate an array of page address onto an array of nodes and fill * the corresponding array of status. */ -static int do_pages_move(struct mm_struct *mm, struct task_struct *task, +static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, unsigned long nr_pages, const void __user * __user *pages, const int __user *nodes, int __user *status, int flags) { struct page_to_node *pm; - nodemask_t task_nodes; unsigned long chunk_nr_pages; unsigned long chunk_start; int err; - task_nodes = cpuset_mems_allowed(task); - err = -ENOMEM; pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); if (!pm) @@ -1351,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, struct task_struct *task; struct mm_struct *mm; int err; + nodemask_t task_nodes; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -1366,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, rcu_read_unlock(); return -ESRCH; } - mm = get_task_mm(task); - rcu_read_unlock(); - - if (!mm) - return -EINVAL; + get_task_struct(task); /* * Check if this process has the right to modify the specified @@ -1378,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, * capabilities, superuser privileges or the same * userid as the target process. */ - rcu_read_lock(); tcred = __task_cred(task); if (cred->euid != tcred->suid && cred->euid != tcred->uid && cred->uid != tcred->suid && cred->uid != tcred->uid && @@ -1393,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, if (err) goto out; - if (nodes) { - err = do_pages_move(mm, task, nr_pages, pages, nodes, status, - flags); - } else { - err = do_pages_stat(mm, nr_pages, pages, status); - } + task_nodes = cpuset_mems_allowed(task); + mm = get_task_mm(task); + put_task_struct(task); + + if (mm) { + if (nodes) + err = do_pages_move(mm, task_nodes, nr_pages, pages, + nodes, status, flags); + else + err = do_pages_stat(mm, nr_pages, pages, status); + } else + err = -EINVAL; -out: mmput(mm); return err; + +out: + put_task_struct(task); + return err; } /* diff --git a/mm/mincore.c b/mm/mincore.c index 636a86876ff2..936b4cee8cb1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, } /* fall through */ } - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else mincore_pte_range(vma, pmd, addr, next, vec); diff --git a/mm/mlock.c b/mm/mlock.c index 4f4f53bdc65d..ef726e8aa8e9 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -385,10 +385,11 @@ static int do_mlock(unsigned long start, size_t len, int on) return -EINVAL; if (end == start) return 0; - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); if (!vma || vma->vm_start > start) return -ENOMEM; + prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; diff --git a/mm/mmap.c b/mm/mmap.c index 7c112fbca405..b17a39f31a5e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -453,9 +453,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Helper for vma_adjust in the split_vma insert case: - * insert vm structure into list and rbtree and anon_vma, - * but it has already been inserted into prio_tree earlier. + * Helper for vma_adjust() in the split_vma insert case: insert a vma into the + * mm's list and rbtree. It has already been inserted into the prio_tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -954,6 +953,19 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, #endif /* CONFIG_PROC_FS */ /* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} + +/* * The caller must hold down_write(¤t->mm->mmap_sem). */ @@ -1117,9 +1129,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, * A dummy user value is used because we are not locking * memory so no accounting is necessary */ - len = ALIGN(len, huge_page_size(&default_hstate)); - file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, + VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE); if (IS_ERR(file)) return PTR_ERR(file); } @@ -1253,7 +1265,7 @@ munmap_back: */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) + if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; } @@ -1284,8 +1296,9 @@ munmap_back: vma->vm_pgoff = pgoff; INIT_LIST_HEAD(&vma->anon_vma_chain); + error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ + if (file) { - error = -EINVAL; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) goto free_vma; if (vm_flags & VM_DENYWRITE) { @@ -1311,6 +1324,8 @@ munmap_back: pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { + if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) + goto free_vma; error = shmem_zero_setup(vma); if (error) goto free_vma; @@ -1446,10 +1461,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) /* * Is this a new hole at the lowest possible address? */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { + if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) mm->free_area_cache = addr; - mm->cached_hole_size = ~0UL; - } } /* @@ -1464,7 +1477,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; + unsigned long addr = addr0, start_addr; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -1488,22 +1501,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, mm->free_area_cache = mm->mmap_base; } +try_again: /* either no address requested or can't fit in requested address hole */ - addr = mm->free_area_cache; - - /* make sure it can fit in the remaining address space */ - if (addr > len) { - vma = find_vma(mm, addr-len); - if (!vma || addr <= vma->vm_start) - /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr-len); - } + start_addr = addr = mm->free_area_cache; - if (mm->mmap_base < len) - goto bottomup; - - addr = mm->mmap_base-len; + if (addr < len) + goto fail; + addr -= len; do { /* * Lookup failure means no vma is above this address, @@ -1523,7 +1528,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = vma->vm_start-len; } while (len < vma->vm_start); -bottomup: +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + * + * Note: this is different with the case of bottomup + * which does the fully line-search, but we use find_vma + * here that causes some holes skipped. + */ + if (start_addr != mm->mmap_base) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + goto try_again; + } + /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario @@ -1628,7 +1647,6 @@ EXPORT_SYMBOL(find_vma); /* * Same as find_vma, but also return a pointer to the previous VMA in *pprev. - * Note: pprev is set to NULL when return value is NULL. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, @@ -1637,7 +1655,16 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; vma = find_vma(mm, addr); - *pprev = vma ? vma->vm_prev : NULL; + if (vma) { + *pprev = vma->vm_prev; + } else { + struct rb_node *rb_node = mm->mm_rb.rb_node; + *pprev = NULL; + while (rb_node) { + *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); + rb_node = rb_node->rb_right; + } + } return vma; } @@ -2192,7 +2219,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (mm->map_count > sysctl_max_map_count) return -ENOMEM; - if (security_vm_enough_memory(len >> PAGE_SHIFT)) + if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; /* Can we just expand an old private anonymous mapping? */ @@ -2236,7 +2263,6 @@ void exit_mmap(struct mm_struct *mm) struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; - unsigned long end; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -2261,11 +2287,11 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu(&tlb, mm, 1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); - tlb_finish_mmu(&tlb, 0, end); + tlb_finish_mmu(&tlb, 0, -1); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/mmu_context.c b/mm/mmu_context.c index cf332bc0080a..3dcfaf4ed355 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - sync_mm_rss(tsk, mm); + sync_mm_rss(mm); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); diff --git a/mm/mprotect.c b/mm/mprotect.c index 5a688a2756be..a40992610ab6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ptent = pte_mkwrite(ptent); ptep_modify_prot_commit(mm, addr, pte, ptent); - } else if (PAGE_MIGRATION && !pte_file(oldpte)) { + } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { @@ -168,7 +168,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; - if (security_vm_enough_memory(charged)) + if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; newflags |= VM_ACCOUNT; } @@ -262,10 +262,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, down_write(¤t->mm->mmap_sem); - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma(current->mm, start); error = -ENOMEM; if (!vma) goto out; + prev = vma->vm_prev; if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; diff --git a/mm/mremap.c b/mm/mremap.c index 87bb8393e7d2..db8d983b5a7d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -329,7 +329,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & VM_ACCOUNT) { unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) + if (security_vm_enough_memory_mm(mm, charged)) goto Efault; *p = charged; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2958fd8e7c9a..46bf2ed5594c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -34,6 +34,7 @@ #include <linux/ptrace.h> #include <linux/freezer.h> #include <linux/ftrace.h> +#include <linux/ratelimit.h> #define CREATE_TRACE_POINTS #include <trace/events/oom.h> @@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, */ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long totalpages, struct mem_cgroup *memcg, - const nodemask_t *nodemask) + const nodemask_t *nodemask, bool force_kill) { struct task_struct *g, *p; struct task_struct *chosen = NULL; @@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (test_tsk_thread_flag(p, TIF_MEMDIE)) { if (unlikely(frozen(p))) __thaw_task(p); - return ERR_PTR(-1UL); + if (!force_kill) + return ERR_PTR(-1UL); } if (!p->mm) continue; @@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (p == current) { chosen = p; *ppoints = 1000; - } else { + } else if (!force_kill) { /* * If this task is not being ptraced on exit, * then wait for it to finish before killing @@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } #define K(x) ((x) << (PAGE_SHIFT-10)) -static int oom_kill_task(struct task_struct *p) -{ - struct task_struct *q; - struct mm_struct *mm; - - p = find_lock_task_mm(p); - if (!p) - return 1; - - /* mm cannot be safely dereferenced after task_unlock(p) */ - mm = p->mm; - - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", - task_pid_nr(p), p->comm, K(p->mm->total_vm), - K(get_mm_counter(p->mm, MM_ANONPAGES)), - K(get_mm_counter(p->mm, MM_FILEPAGES))); - task_unlock(p); - - /* - * Kill all user processes sharing p->mm in other thread groups, if any. - * They don't get access to memory reserves or a higher scheduler - * priority, though, to avoid depletion of all memory or task - * starvation. This prevents mm->mmap_sem livelock when an oom killed - * task cannot exit because it requires the semaphore and its contended - * by another thread trying to allocate memory itself. That thread will - * now get access to memory reserves since it has a pending fatal - * signal. - */ - for_each_process(q) - if (q->mm == mm && !same_thread_group(q, p) && - !(q->flags & PF_KTHREAD)) { - if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - continue; - - task_lock(q); /* Protect ->comm from prctl() */ - pr_err("Kill process %d (%s) sharing same memory\n", - task_pid_nr(q), q->comm); - task_unlock(q); - force_sig(SIGKILL, q); - } - - set_tsk_thread_flag(p, TIF_MEMDIE); - force_sig(SIGKILL, p); - - return 0; -} -#undef K - -static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, nodemask_t *nodemask, - const char *message) +static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *memcg, nodemask_t *nodemask, + const char *message) { struct task_struct *victim = p; struct task_struct *child; struct task_struct *t = p; + struct mm_struct *mm; unsigned int victim_points = 0; - - if (printk_ratelimit()) - dump_header(p, gfp_mask, order, memcg, nodemask); + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ if (p->flags & PF_EXITING) { set_tsk_thread_flag(p, TIF_MEMDIE); - return 0; + return; } + if (__ratelimit(&oom_rs)) + dump_header(p, gfp_mask, order, memcg, nodemask); + task_lock(p); pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", message, task_pid_nr(p), p->comm, points); @@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, } } while_each_thread(p, t); - return oom_kill_task(victim); + victim = find_lock_task_mm(victim); + if (!victim) + return; + + /* mm cannot safely be dereferenced after task_unlock(victim) */ + mm = victim->mm; + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), + K(get_mm_counter(victim->mm, MM_ANONPAGES)), + K(get_mm_counter(victim->mm, MM_FILEPAGES))); + task_unlock(victim); + + /* + * Kill all user processes sharing victim->mm in other thread groups, if + * any. They don't get access to memory reserves, though, to avoid + * depletion of all memory. This prevents mm->mmap_sem livelock when an + * oom killed thread cannot exit because it requires the semaphore and + * its contended by another thread trying to allocate memory itself. + * That thread will now get access to memory reserves since it has a + * pending fatal signal. + */ + for_each_process(p) + if (p->mm == mm && !same_thread_group(p, victim) && + !(p->flags & PF_KTHREAD)) { + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + continue; + + task_lock(p); /* Protect ->comm from prctl() */ + pr_err("Kill process %d (%s) sharing same memory\n", + task_pid_nr(p), p->comm); + task_unlock(p); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + } + + set_tsk_thread_flag(victim, TIF_MEMDIE); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); } +#undef K /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. @@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, } #ifdef CONFIG_CGROUP_MEM_RES_CTLR -void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) +void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) { unsigned long limit; unsigned int points = 0; @@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) return; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; read_lock(&tasklist_lock); -retry: - p = select_bad_process(&points, limit, memcg, NULL); - if (!p || PTR_ERR(p) == -1UL) - goto out; - - if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, - "Memory cgroup out of memory")) - goto retry; -out: + p = select_bad_process(&points, limit, memcg, NULL, false); + if (p && PTR_ERR(p) != -1UL) + oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, + "Memory cgroup out of memory"); read_unlock(&tasklist_lock); } #endif @@ -700,6 +689,7 @@ static void clear_system_oom(void) * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) @@ -707,7 +697,7 @@ static void clear_system_oom(void) * don't have to be perfect here, we just have to be good. */ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask) + int order, nodemask_t *nodemask, bool force_kill) { const nodemask_t *mpol_mask; struct task_struct *p; @@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, if (sysctl_oom_kill_allocating_task && !oom_unkillable_task(current, NULL, nodemask) && current->mm) { - /* - * oom_kill_process() needs tasklist_lock held. If it returns - * non-zero, current could not be killed so we must fallback to - * the tasklist scan. - */ - if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, - NULL, nodemask, - "Out of memory (oom_kill_allocating_task)")) - goto out; - } - -retry: - p = select_bad_process(&points, totalpages, NULL, mpol_mask); - if (PTR_ERR(p) == -1UL) + oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, + nodemask, + "Out of memory (oom_kill_allocating_task)"); goto out; + } + p = select_bad_process(&points, totalpages, NULL, mpol_mask, + force_kill); /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { dump_header(NULL, gfp_mask, order, NULL, mpol_mask); read_unlock(&tasklist_lock); panic("Out of memory and no killable processes...\n"); } - - if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, - nodemask, "Out of memory")) - goto retry; - killed = 1; + if (PTR_ERR(p) != -1UL) { + oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, + nodemask, "Out of memory"); + killed = 1; + } out: read_unlock(&tasklist_lock); @@ -792,7 +774,7 @@ out: void pagefault_out_of_memory(void) { if (try_set_system_oom()) { - out_of_memory(NULL, 0, 0, NULL); + out_of_memory(NULL, 0, 0, NULL, false); clear_system_oom(); } if (!test_thread_flag(TIF_MEMDIE)) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 363ba7082ef5..26adea8ca2e7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -95,6 +95,8 @@ unsigned long vm_dirty_bytes; */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ +EXPORT_SYMBOL_GPL(dirty_writeback_interval); + /* * The longest time for which data is allowed to remain dirty */ @@ -1472,6 +1474,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) for ( ; ; ) { global_dirty_limits(&background_thresh, &dirty_thresh); + dirty_thresh = hard_dirty_limit(dirty_thresh); /* * Boost the allowable dirty threshold a bit for page diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a13ded1938f0..a712fb9e04ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * Note that this code is protected against sending an IPI to an offline + * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: + * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but + * nothing keeps CPUs from showing up after we populated the cpumask and + * before the call to on_each_cpu_mask(). */ void drain_all_pages(void) { - on_each_cpu(drain_local_pages, NULL, 1); + int cpu; + struct per_cpu_pageset *pcp; + struct zone *zone; + + /* + * Allocate in the BSS so we wont require allocation in + * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y + */ + static cpumask_t cpus_with_pcps; + + /* + * We don't care about racing with CPU hotplug event + * as offline notification will cause the notified + * cpu to drain that CPU pcps and on_each_cpu_mask + * disables preemption as part of its processing + */ + for_each_online_cpu(cpu) { + bool has_pcps = false; + for_each_populated_zone(zone) { + pcp = per_cpu_ptr(zone->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } + } + if (has_pcps) + cpumask_set_cpu(cpu, &cpus_with_pcps); + else + cpumask_clear_cpu(cpu, &cpus_with_pcps); + } + on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); } #ifdef CONFIG_HIBERNATION @@ -1968,7 +2004,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order, nodemask); + out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: clear_zonelist_oom(zonelist, gfp_mask); @@ -1990,7 +2026,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (!order) return NULL; - if (compaction_deferred(preferred_zone)) { + if (compaction_deferred(preferred_zone, order)) { *deferred_compaction = true; return NULL; } @@ -2012,6 +2048,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (page) { preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; + if (order >= preferred_zone->compact_order_failed) + preferred_zone->compact_order_failed = order + 1; count_vm_event(COMPACTSUCCESS); return page; } @@ -2028,7 +2066,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * defer if the failure was a sync compaction failure. */ if (sync_migration) - defer_compaction(preferred_zone); + defer_compaction(preferred_zone, order); cond_resched(); } @@ -2306,6 +2344,10 @@ rebalance: if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) goto nopage; + /* Coredumps can quickly deplete all memory reserves */ + if ((current->flags & PF_DUMPCORE) && + !(gfp_mask & __GFP_NOFAIL)) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, @@ -2378,8 +2420,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; - struct page *page; + struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); + unsigned int cpuset_mems_cookie; gfp_mask &= gfp_allowed_mask; @@ -2398,15 +2441,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); - if (!preferred_zone) { - put_mems_allowed(); - return NULL; - } + if (!preferred_zone) + goto out; /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -2416,9 +2459,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); + +out: + /* + * When updating a task's mems_allowed, it is possible to race with + * parallel threads in such a way that an allocation can fail while + * the mask is being updated. If a page allocation is about to fail, + * check if the cpuset changed during allocation and if so, retry. + */ + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2632,13 +2685,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) bool skip_free_areas_node(unsigned int flags, int nid) { bool ret = false; + unsigned int cpuset_mems_cookie; if (!(flags & SHOW_MEM_FILTER_NODES)) goto out; - get_mems_allowed(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - put_mems_allowed(); + do { + cpuset_mems_cookie = get_mems_allowed(); + ret = !node_isset(nid, cpuset_current_mems_allowed); + } while (!put_mems_allowed(cpuset_mems_cookie)); out: return ret; } @@ -3925,18 +3980,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) } } -int __init add_from_early_node_map(struct range *range, int az, - int nr_range, int nid) -{ - unsigned long start_pfn, end_pfn; - int i; - - /* need to go over early_node_map to find out good range for node */ - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) - nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); - return nr_range; -} - /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -4521,7 +4564,7 @@ static unsigned long __init early_calculate_totalpages(void) * memory. When they don't, some nodes will have more kernelcore than * others */ -static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) +static void __init find_zone_movable_pfns_for_nodes(void) { int i, nid; unsigned long usable_startpfn; @@ -4713,7 +4756,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); - find_zone_movable_pfns_for_nodes(zone_movable_pfn); + find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); @@ -4823,6 +4866,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, int cpu = (unsigned long)hcpu; if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + lru_add_drain_cpu(cpu); drain_pages(cpu); /* diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index de1616aa9b1e..1ccbd714059c 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -379,13 +379,15 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, pgoff_t offset = swp_offset(ent); struct swap_cgroup_ctrl *ctrl; struct page *mappage; + struct swap_cgroup *sc; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; if (ctrlp) *ctrlp = ctrl; mappage = ctrl->map[offset / SC_PER_PAGE]; - return page_address(mappage) + offset % SC_PER_PAGE; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; } /** diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2f5cf10ff660..aa9701e12714 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -59,7 +59,7 @@ again: continue; split_huge_page_pmd(walk->mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); if (err) diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 12a48a88c0d8..405d331804c3 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -184,8 +184,7 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, page_end - page_start); } - for (i = page_start; i < page_end; i++) - __clear_bit(i, populated); + bitmap_clear(populated, page_start, page_end - page_start); } /** diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index eb663fb533e0..5a74fea182f1 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; -#ifndef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#else BUG(); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); diff --git a/mm/rmap.c b/mm/rmap.c index c8454e06b6c8..5b5ad584ffb7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } +static void anon_vma_chain_link(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) +{ + avc->vma = vma; + avc->anon_vma = anon_vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + + /* + * It's critical to add new vmas to the tail of the anon_vma, + * see comment in huge_memory.c:__split_huge_page(). + */ + list_add_tail(&avc->same_anon_vma, &anon_vma->head); +} + /** * anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question @@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - avc->anon_vma = anon_vma; - avc->vma = vma; - list_add(&avc->same_vma, &vma->anon_vma_chain); - list_add_tail(&avc->same_anon_vma, &anon_vma->head); + anon_vma_chain_link(vma, avc, anon_vma); allocated = NULL; avc = NULL; } @@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) mutex_unlock(&root->mutex); } -static void anon_vma_chain_link(struct vm_area_struct *vma, - struct anon_vma_chain *avc, - struct anon_vma *anon_vma) -{ - avc->vma = vma; - avc->anon_vma = anon_vma; - list_add(&avc->same_vma, &vma->anon_vma_chain); - - /* - * It's critical to add new vmas to the tail of the anon_vma, - * see comment in huge_memory.c:__split_huge_page(). - */ - list_add_tail(&avc->same_anon_vma, &anon_vma->head); -} - /* * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. @@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { + bool locked; + unsigned long flags; + + mem_cgroup_begin_update_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } + mem_cgroup_end_update_page_stat(page, &locked, &flags); } /** @@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + bool anon = PageAnon(page); + bool locked; + unsigned long flags; + + /* + * The anon case has no mem_cgroup page_stat to update; but may + * uncharge_page() below, where the lock ordering can deadlock if + * we hold the lock against page_stat move: so avoid it on anon. + */ + if (!anon) + mem_cgroup_begin_update_page_stat(page, &locked, &flags); + /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - return; + goto out; /* * Now that the last pte has gone, s390 must transfer dirty @@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page) * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. */ - if ((!PageAnon(page) || PageSwapCache(page)) && + if ((!anon || PageSwapCache(page)) && page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); /* @@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page) * and not charged by memcg for now. */ if (unlikely(PageHuge(page))) - return; - if (PageAnon(page)) { + goto out; + if (anon) { mem_cgroup_uncharge_page(page); if (!PageTransHuge(page)) __dec_zone_page_state(page, NR_ANON_PAGES); @@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ +out: + if (!anon) + mem_cgroup_end_update_page_stat(page, &locked, &flags); } /* @@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } dec_mm_counter(mm, MM_ANONPAGES); inc_mm_counter(mm, MM_SWAPENTS); - } else if (PAGE_MIGRATION) { + } else if (IS_ENABLED(CONFIG_MIGRATION)) { /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration @@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { + } else if (IS_ENABLED(CONFIG_MIGRATION) && + (TTU_ACTION(flags) == TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && + if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && is_vma_temporary_stack(vma)) continue; diff --git a/mm/shmem.c b/mm/shmem.c index 269d049294ab..f99ff3e50bd6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -127,7 +127,7 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) static inline int shmem_acct_size(unsigned long flags, loff_t size) { return (flags & VM_NORESERVE) ? - 0 : security_vm_enough_memory_kern(VM_ACCT(size)); + 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) @@ -145,7 +145,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) static inline int shmem_acct_block(unsigned long flags) { return (flags & VM_NORESERVE) ? - security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; + security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; } static inline void shmem_unacct_blocks(unsigned long flags, long pages) @@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; +#ifdef CONFIG_TMPFS_XATTR +static int shmem_initxattrs(struct inode *, const struct xattr *, void *); +#else +#define shmem_initxattrs NULL +#endif + static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (inode) { error = security_inode_init_security(inode, dir, &dentry->d_name, - NULL, NULL); + shmem_initxattrs, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); @@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return -ENOSPC; error = security_inode_init_security(inode, dir, &dentry->d_name, - NULL, NULL); + shmem_initxattrs, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); @@ -1656,9 +1662,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memcpy(kaddr, symname, len); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co * filesystem level, though. */ +/* + * Allocate new xattr and copy in the value; but leave the name to callers. + */ +static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) +{ + struct shmem_xattr *new_xattr; + size_t len; + + /* wrap around? */ + len = sizeof(*new_xattr) + size; + if (len <= sizeof(*new_xattr)) + return NULL; + + new_xattr = kmalloc(len, GFP_KERNEL); + if (!new_xattr) + return NULL; + + new_xattr->size = size; + memcpy(new_xattr->value, value, size); + return new_xattr; +} + +/* + * Callback for security_inode_init_security() for acquiring xattrs. + */ +static int shmem_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + const struct xattr *xattr; + struct shmem_xattr *new_xattr; + size_t len; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); + if (!new_xattr) + return -ENOMEM; + + len = strlen(xattr->name) + 1; + new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, + GFP_KERNEL); + if (!new_xattr->name) { + kfree(new_xattr); + return -ENOMEM; + } + + memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); + memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, + xattr->name, len); + + spin_lock(&info->lock); + list_add(&new_xattr->list, &info->xattr_list); + spin_unlock(&info->lock); + } + + return 0; +} + static int shmem_xattr_get(struct dentry *dentry, const char *name, void *buffer, size_t size) { @@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name, return ret; } -static int shmem_xattr_set(struct dentry *dentry, const char *name, +static int shmem_xattr_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_xattr *xattr; struct shmem_xattr *new_xattr = NULL; - size_t len; int err = 0; /* value == NULL means remove */ if (value) { - /* wrap around? */ - len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) - return -ENOMEM; - - new_xattr = kmalloc(len, GFP_KERNEL); + new_xattr = shmem_xattr_alloc(value, size); if (!new_xattr) return -ENOMEM; @@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name, kfree(new_xattr); return -ENOMEM; } - - new_xattr->size = size; - memcpy(new_xattr->value, value, size); } spin_lock(&info->lock); @@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, if (size == 0) value = ""; /* empty EA, do not remove */ - return shmem_xattr_set(dentry, name, value, size, flags); + return shmem_xattr_set(dentry->d_inode, name, value, size, flags); } @@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) if (err) return err; - return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); + return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); } static bool xattr_is_trusted(const char *name) @@ -2175,7 +2231,6 @@ static void shmem_put_super(struct super_block *sb) int shmem_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct dentry *root; struct shmem_sb_info *sbinfo; int err = -ENOMEM; @@ -2232,14 +2287,11 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) goto failed; inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; - root = d_alloc_root(inode); - if (!root) - goto failed_iput; - sb->s_root = root; + sb->s_root = d_make_root(inode); + if (!sb->s_root) + goto failed; return 0; -failed_iput: - iput(inode); failed: shmem_put_super(sb); return err; diff --git a/mm/slab.c b/mm/slab.c index f0bd7857ab3b..e901a36e2520 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1731,6 +1731,52 @@ static int __init cpucache_init(void) } __initcall(cpucache_init); +static noinline void +slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) +{ + struct kmem_list3 *l3; + struct slab *slabp; + unsigned long flags; + int node; + + printk(KERN_WARNING + "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nodeid, gfpflags); + printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + cachep->name, cachep->buffer_size, cachep->gfporder); + + for_each_online_node(node) { + unsigned long active_objs = 0, num_objs = 0, free_objects = 0; + unsigned long active_slabs = 0, num_slabs = 0; + + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + spin_lock_irqsave(&l3->list_lock, flags); + list_for_each_entry(slabp, &l3->slabs_full, list) { + active_objs += cachep->num; + active_slabs++; + } + list_for_each_entry(slabp, &l3->slabs_partial, list) { + active_objs += slabp->inuse; + active_slabs++; + } + list_for_each_entry(slabp, &l3->slabs_free, list) + num_slabs++; + + free_objects += l3->free_objects; + spin_unlock_irqrestore(&l3->list_lock, flags); + + num_slabs += active_slabs; + num_objs = num_slabs * cachep->num; + printk(KERN_WARNING + " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + node, active_slabs, num_slabs, active_objs, num_objs, + free_objects); + } +} + /* * Interface to system's page allocator. No need to hold the cache-lock. * @@ -1757,8 +1803,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) flags |= __GFP_RECLAIMABLE; page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); - if (!page) + if (!page) { + if (!(flags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(cachep, flags, nodeid); return NULL; + } nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -3284,12 +3333,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_mem_id(); - get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); - put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3312,14 +3359,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; int nid; + unsigned int cpuset_mems_cookie; if (flags & __GFP_THISNODE) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + retry: /* * Look through allowed nodes for objects available @@ -3372,7 +3422,9 @@ retry: } } } - put_mems_allowed(); + + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + goto retry_cpuset; return obj; } @@ -3693,13 +3745,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, if (likely(ac->avail < ac->limit)) { STATS_INC_FREEHIT(cachep); - ac->entry[ac->avail++] = objp; - return; } else { STATS_INC_FREEMISS(cachep); cache_flusharray(cachep, ac); - ac->entry[ac->avail++] = objp; } + + ac->entry[ac->avail++] = objp; } /** diff --git a/mm/slub.c b/mm/slub.c index 4907563ef7ff..ffe13fdf8144 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -29,6 +29,7 @@ #include <linux/math64.h> #include <linux/fault-inject.h> #include <linux/stacktrace.h> +#include <linux/prefetch.h> #include <trace/events/kmem.h> @@ -269,6 +270,11 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return *(void **)(object + s->offset); } +static void prefetch_freepointer(const struct kmem_cache *s, void *object) +{ + prefetch(object + s->offset); +} + static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { void *p; @@ -1560,6 +1566,7 @@ static void *get_partial_node(struct kmem_cache *s, } else { page->freelist = t; available = put_cpu_partial(s, page, 0); + stat(s, CPU_PARTIAL_NODE); } if (kmem_cache_debug(s) || available > s->cpu_partial / 2) break; @@ -1581,6 +1588,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *object; + unsigned int cpuset_mems_cookie; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1604,23 +1612,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - struct kmem_cache_node *n; - - n = get_node(s, zone_to_nid(zone)); - - if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c); - if (object) { - put_mems_allowed(); - return object; + do { + cpuset_mems_cookie = get_mems_allowed(); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(zone)); + + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > s->min_partial) { + object = get_partial_node(s, n, c); + if (object) { + /* + * Return the object even if + * put_mems_allowed indicated that + * the cpuset mems_allowed was + * updated in parallel. It's a + * harmless race between the alloc + * and the cpuset update. + */ + put_mems_allowed(cpuset_mems_cookie); + return object; + } } } - } - put_mems_allowed(); + } while (!put_mems_allowed(cpuset_mems_cookie)); #endif return NULL; } @@ -1973,6 +1990,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) local_irq_restore(flags); pobjects = 0; pages = 0; + stat(s, CPU_PARTIAL_DRAIN); } } @@ -1984,7 +2002,6 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page->next = oldpage; } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); - stat(s, CPU_PARTIAL_FREE); return pobjects; } @@ -2018,9 +2035,17 @@ static void flush_cpu_slab(void *d) __flush_cpu_slab(s, smp_processor_id()); } +static bool has_cpu_slab(int cpu, void *info) +{ + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return !!(c->page); +} + static void flush_all(struct kmem_cache *s) { - on_each_cpu(flush_cpu_slab, s, 1); + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); } /* @@ -2309,6 +2334,8 @@ redo: object = __slab_alloc(s, gfpflags, node, addr, c); else { + void *next_object = get_freepointer_safe(s, object); + /* * The cmpxchg will only match if there was no additional * operation and if we are on the right processor. @@ -2324,11 +2351,12 @@ redo: if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, object, tid, - get_freepointer_safe(s, object), next_tid(tid)))) { + next_object, next_tid(tid)))) { note_cmpxchg_failure("slab_alloc", s, tid); goto redo; } + prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } @@ -2465,9 +2493,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * If we just froze the page then put it onto the * per cpu partial list. */ - if (new.frozen && !was_frozen) + if (new.frozen && !was_frozen) { put_cpu_partial(s, page, 1); - + stat(s, CPU_PARTIAL_FREE); + } /* * The list lock was not taken therefore no list * activity can be necessary. @@ -3929,13 +3958,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, if (kmem_cache_open(s, n, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); + up_write(&slub_lock); if (sysfs_slab_add(s)) { + down_write(&slub_lock); list_del(&s->list); kfree(n); kfree(s); goto err; } - up_write(&slub_lock); return s; } kfree(n); @@ -5059,6 +5089,8 @@ STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); +STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); +STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); #endif static struct attribute *slab_attrs[] = { @@ -5124,6 +5156,8 @@ static struct attribute *slab_attrs[] = { &cmpxchg_double_cpu_fail_attr.attr, &cpu_partial_alloc_attr.attr, &cpu_partial_free_attr.attr, + &cpu_partial_node_attr.attr, + &cpu_partial_drain_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, diff --git a/mm/sparse.c b/mm/sparse.c index 61d7cde23111..a8bc7d364deb 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), usemap_count); - if (usemap) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[pnum] = usemap; - usemap += size; + if (!usemap) { + usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); + if (!usemap) { + printk(KERN_WARNING "%s: allocation failed\n", __func__); + return; } - return; } - usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); - if (usemap) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[pnum] = usemap; - usemap += size; - check_usemap_section_nr(nodeid, usemap_map[pnum]); - } - return; + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + check_usemap_section_nr(nodeid, usemap_map[pnum]); } - - printk(KERN_WARNING "%s: allocation failed\n", __func__); } #ifndef CONFIG_SPARSEMEM_VMEMMAP diff --git a/mm/swap.c b/mm/swap.c index fff1ff7fb9ad..5c13f1338972 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg) * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ -static void drain_cpu_pagevecs(int cpu) +void lru_add_drain_cpu(int cpu) { struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); struct pagevec *pvec; @@ -553,7 +553,7 @@ void deactivate_page(struct page *page) void lru_add_drain(void) { - drain_cpu_pagevecs(get_cpu()); + lru_add_drain_cpu(get_cpu()); put_cpu(); } @@ -652,7 +652,7 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct zone* zone, struct page *page, struct page *page_tail) { - int active; + int uninitialized_var(active); enum lru_list lru; const int file = 0; @@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone, active = 0; lru = LRU_INACTIVE_ANON; } - update_page_reclaim_stat(zone, page_tail, file, active); } else { SetPageUnevictable(page_tail); lru = LRU_UNEVICTABLE; @@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone, list_head = page_tail->lru.prev; list_move_tail(&page_tail->lru, list_head); } + + if (!PageUnevictable(page)) + update_page_reclaim_stat(zone, page_tail, file, active); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg) SetPageLRU(page); if (active) SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); add_page_to_lru_list(zone, page, lru); + update_page_reclaim_stat(zone, page, file, active); } /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 470038a91873..9d3dd3763cf7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ - /* - * The memcg-specific accounting when moving - * pages around the LRU lists relies on the - * page's owner (memcg) to be valid. Usually, - * pages are assigned to a new owner before - * being put on the LRU list, but since this - * is not the case here, the stale owner from - * a previous allocation cycle must be reset. - */ - mem_cgroup_reset_owner(new_page); } /* @@ -382,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { - int nr_pages; struct page *page; - unsigned long offset; - unsigned long end_offset; + unsigned long offset = swp_offset(entry); + unsigned long start_offset, end_offset; + unsigned long mask = (1UL << page_cluster) - 1; - /* - * Get starting offset for readaround, and number of pages to read. - * Adjust starting address by readbehind (for NUMA interleave case)? - * No, it's very unlikely that swap layout would follow vma layout, - * more likely that neighbouring swap pages came from the same node: - * so use the same "addr" to choose the same node for each swap read. - */ - nr_pages = valid_swaphandles(entry, &offset); - for (end_offset = offset + nr_pages; offset < end_offset; offset++) { + /* Read a page_cluster sized and aligned cluster around offset. */ + start_offset = offset & ~mask; + end_offset = offset | mask; + if (!start_offset) /* First page is swap header. */ + start_offset++; + + for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = read_swap_cache_async(swp_entry(swp_type(entry), offset), gfp_mask, vma, addr); if (!page) - break; + continue; page_cache_release(page); } lru_add_drain(); /* Push any new pages onto the LRU now */ diff --git a/mm/swapfile.c b/mm/swapfile.c index d999f090dfda..fafc26d1b1dc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (unlikely(pmd_trans_huge(*pmd))) - continue; - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); if (ret) @@ -1563,6 +1561,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + BUG_ON(!current->mm); + pathname = getname(specialfile); err = PTR_ERR(pathname); if (IS_ERR(pathname)) @@ -1590,7 +1590,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (!security_vm_enough_memory(p->pages)) + if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; @@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct page *page = NULL; struct inode *inode = NULL; + if (swap_flags & ~SWAP_FLAGS_VALID) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2105,7 +2108,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SOLIDSTATE; p->cluster_next = 1 + (random32() % p->highest_bit); } - if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) + if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) p->flags |= SWP_DISCARDABLE; } @@ -2290,58 +2293,6 @@ int swapcache_prepare(swp_entry_t entry) } /* - * swap_lock prevents swap_map being freed. Don't grab an extra - * reference on the swaphandle, it doesn't matter if it becomes unused. - */ -int valid_swaphandles(swp_entry_t entry, unsigned long *offset) -{ - struct swap_info_struct *si; - int our_page_cluster = page_cluster; - pgoff_t target, toff; - pgoff_t base, end; - int nr_pages = 0; - - if (!our_page_cluster) /* no readahead */ - return 0; - - si = swap_info[swp_type(entry)]; - target = swp_offset(entry); - base = (target >> our_page_cluster) << our_page_cluster; - end = base + (1 << our_page_cluster); - if (!base) /* first page is swap header */ - base++; - - spin_lock(&swap_lock); - if (end > si->max) /* don't go beyond end of map */ - end = si->max; - - /* Count contiguous allocated slots above our target */ - for (toff = target; ++toff < end; nr_pages++) { - /* Don't read in free or bad pages */ - if (!si->swap_map[toff]) - break; - if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) - break; - } - /* Count contiguous allocated slots below our target */ - for (toff = target; --toff >= base; nr_pages++) { - /* Don't read in free or bad pages */ - if (!si->swap_map[toff]) - break; - if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) - break; - } - spin_unlock(&swap_lock); - - /* - * Indicate starting offset, and return number of pages to get: - * if only 1, say 0, since there's then no readahead to be done. - */ - *offset = ++toff; - return nr_pages? ++nr_pages: 0; -} - -/* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count @@ -2427,9 +2378,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) if (!(count & COUNT_CONTINUED)) goto out; - map = kmap_atomic(list_page, KM_USER0) + offset; + map = kmap_atomic(list_page) + offset; count = *map; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); /* * If this continuation count now has some space in it, @@ -2472,7 +2423,7 @@ static bool swap_count_continued(struct swap_info_struct *si, offset &= ~PAGE_MASK; page = list_entry(head->lru.next, struct page, lru); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ @@ -2482,26 +2433,26 @@ static bool swap_count_continued(struct swap_info_struct *si, * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); BUG_ON(page == head); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); if (page == head) return false; /* add count continuation */ - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); while (page != head) { - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } return true; /* incremented */ @@ -2512,22 +2463,22 @@ init_map: *map = 0; /* we didn't zero the page */ */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.next, struct page, lru); BUG_ON(page == head); - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); while (page != head) { - map = kmap_atomic(page, KM_USER0) + offset; + map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); page = list_entry(page->lru.prev, struct page, lru); } return count == COUNT_CONTINUED; diff --git a/mm/truncate.c b/mm/truncate.c index 632b15e29f74..61a183b89df6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -52,7 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); - cleancache_flush_page(page->mapping, page); + cleancache_invalidate_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -184,7 +184,7 @@ int invalidate_inode_page(struct page *page) } /** - * truncate_inode_pages - truncate range of pages specified by start & end byte offsets + * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate @@ -213,7 +213,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t end; int i; - cleancache_flush_inode(mapping); + cleancache_invalidate_inode(mapping); if (mapping->nrpages == 0) return; @@ -292,7 +292,7 @@ void truncate_inode_pages_range(struct address_space *mapping, mem_cgroup_uncharge_end(); index++; } - cleancache_flush_inode(mapping); + cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -444,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int ret2 = 0; int did_range_unmap = 0; - cleancache_flush_inode(mapping); + cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; while (index <= end && pagevec_lookup(&pvec, mapping, index, @@ -500,7 +500,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cond_resched(); index++; } - cleancache_flush_inode(mapping); + cleancache_invalidate_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); @@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) return 0; } + +/** + * truncate_pagecache_range - unmap and remove pagecache that is hole-punched + * @inode: inode + * @lstart: offset of beginning of hole + * @lend: offset of last byte of hole + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t unmap_start = round_up(lstart, PAGE_SIZE); + loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; + /* + * This rounding is currently just for example: unmap_mapping_range + * expands its hole outwards, whereas we want it to contract the hole + * inwards. However, existing callers of truncate_pagecache_range are + * doing their own page rounding first; and truncate_inode_pages_range + * currently BUGs if lend is not pagealigned-1 (it handles partial + * page at start of hole, but not partial page at end of hole). Note + * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. + */ + + /* + * Unlike in truncate_pagecache, unmap_mapping_range is called only + * once (before truncating pagecache), and without "even_cows" flag: + * hole-punching should not remove private COWed pages from the hole. + */ + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + truncate_inode_pages_range(mapping, lstart, lend); +} +EXPORT_SYMBOL(truncate_pagecache_range); diff --git a/mm/util.c b/mm/util.c index 136ac4f322b8..ae962b31de88 100644 --- a/mm/util.c +++ b/mm/util.c @@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, next->vm_prev = vma; } +/* Check if the vma is being used as a stack by this task */ +static int vm_is_stack_for_task(struct task_struct *t, + struct vm_area_struct *vma) +{ + return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); +} + +/* + * Check if the vma is being used as a stack. + * If is_group is non-zero, check in the entire thread group or else + * just check in the current task. Returns the pid of the task that + * the vma is stack for. + */ +pid_t vm_is_stack(struct task_struct *task, + struct vm_area_struct *vma, int in_group) +{ + pid_t ret = 0; + + if (vm_is_stack_for_task(task, vma)) + return task->pid; + + if (in_group) { + struct task_struct *t; + rcu_read_lock(); + if (!pid_alive(task)) + goto done; + + t = task; + do { + if (vm_is_stack_for_task(t, vma)) { + ret = t->pid; + goto done; + } + } while_each_thread(task, t); +done: + rcu_read_unlock(); + } + + return ret; +} + #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 86ce9a526c17..94dff883b449 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1906,9 +1906,9 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) * we can expect USER0 is not used (see vread/vwrite's * function description) */ - void *map = kmap_atomic(p, KM_USER0); + void *map = kmap_atomic(p); memcpy(buf, map + offset, length); - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); } else memset(buf, 0, length); @@ -1945,9 +1945,9 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * we can expect USER0 is not used (see vread/vwrite's * function description) */ - void *map = kmap_atomic(p, KM_USER0); + void *map = kmap_atomic(p); memcpy(map + offset, buf, length); - kunmap_atomic(map, KM_USER0); + kunmap_atomic(map); } addr += length; buf += length; diff --git a/mm/vmscan.c b/mm/vmscan.c index c52b23552659..33c332bbab73 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * @mz: The mem_cgroup_zone to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. - * @order: The caller's attempted allocation order + * @sc: The scan_control struct for this reclaim session * @mode: One of the LRU isolation modes * @active: True [1] if isolating active pages * @file: True [1] if isolating file [!anon] pages @@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, struct list_head *dst, - unsigned long *nr_scanned, int order, isolate_mode_t mode, - int active, int file) + unsigned long *nr_scanned, struct scan_control *sc, + isolate_mode_t mode, int active, int file) { struct lruvec *lruvec; struct list_head *src; @@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, BUG(); } - if (!order) + if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) continue; /* @@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, */ zone_id = page_zone_id(page); page_pfn = page_to_pfn(page); - pfn = page_pfn & ~((1 << order) - 1); - end_pfn = pfn + (1 << order); + pfn = page_pfn & ~((1 << sc->order) - 1); + end_pfn = pfn + (1 << sc->order); for (; pfn < end_pfn; pfn++) { struct page *cursor_page; @@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(order, + trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, @@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz, unsigned long *nr_anon, unsigned long *nr_file) { - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); struct zone *zone = mz->zone; unsigned int count[NR_LRU_LISTS] = { 0, }; unsigned long nr_active = 0; @@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz, count[lru] += numpages; } + preempt_disable(); __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE_FILE, @@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz, *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - reclaim_stat->recent_scanned[0] += *nr_anon; - reclaim_stat->recent_scanned[1] += *nr_file; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); + preempt_enable(); } /* @@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, unsigned long nr_file; unsigned long nr_dirty = 0; unsigned long nr_writeback = 0; - isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; + isolate_mode_t isolate_mode = ISOLATE_INACTIVE; struct zone *zone = mz->zone; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, set_reclaim_mode(priority, sc, false); if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - reclaim_mode |= ISOLATE_ACTIVE; + isolate_mode |= ISOLATE_ACTIVE; lru_add_drain(); if (!sc->may_unmap) - reclaim_mode |= ISOLATE_UNMAPPED; + isolate_mode |= ISOLATE_UNMAPPED; if (!sc->may_writepage) - reclaim_mode |= ISOLATE_CLEAN; + isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, - &nr_scanned, sc->order, - reclaim_mode, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, + sc, isolate_mode, 0, file); if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); } + spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) { - spin_unlock_irq(&zone->lru_lock); + if (nr_taken == 0) return 0; - } update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); - __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); - - spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, &nr_dirty, &nr_writeback); @@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, spin_lock_irq(&zone->lru_lock); + reclaim_stat->recent_scanned[0] += nr_anon; + reclaim_stat->recent_scanned[1] += nr_file; + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); @@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone, unsigned long pgmoved = 0; struct page *page; - if (buffer_heads_over_limit) { - spin_unlock_irq(&zone->lru_lock); - list_for_each_entry(page, list, lru) { - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); - } - } - spin_lock_irq(&zone->lru_lock); - } - while (!list_empty(list)) { struct lruvec *lruvec; @@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan, struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); unsigned long nr_rotated = 0; - isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; + isolate_mode_t isolate_mode = ISOLATE_ACTIVE; struct zone *zone = mz->zone; lru_add_drain(); + reset_reclaim_mode(sc); + if (!sc->may_unmap) - reclaim_mode |= ISOLATE_UNMAPPED; + isolate_mode |= ISOLATE_UNMAPPED; if (!sc->may_writepage) - reclaim_mode |= ISOLATE_CLEAN; + isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, - &nr_scanned, sc->order, - reclaim_mode, 1, file); + nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, + isolate_mode, 1, file); if (global_reclaim(sc)) zone->pages_scanned += nr_scanned; @@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan, continue; } + if (unlikely(buffer_heads_over_limit)) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* @@ -2112,7 +2107,12 @@ restart: * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) + if (nr_reclaimed >= nr_to_reclaim) + nr_to_reclaim = 0; + else + nr_to_reclaim -= nr_reclaimed; + + if (!nr_to_reclaim && priority < DEF_PRIORITY) break; } blk_finish_plug(&plug); @@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone)) + if (compaction_deferred(zone, sc->order)) return watermark_ok; /* If compaction is not ready to start, keep reclaiming */ @@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, unsigned long nr_soft_scanned; bool aborted_reclaim = false; + /* + * If the number of buffer_heads in the machine exceeds the maximum + * allowed level, force direct reclaim to scan the highmem zone as + * highmem pages could be pinning lowmem pages storing buffer_heads + */ + if (buffer_heads_over_limit) + sc->gfp_mask |= __GFP_HIGHMEM; + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) @@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * Even though compaction is invoked for any * non-zero order, only frequent costly order * reclamation is disruptive enough to become a - * noticable problem, like transparent huge page - * allocations. + * noticeable problem, like transparent huge + * page allocations. */ if (compaction_ready(zone, sc)) { aborted_reclaim = true; @@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, unsigned long writeback_threshold; bool aborted_reclaim; - get_mems_allowed(); delayacct_freepages_start(); if (global_reclaim(sc)) @@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, out: delayacct_freepages_end(); - put_mems_allowed(); if (sc->nr_reclaimed) return sc->nr_reclaimed; @@ -2724,6 +2730,17 @@ loop_again: */ age_active_anon(zone, &sc, priority); + /* + * If the number of buffer_heads in the machine + * exceeds the maximum allowed level and this node + * has a highmem zone, force kswapd to reclaim from + * it to relieve lowmem pressure. + */ + if (buffer_heads_over_limit && is_highmem_idx(i)) { + end_zone = i; + break; + } + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; @@ -2753,7 +2770,7 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab; + int nr_slab, testorder; unsigned long balance_gap; if (!populated_zone(zone)) @@ -2786,7 +2803,21 @@ loop_again: (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO); - if (!zone_watermark_ok_safe(zone, order, + /* + * Kswapd reclaims only single pages with compaction + * enabled. Trying too hard to reclaim until contiguous + * free pages have become available can hurt performance + * by evicting too much useful data from memory. + * Do not reclaim more than needed for compaction. + */ + testorder = order; + if (COMPACTION_BUILD && order && + compaction_suitable(zone, order) != + COMPACT_SKIPPED) + testorder = 0; + + if ((buffer_heads_over_limit && is_highmem_idx(i)) || + !zone_watermark_ok_safe(zone, testorder, high_wmark_pages(zone) + balance_gap, end_zone, 0)) { shrink_zone(priority, zone, &sc); @@ -2815,7 +2846,7 @@ loop_again: continue; } - if (!zone_watermark_ok_safe(zone, order, + if (!zone_watermark_ok_safe(zone, testorder, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2903,6 +2934,8 @@ out: * and it is potentially going to sleep here. */ if (order) { + int zones_need_compaction = 1; + for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; @@ -2912,6 +2945,11 @@ out: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; + /* Would compaction fail due to lack of free memory? */ + if (COMPACTION_BUILD && + compaction_suitable(zone, order) == COMPACT_SKIPPED) + goto loop_again; + /* Confirm the zone is balanced for order-0 */ if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone), 0, 0)) { @@ -2919,11 +2957,17 @@ out: goto loop_again; } + /* Check if the memory needs to be defragmented. */ + if (zone_watermark_ok(zone, order, + low_wmark_pages(zone), *classzone_idx, 0)) + zones_need_compaction = 0; + /* If balanced, clear the congested flag */ zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= *classzone_idx) - balanced += zone->present_pages; } + + if (zones_need_compaction) + compact_pgdat(pgdat, order); } /* |