diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 52 | ||||
-rw-r--r-- | mm/compaction.c | 156 | ||||
-rw-r--r-- | mm/filemap.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 1 | ||||
-rw-r--r-- | mm/internal.h | 1 | ||||
-rw-r--r-- | mm/memblock.c | 2 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 1 | ||||
-rw-r--r-- | mm/page_alloc.c | 40 | ||||
-rw-r--r-- | mm/slab.c | 265 | ||||
-rw-r--r-- | mm/slab.h | 19 | ||||
-rw-r--r-- | mm/slab_common.c | 80 | ||||
-rw-r--r-- | mm/slob.c | 60 | ||||
-rw-r--r-- | mm/slub.c | 164 | ||||
-rw-r--r-- | mm/vmscan.c | 1 |
17 files changed, 447 insertions, 427 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6b4718e2ee34..b41823cc05e6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); LIST_HEAD(bdi_pending_list); -static struct task_struct *sync_supers_tsk; -static struct timer_list sync_supers_timer; - -static int bdi_sync_supers(void *); -static void sync_supers_timer_fn(unsigned long); - void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { @@ -250,12 +244,6 @@ static int __init default_bdi_init(void) { int err; - sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); - BUG_ON(IS_ERR(sync_supers_tsk)); - - setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); - bdi_arm_supers_timer(); - err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -/* - * kupdated() used to do this. We cannot do it from the bdi_forker_thread() - * or we risk deadlocking on ->s_umount. The longer term solution would be - * to implement sync_supers_bdi() or similar and simply do it from the - * bdi writeback thread individually. - */ -static int bdi_sync_supers(void *unused) -{ - set_user_nice(current, 0); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - - /* - * Do this periodically, like kupdated() did before. - */ - sync_supers(); - } - - return 0; -} - -void bdi_arm_supers_timer(void) -{ - unsigned long next; - - if (!dirty_writeback_interval) - return; - - next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; - mod_timer(&sync_supers_timer, round_jiffies_up(next)); -} - -static void sync_supers_timer_fn(unsigned long unused) -{ - wake_up_process(sync_supers_tsk); - bdi_arm_supers_timer(); -} - static void wakeup_timer_fn(unsigned long data) { struct backing_dev_info *bdi = (struct backing_dev_info *)data; diff --git a/mm/compaction.c b/mm/compaction.c index e78cb9688421..7fcd3a52e68d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -51,6 +51,47 @@ static inline bool migrate_async_suitable(int migratetype) } /* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. Check if the process needs to be scheduled or + * if the lock is contended. For async compaction, back out in the event + * if contention is severe. For sync compaction, schedule. + * + * Returns true if the lock is held. + * Returns false if the lock is released and compaction should abort + */ +static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, + bool locked, struct compact_control *cc) +{ + if (need_resched() || spin_is_contended(lock)) { + if (locked) { + spin_unlock_irqrestore(lock, *flags); + locked = false; + } + + /* async aborts if taking too long or contended */ + if (!cc->sync) { + if (cc->contended) + *cc->contended = true; + return false; + } + + cond_resched(); + if (fatal_signal_pending(current)) + return false; + } + + if (!locked) + spin_lock_irqsave(lock, *flags); + return true; +} + +static inline bool compact_trylock_irqsave(spinlock_t *lock, + unsigned long *flags, struct compact_control *cc) +{ + return compact_checklock_irqsave(lock, flags, false, cc); +} + +/* * Isolate free pages onto a private freelist. Caller must hold zone->lock. * If @strict is true, will abort returning 0 on any invalid PFNs or non-free * pages inside of the pageblock (even though it may still end up isolating @@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) } /* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) +static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) { struct page *page; unsigned int count[2] = { 0, }; @@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + /* If locked we can use the interrupt unsafe versions */ + if (locked) { + __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } else { + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } } /* Similar to reclaim, but different enough that they don't share logic */ @@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, struct list_head *migratelist = &cc->migratepages; isolate_mode_t mode = 0; struct lruvec *lruvec; + unsigned long flags; + bool locked; /* * Ensure that there are not too many pages isolated from the LRU @@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ cond_resched(); - spin_lock_irq(&zone->lru_lock); + spin_lock_irqsave(&zone->lru_lock, flags); + locked = true; for (; low_pfn < end_pfn; low_pfn++) { struct page *page; - bool locked = true; /* give a chance to irqs before checking need_resched() */ if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); locked = false; } - if (need_resched() || spin_is_contended(&zone->lru_lock)) { - if (locked) - spin_unlock_irq(&zone->lru_lock); - cond_resched(); - spin_lock_irq(&zone->lru_lock); - if (fatal_signal_pending(current)) - break; - } else if (!locked) - spin_lock_irq(&zone->lru_lock); + + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); + if (!locked) + break; /* * migrate_pfn does not necessarily start aligned to a @@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, } } - acct_isolated(zone, cc); + acct_isolated(zone, locked, cc); - spin_unlock_irq(&zone->lru_lock); + if (locked) + spin_unlock_irqrestore(&zone->lru_lock, flags); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -384,6 +431,20 @@ static bool suitable_migration_target(struct page *page) } /* + * Returns the start pfn of the last page block in a zone. This is the starting + * point for full compaction of a zone. Compaction searches for free pages from + * the end of each zone, while isolate_freepages_block scans forward inside each + * page block. + */ +static unsigned long start_free_pfn(struct zone *zone) +{ + unsigned long free_pfn; + free_pfn = zone->zone_start_pfn + zone->spanned_pages; + free_pfn &= ~(pageblock_nr_pages-1); + return free_pfn; +} + +/* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ @@ -422,17 +483,6 @@ static void isolate_freepages(struct zone *zone, pfn -= pageblock_nr_pages) { unsigned long isolated; - /* - * Skip ahead if another thread is compacting in the area - * simultaneously. If we wrapped around, we can only skip - * ahead if zone->compact_cached_free_pfn also wrapped to - * above our starting point. - */ - if (cc->order > 0 && (!cc->wrapped || - zone->compact_cached_free_pfn > - cc->start_free_pfn)) - pfn = min(pfn, zone->compact_cached_free_pfn); - if (!pfn_valid(pfn)) continue; @@ -458,7 +508,16 @@ static void isolate_freepages(struct zone *zone, * are disabled */ isolated = 0; - spin_lock_irqsave(&zone->lock, flags); + + /* + * The zone lock must be held to isolate freepages. This + * unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock + */ + if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) + break; if (suitable_migration_target(page)) { end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); isolated = isolate_freepages_block(pfn, end_pfn, @@ -474,7 +533,15 @@ static void isolate_freepages(struct zone *zone, */ if (isolated) { high_pfn = max(high_pfn, pfn); - if (cc->order > 0) + + /* + * If the free scanner has wrapped, update + * compact_cached_free_pfn to point to the highest + * pageblock with free pages. This reduces excessive + * scanning of full pageblocks near the end of the + * zone + */ + if (cc->order > 0 && cc->wrapped) zone->compact_cached_free_pfn = high_pfn; } } @@ -484,6 +551,11 @@ static void isolate_freepages(struct zone *zone, cc->free_pfn = high_pfn; cc->nr_freepages = nr_freepages; + + /* If compact_cached_free_pfn is reset then set it now */ + if (cc->order > 0 && !cc->wrapped && + zone->compact_cached_free_pfn == start_free_pfn(zone)) + zone->compact_cached_free_pfn = high_pfn; } /* @@ -570,20 +642,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, return ISOLATE_SUCCESS; } -/* - * Returns the start pfn of the last page block in a zone. This is the starting - * point for full compaction of a zone. Compaction searches for free pages from - * the end of each zone, while isolate_freepages_block scans forward inside each - * page block. - */ -static unsigned long start_free_pfn(struct zone *zone) -{ - unsigned long free_pfn; - free_pfn = zone->zone_start_pfn + zone->spanned_pages; - free_pfn &= ~(pageblock_nr_pages-1); - return free_pfn; -} - static int compact_finished(struct zone *zone, struct compact_control *cc) { @@ -771,7 +829,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync) + bool sync, bool *contended) { struct compact_control cc = { .nr_freepages = 0, @@ -780,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, + .contended = contended, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -801,7 +860,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -825,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync); + status = compact_zone_order(zone, order, gfp_mask, sync, + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -861,7 +921,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (cc->order > 0) { int ok = zone_watermark_ok(zone, cc->order, low_wmark_pages(zone), 0, 0); - if (ok && cc->order > zone->compact_order_failed) + if (ok && cc->order >= zone->compact_order_failed) zone->compact_order_failed = cc->order + 1; /* Currently async compaction is never deferred. */ else if (!ok && cc->sync) diff --git a/mm/filemap.c b/mm/filemap.c index fa5ca304148e..384344575c37 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); if (!retval) { - struct blk_plug plug; - - blk_start_plug(&plug); retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); - blk_finish_plug(&plug); } if (retval > 0) { *ppos = pos + retval; @@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); - blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } - blk_finish_plug(&plug); sb_end_write(inode->i_sb); return ret; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 57c4b9309015..141dbb695097 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1811,7 +1811,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); VM_BUG_ON(page_mapcount(src_page) != 1); - VM_BUG_ON(page_count(src_page) != 2); release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to diff --git a/mm/internal.h b/mm/internal.h index 3314f79d775a..b8c91b342e24 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -130,6 +130,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + bool *contended; /* True if a lock was contended */ }; unsigned long diff --git a/mm/memblock.c b/mm/memblock.c index 4d9393c7edc9..82aa349d2f7a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -246,7 +246,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, min(new_area_start, memblock.current_limit), new_alloc_size, PAGE_SIZE); - new_array = addr ? __va(addr) : 0; + new_array = addr ? __va(addr) : NULL; } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3ad25f9d1fc1..6a5b90d0cfd7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -126,9 +126,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -187,9 +184,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) end_pfn = pfn + pgdat->node_spanned_pages; /* register_section info */ - for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) - register_page_bootmem_info_section(pfn); - + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other node. + */ + if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bd92431d4c49..4ada3be6e252 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2562,7 +2562,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) break; default: - BUG(); + return -EINVAL; } l = strlen(policy_modes[mode]); diff --git a/mm/mmap.c b/mm/mmap.c index e3e86914f11a..ae18a48e7e4e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1356,9 +1356,8 @@ out: } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); - if (file && uprobe_mmap(vma)) - /* matching probes but cannot insert */ - goto unmap_and_free_vma; + if (file) + uprobe_mmap(vma); return addr; @@ -2309,7 +2308,7 @@ void exit_mmap(struct mm_struct *mm) } vm_unacct_memory(nr_accounted); - BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e5363f34e025..5ad5ce23c1e0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1532,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - bdi_arm_supers_timer(); return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 009ac285fea7..c13ea7538891 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -584,7 +584,7 @@ static inline void __free_one_page(struct page *page, combined_idx = buddy_idx & page_idx; higher_page = page + (combined_idx - page_idx); buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = page + (buddy_idx - combined_idx); + higher_buddy = higher_page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -1928,6 +1928,17 @@ this_zone_full: zlc_active = 0; goto zonelist_scan; } + + if (page) + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was + * necessary to allocate the page. The expectation is + * that the caller is taking steps that will free more + * memory. The caller should avoid the page being used + * for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; } @@ -2091,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int migratetype, bool sync_migration, - bool *deferred_compaction, + bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { struct page *page; @@ -2106,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration); + nodemask, sync_migration, + contended_compaction); current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { @@ -2152,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int migratetype, bool sync_migration, - bool *deferred_compaction, + bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; @@ -2325,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; bool sync_migration = false; bool deferred_compaction = false; + bool contended_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2389,14 +2402,6 @@ rebalance: zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); if (page) { - /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was - * necessary to allocate the page. The expectation is - * that the caller is taking steps that will free more - * memory. The caller should avoid the page being used - * for !PFMEMALLOC purposes. - */ - page->pfmemalloc = true; goto got_pg; } } @@ -2422,6 +2427,7 @@ rebalance: nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, + &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2431,10 +2437,11 @@ rebalance: /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller - * has requested the system not be heavily disrupted, fail the - * allocation now instead of entering direct reclaim + * requested a movable allocation that does not heavily disrupt the + * system then fail the allocation instead of entering direct reclaim. */ - if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + if ((deferred_compaction || contended_compaction) && + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ @@ -2505,6 +2512,7 @@ rebalance: nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, + &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2569,8 +2577,6 @@ retry_cpuset: page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - else - page->pfmemalloc = false; trace_mm_page_alloc(page, order, gfp_mask, migratetype); diff --git a/mm/slab.c b/mm/slab.c index ca3849fe0584..87c55b0e3e0e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -570,9 +570,9 @@ static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ -static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; -static struct kmem_cache cache_cache = { - .nodelists = cache_cache_nodelists, +static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; +static struct kmem_cache kmem_cache_boot = { + .nodelists = kmem_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -795,6 +795,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, *left_over = slab_size - nr_objs*buffer_size - mgmt_size; } +#if DEBUG #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) static void __slab_error(const char *function, struct kmem_cache *cachep, @@ -805,6 +806,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, dump_stack(); add_taint(TAINT_BAD_PAGE); } +#endif /* * By default on NUMA we use alien caches to stage the freeing of @@ -969,7 +971,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, } /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 1; i < ac->avail; i++) { + for (i = 0; i < ac->avail; i++) { /* If a !PFMEMALLOC object is found, swap them */ if (!is_obj_pfmemalloc(ac->entry[i])) { objp = ac->entry[i]; @@ -986,7 +988,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, l3 = cachep->nodelists[numa_mem_id()]; if (!list_empty(&l3->slabs_free) && force_refill) { struct slab *slabp = virt_to_slab(objp); - ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem)); + ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); clear_obj_pfmemalloc(&objp); recheck_pfmemalloc_active(cachep, ac); return objp; @@ -1018,7 +1020,7 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, { if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_page(objp); + struct page *page = virt_to_head_page(objp); if (PageSlabPfmemalloc(page)) set_obj_pfmemalloc(&objp); } @@ -1587,15 +1589,17 @@ void __init kmem_cache_init(void) int order; int node; + kmem_cache = &kmem_cache_boot; + if (num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); if (i < MAX_NUMNODES) - cache_cache.nodelists[i] = NULL; + kmem_cache->nodelists[i] = NULL; } - set_up_list3s(&cache_cache, CACHE_CACHE); + set_up_list3s(kmem_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger @@ -1607,9 +1611,9 @@ void __init kmem_cache_init(void) /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: - * 1) initialize the cache_cache cache: it contains the struct - * kmem_cache structures of all caches, except cache_cache itself: - * cache_cache is statically allocated. + * 1) initialize the kmem_cache cache: it contains the struct + * kmem_cache structures of all caches, except kmem_cache itself: + * kmem_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_list3 structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. @@ -1618,43 +1622,43 @@ void __init kmem_cache_init(void) * An __init data area is used for the head array. * 3) Create the remaining kmalloc caches, with minimally sized * head arrays. - * 4) Replace the __init data head arrays for cache_cache and the first + * 4) Replace the __init data head arrays for kmem_cache and the first * kmalloc cache with kmalloc allocated arrays. - * 5) Replace the __init data for kmem_list3 for cache_cache and + * 5) Replace the __init data for kmem_list3 for kmem_cache and * the other cache's with kmalloc allocated memory. * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ node = numa_mem_id(); - /* 1) create the cache_cache */ + /* 1) create the kmem_cache */ INIT_LIST_HEAD(&slab_caches); - list_add(&cache_cache.list, &slab_caches); - cache_cache.colour_off = cache_line_size(); - cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; + list_add(&kmem_cache->list, &slab_caches); + kmem_cache->colour_off = cache_line_size(); + kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; + kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + + kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + nr_node_ids * sizeof(struct kmem_list3 *); - cache_cache.object_size = cache_cache.size; - cache_cache.size = ALIGN(cache_cache.size, + kmem_cache->object_size = kmem_cache->size; + kmem_cache->size = ALIGN(kmem_cache->object_size, cache_line_size()); - cache_cache.reciprocal_buffer_size = - reciprocal_value(cache_cache.size); + kmem_cache->reciprocal_buffer_size = + reciprocal_value(kmem_cache->size); for (order = 0; order < MAX_ORDER; order++) { - cache_estimate(order, cache_cache.size, - cache_line_size(), 0, &left_over, &cache_cache.num); - if (cache_cache.num) + cache_estimate(order, kmem_cache->size, + cache_line_size(), 0, &left_over, &kmem_cache->num); + if (kmem_cache->num) break; } - BUG_ON(!cache_cache.num); - cache_cache.gfporder = order; - cache_cache.colour = left_over / cache_cache.colour_off; - cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + + BUG_ON(!kmem_cache->num); + kmem_cache->gfporder = order; + kmem_cache->colour = left_over / kmem_cache->colour_off; + kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), cache_line_size()); /* 2+3) create the kmalloc caches */ @@ -1667,19 +1671,22 @@ void __init kmem_cache_init(void) * bug. */ - sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); + sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; + sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; + sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; + sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); + list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); if (INDEX_AC != INDEX_L3) { - sizes[INDEX_L3].cs_cachep = - __kmem_cache_create(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); + sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; + sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; + sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; + sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); + list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); } slab_early_init = 0; @@ -1693,20 +1700,23 @@ void __init kmem_cache_init(void) * allow tighter packing of the smaller caches. */ if (!sizes->cs_cachep) { - sizes->cs_cachep = __kmem_cache_create(names->name, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); + sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes->cs_cachep->name = names->name; + sizes->cs_cachep->size = sizes->cs_size; + sizes->cs_cachep->object_size = sizes->cs_size; + sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); + list_add(&sizes->cs_cachep->list, &slab_caches); } #ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = __kmem_cache_create( - names->name_dma, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| - SLAB_PANIC, - NULL); + sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + sizes->cs_dmacachep->name = names->name_dma; + sizes->cs_dmacachep->size = sizes->cs_size; + sizes->cs_dmacachep->object_size = sizes->cs_size; + sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; + __kmem_cache_create(sizes->cs_dmacachep, + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); + list_add(&sizes->cs_dmacachep->list, &slab_caches); #endif sizes++; names++; @@ -1717,15 +1727,15 @@ void __init kmem_cache_init(void) ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), + BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); + memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - cache_cache.array[smp_processor_id()] = ptr; + kmem_cache->array[smp_processor_id()] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); @@ -1746,7 +1756,7 @@ void __init kmem_cache_init(void) int nid; for_each_online_node(nid) { - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); + init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid); init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); @@ -2195,27 +2205,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -static void __kmem_cache_destroy(struct kmem_cache *cachep) -{ - int i; - struct kmem_list3 *l3; - - for_each_online_cpu(i) - kfree(cachep->array[i]); - - /* NUMA: free the list3 structures */ - for_each_online_node(i) { - l3 = cachep->nodelists[i]; - if (l3) { - kfree(l3->shared); - free_alien_cache(l3->alien); - kfree(l3); - } - } - kmem_cache_free(&cache_cache, cachep); -} - - /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2352,9 +2341,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * - * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting unloaded. - * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) @@ -2367,13 +2353,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ -struct kmem_cache * -__kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +int +__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { size_t left_over, slab_size, ralign; - struct kmem_cache *cachep = NULL; gfp_t gfp; + int err; + size_t size = cachep->size; #if DEBUG #if FORCED_DEBUG @@ -2445,8 +2431,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align, ralign = ARCH_SLAB_MINALIGN; } /* 3) caller mandated alignment */ - if (ralign < align) { - ralign = align; + if (ralign < cachep->align) { + ralign = cachep->align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) @@ -2454,21 +2440,14 @@ __kmem_cache_create (const char *name, size_t size, size_t align, /* * 4) Store it. */ - align = ralign; + cachep->align = ralign; if (slab_is_available()) gfp = GFP_KERNEL; else gfp = GFP_NOWAIT; - /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, gfp); - if (!cachep) - return NULL; - cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; - cachep->object_size = size; - cachep->align = align; #if DEBUG /* @@ -2492,8 +2471,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align, } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); + && cachep->object_size > cache_line_size() + && ALIGN(size, cachep->align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); size = PAGE_SIZE; } #endif @@ -2513,18 +2493,15 @@ __kmem_cache_create (const char *name, size_t size, size_t align, */ flags |= CFLGS_OFF_SLAB; - size = ALIGN(size, align); + size = ALIGN(size, cachep->align); - left_over = calculate_slab_order(cachep, size, align, flags); + left_over = calculate_slab_order(cachep, size, cachep->align, flags); + + if (!cachep->num) + return -E2BIG; - if (!cachep->num) { - printk(KERN_ERR - "kmem_cache_create: couldn't create cache %s.\n", name); - kmem_cache_free(&cache_cache, cachep); - return NULL; - } slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + + sizeof(struct slab), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then @@ -2552,8 +2529,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align, cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < align) - cachep->colour_off = align; + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; cachep->slab_size = slab_size; cachep->flags = flags; @@ -2574,12 +2551,11 @@ __kmem_cache_create (const char *name, size_t size, size_t align, */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } - cachep->ctor = ctor; - cachep->name = name; - if (setup_cpu_cache(cachep, gfp)) { - __kmem_cache_destroy(cachep); - return NULL; + err = setup_cpu_cache(cachep, gfp); + if (err) { + __kmem_cache_shutdown(cachep); + return err; } if (flags & SLAB_DEBUG_OBJECTS) { @@ -2592,9 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, slab_set_debugobj_lock_classes(cachep); } - /* cache setup completed, link it into the list */ - list_add(&cachep->list, &slab_caches); - return cachep; + return 0; } #if DEBUG @@ -2753,49 +2727,29 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); -/** - * kmem_cache_destroy - delete a cache - * @cachep: the cache to destroy - * - * Remove a &struct kmem_cache object from the slab cache. - * - * It is expected this function will be called by a module when it is - * unloaded. This will remove the cache completely, and avoid a duplicate - * cache being allocated each time a module is loaded and unloaded, if the - * module doesn't have persistent in-kernel storage across loads and unloads. - * - * The cache must be empty before calling this function. - * - * The caller must guarantee that no one will allocate memory from the cache - * during the kmem_cache_destroy(). - */ -void kmem_cache_destroy(struct kmem_cache *cachep) +int __kmem_cache_shutdown(struct kmem_cache *cachep) { - BUG_ON(!cachep || in_interrupt()); + int i; + struct kmem_list3 *l3; + int rc = __cache_shrink(cachep); - /* Find the cache in the chain of caches. */ - get_online_cpus(); - mutex_lock(&slab_mutex); - /* - * the chain is never empty, cache_cache is never destroyed - */ - list_del(&cachep->list); - if (__cache_shrink(cachep)) { - slab_error(cachep, "Can't free all objects"); - list_add(&cachep->list, &slab_caches); - mutex_unlock(&slab_mutex); - put_online_cpus(); - return; - } + if (rc) + return rc; - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) - rcu_barrier(); + for_each_online_cpu(i) + kfree(cachep->array[i]); - __kmem_cache_destroy(cachep); - mutex_unlock(&slab_mutex); - put_online_cpus(); + /* NUMA: free the list3 structures */ + for_each_online_node(i) { + l3 = cachep->nodelists[i]; + if (l3) { + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + } + } + return 0; } -EXPORT_SYMBOL(kmem_cache_destroy); /* * Get the memory for a slab management obj. @@ -3246,6 +3200,7 @@ force_grow: /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); + node = numa_mem_id(); /* no objects in sight? abort */ if (!x && (ac->avail == 0 || force_refill)) @@ -3328,7 +3283,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == &cache_cache) + if (cachep == kmem_cache) return false; return should_failslab(cachep->object_size, flags, cachep->flags); diff --git a/mm/slab.h b/mm/slab.h index db7848caaa25..7deeb449a301 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -25,9 +25,26 @@ extern enum slab_state slab_state; /* The slab cache mutex protects the management structures during changes */ extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ extern struct list_head slab_caches; -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* Functions provided by the slab allocators */ +extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); + +#ifdef CONFIG_SLUB +struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); +#else +static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *)) +{ return NULL; } +#endif + + +int __kmem_cache_shutdown(struct kmem_cache *); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 8cf8b4962d6c..9c217255ac49 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -22,6 +22,7 @@ enum slab_state slab_state; LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); +struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM static int kmem_cache_sanity_check(const char *name, size_t size) @@ -98,21 +99,92 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s = NULL; + int err = 0; get_online_cpus(); mutex_lock(&slab_mutex); - if (kmem_cache_sanity_check(name, size) == 0) - s = __kmem_cache_create(name, size, align, flags, ctor); + + if (!kmem_cache_sanity_check(name, size) == 0) + goto out_locked; + + + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) + goto out_locked; + + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (s) { + s->object_size = s->size = size; + s->align = align; + s->ctor = ctor; + s->name = kstrdup(name, GFP_KERNEL); + if (!s->name) { + kmem_cache_free(kmem_cache, s); + err = -ENOMEM; + goto out_locked; + } + + err = __kmem_cache_create(s, flags); + if (!err) { + + s->refcount = 1; + list_add(&s->list, &slab_caches); + + } else { + kfree(s->name); + kmem_cache_free(kmem_cache, s); + } + } else + err = -ENOMEM; + +out_locked: mutex_unlock(&slab_mutex); put_online_cpus(); - if (!s && (flags & SLAB_PANIC)) - panic("kmem_cache_create: Failed to create slab '%s'\n", name); + if (err) { + + if (flags & SLAB_PANIC) + panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", + name, err); + else { + printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + name, err); + dump_stack(); + } + + return NULL; + } return s; } EXPORT_SYMBOL(kmem_cache_create); +void kmem_cache_destroy(struct kmem_cache *s) +{ + get_online_cpus(); + mutex_lock(&slab_mutex); + s->refcount--; + if (!s->refcount) { + list_del(&s->list); + + if (!__kmem_cache_shutdown(s)) { + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); + + kfree(s->name); + kmem_cache_free(kmem_cache, s); + } else { + list_add(&s->list, &slab_caches); + printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", + s->name); + dump_stack(); + } + } + mutex_unlock(&slab_mutex); + put_online_cpus(); +} +EXPORT_SYMBOL(kmem_cache_destroy); + int slab_is_available(void) { return slab_state >= UP; diff --git a/mm/slob.c b/mm/slob.c index 8c00d22afc9e..a08e4681fd0d 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -529,44 +529,24 @@ size_t ksize(const void *block) } EXPORT_SYMBOL(ksize); -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - struct kmem_cache *c; + size_t align = c->size; - c = slob_alloc(sizeof(struct kmem_cache), - GFP_KERNEL, ARCH_KMALLOC_MINALIGN, NUMA_NO_NODE); - - if (c) { - c->name = name; - c->size = size; - if (flags & SLAB_DESTROY_BY_RCU) { - /* leave room for rcu footer at the end of object */ - c->size += sizeof(struct slob_rcu); - } - c->flags = flags; - c->ctor = ctor; - /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; - if (c->align < ARCH_SLAB_MINALIGN) - c->align = ARCH_SLAB_MINALIGN; - if (c->align < align) - c->align = align; - - kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); - c->refcount = 1; + if (flags & SLAB_DESTROY_BY_RCU) { + /* leave room for rcu footer at the end of object */ + c->size += sizeof(struct slob_rcu); } - return c; -} + c->flags = flags; + /* ignore alignment unless it's forced */ + c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + if (c->align < ARCH_SLAB_MINALIGN) + c->align = ARCH_SLAB_MINALIGN; + if (c->align < align) + c->align = align; -void kmem_cache_destroy(struct kmem_cache *c) -{ - kmemleak_free(c); - if (c->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - slob_free(c, sizeof(struct kmem_cache)); + return 0; } -EXPORT_SYMBOL(kmem_cache_destroy); void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { @@ -634,14 +614,28 @@ unsigned int kmem_cache_size(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_size); +int __kmem_cache_shutdown(struct kmem_cache *c) +{ + /* No way to check for remaining objects */ + return 0; +} + int kmem_cache_shrink(struct kmem_cache *d) { return 0; } EXPORT_SYMBOL(kmem_cache_shrink); +struct kmem_cache kmem_cache_boot = { + .name = "kmem_cache", + .size = sizeof(struct kmem_cache), + .flags = SLAB_PANIC, + .align = ARCH_KMALLOC_MINALIGN, +}; + void __init kmem_cache_init(void) { + kmem_cache = &kmem_cache_boot; slab_state = UP; } diff --git a/mm/slub.c b/mm/slub.c index f074f756405a..a0d698467f70 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -210,11 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *); static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) -{ - kfree(s->name); - kfree(s); -} +static inline void sysfs_slab_remove(struct kmem_cache *s) { } #endif @@ -626,7 +622,7 @@ static void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) +static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) { va_list args; char buf[100]; @@ -1531,12 +1527,13 @@ static inline void *acquire_slab(struct kmem_cache *s, } static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n, struct kmem_cache_cpu *c) +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + struct kmem_cache_cpu *c, gfp_t flags) { struct page *page, *page2; void *object = NULL; @@ -1552,9 +1549,13 @@ static void *get_partial_node(struct kmem_cache *s, spin_lock(&n->list_lock); list_for_each_entry_safe(page, page2, &n->partial, lru) { - void *t = acquire_slab(s, n, page, object == NULL); + void *t; int available; + if (!pfmemalloc_match(page, flags)) + continue; + + t = acquire_slab(s, n, page, object == NULL); if (!t) break; @@ -1621,7 +1622,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c); + object = get_partial_node(s, n, c, flags); if (object) { /* * Return the object even if @@ -1650,7 +1651,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - object = get_partial_node(s, get_node(s, searchnode), c); + object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) return object; @@ -1716,7 +1717,7 @@ static inline void note_cmpxchg_failure(const char *n, stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } -void init_kmem_cache_cpus(struct kmem_cache *s) +static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; @@ -1941,7 +1942,7 @@ static void unfreeze_partials(struct kmem_cache *s) * If we did not find a slot then simply move all the partials to the * per node partial list. */ -int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { struct page *oldpage; int pages; @@ -2622,6 +2623,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); + if (kmem_cache_debug(s) && page->slab != s) { + pr_err("kmem_cache_free: Wrong slab cache. %s but object" + " is from %s\n", page->slab->name, s->name); + WARN_ON_ONCE(1); + return; + } + slab_free(s, page, x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); @@ -3036,17 +3044,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) } -static int kmem_cache_open(struct kmem_cache *s, - const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *)) +static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { - memset(s, 0, kmem_size); - s->name = name; - s->ctor = ctor; - s->object_size = size; - s->align = align; - s->flags = kmem_cache_flags(size, flags, name, ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) @@ -3108,7 +3108,6 @@ static int kmem_cache_open(struct kmem_cache *s, else s->cpu_partial = 30; - s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -3116,16 +3115,16 @@ static int kmem_cache_open(struct kmem_cache *s, goto error; if (alloc_kmem_cache_cpus(s)) - return 1; + return 0; free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, oo_order(s->oo), + s->name, (unsigned long)s->size, s->size, oo_order(s->oo), s->offset, flags); - return 0; + return -EINVAL; } /* @@ -3147,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, sizeof(long), GFP_ATOMIC); if (!map) return; - slab_err(s, page, "%s", text); + slab_err(s, page, text, s->name); slab_lock(page); get_map(s, page, map); @@ -3179,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) discard_slab(s, page); } else { list_slab_objects(s, page, - "Objects remaining on kmem_cache_close()"); + "Objects remaining in %s on kmem_cache_close()"); } } } @@ -3192,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - free_percpu(s->cpu_slab); /* Attempt to free all objects */ for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3201,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } + free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); return 0; } -/* - * Close a cache and release the kmem_cache structure - * (must be used for caches created using kmem_cache_create) - */ -void kmem_cache_destroy(struct kmem_cache *s) +int __kmem_cache_shutdown(struct kmem_cache *s) { - mutex_lock(&slab_mutex); - s->refcount--; - if (!s->refcount) { - list_del(&s->list); - mutex_unlock(&slab_mutex); - if (kmem_cache_close(s)) { - printk(KERN_ERR "SLUB %s: %s called for cache that " - "still has objects.\n", s->name, __func__); - dump_stack(); - } - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); + int rc = kmem_cache_close(s); + + if (!rc) sysfs_slab_remove(s); - } else - mutex_unlock(&slab_mutex); + + return rc; } -EXPORT_SYMBOL(kmem_cache_destroy); /******************************************************************** * Kmalloc subsystem @@ -3236,8 +3221,6 @@ EXPORT_SYMBOL(kmem_cache_destroy); struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; EXPORT_SYMBOL(kmalloc_caches); -static struct kmem_cache *kmem_cache; - #ifdef CONFIG_ZONE_DMA static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; #endif @@ -3283,14 +3266,17 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, { struct kmem_cache *s; - s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + + s->name = name; + s->size = s->object_size = size; + s->align = ARCH_KMALLOC_MINALIGN; /* * This function is called with IRQs disabled during early-boot on * single CPU so there's no need to take slab_mutex here. */ - if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL)) + if (kmem_cache_open(s, flags)) goto panic; list_add(&s->list, &slab_caches); @@ -3729,12 +3715,12 @@ void __init kmem_cache_init(void) slub_max_order = 0; kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); + nr_node_ids * sizeof(struct kmem_cache_node *); /* Allocate two kmem_caches from the page allocator */ kmalloc_size = ALIGN(kmem_size, cache_line_size()); order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); + kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); /* * Must first have the slab cache available for the allocations of the @@ -3743,9 +3729,10 @@ void __init kmem_cache_init(void) */ kmem_cache_node = (void *)kmem_cache + kmalloc_size; - kmem_cache_open(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache_node->name = "kmem_cache_node"; + kmem_cache_node->size = kmem_cache_node->object_size = + sizeof(struct kmem_cache_node); + kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -3753,8 +3740,10 @@ void __init kmem_cache_init(void) slab_state = PARTIAL; temp_kmem_cache = kmem_cache; - kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache->name = "kmem_cache"; + kmem_cache->size = kmem_cache->object_size = kmem_size; + kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); memcpy(kmem_cache, temp_kmem_cache, kmem_size); @@ -3943,11 +3932,10 @@ static struct kmem_cache *find_mergeable(size_t size, return NULL; } -struct kmem_cache *__kmem_cache_create(const char *name, size_t size, +struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - char *n; s = find_mergeable(size, align, flags, name, ctor); if (s) { @@ -3961,36 +3949,29 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, if (sysfs_slab_alias(s, name)) { s->refcount--; - return NULL; + s = NULL; } - return s; } - n = kstrdup(name, GFP_KERNEL); - if (!n) - return NULL; + return s; +} - s = kmalloc(kmem_size, GFP_KERNEL); - if (s) { - if (kmem_cache_open(s, n, - size, align, flags, ctor)) { - int r; +int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +{ + int err; - list_add(&s->list, &slab_caches); - mutex_unlock(&slab_mutex); - r = sysfs_slab_add(s); - mutex_lock(&slab_mutex); + err = kmem_cache_open(s, flags); + if (err) + return err; - if (!r) - return s; + mutex_unlock(&slab_mutex); + err = sysfs_slab_add(s); + mutex_lock(&slab_mutex); - list_del(&s->list); - kmem_cache_close(s); - } - kfree(s); - } - kfree(n); - return NULL; + if (err) + kmem_cache_close(s); + + return err; } #ifdef CONFIG_SMP @@ -5220,14 +5201,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, return err; } -static void kmem_cache_release(struct kobject *kobj) -{ - struct kmem_cache *s = to_slab(kobj); - - kfree(s->name); - kfree(s); -} - static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5235,7 +5208,6 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, - .release = kmem_cache_release }; static int uevent_filter(struct kset *kset, struct kobject *kobj) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8d01243d9560..99b434b674c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3102,6 +3102,7 @@ int kswapd_run(int nid) /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); printk("Failed to start kswapd on node %d\n",nid); + pgdat->kswapd = NULL; ret = -1; } return ret; |