From e92dd4fd1aa1cd081dac03973b33c972637d5b7a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 26 Mar 2010 19:27:58 -0700 Subject: slab: Fix continuation lines Signed-off-by: Joe Perches Signed-off-by: Pekka Enberg --- mm/slab.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a9f325b28bed..ceb4e3aa22f7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4227,10 +4227,11 @@ static int s_show(struct seq_file *m, void *p) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, - reaped, errors, max_freeable, node_allocs, - node_frees, overflows); + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " + "%4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees, overflows); } /* cpu stats */ { -- cgit v1.2.1 From 8f9f8d9e8080a2ff46caa7decef47810d093d252 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Sat, 27 Mar 2010 19:40:47 -0700 Subject: slab: add memory hotplug support Slab lacks any memory hotplug support for nodes that are hotplugged without cpus being hotplugged. This is possible at least on x86 CONFIG_MEMORY_HOTPLUG_SPARSE kernels where SRAT entries are marked ACPI_SRAT_MEM_HOT_PLUGGABLE and the regions of RAM represent a seperate node. It can also be done manually by writing the start address to /sys/devices/system/memory/probe for kernels that have CONFIG_ARCH_MEMORY_PROBE set, which is how this patch was tested, and then onlining the new memory region. When a node is hotadded, a nodelist for that node is allocated and initialized for each slab cache. If this isn't completed due to a lack of memory, the hotadd is aborted: we have a reasonable expectation that kmalloc_node(nid) will work for all caches if nid is online and memory is available. Since nodelists must be allocated and initialized prior to the new node's memory actually being online, the struct kmem_list3 is allocated off-node due to kmalloc_node()'s fallback. When an entire node would be offlined, its nodelists are subsequently drained. If slab objects still exist and cannot be freed, the offline is aborted. It is possible that objects will be allocated between this drain and page isolation, so it's still possible that the offline will still fail, however. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slab.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 125 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a9f325b28bed..3230cd2c6b3b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -115,6 +115,7 @@ #include #include #include +#include #include #include @@ -1102,6 +1103,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } #endif +/* + * Allocates and initializes nodelists for a node on each slab cache, used for + * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3 + * will be allocated off-node since memory is not yet online for the new node. + * When hotplugging memory or a cpu, existing nodelists are not replaced if + * already in use. + * + * Must hold cache_chain_mutex. + */ +static int init_cache_nodelists_node(int node) +{ + struct kmem_cache *cachep; + struct kmem_list3 *l3; + const int memsize = sizeof(struct kmem_list3); + + list_for_each_entry(cachep, &cache_chain, next) { + /* + * Set up the size64 kmemlist for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + if (!cachep->nodelists[node]) { + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) + return -ENOMEM; + kmem_list3_init(l3); + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + /* + * The l3s don't come and go as CPUs come and + * go. cache_chain_mutex is sufficient + * protection here. + */ + cachep->nodelists[node] = l3; + } + + spin_lock_irq(&cachep->nodelists[node]->list_lock); + cachep->nodelists[node]->free_limit = + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&cachep->nodelists[node]->list_lock); + } + return 0; +} + static void __cpuinit cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -1172,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu) struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); - const int memsize = sizeof(struct kmem_list3); + int err; /* * We need to do this right in the beginning since @@ -1180,35 +1227,9 @@ static int __cpuinit cpuup_prepare(long cpu) * kmalloc_node allows us to add the slab to the right * kmem_list3 and not this cpu's kmem_list3 */ - - list_for_each_entry(cachep, &cache_chain, next) { - /* - * Set up the size64 kmemlist for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - if (!cachep->nodelists[node]) { - l3 = kmalloc_node(memsize, GFP_KERNEL, node); - if (!l3) - goto bad; - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - - /* - * The l3s don't come and go as CPUs come and - * go. cache_chain_mutex is sufficient - * protection here. - */ - cachep->nodelists[node] = l3; - } - - spin_lock_irq(&cachep->nodelists[node]->list_lock); - cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); - } + err = init_cache_nodelists_node(node); + if (err < 0) + goto bad; /* * Now we can go ahead with allocating the shared arrays and @@ -1331,11 +1352,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = { &cpuup_callback, NULL, 0 }; +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +/* + * Drains freelist for a node on each slab cache, used for memory hot-remove. + * Returns -EBUSY if all objects cannot be drained so that the node is not + * removed. + * + * Must hold cache_chain_mutex. + */ +static int __meminit drain_cache_nodelists_node(int node) +{ + struct kmem_cache *cachep; + int ret = 0; + + list_for_each_entry(cachep, &cache_chain, next) { + struct kmem_list3 *l3; + + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + drain_freelist(cachep, l3, l3->free_objects); + + if (!list_empty(&l3->slabs_full) || + !list_empty(&l3->slabs_partial)) { + ret = -EBUSY; + break; + } + } + return ret; +} + +static int __meminit slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int ret = 0; + int nid; + + nid = mnb->status_change_nid; + if (nid < 0) + goto out; + + switch (action) { + case MEM_GOING_ONLINE: + mutex_lock(&cache_chain_mutex); + ret = init_cache_nodelists_node(nid); + mutex_unlock(&cache_chain_mutex); + break; + case MEM_GOING_OFFLINE: + mutex_lock(&cache_chain_mutex); + ret = drain_cache_nodelists_node(nid); + mutex_unlock(&cache_chain_mutex); + break; + case MEM_ONLINE: + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } +out: + return ret ? notifier_from_errno(ret) : NOTIFY_OK; +} +#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ + /* * swap the static kmem_list3 with kmalloced memory */ -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, - int nodeid) +static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + int nodeid) { struct kmem_list3 *ptr; @@ -1580,6 +1665,14 @@ void __init kmem_cache_init_late(void) */ register_cpu_notifier(&cpucache_notifier); +#ifdef CONFIG_NUMA + /* + * Register a memory hotplug callback that initializes and frees + * nodelists. + */ + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +#endif + /* * The reap timers are started later, with a module init call: That part * of the kernel is not yet operational. -- cgit v1.2.1 From 5c5e3b33b7cb959a401f823707bee006caadd76e Mon Sep 17 00:00:00 2001 From: Shiyong Li Date: Mon, 12 Apr 2010 13:48:21 +0800 Subject: slab: Fix missing DEBUG_SLAB last user Even with SLAB_RED_ZONE and SLAB_STORE_USER enabled, kernel would NOT store redzone and last user data around allocated memory space if "arch cache line > sizeof(unsigned long long)". As a result, last user information is unexpectedly MISSED while dumping slab corruption log. This fix makes sure that redzone and last user tags get stored unless the required alignment breaks redzone's. Signed-off-by: Shiyong Li Signed-off-by: Pekka Enberg --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index bac0f4fcc216..525c66466469 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2220,8 +2220,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign < align) { ralign = align; } - /* disable debug if necessary */ - if (ralign > __alignof__(unsigned long long)) + /* disable debug if not aligning with REDZONE_ALIGN */ + if (ralign & (__alignof__(unsigned long long) - 1)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2247,8 +2247,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += sizeof(unsigned long long); - size += 2 * sizeof(unsigned long long); + cachep->obj_offset += align; + size += align + sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of -- cgit v1.2.1 From 1f0ce8b3dd667dca720a47869f8110c298f0e5b8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 19 May 2010 12:01:42 +0100 Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to Acked-by: Herbert Xu Signed-off-by: David Woodhouse Signed-off-by: Pekka Enberg --- mm/slab.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index bac0f4fcc216..7401ddc24306 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -144,30 +144,6 @@ #define BYTES_PER_WORD sizeof(void *) #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) -#ifndef ARCH_KMALLOC_MINALIGN -/* - * Enforce a minimum alignment for the kmalloc caches. - * Usually, the kmalloc caches are cache_line_size() aligned, except when - * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. - * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than the alignment of a 64-bit integer. - * ARCH_KMALLOC_MINALIGN allows that. - * Note that increasing this value may disable some debug features. - */ -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -/* - * Enforce a minimum alignment for all caches. - * Intended for archs that get misalignment faults even for BYTES_PER_WORD - * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. - * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables - * some debug features. - */ -#define ARCH_SLAB_MINALIGN 0 -#endif - #ifndef ARCH_KMALLOC_FLAGS #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif -- cgit v1.2.1 From bac49ce42a33f53beb7cf04e9a0600879d6265ca Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 19 May 2010 12:01:43 +0100 Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to Acked-by: Herbert Xu Signed-off-by: David Woodhouse Signed-off-by: Pekka Enberg --- mm/slob.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/slob.c b/mm/slob.c index 837ebd64cc34..23631e2bb57a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -467,14 +467,6 @@ out: * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) -#endif - void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; -- cgit v1.2.1 From 4581ced379736fd76432c754f999d26deb83fbb7 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 19 May 2010 12:02:14 +0100 Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to Acked-by: Herbert Xu Signed-off-by: David Woodhouse Signed-off-by: Pekka Enberg --- mm/slub.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d2a54fe71ea2..c874c3efac29 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -157,14 +157,6 @@ #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA | SLAB_NOTRACK) -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) -#endif - #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ -- cgit v1.2.1 From bbd7d57bfe852d9788bae5fb171c7edb4021d8ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Mar 2010 22:25:47 +0100 Subject: slub: Potential stack overflow I discovered that we can overflow stack if CONFIG_SLUB_DEBUG=y and use slabs with many objects, since list_slab_objects() and process_slab() use DECLARE_BITMAP(map, page->objects). With 65535 bits, we use 8192 bytes of stack ... Switch these allocations to dynamic allocations. Signed-off-by: Eric Dumazet Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d2a54fe71ea2..78f1a202ca33 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2429,9 +2429,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - DECLARE_BITMAP(map, page->objects); + long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), + GFP_ATOMIC); - bitmap_zero(map, page->objects); + if (!map) + return; slab_err(s, page, "%s", text); slab_lock(page); for_each_free_object(p, s, page->freelist) @@ -2446,6 +2448,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } slab_unlock(page); + kfree(map); #endif } @@ -3651,10 +3654,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc) + struct page *page, enum track_item alloc, + long *map) { void *addr = page_address(page); - DECLARE_BITMAP(map, page->objects); void *p; bitmap_zero(map, page->objects); @@ -3673,11 +3676,14 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * + sizeof(unsigned long), GFP_KERNEL); - if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_TEMPORARY)) + if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_TEMPORARY)) { + kfree(map); return sprintf(buf, "Out of memory\n"); - + } /* Push back cpu slabs */ flush_all(s); @@ -3691,9 +3697,9 @@ static int list_locations(struct kmem_cache *s, char *buf, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); list_for_each_entry(page, &n->full, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } @@ -3744,6 +3750,7 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); + kfree(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; -- cgit v1.2.1 From d3e14aa336b37df76ae875fa051dfdb0e765ddf9 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 8 Apr 2010 17:26:44 +0800 Subject: slub: __kmalloc_node_track_caller should trace kmalloc_large_node case commit 94b528d (kmemtrace: SLUB hooks for caller-tracking functions) missed tracing kmalloc_large_node in __kmalloc_node_track_caller. We should trace it same as __kmalloc_node. Acked-by: David Rientjes Cc: Matt Mackall Cc: Ingo Molnar Cc: Vegard Nossum Signed-off-by: Xiaotian Feng Signed-off-by: Pekka Enberg --- mm/slub.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 78f1a202ca33..52ae5a538180 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3341,8 +3341,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) - return kmalloc_large_node(size, gfpflags, node); + if (unlikely(size > SLUB_MAX_SIZE)) { + ret = kmalloc_large_node(size, gfpflags, node); + + trace_kmalloc_node(caller, ret, + size, PAGE_SIZE << get_order(size), + gfpflags, node); + + return ret; + } s = get_slab(size, gfpflags); -- cgit v1.2.1 From 6b65aaf3027c4e02b42aaefd900aa79136a30681 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 14 Apr 2010 23:58:36 +0900 Subject: slub: Use alloc_pages_exact_node() for page allocation The alloc_slab_page() in SLUB uses alloc_pages() if node is '-1'. This means that node validity check in alloc_pages_node is unnecessary and we can use alloc_pages_exact_node() to avoid comparison and branch as commit 6484eb3e2a81807722 ("page allocator: do not check NUMA node ID when the caller knows the node is valid") did for the page allocator. Cc: Christoph Lameter Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Mel Gorman Signed-off-by: Minchan Kim Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 52ae5a538180..2cdd235cb801 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1084,7 +1084,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, if (node == -1) return alloc_pages(flags, order); else - return alloc_pages_node(node, flags, order); + return alloc_pages_exact_node(node, flags, order); } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) -- cgit v1.2.1