From 5dfb417509921eb90ee123a4d1525e8916b4ace4 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:38 -0700 Subject: sl[au]b: charge slabs to kmemcg explicitly We have only a few places where we actually want to charge kmem so instead of intruding into the general page allocation path with __GFP_KMEMCG it's better to explictly charge kmem there. All kmem charges will be easier to follow that way. This is a step towards removing __GFP_KMEMCG. It removes __GFP_KMEMCG from memcg caches' allocflags. Instead it makes slab allocation path call memcg_charge_kmem directly getting memcg to charge from the cache's memcg params. This also eliminates any possibility of misaccounting an allocation going from one memcg's cache to another memcg, because now we always charge slabs against the memcg the cache belongs to. That's why this patch removes the big comment to memcg_kmem_get_cache. Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b569b8be5c5a..96e5d2573eb0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -506,6 +506,9 @@ void memcg_update_array_size(int num_groups); struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size); +void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size); + void mem_cgroup_destroy_cache(struct kmem_cache *cachep); int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); @@ -583,17 +586,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) * @cachep: the original global kmem cache * @gfp: allocation flags. * - * This function assumes that the task allocating, which determines the memcg - * in the page allocator, belongs to the same cgroup throughout the whole - * process. Misacounting can happen if the task calls memcg_kmem_get_cache() - * while belonging to a cgroup, and later on changes. This is considered - * acceptable, and should only happen upon task migration. - * - * Before the cache is created by the memcg core, there is also a possible - * imbalance: the task belongs to a memcg, but the cache being allocated from - * is the global cache, since the child cache is not yet guaranteed to be - * ready. This case is also fine, since in this case the GFP_KMEMCG will not be - * passed and the page allocator will not attempt any cgroup accounting. + * All memory allocated from a per-memcg cache is charged to the owner memcg. */ static __always_inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) -- cgit v1.2.3 From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:39 -0700 Subject: mm: get rid of __GFP_KMEMCG Currently to allocate a page that should be charged to kmemcg (e.g. threadinfo), we pass __GFP_KMEMCG flag to the page allocator. The page allocated is then to be freed by free_memcg_kmem_pages. Apart from looking asymmetrical, this also requires intrusion to the general allocation path. So let's introduce separate functions that will alloc/free pages charged to kmemcg. The new functions are called alloc_kmem_pages and free_kmem_pages. They should be used when the caller actually would like to use kmalloc, but has to fall back to the page allocator for the allocation is large. They only differ from alloc_pages and free_pages in that besides allocating or freeing pages they also charge them to the kmem resource counter of the current memory cgroup. [sfr@canb.auug.org.au: export kmalloc_order() to modules] Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 +++++--- include/linux/memcontrol.h | 2 +- include/linux/slab.h | 11 +------- include/linux/thread_info.h | 2 -- include/trace/events/gfpflags.h | 1 - kernel/fork.c | 6 ++--- mm/memcontrol.c | 11 ++++---- mm/page_alloc.c | 56 +++++++++++++++++++++++++---------------- mm/slab_common.c | 13 ++++++++++ mm/slub.c | 6 ++--- 10 files changed, 68 insertions(+), 50 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 39b81dc7d01a..d382db71e300 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -31,7 +31,6 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -91,7 +90,6 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ -#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* @@ -353,6 +351,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ alloc_pages_vma(gfp_mask, 0, vma, addr, node) +extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); +extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, + unsigned int order); + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); @@ -372,8 +374,8 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); -extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); -extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); +extern void __free_kmem_pages(struct page *page, unsigned int order); +extern void free_kmem_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 96e5d2573eb0..5155d09e749d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -537,7 +537,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) * res_counter_charge_nofail, but we hope those allocations are rare, * and won't be worth the trouble. */ - if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) + if (gfp & __GFP_NOFAIL) return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; diff --git a/include/linux/slab.h b/include/linux/slab.h index 307bfbe62387..a6aab2c0dfc5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -369,16 +369,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, #include #endif -static __always_inline void * -kmalloc_order(size_t size, gfp_t flags, unsigned int order) -{ - void *ret; - - flags |= (__GFP_COMP | __GFP_KMEMCG); - ret = (void *) __get_free_pages(flags, order); - kmemleak_alloc(ret, size, 1, flags); - return ret; -} +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cb0cec94fda3..ff307b548ed3 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif -#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) - /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 1eddbf1557f2..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,7 +34,6 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..59e3dcc5b8f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56a768b3d5a8..7bab1de50f48 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3540,11 +3540,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c0..7cfdcd808f52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,7 +2697,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2716,13 +2715,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2782,8 +2774,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2837,27 +2827,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } diff --git a/mm/slab_common.c b/mm/slab_common.c index 06f0c6125632..1950c8f4d1a6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,6 +582,19 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { diff --git a/mm/slub.c b/mm/slub.c index fc9831851be6..ddb60795f373 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3311,8 +3311,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3381,7 +3381,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); -- cgit v1.2.3 From 1e32e77f95d60b121b6072e3e3a650a7f93068f9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:37 -0700 Subject: memcg, slab: do not schedule cache destruction when last page goes away This patchset is a part of preparations for kmemcg re-parenting. It targets at simplifying kmemcg work-flows and synchronization. First, it removes async per memcg cache destruction (see patches 1, 2). Now caches are only destroyed on memcg offline. That means the caches that are not empty on memcg offline will be leaked. However, they are already leaked, because memcg_cache_params::nr_pages normally never drops to 0 so the destruction work is never scheduled except kmem_cache_shrink is called explicitly. In the future I'm planning reaping such dead caches on vmpressure or periodically. Second, it substitutes per memcg slab_caches_mutex's with the global memcg_slab_mutex, which should be taken during the whole per memcg cache creation/destruction path before the slab_mutex (see patch 3). This greatly simplifies synchronization among various per memcg cache creation/destruction paths. I'm still not quite sure about the end picture, in particular I don't know whether we should reap dead memcgs' kmem caches periodically or try to merge them with their parents (see https://lkml.org/lkml/2014/4/20/38 for more details), but whichever way we choose, this set looks like a reasonable change to me, because it greatly simplifies kmemcg work-flows and eases further development. This patch (of 3): After a memcg is offlined, we mark its kmem caches that cannot be deleted right now due to pending objects as dead by setting the memcg_cache_params::dead flag, so that memcg_release_pages will schedule cache destruction (memcg_cache_params::destroy) as soon as the last slab of the cache is freed (memcg_cache_params::nr_pages drops to zero). I guess the idea was to destroy the caches as soon as possible, i.e. immediately after freeing the last object. However, it just doesn't work that way, because kmem caches always preserve some pages for the sake of performance, so that nr_pages never gets to zero unless the cache is shrunk explicitly using kmem_cache_shrink. Of course, we could account the total number of objects on the cache or check if all the slabs allocated for the cache are empty on kmem_cache_free and schedule destruction if so, but that would be too costly. Thus we have a piece of code that works only when we explicitly call kmem_cache_shrink, but complicates the whole picture a lot. Moreover, it's racy in fact. For instance, kmem_cache_shrink may free the last slab and thus schedule cache destruction before it finishes checking that the cache is empty, which can lead to use-after-free. So I propose to remove this async cache destruction from memcg_release_pages, and check if the cache is empty explicitly after calling kmem_cache_shrink instead. This will simplify things a lot w/o introducing any functional changes. And regarding dead memcg caches (i.e. those that are left hanging around after memcg offline for they have objects), I suppose we should reap them either periodically or on vmpressure as Glauber suggested initially. I'm going to implement this later. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - include/linux/slab.h | 2 -- mm/memcontrol.c | 63 ++-------------------------------------------- mm/slab.h | 7 ++---- 4 files changed, 4 insertions(+), 69 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5155d09e749d..087a45314181 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -509,7 +509,6 @@ __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size); void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size); -void mem_cgroup_destroy_cache(struct kmem_cache *cachep); int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); /** diff --git a/include/linux/slab.h b/include/linux/slab.h index a6aab2c0dfc5..905541dd3778 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -524,7 +524,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @memcg: pointer to the memcg this cache belongs to * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from - * @dead: set to true after the memcg dies; the cache may still be around. * @nr_pages: number of pages that belongs to this cache. * @destroy: worker to be called whenever we are ready, or believe we may be * ready, to destroy this cache. @@ -540,7 +539,6 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head list; struct kmem_cache *root_cache; - bool dead; atomic_t nr_pages; struct work_struct destroy; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9f4ff49c6add..6b1c45ced733 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3277,60 +3277,11 @@ static void kmem_cache_destroy_work_func(struct work_struct *w) cachep = memcg_params_to_cache(p); - /* - * If we get down to 0 after shrink, we could delete right away. - * However, memcg_release_pages() already puts us back in the workqueue - * in that case. If we proceed deleting, we'll get a dangling - * reference, and removing the object from the workqueue in that case - * is unnecessary complication. We are not a fast path. - * - * Note that this case is fundamentally different from racing with - * shrink_slab(): if memcg_cgroup_destroy_cache() is called in - * kmem_cache_shrink, not only we would be reinserting a dead cache - * into the queue, but doing so from inside the worker racing to - * destroy it. - * - * So if we aren't down to zero, we'll just schedule a worker and try - * again - */ - if (atomic_read(&cachep->memcg_params->nr_pages) != 0) - kmem_cache_shrink(cachep); - else + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) kmem_cache_destroy(cachep); } -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ - if (!cachep->memcg_params->dead) - return; - - /* - * There are many ways in which we can get here. - * - * We can get to a memory-pressure situation while the delayed work is - * still pending to run. The vmscan shrinkers can then release all - * cache memory and get us to destruction. If this is the case, we'll - * be executed twice, which is a bug (the second time will execute over - * bogus data). In this case, cancelling the work should be fine. - * - * But we can also get here from the worker itself, if - * kmem_cache_shrink is enough to shake all the remaining objects and - * get the page count to 0. In this case, we'll deadlock if we try to - * cancel the work (the worker runs with an internal lock held, which - * is the same lock we would hold for cancel_work_sync().) - * - * Since we can't possibly know who got us here, just refrain from - * running if there is already work pending - */ - if (work_pending(&cachep->memcg_params->destroy)) - return; - /* - * We have to defer the actual destroying to a workqueue, because - * we might currently be in a context that cannot sleep. - */ - schedule_work(&cachep->memcg_params->destroy); -} - int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; @@ -3356,16 +3307,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) * We will now manually delete the caches, so to avoid races * we need to cancel all pending destruction workers and * proceed with destruction ourselves. - * - * kmem_cache_destroy() will call kmem_cache_shrink internally, - * and that could spawn the workers again: it is likely that - * the cache still have active pages until this very moment. - * This would lead us back to mem_cgroup_destroy_cache. - * - * But that will not execute at all if the "dead" flag is not - * set, so flip it down to guarantee we are in control. */ - c->memcg_params->dead = false; cancel_work_sync(&c->memcg_params->destroy); kmem_cache_destroy(c); @@ -3387,7 +3329,6 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) mutex_lock(&memcg->slab_caches_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - cachep->memcg_params->dead = true; schedule_work(&cachep->memcg_params->destroy); } mutex_unlock(&memcg->slab_caches_mutex); diff --git a/mm/slab.h b/mm/slab.h index d85d59803d5f..b59447ac4533 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -129,11 +129,8 @@ static inline void memcg_bind_pages(struct kmem_cache *s, int order) static inline void memcg_release_pages(struct kmem_cache *s, int order) { - if (is_root_cache(s)) - return; - - if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) - mem_cgroup_destroy_cache(s); + if (!is_root_cache(s)) + atomic_sub(1 << order, &s->memcg_params->nr_pages); } static inline bool slab_equal_or_root(struct kmem_cache *s, -- cgit v1.2.3 From c67a8a685a6e9abbaf0235e084168f15a721ae39 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:39 -0700 Subject: memcg, slab: merge memcg_{bind,release}_pages to memcg_{un}charge_slab Currently we have two pairs of kmemcg-related functions that are called on slab alloc/free. The first is memcg_{bind,release}_pages that count the total number of pages allocated on a kmem cache. The second is memcg_{un}charge_slab that {un}charge slab pages to kmemcg resource counter. Let's just merge them to keep the code clean. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 22 ++++++++++++++++++++-- mm/slab.c | 2 -- mm/slab.h | 25 ++----------------------- mm/slub.c | 2 -- 5 files changed, 24 insertions(+), 31 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 087a45314181..d38d190f4cec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -506,8 +506,8 @@ void memcg_update_array_size(int num_groups); struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size); -void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size); +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b1c45ced733..86a2078805e5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2954,7 +2954,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) } #endif -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; int ret = 0; @@ -2992,7 +2992,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) return ret; } -void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { res_counter_uncharge(&memcg->res, size); if (do_swap_account) @@ -3390,6 +3390,24 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, __memcg_create_cache_enqueue(memcg, cachep); memcg_resume_kmem_account(); } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ + int res; + + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, + PAGE_SIZE << order); + if (!res) + atomic_add(1 << order, &cachep->memcg_params->nr_pages); + return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ + memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); + atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} + /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. diff --git a/mm/slab.c b/mm/slab.c index 7067ea7f3927..9ca3b87edabc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1712,7 +1712,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, __SetPageSlab(page); if (page->pfmemalloc) SetPageSlabPfmemalloc(page); - memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1748,7 +1747,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) page_mapcount_reset(page); page->mapping = NULL; - memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; __free_pages(page, cachep->gfporder); diff --git a/mm/slab.h b/mm/slab.h index b59447ac4533..961a3fb1f5a2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -121,18 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return !s->memcg_params || s->memcg_params->is_root_cache; } -static inline void memcg_bind_pages(struct kmem_cache *s, int order) -{ - if (!is_root_cache(s)) - atomic_add(1 << order, &s->memcg_params->nr_pages); -} - -static inline void memcg_release_pages(struct kmem_cache *s, int order) -{ - if (!is_root_cache(s)) - atomic_sub(1 << order, &s->memcg_params->nr_pages); -} - static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { @@ -198,8 +186,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params->memcg, gfp, - PAGE_SIZE << order); + return __memcg_charge_slab(s, gfp, order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -208,7 +195,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order); + __memcg_uncharge_slab(s, order); } #else static inline bool is_root_cache(struct kmem_cache *s) @@ -216,14 +203,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return true; } -static inline void memcg_bind_pages(struct kmem_cache *s, int order) -{ -} - -static inline void memcg_release_pages(struct kmem_cache *s, int order) -{ -} - static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { diff --git a/mm/slub.c b/mm/slub.c index 5d1b653183ab..9e288d7c5e6a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1422,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) @@ -1473,7 +1472,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - memcg_release_pages(s, order); page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; -- cgit v1.2.3 From bd67314586a3d5725e60f2f6587b4cb0f659bb67 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:40 -0700 Subject: memcg, slab: simplify synchronization scheme At present, we have the following mutexes protecting data related to per memcg kmem caches: - slab_mutex. This one is held during the whole kmem cache creation and destruction paths. We also take it when updating per root cache memcg_caches arrays (see memcg_update_all_caches). As a result, taking it guarantees there will be no changes to any kmem cache (including per memcg). Why do we need something else then? The point is it is private to slab implementation and has some internal dependencies with other mutexes (get_online_cpus). So we just don't want to rely upon it and prefer to introduce additional mutexes instead. - activate_kmem_mutex. Initially it was added to synchronize initializing kmem limit (memcg_activate_kmem). However, since we can grow per root cache memcg_caches arrays only on kmem limit initialization (see memcg_update_all_caches), we also employ it to protect against memcg_caches arrays relocation (e.g. see __kmem_cache_destroy_memcg_children). - We have a convention not to take slab_mutex in memcontrol.c, but we want to walk over per memcg memcg_slab_caches lists there (e.g. for destroying all memcg caches on offline). So we have per memcg slab_caches_mutex's protecting those lists. The mutexes are taken in the following order: activate_kmem_mutex -> slab_mutex -> memcg::slab_caches_mutex Such a syncrhonization scheme has a number of flaws, for instance: - We can't call kmem_cache_{destroy,shrink} while walking over a memcg::memcg_slab_caches list due to locking order. As a result, in mem_cgroup_destroy_all_caches we schedule the memcg_cache_params::destroy work shrinking and destroying the cache. - We don't have a mutex to synchronize per memcg caches destruction between memcg offline (mem_cgroup_destroy_all_caches) and root cache destruction (__kmem_cache_destroy_memcg_children). Currently we just don't bother about it. This patch simplifies it by substituting per memcg slab_caches_mutex's with the global memcg_slab_mutex. It will be held whenever a new per memcg cache is created or destroyed, so it protects per root cache memcg_caches arrays and per memcg memcg_slab_caches lists. The locking order is following: activate_kmem_mutex -> memcg_slab_mutex -> slab_mutex This allows us to call kmem_cache_{create,shrink,destroy} under the memcg_slab_mutex. As a result, we don't need memcg_cache_params::destroy work any more - we can simply destroy caches while iterating over a per memcg slab caches list. Also using the global mutex simplifies synchronization between concurrent per memcg caches creation/destruction, e.g. mem_cgroup_destroy_all_caches vs __kmem_cache_destroy_memcg_children. The downside of this is that we substitute per-memcg slab_caches_mutex's with a hummer-like global mutex, but since we already take either the slab_mutex or the cgroup_mutex along with a memcg::slab_caches_mutex, it shouldn't hurt concurrency a lot. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 --- include/linux/slab.h | 6 +- mm/memcontrol.c | 150 ++++++++++++++++++--------------------------- mm/slab_common.c | 23 +++---- 4 files changed, 69 insertions(+), 120 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d38d190f4cec..1fa23244fe37 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -497,8 +497,6 @@ char *memcg_create_cache_name(struct mem_cgroup *memcg, int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache); void memcg_free_cache_params(struct kmem_cache *s); -void memcg_register_cache(struct kmem_cache *s); -void memcg_unregister_cache(struct kmem_cache *s); int memcg_update_cache_size(struct kmem_cache *s, int num_groups); void memcg_update_array_size(int num_groups); @@ -640,14 +638,6 @@ static inline void memcg_free_cache_params(struct kmem_cache *s) { } -static inline void memcg_register_cache(struct kmem_cache *s) -{ -} - -static inline void memcg_unregister_cache(struct kmem_cache *s) -{ -} - static inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { diff --git a/include/linux/slab.h b/include/linux/slab.h index 905541dd3778..ecbec9ccb80d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,7 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *); +struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *, + struct kmem_cache *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); @@ -525,8 +526,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from * @nr_pages: number of pages that belongs to this cache. - * @destroy: worker to be called whenever we are ready, or believe we may be - * ready, to destroy this cache. */ struct memcg_cache_params { bool is_root_cache; @@ -540,7 +539,6 @@ struct memcg_cache_params { struct list_head list; struct kmem_cache *root_cache; atomic_t nr_pages; - struct work_struct destroy; }; }; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 86a2078805e5..6b448881422b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -357,10 +357,9 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list. per-memcg */ + /* analogous to slab_common's slab_caches list, but per-memcg; + * protected by memcg_slab_mutex */ struct list_head memcg_slab_caches; - /* Not a spinlock, we can take a lot of time walking the list */ - struct mutex slab_caches_mutex; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -2913,6 +2912,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, static DEFINE_MUTEX(set_limit_mutex); #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); + static DEFINE_MUTEX(activate_kmem_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) @@ -2945,10 +2950,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) print_slabinfo_header(m); - mutex_lock(&memcg->slab_caches_mutex); + mutex_lock(&memcg_slab_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); return 0; } @@ -3050,8 +3055,6 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } -static void kmem_cache_destroy_work_func(struct work_struct *w); - int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3148,8 +3151,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3166,24 +3167,34 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -void memcg_register_cache(struct kmem_cache *s) +static void memcg_kmem_create_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *root; - struct mem_cgroup *memcg; + struct kmem_cache *cachep; int id; - if (is_root_cache(s)) + lockdep_assert_held(&memcg_slab_mutex); + + id = memcg_cache_id(memcg); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, id)) return; + cachep = kmem_cache_create_memcg(memcg, root_cache); /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. */ - lockdep_assert_held(&slab_mutex); + if (!cachep) + return; - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; - id = memcg_cache_id(memcg); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3192,49 +3203,30 @@ void memcg_register_cache(struct kmem_cache *s) */ smp_wmb(); - /* - * Initialize the pointer to this cache in its parent's memcg_params - * before adding it to the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = s; - - mutex_lock(&memcg->slab_caches_mutex); - list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id]); + root_cache->memcg_params->memcg_caches[id] = cachep; } -void memcg_unregister_cache(struct kmem_cache *s) +static void memcg_kmem_destroy_cache(struct kmem_cache *cachep) { - struct kmem_cache *root; + struct kmem_cache *root_cache; struct mem_cgroup *memcg; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); - /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. - */ - lockdep_assert_held(&slab_mutex); + BUG_ON(is_root_cache(cachep)); - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; + root_cache = cachep->memcg_params->root_cache; + memcg = cachep->memcg_params->memcg; id = memcg_cache_id(memcg); - mutex_lock(&memcg->slab_caches_mutex); - list_del(&s->memcg_params->list); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; - /* - * Clear the pointer to this cache in its parent's memcg_params only - * after removing it from the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); - root->memcg_params->memcg_caches[id] = NULL; + list_del(&cachep->memcg_params->list); + + kmem_cache_destroy(cachep); } /* @@ -3268,70 +3260,42 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *p; - - p = container_of(w, struct memcg_cache_params, destroy); - - cachep = memcg_params_to_cache(p); - - kmem_cache_shrink(cachep); - if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - kmem_cache_destroy(cachep); -} - int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; int i, failed = 0; - /* - * If the cache is being destroyed, we trust that there is no one else - * requesting objects from it. Even if there are, the sanity checks in - * kmem_cache_destroy should caught this ill-case. - * - * Still, we don't want anyone else freeing memcg_caches under our - * noses, which can happen if a new memcg comes to life. As usual, - * we'll take the activate_kmem_mutex to protect ourselves against - * this. - */ - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_slab_mutex); for_each_memcg_cache_index(i) { c = cache_from_memcg_idx(s, i); if (!c) continue; - /* - * We will now manually delete the caches, so to avoid races - * we need to cancel all pending destruction workers and - * proceed with destruction ourselves. - */ - cancel_work_sync(&c->memcg_params->destroy); - kmem_cache_destroy(c); + memcg_kmem_destroy_cache(c); if (cache_from_memcg_idx(s, i)) failed++; } - mutex_unlock(&activate_kmem_mutex); + mutex_unlock(&memcg_slab_mutex); return failed; } static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; - struct memcg_cache_params *params; + struct memcg_cache_params *params, *tmp; if (!memcg_kmem_is_active(memcg)) return; - mutex_lock(&memcg->slab_caches_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + mutex_lock(&memcg_slab_mutex); + list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - schedule_work(&cachep->memcg_params->destroy); + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + memcg_kmem_destroy_cache(cachep); } - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); } struct create_work { @@ -3346,7 +3310,10 @@ static void memcg_create_cache_work_func(struct work_struct *w) struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - kmem_cache_create_memcg(memcg, cachep); + mutex_lock(&memcg_slab_mutex); + memcg_kmem_create_cache(memcg, cachep); + mutex_unlock(&memcg_slab_mutex); + css_put(&memcg->css); kfree(cw); } @@ -5022,13 +4989,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * Make sure we have enough space for this cgroup in each root cache's * memcg_params. */ + mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(memcg_id + 1); + mutex_unlock(&memcg_slab_mutex); if (err) goto out_rmid; memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); - mutex_init(&memcg->slab_caches_mutex); /* * We couldn't have accounted to this cgroup, because it hasn't got the diff --git a/mm/slab_common.c b/mm/slab_common.c index 2dd920dc3776..7e348cff814d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -160,7 +160,6 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, s->refcount = 1; list_add(&s->list, &slab_caches); - memcg_register_cache(s); out: if (err) return ERR_PTR(err); @@ -270,9 +269,10 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) +struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *s; + struct kmem_cache *s = NULL; char *cache_name; get_online_cpus(); @@ -280,14 +280,6 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c mutex_lock(&slab_mutex); - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can try to - * create the same cache, but only one of them may succeed. - */ - if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) - goto out_unlock; - cache_name = memcg_create_cache_name(memcg, root_cache); if (!cache_name) goto out_unlock; @@ -296,14 +288,18 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); - if (IS_ERR(s)) + if (IS_ERR(s)) { kfree(cache_name); + s = NULL; + } out_unlock: mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); + + return s; } static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) @@ -348,11 +344,8 @@ void kmem_cache_destroy(struct kmem_cache *s) goto out_unlock; list_del(&s->list); - memcg_unregister_cache(s); - if (__kmem_cache_shutdown(s) != 0) { list_add(&s->list, &slab_caches); - memcg_register_cache(s); printk(KERN_ERR "kmem_cache_destroy %s: " "Slab cache still has objects\n", s->name); dump_stack(); -- cgit v1.2.3 From 073ee1c6cd11cd190f4d0da84d9b4ba79d7b9e70 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:08:23 -0700 Subject: memcg: get rid of memcg_create_cache_name Instead of calling back to memcontrol.c from kmem_cache_create_memcg in order to just create the name of a per memcg cache, let's allocate it in place. We only need to pass the memcg name to kmem_cache_create_memcg for that - everything else can be done in slab_common.c. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- include/linux/slab.h | 3 ++- mm/memcontrol.c | 33 +++++++++------------------------ mm/slab_common.c | 7 +++++-- 4 files changed, 16 insertions(+), 29 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1fa23244fe37..dfc2929a3877 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -492,8 +492,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order); int memcg_cache_id(struct mem_cgroup *memcg); -char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache); int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache); void memcg_free_cache_params(struct kmem_cache *s); diff --git a/include/linux/slab.h b/include/linux/slab.h index ecbec9ccb80d..86e5b26fbdab 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -117,7 +117,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *, - struct kmem_cache *); + struct kmem_cache *, + const char *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 971d7b643f6e..7df7f599e3df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3095,29 +3095,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } -char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - static char *buf; - - /* - * We need a mutex here to protect the shared buffer. Since this is - * expected to be called only on cache creation, we can employ the - * slab_mutex for that purpose. - */ - lockdep_assert_held(&slab_mutex); - - if (!buf) { - buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!buf) - return NULL; - } - - cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); - return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), buf); -} - int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@ -3158,6 +3135,7 @@ void memcg_free_cache_params(struct kmem_cache *s) static void memcg_kmem_create_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { + static char *memcg_name_buf; /* protected by memcg_slab_mutex */ struct kmem_cache *cachep; int id; @@ -3173,7 +3151,14 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, if (cache_from_memcg_idx(root_cache, id)) return; - cachep = kmem_cache_create_memcg(memcg, root_cache); + if (!memcg_name_buf) { + memcg_name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!memcg_name_buf) + return; + } + + cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); + cachep = kmem_cache_create_memcg(memcg, root_cache, memcg_name_buf); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root diff --git a/mm/slab_common.c b/mm/slab_common.c index 7e348cff814d..32175617cb75 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -264,13 +264,15 @@ EXPORT_SYMBOL(kmem_cache_create); * kmem_cache_create_memcg - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. + * @memcg_name: The name of the memory cgroup (used for naming the new cache). * * This function attempts to create a kmem cache that will serve allocation * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) + struct kmem_cache *root_cache, + const char *memcg_name) { struct kmem_cache *s = NULL; char *cache_name; @@ -280,7 +282,8 @@ struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); - cache_name = memcg_create_cache_name(memcg, root_cache); + cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + memcg_cache_id(memcg), memcg_name); if (!cache_name) goto out_unlock; -- cgit v1.2.3 From 776ed0f0377914d1e65fed903c052e9eef3f4cc3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:10:02 -0700 Subject: memcg: cleanup kmem cache creation/destruction functions naming Current names are rather inconsistent. Let's try to improve them. Brief change log: ** old name ** ** new name ** kmem_cache_create_memcg memcg_create_kmem_cache memcg_kmem_create_cache memcg_regsiter_cache memcg_kmem_destroy_cache memcg_unregister_cache kmem_cache_destroy_memcg_children memcg_cleanup_cache_params mem_cgroup_destroy_all_caches memcg_unregister_all_caches create_work memcg_register_cache_work memcg_create_cache_work_func memcg_register_cache_func memcg_create_cache_enqueue memcg_schedule_register_cache Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- include/linux/slab.h | 2 +- mm/memcontrol.c | 60 ++++++++++++++++++++++------------------------ mm/slab_common.c | 12 +++++----- 4 files changed, 36 insertions(+), 40 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dfc2929a3877..eb65d29516ca 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -505,7 +505,7 @@ __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); -int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); +int __memcg_cleanup_cache_params(struct kmem_cache *s); /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. diff --git a/include/linux/slab.h b/include/linux/slab.h index 86e5b26fbdab..1d9abb7d22a0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,7 +116,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *, +struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *, const char *); #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5e2bfcc96da9..d176edb1d5e8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3132,8 +3132,8 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -static void memcg_kmem_create_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) +static void memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by memcg_slab_mutex */ @@ -3153,7 +3153,7 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, return; cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); - cachep = kmem_cache_create_memcg(memcg, root_cache, memcg_name_buf); + cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root @@ -3175,7 +3175,7 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, root_cache->memcg_params->memcg_caches[id] = cachep; } -static void memcg_kmem_destroy_cache(struct kmem_cache *cachep) +static void memcg_unregister_cache(struct kmem_cache *cachep) { struct kmem_cache *root_cache; struct mem_cgroup *memcg; @@ -3228,7 +3228,7 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s) { struct kmem_cache *c; int i, failed = 0; @@ -3239,7 +3239,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) if (!c) continue; - memcg_kmem_destroy_cache(c); + memcg_unregister_cache(c); if (cache_from_memcg_idx(s, i)) failed++; @@ -3248,7 +3248,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) return failed; } -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; struct memcg_cache_params *params, *tmp; @@ -3261,25 +3261,26 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) cachep = memcg_params_to_cache(params); kmem_cache_shrink(cachep); if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - memcg_kmem_destroy_cache(cachep); + memcg_unregister_cache(cachep); } mutex_unlock(&memcg_slab_mutex); } -struct create_work { +struct memcg_register_cache_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_create_cache_work_func(struct work_struct *w) +static void memcg_register_cache_func(struct work_struct *w) { - struct create_work *cw = container_of(w, struct create_work, work); + struct memcg_register_cache_work *cw = + container_of(w, struct memcg_register_cache_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; mutex_lock(&memcg_slab_mutex); - memcg_kmem_create_cache(memcg, cachep); + memcg_register_cache(memcg, cachep); mutex_unlock(&memcg_slab_mutex); css_put(&memcg->css); @@ -3289,12 +3290,12 @@ static void memcg_create_cache_work_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct create_work *cw; + struct memcg_register_cache_work *cw; - cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (cw == NULL) { css_put(&memcg->css); return; @@ -3303,17 +3304,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; - INIT_WORK(&cw->work, memcg_create_cache_work_func); + INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_create_cache_enqueue will recurse. + * in __memcg_schedule_register_cache will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -3322,7 +3323,7 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ memcg_stop_kmem_account(); - __memcg_create_cache_enqueue(memcg, cachep); + __memcg_schedule_register_cache(memcg, cachep); memcg_resume_kmem_account(); } @@ -3393,16 +3394,11 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * * However, there are some clashes that can arrive from locking. * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. + * memcg_create_kmem_cache, this means no further allocation + * could happen with the slab_mutex held. So it's better to + * defer everything. */ - memcg_create_cache_enqueue(memcg, cachep); + memcg_schedule_register_cache(memcg, cachep); return cachep; out: rcu_read_unlock(); @@ -3526,7 +3522,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -6372,7 +6368,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) css_for_each_descendant_post(iter, css) mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - mem_cgroup_destroy_all_caches(memcg); + memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 32175617cb75..48fafb61f35e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -261,7 +261,7 @@ EXPORT_SYMBOL(kmem_cache_create); #ifdef CONFIG_MEMCG_KMEM /* - * kmem_cache_create_memcg - Create a cache for a memory cgroup. + * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. * @memcg_name: The name of the memory cgroup (used for naming the new cache). @@ -270,7 +270,7 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, +struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache, const char *memcg_name) { @@ -305,7 +305,7 @@ out_unlock: return s; } -static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +static int memcg_cleanup_cache_params(struct kmem_cache *s) { int rc; @@ -314,13 +314,13 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) return 0; mutex_unlock(&slab_mutex); - rc = __kmem_cache_destroy_memcg_children(s); + rc = __memcg_cleanup_cache_params(s); mutex_lock(&slab_mutex); return rc; } #else -static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +static int memcg_cleanup_cache_params(struct kmem_cache *s) { return 0; } @@ -343,7 +343,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - if (kmem_cache_destroy_memcg_children(s) != 0) + if (memcg_cleanup_cache_params(s) != 0) goto out_unlock; list_del(&s->list); -- cgit v1.2.3