diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 525 |
1 files changed, 312 insertions, 213 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9acfb165eb52..0eda67376df4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -76,9 +76,12 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); +struct mem_cgroup *root_mem_cgroup __read_mostly; + #define MEM_CGROUP_RECLAIM_RETRIES 5 -static struct mem_cgroup *root_mem_cgroup __read_mostly; -struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; + +/* Socket memory accounting disabled? */ +static bool cgroup_memory_nosocket; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -87,6 +90,12 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +/* Whether legacy memory+swap accounting is active */ +static bool do_memsw_account(void) +{ + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; +} + static const char * const mem_cgroup_stat_names[] = { "cache", "rss", @@ -288,64 +297,6 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_css(css); } -/* Writing them here to avoid exposing memcg's inner layout */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) - -void sock_update_memcg(struct sock *sk) -{ - if (mem_cgroup_sockets_enabled) { - struct mem_cgroup *memcg; - struct cg_proto *cg_proto; - - BUG_ON(!sk->sk_prot->proto_cgroup); - - /* Socket cloning can throw us here with sk_cgrp already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_cgrp) { - BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - css_get(&sk->sk_cgrp->memcg->css); - return; - } - - rcu_read_lock(); - memcg = mem_cgroup_from_task(current); - cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && - css_tryget_online(&memcg->css)) { - sk->sk_cgrp = cg_proto; - } - rcu_read_unlock(); - } -} -EXPORT_SYMBOL(sock_update_memcg); - -void sock_release_memcg(struct sock *sk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { - struct mem_cgroup *memcg; - WARN_ON(!sk->sk_cgrp->memcg); - memcg = sk->sk_cgrp->memcg; - css_put(&sk->sk_cgrp->memcg->css); - } -} - -struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg || mem_cgroup_is_root(memcg)) - return NULL; - - return &memcg->tcp_mem; -} -EXPORT_SYMBOL(tcp_proto_cgroup); - -#endif - #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. @@ -395,7 +346,7 @@ void memcg_put_cache_ids(void) * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -struct static_key memcg_kmem_enabled_key; +DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /* CONFIG_MEMCG_KMEM */ @@ -431,14 +382,11 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = page->mem_cgroup; if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; - rcu_read_unlock(); return &memcg->css; } @@ -696,7 +644,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - int nr_pages) + bool compound, int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is @@ -709,9 +657,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_pages); + } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) @@ -903,14 +853,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && reclaim->generation != iter->generation) goto out_unlock; - do { + while (1) { pos = READ_ONCE(iter->position); + if (!pos || css_tryget(&pos->css)) + break; /* - * A racing update may change the position and - * put the last reference, hence css_tryget(), - * or retry to see the updated position. + * css reference reached zero, so iter->position will + * be cleared by ->css_released. However, we should not + * rely on this happening soon, because ->css_released + * is called from a work queue, and by busy-waiting we + * might block it. So we clear iter->position right + * away. */ - } while (pos && !css_tryget(&pos->css)); + (void)cmpxchg(&iter->position, pos, NULL); + } } if (pos) @@ -956,17 +912,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, } if (reclaim) { - if (cmpxchg(&iter->position, pos, memcg) == pos) { - if (memcg) - css_get(&memcg->css); - if (pos) - css_put(&pos->css); - } - /* - * pairs with css_tryget when dereferencing iter->position - * above. + * The position could have already been updated by a competing + * thread, so check that the value hasn't changed since we read + * it to avoid reclaiming from the same cgroup twice. */ + (void)cmpxchg(&iter->position, pos, memcg); + if (pos) css_put(&pos->css); @@ -999,6 +951,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup_reclaim_iter *iter; + struct mem_cgroup_per_zone *mz; + int nid, zid; + int i; + + while ((memcg = parent_mem_cgroup(memcg))) { + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); + } + } + } + } +} + /* * Iteration constructs for visiting all cgroups (under a tree). If * loops are exited prematurely (break), mem_cgroup_iter_break() must @@ -1138,9 +1112,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) return ret; } -#define mem_cgroup_from_counter(counter, member) \ - container_of(counter, struct mem_cgroup, member) - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup @@ -1159,7 +1130,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (count < limit) margin = limit - count; - if (do_swap_account) { + if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) @@ -1262,7 +1233,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], K(mem_cgroup_read_stat(iter, i))); @@ -1885,7 +1856,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) if (stock->nr_pages) { page_counter_uncharge(&old->memory, stock->nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&old->memsw, stock->nr_pages); css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; @@ -1973,6 +1944,26 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +static void reclaim_high(struct mem_cgroup *memcg, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); +} + +static void high_work_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); +} + /* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. @@ -1980,20 +1971,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, void mem_cgroup_handle_over_high(void) { unsigned int nr_pages = current->memcg_nr_pages_over_high; - struct mem_cgroup *memcg, *pos; + struct mem_cgroup *memcg; if (likely(!nr_pages)) return; - pos = memcg = get_mem_cgroup_from_mm(current->mm); - - do { - if (page_counter_read(&pos->memory) <= pos->high) - continue; - mem_cgroup_events(pos, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); - } while ((pos = parent_mem_cgroup(pos))); - + memcg = get_mem_cgroup_from_mm(current->mm); + reclaim_high(memcg, nr_pages, GFP_KERNEL); css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; } @@ -2015,11 +1999,11 @@ retry: if (consume_stock(memcg, nr_pages)) return 0; - if (!do_swap_account || + if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, batch); mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { @@ -2106,7 +2090,7 @@ force: * temporarily by force charging it. */ page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); @@ -2128,7 +2112,12 @@ done_restock: */ do { if (page_counter_read(&memcg->memory) > memcg->high) { - current->memcg_nr_pages_over_high += nr_pages; + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + schedule_work(&memcg->high_work); + break; + } + current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; } @@ -2143,7 +2132,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) return; page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); css_put_many(&memcg->css, nr_pages); @@ -2332,7 +2321,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, * Can't be called in interrupt context or from kernel threads. * This function needs to be called with rcu_read_lock() held. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2340,6 +2329,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) VM_BUG_ON(!is_root_cache(cachep)); + if (cachep->flags & SLAB_ACCOUNT) + gfp |= __GFP_ACCOUNT; + + if (!(gfp & __GFP_ACCOUNT)) + return cachep; + if (current->memcg_kmem_skip_account) return cachep; @@ -2423,7 +2418,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) page_counter_uncharge(&memcg->kmem, nr_pages); page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); page->mem_cgroup = NULL; @@ -2435,9 +2430,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compound_lock. - * charge/uncharge will be never happen and move_account() is done under - * compound_lock(), so we don't have to take care of races. + * zone->lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -2911,7 +2904,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, err = page_counter_limit(&memcg->kmem, nr_pages); VM_BUG_ON(err); - static_key_slow_inc(&memcg_kmem_enabled_key); + static_branch_inc(&memcg_kmem_enabled_key); /* * A memory cgroup is considered kmem-active as soon as it gets * kmemcg_id. Setting the id after enabling static branching will @@ -3138,7 +3131,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); @@ -3160,14 +3153,14 @@ static int memcg_stat_show(struct seq_file *m, void *v) } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); - if (do_swap_account) + if (do_memsw_account()) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { unsigned long long val = 0; - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; for_each_mem_cgroup_tree(mi, memcg) val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; @@ -3298,7 +3291,7 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg) { while (memcg) { __mem_cgroup_threshold(memcg, false); - if (do_swap_account) + if (do_memsw_account()) __mem_cgroup_threshold(memcg, true); memcg = parent_mem_cgroup(memcg); @@ -3498,16 +3491,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, swap_buffers: /* Swap primary and spare array */ thresholds->spare = thresholds->primary; - /* If all events are unregistered, free the spare array */ - if (!new) { - kfree(thresholds->spare); - thresholds->spare = NULL; - } rcu_assign_pointer(thresholds->primary, new); /* To be sure that nobody uses thresholds */ synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } unlock: mutex_unlock(&memcg->thresholds_lock); } @@ -3597,7 +3591,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) if (ret) return ret; - return mem_cgroup_sockets_init(memcg, ss); + return tcp_init_cgroup(memcg, ss); } static void memcg_deactivate_kmem(struct mem_cgroup *memcg) @@ -3650,10 +3644,10 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) { if (memcg->kmem_acct_activated) { memcg_destroy_kmem_caches(memcg); - static_key_slow_dec(&memcg_kmem_enabled_key); + static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } - mem_cgroup_sockets_destroy(memcg); + tcp_destroy_cgroup(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -4172,6 +4166,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + cancel_work_sync(&memcg->high_work); + mem_cgroup_remove_from_trees(memcg); for_each_node(node) @@ -4182,17 +4178,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) kfree(memcg); } -/* - * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. - */ -struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg->memory.parent) - return NULL; - return mem_cgroup_from_counter(memcg->memory.parent, memory); -} -EXPORT_SYMBOL(parent_mem_cgroup); - static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -4211,7 +4196,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; - mem_cgroup_root_css = &memcg->css; page_counter_init(&memcg->memory, NULL); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -4219,6 +4203,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, NULL); } + INIT_WORK(&memcg->high_work, high_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); memcg->move_charge_at_immigrate = 0; @@ -4233,6 +4218,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif +#ifdef CONFIG_INET + memcg->socket_pressure = jiffies; +#endif return &memcg->css; free_out: @@ -4290,6 +4278,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (ret) return ret; +#ifdef CONFIG_INET + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_branch_inc(&memcg_sockets_enabled_key); +#endif + /* * Make sure the memcg is initialized: mem_cgroup_iter() * orders reading memcg->initialized against its callers @@ -4324,11 +4317,22 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) wb_memcg_offline(memcg); } +static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + invalidate_reclaim_iterators(memcg); +} + static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg_destroy_kmem(memcg); +#ifdef CONFIG_INET + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_branch_dec(&memcg_sockets_enabled_key); +#endif __mem_cgroup_free(memcg); } @@ -4445,7 +4449,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), ent.val); - if (do_swap_account) + if (do_memsw_account()) entry->val = ent.val; return page; @@ -4480,7 +4484,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, page = find_get_entry(mapping, pgoff); if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - if (do_swap_account) + if (do_memsw_account()) *entry = swp; page = find_get_page(swap_address_space(swp), swp.val); } @@ -4499,38 +4503,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 + * The caller must make sure the page is not on LRU (isolate_page() is useful.) * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. */ static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, + bool compound, struct mem_cgroup *from, struct mem_cgroup *to) { unsigned long flags; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; + VM_BUG_ON(compound && !PageTransHuge(page)); /* * Prevent mem_cgroup_replace_page() from looking at * page->mem_cgroup of its source page while we change it. */ + ret = -EBUSY; if (!trylock_page(page)) goto out; @@ -4585,9 +4581,9 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); + mem_cgroup_charge_statistics(to, page, compound, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); + mem_cgroup_charge_statistics(from, page, compound, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -4677,7 +4673,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4779,23 +4775,18 @@ static void mem_cgroup_clear_mc(void) spin_unlock(&mc.lock); } -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ struct mem_cgroup *from; struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; - /* - * We are now commited to this value whatever it is. Changes in this - * tunable will only affect upcoming migrations, not the current one. - * So we need to save it, and keep it going. - */ - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (!move_flags) + /* charge immigration isn't supported on the default hierarchy */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; /* @@ -4805,13 +4796,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * multiple. */ p = NULL; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { WARN_ON_ONCE(p); p = leader; + memcg = mem_cgroup_from_css(css); } if (!p) return 0; + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); + if (!move_flags) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); @@ -4842,8 +4843,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) mem_cgroup_clear_mc(); @@ -4861,17 +4861,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, union mc_target target; struct page *page; - /* - * We don't take compound_lock() here but no race with splitting thp - * happens because: - * - if pmd_trans_huge_lock() returns 1, the relevant thp is not - * under splitting, which means there's no concurrent thp split, - * - if another thread runs into split_huge_page() just after we - * entered this if-block, the thread must wait for page table lock - * to be unlocked in __split_huge_page_splitting(), where the main - * part of thp split is not executed yet. - */ - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { if (mc.precharge < HPAGE_PMD_NR) { spin_unlock(ptl); return 0; @@ -4880,7 +4870,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (target_type == MC_TARGET_PAGE) { page = target.page; if (!isolate_lru_page(page)) { - if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, + if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; @@ -4907,9 +4897,18 @@ retry: switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_PAGE: page = target.page; + /* + * We can have a part of the split pmd here. Moving it + * can be done but it would be too convoluted so simply + * ignore such a partial THP and keep it in original + * memcg. There should be somebody mapping the head. + */ + if (PageTransCompound(page)) + goto put; if (isolate_lru_page(page)) goto put; - if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { + if (!mem_cgroup_move_account(page, false, + mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -4985,10 +4984,10 @@ retry: atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); + struct cgroup_subsys_state *css; + struct task_struct *p = cgroup_taskset_first(tset, &css); struct mm_struct *mm = get_task_mm(p); if (mm) { @@ -5000,17 +4999,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { } #endif @@ -5184,6 +5180,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, + .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, @@ -5250,10 +5247,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * with mem_cgroup_cancel_charge() in case page instantiation fails. */ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp) + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) { struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) @@ -5271,7 +5269,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, if (page->mem_cgroup) goto out; - if (do_swap_account) { + if (do_memsw_account()) { swp_entry_t ent = { .val = page_private(page), }; unsigned short id = lookup_swap_cgroup_id(ent); @@ -5283,11 +5281,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, } } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - if (!memcg) memcg = get_mem_cgroup_from_mm(mm); @@ -5316,9 +5309,9 @@ out: * Use mem_cgroup_cancel_charge() to cancel the transaction instead. */ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) + bool lrucare, bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_PAGE(!page->mapping, page); VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); @@ -5335,17 +5328,12 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, commit_charge(page, memcg, lrucare); - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); + mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); - if (do_swap_account && PageSwapCache(page)) { + if (do_memsw_account() && PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -5363,9 +5351,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, + bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; if (mem_cgroup_disabled()) return; @@ -5377,11 +5366,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) if (!memcg) return; - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - cancel_charge(memcg, nr_pages); } @@ -5394,7 +5378,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, if (!mem_cgroup_is_root(memcg)) { page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); memcg_oom_recover(memcg); } @@ -5511,11 +5495,11 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to - * @lrucare: either or both pages might be on the LRU already * * Migrate the charge from @oldpage to @newpage. * * Both pages must be locked, @newpage->mapping must be set up. + * Either or both pages might be on the LRU already. */ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { @@ -5547,6 +5531,121 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, true); } +#ifdef CONFIG_INET + +DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); +EXPORT_SYMBOL(memcg_sockets_enabled_key); + +void sock_update_memcg(struct sock *sk) +{ + struct mem_cgroup *memcg; + + /* Socket cloning can throw us here with sk_cgrp already + * filled. It won't however, necessarily happen from + * process context. So the test for root memcg given + * the current task's memcg won't help us in this case. + * + * Respecting the original socket's memcg is a better + * decision in this case. + */ + if (sk->sk_memcg) { + BUG_ON(mem_cgroup_is_root(sk->sk_memcg)); + css_get(&sk->sk_memcg->css); + return; + } + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (memcg == root_mem_cgroup) + goto out; +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) + goto out; +#endif + if (css_tryget_online(&memcg->css)) + sk->sk_memcg = memcg; +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + WARN_ON(!sk->sk_memcg); + css_put(&sk->sk_memcg->css); +} + +/** + * mem_cgroup_charge_skmem - charge socket memory + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * + * Charges @nr_pages to @memcg. Returns %true if the charge fit within + * @memcg's configured limit, %false if the charge had to be forced. + */ +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + gfp_t gfp_mask = GFP_KERNEL; + +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + struct page_counter *counter; + + if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, + nr_pages, &counter)) { + memcg->tcp_mem.memory_pressure = 0; + return true; + } + page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); + memcg->tcp_mem.memory_pressure = 1; + return false; + } +#endif + /* Don't block in the packet receive path */ + if (in_softirq()) + gfp_mask = GFP_NOWAIT; + + if (try_charge(memcg, gfp_mask, nr_pages) == 0) + return true; + + try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); + return false; +} + +/** + * mem_cgroup_uncharge_skmem - uncharge socket memory + * @memcg - memcg to uncharge + * @nr_pages - number of pages to uncharge + */ +void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + page_counter_uncharge(&memcg->tcp_mem.memory_allocated, + nr_pages); + return; + } +#endif + page_counter_uncharge(&memcg->memory, nr_pages); + css_put_many(&memcg->css, nr_pages); +} + +#endif /* CONFIG_INET */ + +static int __init cgroup_memory(char *s) +{ + char *token; + + while ((token = strsep(&s, ",")) != NULL) { + if (!*token) + continue; + if (!strcmp(token, "nosocket")) + cgroup_memory_nosocket = true; + } + return 0; +} +__setup("cgroup.memory=", cgroup_memory); + /* * subsys_initcall() for memory controller. * @@ -5602,7 +5701,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - if (!do_swap_account) + if (!do_memsw_account()) return; memcg = page->mem_cgroup; @@ -5627,7 +5726,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for udpating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, -1); + mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); } @@ -5642,7 +5741,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) struct mem_cgroup *memcg; unsigned short id; - if (!do_swap_account) + if (!do_memsw_account()) return; id = swap_cgroup_record(entry, 0); |