diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 828 |
1 files changed, 321 insertions, 507 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1cbe1e54ff5f..9106f1b12f56 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -754,9 +754,11 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); __mem_cgroup_remove_exceeded(mz, mctz); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); } @@ -779,7 +781,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * mem is over its softlimit. */ if (excess || mz->on_tree) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); /* if on-tree, remove it */ if (mz->on_tree) __mem_cgroup_remove_exceeded(mz, mctz); @@ -788,7 +792,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * If excess is 0, no tree ops. */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); } } } @@ -839,9 +843,9 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { struct mem_cgroup_per_zone *mz; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); return mz; } @@ -882,13 +886,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg, return val; } -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) -{ - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -909,13 +906,13 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool anon, int nr_pages) + int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. */ - if (anon) + if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else @@ -1013,7 +1010,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { - preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@ -1026,8 +1022,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); #endif - preempt_enable(); - mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); @@ -1035,8 +1029,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); #endif - } else - preempt_enable(); + } } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1347,20 +1340,6 @@ out: return lruvec; } -/* - * Following LRU functions are allowed to be used without PCG_LOCK. - * Operations are called by routine of global LRU independently from memcg. - * What we have to take care of here is validness of pc->mem_cgroup. - * - * Changes to pc->mem_cgroup happens when - * 1. charge - * 2. moving account - * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. - * It is added to LRU before charge. - * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. - * When moving account, the page is not on LRU. It's isolated. - */ - /** * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page @@ -2261,22 +2240,14 @@ cleanup: * * Notes: Race condition * - * We usually use lock_page_cgroup() for accessing page_cgroup member but - * it tends to be costly. But considering some conditions, we doesn't need - * to do so _always_. - * - * Considering "charge", lock_page_cgroup() is not required because all - * file-stat operations happen after a page is attached to radix-tree. There - * are no race with "charge". + * Charging occurs during page instantiation, while the page is + * unmapped and locked in page migration, or while the page table is + * locked in THP migration. No race is possible. * - * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup - * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even - * if there are race with "uncharge". Statistics itself is properly handled - * by flags. + * Uncharge happens to pages with zero references, no race possible. * - * Considering "move", this is an only case we see a race. To make the race - * small, we check memcg->moving_account and detect there are possibility - * of race or not. If there is, we take a lock. + * Charge moving between groups is protected by checking mm->moving + * account and taking the move_lock in the slowpath. */ void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2689,6 +2660,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return mem_cgroup_from_id(id); } +/* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges. If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@ -2699,7 +2680,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; if (memcg && !css_tryget_online(&memcg->css)) @@ -2713,19 +2693,46 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) memcg = NULL; rcu_read_unlock(); } - unlock_page_cgroup(pc); return memcg; } +static void lock_page_lru(struct page *page, int *isolated) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + *isolated = 1; + } else + *isolated = 0; +} + +static void unlock_page_lru(struct page *page, int isolated) +{ + struct zone *zone = page_zone(page); + + if (isolated) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(PageLRU(page), page); + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, page_lru(page)); + } + spin_unlock_irq(&zone->lru_lock); +} + static void commit_charge(struct page *page, struct mem_cgroup *memcg, - unsigned int nr_pages, bool anon, bool lrucare) + unsigned int nr_pages, bool lrucare) { struct page_cgroup *pc = lookup_page_cgroup(page); - struct zone *uninitialized_var(zone); - struct lruvec *lruvec; - bool was_on_lru = false; + int isolated; - lock_page_cgroup(pc); VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not @@ -2736,39 +2743,38 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page * may already be on some other mem_cgroup's LRU. Take care of it. */ - if (lrucare) { - zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { - lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - was_on_lru = true; - } - } + if (lrucare) + lock_page_lru(page, &isolated); + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point: + * + * - the page is uncharged + * + * - the page is off-LRU + * + * - an anonymous fault has exclusive page access, except for + * a locked page table + * + * - a page cache insertion, a swapin fault, or a migration + * have the page locked + */ pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - - if (lrucare) { - if (was_on_lru) { - lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&zone->lru_lock); - } + pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); - mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); - unlock_page_cgroup(pc); + if (lrucare) + unlock_page_lru(page, isolated); + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ memcg_check_events(memcg, page); + local_irq_enable(); } static DEFINE_MUTEX(set_limit_mutex); @@ -3395,7 +3401,6 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -3416,7 +3421,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + pc->flags = head_pc->flags; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); @@ -3446,7 +3451,6 @@ static int mem_cgroup_move_account(struct page *page, { unsigned long flags; int ret; - bool anon = PageAnon(page); VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -3460,15 +3464,21 @@ static int mem_cgroup_move_account(struct page *page, if (nr_pages > 1 && !PageTransHuge(page)) goto out; - lock_page_cgroup(pc); + /* + * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out; ret = -EINVAL; if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) - goto unlock; + goto out_unlock; move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) { + if (!PageAnon(page) && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@ -3482,20 +3492,25 @@ static int mem_cgroup_move_account(struct page *page, nr_pages); } - mem_cgroup_charge_statistics(from, page, anon, -nr_pages); + /* + * It is safe to change pc->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; -unlock: - unlock_page_cgroup(pc); - /* - * check events - */ + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); + local_irq_enable(); +out_unlock: + unlock_page(page); out: return ret; } @@ -3566,193 +3581,6 @@ out: return ret; } -static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages, - const enum charge_type ctype) -{ - struct memcg_batch_info *batch = NULL; - bool uncharge_memsw = true; - - /* If swapout, usage of swap doesn't decrease */ - if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - uncharge_memsw = false; - - batch = ¤t->memcg_batch; - /* - * In usual, we do css_get() when we remember memcg pointer. - * But in this case, we keep res->usage until end of a series of - * uncharges. Then, it's ok to ignore memcg's refcnt. - */ - if (!batch->memcg) - batch->memcg = memcg; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continuously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - - if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; - - if (nr_pages > 1) - goto direct_uncharge; - - /* - * In typical case, batch->memcg == mem. This means we can - * merge a series of uncharges to an uncharge of res_counter. - * If not, we uncharge res_counter ony by one. - */ - if (batch->memcg != memcg) - goto direct_uncharge; - /* remember freed charge and uncharge it later */ - batch->nr_pages++; - if (uncharge_memsw) - batch->memsw_nr_pages++; - return; -direct_uncharge: - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (uncharge_memsw) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != memcg)) - memcg_oom_recover(memcg); -} - -/* - * uncharge if !page_mapped(page) - */ -static struct mem_cgroup * -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, - bool end_migration) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - bool anon; - - if (mem_cgroup_disabled()) - return NULL; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Check if our page_cgroup is valid - */ - pc = lookup_page_cgroup(page); - if (unlikely(!PageCgroupUsed(pc))) - return NULL; - - lock_page_cgroup(pc); - - memcg = pc->mem_cgroup; - - if (!PageCgroupUsed(pc)) - goto unlock_out; - - anon = PageAnon(page); - - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_ANON: - /* - * Generally PageAnon tells if it's the anon statistics to be - * updated; but sometimes e.g. mem_cgroup_uncharge_page() is - * used before page reached the stage of being marked PageAnon. - */ - anon = true; - /* fallthrough */ - case MEM_CGROUP_CHARGE_TYPE_DROP: - /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page)) - goto unlock_out; - /* - * Pages under migration may not be uncharged. But - * end_migration() /must/ be the one uncharging the - * unused post-migration page and so it has to call - * here with the migration bit still set. See the - * res_counter handling below. - */ - if (!end_migration && PageCgroupMigration(pc)) - goto unlock_out; - break; - case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: - if (!PageAnon(page)) { /* Shared memory */ - if (page->mapping && !page_is_file_cache(page)) - goto unlock_out; - } else if (page_mapped(page)) /* Anon */ - goto unlock_out; - break; - default: - break; - } - - mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); - - ClearPageCgroupUsed(pc); - /* - * pc->mem_cgroup is not cleared here. It will be accessed when it's - * freed from LRU. This is safe because uncharged page is expected not - * to be reused (freed soon). Exception is SwapCache, it's handled by - * special functions. - */ - - unlock_page_cgroup(pc); - /* - * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed, so it's safe to call css_get(). - */ - memcg_check_events(memcg, page); - if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); - } - /* - * Migration does not charge the res_counter for the - * replacement page, so leave it alone when phasing out the - * page that is unused after the migration. - */ - if (!end_migration) - mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - - return memcg; - -unlock_out: - unlock_page_cgroup(pc); - return NULL; -} - -void mem_cgroup_uncharge_page(struct page *page) -{ - /* early check. */ - if (page_mapped(page)) - return; - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - /* - * If the page is in swap cache, uncharge should be deferred - * to the swap path, which also properly accounts swap usage - * and handles memcg lifetime. - * - * Note that this check is not stable and reclaim may add the - * page to swap cache at any time after this. However, if the - * page is not in swap cache by the time page->mapcount hits - * 0, there won't be any page table references to the swap - * slot, and reclaim will free it and not actually write the - * page to disk. - */ - if (PageSwapCache(page)) - return; - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); -} - -void mem_cgroup_uncharge_cache_page(struct page *page) -{ - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping, page); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); -} - /* * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. * In that cases, pages are freed continuously and we can expect pages @@ -3763,6 +3591,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page) void mem_cgroup_uncharge_start(void) { + unsigned long flags; + + local_irq_save(flags); current->memcg_batch.do_batch++; /* We can do nest. */ if (current->memcg_batch.do_batch == 1) { @@ -3770,21 +3601,18 @@ void mem_cgroup_uncharge_start(void) current->memcg_batch.nr_pages = 0; current->memcg_batch.memsw_nr_pages = 0; } + local_irq_restore(flags); } void mem_cgroup_uncharge_end(void) { struct memcg_batch_info *batch = ¤t->memcg_batch; + unsigned long flags; - if (!batch->do_batch) - return; - - batch->do_batch--; - if (batch->do_batch) /* If stacked, do nothing. */ - return; - - if (!batch->memcg) - return; + local_irq_save(flags); + VM_BUG_ON(!batch->do_batch); + if (--batch->do_batch) /* If stacked, do nothing */ + goto out; /* * This "batch->memcg" is valid without any css_get/put etc... * bacause we hide charges behind us. @@ -3796,61 +3624,16 @@ void mem_cgroup_uncharge_end(void) res_counter_uncharge(&batch->memcg->memsw, batch->memsw_nr_pages * PAGE_SIZE); memcg_oom_recover(batch->memcg); - /* forget this pointer (for sanity check) */ - batch->memcg = NULL; -} - -#ifdef CONFIG_SWAP -/* - * called after __delete_from_swap_cache() and drop "page" account. - * memcg information is recorded to swap_cgroup of "ent" - */ -void -mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) -{ - struct mem_cgroup *memcg; - int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; - - if (!swapout) /* this was a swap cache but the swap is unused ! */ - ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - - memcg = __mem_cgroup_uncharge_common(page, ctype, false); - - /* - * record memcg information, if swapout && memcg != NULL, - * css_get() was called in uncharge(). - */ - if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, mem_cgroup_id(memcg)); +out: + local_irq_restore(flags); } -#endif #ifdef CONFIG_MEMCG_SWAP -/* - * called from swap_entry_free(). remove record in swap_cgroup and - * uncharge "memsw" account. - */ -void mem_cgroup_uncharge_swap(swp_entry_t ent) +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, + bool charge) { - struct mem_cgroup *memcg; - unsigned short id; - - if (!do_swap_account) - return; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { - /* - * We uncharge this because swap is freed. This memcg can - * be obsolete one. We avoid calling css_tryget_online(). - */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); - } - rcu_read_unlock(); + int val = (charge) ? 1 : -1; + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); } /** @@ -3902,169 +3685,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -/* - * Before starting migration, account PAGE_SIZE to mem_cgroup that the old - * page belongs to. - */ -void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return; - - if (PageTransHuge(page)) - nr_pages <<= compound_order(page); - - pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - css_get(&memcg->css); - /* - * At migrating an anonymous page, its mapcount goes down - * to 0 and uncharge() will be called. But, even if it's fully - * unmapped, migration may fail and this page has to be - * charged again. We set MIGRATION flag here and delay uncharge - * until end_migration() is called - * - * Corner Case Thinking - * A) - * When the old page was mapped as Anon and it's unmap-and-freed - * while migration was ongoing. - * If unmap finds the old page, uncharge() of it will be delayed - * until end_migration(). If unmap finds a new page, it's - * uncharged when it make mapcount to be 1->0. If unmap code - * finds swap_migration_entry, the new page will not be mapped - * and end_migration() will find it(mapcount==0). - * - * B) - * When the old page was mapped but migraion fails, the kernel - * remaps it. A charge for it is kept by MIGRATION flag even - * if mapcount goes down to 0. We can do remap successfully - * without charging it again. - * - * C) - * The "old" page is under lock_page() until the end of - * migration, so, the old page itself will not be swapped-out. - * If the new page is swapped out before end_migraton, our - * hook to usual swap-out path will catch the event. - */ - if (PageAnon(page)) - SetPageCgroupMigration(pc); - } - unlock_page_cgroup(pc); - /* - * If the page is not charged at this point, - * we return here. - */ - if (!memcg) - return; - - *memcgp = memcg; - /* - * We charge new page before it's used/mapped. So, even if unlock_page() - * is called before end_migration, we can catch all events on this new - * page. In the case new page is migrated but not remapped, new page's - * mapcount will be finally 0 and we call uncharge in end_migration(). - */ - /* - * The page is committed to the memcg, but it's not actually - * charged to the res_counter since we plan on replacing the - * old one and only one page is going to be left afterwards. - */ - commit_charge(newpage, memcg, nr_pages, PageAnon(page), false); -} - -/* remove redundant charge if migration failed*/ -void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok) -{ - struct page *used, *unused; - struct page_cgroup *pc; - bool anon; - - if (!memcg) - return; - - if (!migration_ok) { - used = oldpage; - unused = newpage; - } else { - used = newpage; - unused = oldpage; - } - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); - css_put(&memcg->css); - /* - * We disallowed uncharge of pages under migration because mapcount - * of the page goes down to zero, temporarly. - * Clear the flag and check the page should be charged. - */ - pc = lookup_page_cgroup(oldpage); - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - - /* - * If a page is a file cache, radix-tree replacement is very atomic - * and we can skip this check. When it was an Anon page, its mapcount - * goes down to 0. But because we added MIGRATION flage, it's not - * uncharged yet. There are several case but page->mapcount check - * and USED bit check in mem_cgroup_uncharge_page() will do enough - * check. (see prepare_charge() also) - */ - if (anon) - mem_cgroup_uncharge_page(used); -} - -/* - * At replace page cache, newpage is not under any memcg but it's on - * LRU. So, this function doesn't touch res_counter but handles LRU - * in correct way. Both pages are locked so we cannot race with uncharge. - */ -void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage) -{ - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(oldpage); - /* fix accounting on old pages */ - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, oldpage, false, -1); - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); - - /* - * When called from shmem_replace_page(), in some cases the - * oldpage has already been charged, and in some cases not. - */ - if (!memcg) - return; - /* - * Even if newpage->mapping was NULL before starting replacement, - * the newpage may be on LRU(or pagevec for LRU) already. We lock - * LRU while we overwrite pc->mem_cgroup. - */ - commit_charge(newpage, memcg, 1, false, true); -} - #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -4263,7 +3883,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); /* * If we failed to reclaim anything from this memory cgroup @@ -4303,7 +3923,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, */ /* If excess == 0, no tree ops */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); css_put(&mz->memcg->css); loop++; /* @@ -6265,9 +5885,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (page) { pc = lookup_page_cgroup(page); /* - * Do only loose check w/o page_cgroup lock. - * mem_cgroup_move_account() checks the pc is valid or not under - * the lock. + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the pc is valid or + * not under LRU exclusion. */ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; @@ -6729,6 +6349,67 @@ static void __init enable_swap_cgroup(void) } #endif +#ifdef CONFIG_MEMCG_SWAP +/** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ +void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ + struct page_cgroup *pc; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + pc = lookup_page_cgroup(page); + + /* Readahead page, never charged */ + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); + + oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + VM_BUG_ON_PAGE(oldid, page); + + pc->flags &= ~PCG_MEMSW; + css_get(&pc->mem_cgroup->css); + mem_cgroup_swap_statistics(pc->mem_cgroup, true); +} + +/** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ +void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); +} +#endif + /** * mem_cgroup_try_charge - try charging a page * @page: page to charge @@ -6831,7 +6512,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, VM_BUG_ON_PAGE(!PageTransHuge(page), page); } - commit_charge(page, memcg, nr_pages, PageAnon(page), lrucare); + commit_charge(page, memcg, nr_pages, lrucare); if (do_swap_account && PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; @@ -6873,6 +6554,139 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) cancel_charge(memcg, nr_pages); } +/** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge(struct page *page) +{ + struct memcg_batch_info *batch; + unsigned int nr_pages = 1; + struct mem_cgroup *memcg; + struct page_cgroup *pc; + unsigned long pc_flags; + unsigned long flags; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(page); + + /* Every final put_page() ends up here */ + if (!PageCgroupUsed(pc)) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have fully + * exclusive access to the page. + */ + memcg = pc->mem_cgroup; + pc_flags = pc->flags; + pc->flags = 0; + + local_irq_save(flags); + + if (nr_pages > 1) + goto direct; + if (unlikely(test_thread_flag(TIF_MEMDIE))) + goto direct; + batch = ¤t->memcg_batch; + if (!batch->do_batch) + goto direct; + if (batch->memcg && batch->memcg != memcg) + goto direct; + if (!batch->memcg) + batch->memcg = memcg; + if (pc_flags & PCG_MEM) + batch->nr_pages++; + if (pc_flags & PCG_MEMSW) + batch->memsw_nr_pages++; + goto out; +direct: + if (pc_flags & PCG_MEM) + res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); + if (pc_flags & PCG_MEMSW) + res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); + memcg_oom_recover(memcg); +out: + mem_cgroup_charge_statistics(memcg, page, -nr_pages); + memcg_check_events(memcg, page); + + local_irq_restore(flags); +} + +/** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: both pages might be on the LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare) +{ + unsigned int nr_pages = 1; + struct page_cgroup *pc; + int isolated; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); + VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); + + if (mem_cgroup_disabled()) + return; + + /* Page cache replacement: new page already charged? */ + pc = lookup_page_cgroup(newpage); + if (PageCgroupUsed(pc)) + return; + + /* Re-entrant migration: old page already uncharged? */ + pc = lookup_page_cgroup(oldpage); + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); + VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); + + if (PageTransHuge(oldpage)) { + nr_pages <<= compound_order(oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage); + } + + if (lrucare) + lock_page_lru(oldpage, &isolated); + + pc->flags = 0; + + if (lrucare) + unlock_page_lru(oldpage, isolated); + + local_irq_disable(); + mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages); + memcg_check_events(pc->mem_cgroup, oldpage); + local_irq_enable(); + + commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare); +} + /* * subsys_initcall() for memory controller. * |