diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 490 |
1 files changed, 277 insertions, 213 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 32c661d66a45..2836b5373b2e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/module.h> #include <linux/gfp.h> @@ -43,6 +45,7 @@ #include <linux/sysctl.h> #include <linux/oom.h> #include <linux/prefetch.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -56,32 +59,20 @@ #include <trace/events/vmscan.h> struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; - /* This context's GFP mask */ gfp_t gfp_mask; - int may_writepage; - - /* Can mapped pages be reclaimed? */ - int may_unmap; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - + /* Allocation order */ int order; - /* Scan (total_size >> priority) pages at once */ - int priority; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the @@ -89,11 +80,27 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; + /* Scan (total_size >> priority) pages at once */ + int priority; + + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -130,7 +137,11 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -163,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -324,7 +336,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); return freed; } @@ -458,7 +470,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in __generic_file_aio_write() against + * If this process is currently in __generic_file_write_iter() against * this page's queue, we can perform writeback even if that * will block. * @@ -477,7 +489,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __func__); + pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } @@ -565,9 +577,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - swapcache_free(swap, page); + swapcache_free(swap); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -588,7 +601,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (freepage != NULL) freepage(page); @@ -810,7 +822,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, cond_resched(); - mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1121,11 +1132,12 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - free_hot_cold_page_list(&free_pages, 1); + mem_cgroup_uncharge_list(&free_pages); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - mem_cgroup_uncharge_end(); + *ret_nr_dirty += nr_dirty; *ret_nr_congested += nr_congested; *ret_nr_unqueued_dirty += nr_unqueued_dirty; @@ -1425,6 +1437,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1439,6 +1452,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) } /* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1484,7 +1510,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1519,7 +1545,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&page_list, 1); + mem_cgroup_uncharge_list(&page_list); + free_hot_cold_page_list(&page_list, true); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1554,19 +1581,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing - * pages from reclaim context. It will forcibly stall in the - * next check. + * pages from reclaim context. */ if (nr_unqueued_dirty == nr_taken) zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); /* - * In addition, if kswapd scans pages marked marked for - * immediate reclaim and under writeback (nr_immediate), it - * implies that pages are cycling through the LRU faster than + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it implies + * that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (nr_unqueued_dirty == nr_taken || nr_immediate) + if (nr_immediate && current_may_throttle()) congestion_wait(BLK_RW_ASYNC, HZ/10); } @@ -1575,7 +1601,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * is congested. Allow kswapd to continue until it starts encountering * unqueued dirty pages or cycling through the LRU too quickly. */ - if (!sc->hibernation_mode && !current_is_kswapd()) + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, @@ -1633,6 +1660,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1674,7 +1702,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; @@ -1731,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This * helps balance scan pressure between file and anonymous pages in - * get_scan_ratio. + * get_scan_count. */ reclaim_stat->recent_rotated[file] += nr_rotated; @@ -1740,7 +1768,8 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&l_hold, 1); + mem_cgroup_uncharge_list(&l_hold); + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP @@ -1830,13 +1859,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct scan_control *sc) -{ - if (global_reclaim(sc)) - return vm_swappiness; - return mem_cgroup_swappiness(sc->target_mem_cgroup); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -1853,8 +1875,8 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -1866,6 +1888,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, bool force_scan = false; unsigned long ap, fp; enum lru_list lru; + bool some_scanned; + int pass; /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1895,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + if (!global_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1905,16 +1929,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && vmscan_swappiness(sc)) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip @@ -1925,9 +1944,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + unsigned long zonefile; + unsigned long zonefree; + + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); - if (unlikely(file + free <= high_wmark_pages(zone))) { + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; } @@ -1948,7 +1972,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); + anon_prio = swappiness; file_prio = 200 - anon_prio; /* @@ -1962,6 +1986,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * * anon in [0], file in [1] */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; @@ -1989,46 +2019,57 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - size = get_lru_size(lruvec, lru); - scan = size >> sc->priority; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; - if (!scan && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + nr[lru] = scan; /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. + * Skip the second pass and don't force_scan, + * if we found something to scan. */ - scan = div64_u64(scan * fraction[file], denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) - scan = 0; - break; - default: - /* Look ma, no brain */ - BUG(); + some_scanned |= !!scan; } - nr[lru] = scan; } } /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2037,13 +2078,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + bool scan_adjusted; - get_scan_count(lruvec, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2064,17 +2119,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) continue; /* - * For global direct reclaim, reclaim only the number of pages - * requested. Less care is taken to scan proportionally as it - * is more important to minimise direct reclaim stall latency - * than it is to properly age the LRU lists. - */ - if (global_reclaim(sc) && !current_is_kswapd()) - break; - - /* * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs shrink + * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. @@ -2082,6 +2128,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; @@ -2203,9 +2258,10 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2221,10 +2277,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { struct lruvec *lruvec; + int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, sc); + shrink_lruvec(lruvec, swappiness, sc); /* * Direct reclaim and kswapd have to scan all memory @@ -2248,41 +2306,41 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; } /* Returns true if compaction should go ahead for a high-order request */ -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; bool watermark_ok; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) - return false; - /* * Compaction takes time to run and there are potentially other * callers using the pages just freed. Continue reclaiming until * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, sc->order)) + if (compaction_deferred(zone, order)) return watermark_ok; /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) + if (!compaction_suitable(zone, order)) return false; return watermark_ok; @@ -2304,10 +2362,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. + * Returns true if a zone was reclaimable. */ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { @@ -2316,13 +2371,13 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long lru_pages = 0; - bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2353,22 +2408,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if ((zonelist_zone_idx(z) <= requested_highidx) - && compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->compaction_ready = true; + continue; } + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2381,10 +2438,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); + if (shrink_zone(zone, sc)) + reclaimable = true; + + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; } /* @@ -2407,27 +2471,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) */ sc->gfp_mask = orig_mask; - return aborted_reclaim; -} - -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (zone_reclaimable(zone)) - return false; - } - - return true; + return reclaimable; } /* @@ -2451,7 +2495,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, { unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool aborted_reclaim; + bool zones_reclaimable; delayacct_freepages_start(); @@ -2462,11 +2506,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); + zones_reclaimable = shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->compaction_ready) + break; /* * If we're getting trouble reclaiming, start doing @@ -2488,28 +2535,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - } while (--sc->priority >= 0 && !aborted_reclaim); + } while (--sc->priority >= 0); -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; - /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. - */ - if (oom_killer_disabled) - return 0; - /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) + if (sc->compaction_ready) return 1; - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) return 1; return 0; @@ -2525,10 +2563,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state(zone, NR_FREE_PAGES); } + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ @@ -2553,9 +2598,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { + struct zoneref *z; struct zone *zone; - int high_zoneidx = gfp_zone(gfp_mask); - pg_data_t *pgdat; + pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly @@ -2574,10 +2619,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (fatal_signal_pending(current)) goto out; - /* Check if the pfmemalloc reserves are ok */ - first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); - pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_mask, nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) goto out; /* Account for the throttling */ @@ -2615,15 +2684,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, - .nodemask = nodemask, }; /* @@ -2653,16 +2721,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, unsigned long *nr_scanned) { struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .order = 0, - .priority = 0, - .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2678,7 +2744,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, &sc); + shrink_lruvec(lruvec, swappiness, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2694,16 +2760,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, }; /* @@ -2891,9 +2955,8 @@ static bool kswapd_shrink_zone(struct zone *zone, * high wmark plus a "gap" where the gap is either the low * watermark or 1% of the zone, whichever is smaller. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); /* * If there is no low memory pressure or the zone is balanced then no @@ -2962,12 +3025,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .order = order, .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, - .may_writepage = !laptop_mode, - .order = order, - .target_mem_cgroup = NULL, }; count_vm_event(PAGEOUTRUN); @@ -3302,7 +3364,10 @@ static int kswapd(void *p) } } + tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + return 0; } @@ -3345,14 +3410,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct reclaim_state reclaim_state; struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, + .priority = DEF_PRIORITY, .may_writepage = 1, - .nr_to_reclaim = nr_to_reclaim, + .may_unmap = 1, + .may_swap = 1, .hibernation_mode = 1, - .order = 0, - .priority = DEF_PRIORITY, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3422,7 +3486,7 @@ int kswapd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold lock_memory_hotplug(). + * hold mem_hotplug_begin/end(). */ void kswapd_stop(int nid) { @@ -3532,13 +3596,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, |