summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c490
1 files changed, 277 insertions, 213 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32c661d66a45..2836b5373b2e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -11,6 +11,8 @@
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
@@ -43,6 +45,7 @@
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/prefetch.h>
+#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -56,32 +59,20 @@
#include <trace/events/vmscan.h>
struct scan_control {
- /* Incremented by the number of inactive pages that were scanned */
- unsigned long nr_scanned;
-
- /* Number of pages freed so far during a call to shrink_zones() */
- unsigned long nr_reclaimed;
-
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
- unsigned long hibernation_mode;
-
/* This context's GFP mask */
gfp_t gfp_mask;
- int may_writepage;
-
- /* Can mapped pages be reclaimed? */
- int may_unmap;
-
- /* Can pages be swapped as part of reclaim? */
- int may_swap;
-
+ /* Allocation order */
int order;
- /* Scan (total_size >> priority) pages at once */
- int priority;
+ /*
+ * Nodemask of nodes allowed by the caller. If NULL, all nodes
+ * are scanned.
+ */
+ nodemask_t *nodemask;
/*
* The memory cgroup that hit its limit and as a result is the
@@ -89,11 +80,27 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
- /*
- * Nodemask of nodes allowed by the caller. If NULL, all nodes
- * are scanned.
- */
- nodemask_t *nodemask;
+ /* Scan (total_size >> priority) pages at once */
+ int priority;
+
+ unsigned int may_writepage:1;
+
+ /* Can mapped pages be reclaimed? */
+ unsigned int may_unmap:1;
+
+ /* Can pages be swapped as part of reclaim? */
+ unsigned int may_swap:1;
+
+ unsigned int hibernation_mode:1;
+
+ /* One of the zones is ready for compaction */
+ unsigned int compaction_ready:1;
+
+ /* Incremented by the number of inactive pages that were scanned */
+ unsigned long nr_scanned;
+
+ /* Number of pages freed so far during a call to shrink_zones() */
+ unsigned long nr_reclaimed;
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -130,7 +137,11 @@ struct scan_control {
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
-unsigned long vm_total_pages; /* The total number of pages which the VM controls */
+/*
+ * The total number of pages which are beyond the high watermark within all
+ * zones.
+ */
+unsigned long vm_total_pages;
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
@@ -163,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
bool zone_reclaimable(struct zone *zone)
{
- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+ return zone_page_state(zone, NR_PAGES_SCANNED) <
+ zone_reclaimable_pages(zone) * 6;
}
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -324,7 +336,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
else
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
- trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+ trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
return freed;
}
@@ -458,7 +470,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
- * If this process is currently in __generic_file_aio_write() against
+ * If this process is currently in __generic_file_write_iter() against
* this page's queue, we can perform writeback even if that
* will block.
*
@@ -477,7 +489,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
- printk("%s: orphaned page\n", __func__);
+ pr_info("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
@@ -565,9 +577,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
+ mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
- swapcache_free(swap, page);
+ swapcache_free(swap);
} else {
void (*freepage)(struct page *);
void *shadow = NULL;
@@ -588,7 +601,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
shadow = workingset_eviction(mapping, page);
__delete_from_page_cache(page, shadow);
spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_cache_page(page);
if (freepage != NULL)
freepage(page);
@@ -810,7 +822,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
cond_resched();
- mem_cgroup_uncharge_start();
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
@@ -1121,11 +1132,12 @@ keep:
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
- free_hot_cold_page_list(&free_pages, 1);
+ mem_cgroup_uncharge_list(&free_pages);
+ free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
- mem_cgroup_uncharge_end();
+
*ret_nr_dirty += nr_dirty;
*ret_nr_congested += nr_congested;
*ret_nr_unqueued_dirty += nr_unqueued_dirty;
@@ -1425,6 +1437,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
+ mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&zone->lru_lock);
} else
@@ -1439,6 +1452,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
}
/*
+ * If a kernel thread (such as nfsd for loop-back mounts) services
+ * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * In that case we should only throttle if the backing device it is
+ * writing to is congested. In other cases it is safe to throttle.
+ */
+static int current_may_throttle(void)
+{
+ return !(current->flags & PF_LESS_THROTTLE) ||
+ current->backing_dev_info == NULL ||
+ bdi_write_congested(current->backing_dev_info);
+}
+
+/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
@@ -1484,7 +1510,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
if (global_reclaim(sc)) {
- zone->pages_scanned += nr_scanned;
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
else
@@ -1519,7 +1545,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&page_list, 1);
+ mem_cgroup_uncharge_list(&page_list);
+ free_hot_cold_page_list(&page_list, true);
/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1554,19 +1581,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not keeping up. In this case, flag
* the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
- * pages from reclaim context. It will forcibly stall in the
- * next check.
+ * pages from reclaim context.
*/
if (nr_unqueued_dirty == nr_taken)
zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
/*
- * In addition, if kswapd scans pages marked marked for
- * immediate reclaim and under writeback (nr_immediate), it
- * implies that pages are cycling through the LRU faster than
+ * If kswapd scans pages marked marked for immediate
+ * reclaim and under writeback (nr_immediate), it implies
+ * that pages are cycling through the LRU faster than
* they are written so also forcibly stall.
*/
- if (nr_unqueued_dirty == nr_taken || nr_immediate)
+ if (nr_immediate && current_may_throttle())
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
@@ -1575,7 +1601,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* is congested. Allow kswapd to continue until it starts encountering
* unqueued dirty pages or cycling through the LRU too quickly.
*/
- if (!sc->hibernation_mode && !current_is_kswapd())
+ if (!sc->hibernation_mode && !current_is_kswapd() &&
+ current_may_throttle())
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1633,6 +1660,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
+ mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&zone->lru_lock);
} else
@@ -1674,7 +1702,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
- zone->pages_scanned += nr_scanned;
+ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
reclaim_stat->recent_scanned[file] += nr_taken;
@@ -1731,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
* helps balance scan pressure between file and anonymous pages in
- * get_scan_ratio.
+ * get_scan_count.
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
@@ -1740,7 +1768,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&l_hold, 1);
+ mem_cgroup_uncharge_list(&l_hold);
+ free_hot_cold_page_list(&l_hold, true);
}
#ifdef CONFIG_SWAP
@@ -1830,13 +1859,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}
-static int vmscan_swappiness(struct scan_control *sc)
-{
- if (global_reclaim(sc))
- return vm_swappiness;
- return mem_cgroup_swappiness(sc->target_mem_cgroup);
-}
-
enum scan_balance {
SCAN_EQUAL,
SCAN_FRACT,
@@ -1853,8 +1875,8 @@ enum scan_balance {
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
-static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
- unsigned long *nr)
+static void get_scan_count(struct lruvec *lruvec, int swappiness,
+ struct scan_control *sc, unsigned long *nr)
{
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
@@ -1866,6 +1888,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
+ bool some_scanned;
+ int pass;
/*
* If the zone or memcg is small, nr[l] can be 0. This
@@ -1895,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* using the memory controller's swap limit feature would be
* too expensive.
*/
- if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
+ if (!global_reclaim(sc) && !swappiness) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -1905,16 +1929,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* system is close to OOM, scan both anon and file equally
* (unless the swappiness setting disagrees with swapping).
*/
- if (!sc->priority && vmscan_swappiness(sc)) {
+ if (!sc->priority && swappiness) {
scan_balance = SCAN_EQUAL;
goto out;
}
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
-
/*
* Prevent the reclaimer from falling into the cache trap: as
* cache pages start out inactive, every cache fault will tip
@@ -1925,9 +1944,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* anon pages. Try to detect this based on file LRU size.
*/
if (global_reclaim(sc)) {
- unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
+ unsigned long zonefile;
+ unsigned long zonefree;
+
+ zonefree = zone_page_state(zone, NR_FREE_PAGES);
+ zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
- if (unlikely(file + free <= high_wmark_pages(zone))) {
+ if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
scan_balance = SCAN_ANON;
goto out;
}
@@ -1948,7 +1972,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = vmscan_swappiness(sc);
+ anon_prio = swappiness;
file_prio = 200 - anon_prio;
/*
@@ -1962,6 +1986,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
*
* anon in [0], file in [1]
*/
+
+ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ get_lru_size(lruvec, LRU_INACTIVE_FILE);
+
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
@@ -1989,46 +2019,57 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
- unsigned long size;
- unsigned long scan;
+ some_scanned = false;
+ /* Only use force_scan on second pass. */
+ for (pass = 0; !some_scanned && pass < 2; pass++) {
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long size;
+ unsigned long scan;
- size = get_lru_size(lruvec, lru);
- scan = size >> sc->priority;
+ size = get_lru_size(lruvec, lru);
+ scan = size >> sc->priority;
- if (!scan && force_scan)
- scan = min(size, SWAP_CLUSTER_MAX);
+ if (!scan && pass && force_scan)
+ scan = min(size, SWAP_CLUSTER_MAX);
- switch (scan_balance) {
- case SCAN_EQUAL:
- /* Scan lists relative to size */
- break;
- case SCAN_FRACT:
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
+ /*
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
+ */
+ scan = div64_u64(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file)
+ scan = 0;
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
+ }
+ nr[lru] = scan;
/*
- * Scan types proportional to swappiness and
- * their relative recent reclaim efficiency.
+ * Skip the second pass and don't force_scan,
+ * if we found something to scan.
*/
- scan = div64_u64(scan * fraction[file], denominator);
- break;
- case SCAN_FILE:
- case SCAN_ANON:
- /* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file)
- scan = 0;
- break;
- default:
- /* Look ma, no brain */
- BUG();
+ some_scanned |= !!scan;
}
- nr[lru] = scan;
}
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
+ struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
@@ -2037,13 +2078,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
- bool scan_adjusted = false;
+ bool scan_adjusted;
- get_scan_count(lruvec, sc, nr);
+ get_scan_count(lruvec, swappiness, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
+ /*
+ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+ * event that can occur when there is little memory pressure e.g.
+ * multiple streaming readers/writers. Hence, we do not abort scanning
+ * when the requested number of pages are reclaimed when scanning at
+ * DEF_PRIORITY on the assumption that the fact we are direct
+ * reclaiming implies that kswapd is not keeping up and it is best to
+ * do a batch of work at once. For memcg reclaim one check is made to
+ * abort proportional reclaim if either the file or anon lru has already
+ * dropped to zero at the first pass.
+ */
+ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
+
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
@@ -2064,17 +2119,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
continue;
/*
- * For global direct reclaim, reclaim only the number of pages
- * requested. Less care is taken to scan proportionally as it
- * is more important to minimise direct reclaim stall latency
- * than it is to properly age the LRU lists.
- */
- if (global_reclaim(sc) && !current_is_kswapd())
- break;
-
- /*
* For kswapd and memcg, reclaim at least the number of pages
- * requested. Ensure that the anon and file LRUs shrink
+ * requested. Ensure that the anon and file LRUs are scanned
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
@@ -2082,6 +2128,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+ /*
+ * It's just vindictive to attack the larger once the smaller
+ * has gone to zero. And given the way we stop scanning the
+ * smaller below, this makes sure that we only make one nudge
+ * towards proportionality once we've got nr_to_reclaim.
+ */
+ if (!nr_file || !nr_anon)
+ break;
+
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
@@ -2203,9 +2258,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
}
}
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static bool shrink_zone(struct zone *zone, struct scan_control *sc)
{
unsigned long nr_reclaimed, nr_scanned;
+ bool reclaimable = false;
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2221,10 +2277,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
struct lruvec *lruvec;
+ int swappiness;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ swappiness = mem_cgroup_swappiness(memcg);
- shrink_lruvec(lruvec, sc);
+ shrink_lruvec(lruvec, swappiness, sc);
/*
* Direct reclaim and kswapd have to scan all memory
@@ -2248,41 +2306,41 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
+ if (sc->nr_reclaimed - nr_reclaimed)
+ reclaimable = true;
+
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
+
+ return reclaimable;
}
/* Returns true if compaction should go ahead for a high-order request */
-static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+static inline bool compaction_ready(struct zone *zone, int order)
{
unsigned long balance_gap, watermark;
bool watermark_ok;
- /* Do not consider compaction for orders reclaim is meant to satisfy */
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
- return false;
-
/*
* Compaction takes time to run and there are potentially other
* callers using the pages just freed. Continue reclaiming until
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
- balance_gap = min(low_wmark_pages(zone),
- (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
- KSWAPD_ZONE_BALANCE_GAP_RATIO);
- watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+ zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
+ watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
/*
* If compaction is deferred, reclaim up to a point where
* compaction will have a chance of success when re-enabled
*/
- if (compaction_deferred(zone, sc->order))
+ if (compaction_deferred(zone, order))
return watermark_ok;
/* If compaction is not ready to start, keep reclaiming */
- if (!compaction_suitable(zone, sc->order))
+ if (!compaction_suitable(zone, order))
return false;
return watermark_ok;
@@ -2304,10 +2362,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*
- * This function returns true if a zone is being reclaimed for a costly
- * high-order allocation and compaction is ready to begin. This indicates to
- * the caller that it should consider retrying the allocation instead of
- * further reclaim.
+ * Returns true if a zone was reclaimable.
*/
static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
@@ -2316,13 +2371,13 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long lru_pages = 0;
- bool aborted_reclaim = false;
struct reclaim_state *reclaim_state = current->reclaim_state;
gfp_t orig_mask;
struct shrink_control shrink = {
.gfp_mask = sc->gfp_mask,
};
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+ bool reclaimable = false;
/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2353,22 +2408,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
- if (IS_ENABLED(CONFIG_COMPACTION)) {
- /*
- * If we already have plenty of memory free for
- * compaction in this zone, don't free any more.
- * Even though compaction is invoked for any
- * non-zero order, only frequent costly order
- * reclamation is disruptive enough to become a
- * noticeable problem, like transparent huge
- * page allocations.
- */
- if ((zonelist_zone_idx(z) <= requested_highidx)
- && compaction_ready(zone, sc)) {
- aborted_reclaim = true;
- continue;
- }
+
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticeable problem, like transparent huge
+ * page allocations.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) &&
+ sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+ zonelist_zone_idx(z) <= requested_highidx &&
+ compaction_ready(zone, sc->order)) {
+ sc->compaction_ready = true;
+ continue;
}
+
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
@@ -2381,10 +2438,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
+ if (nr_soft_reclaimed)
+ reclaimable = true;
/* need some check for avoid more shrink_zone() */
}
- shrink_zone(zone, sc);
+ if (shrink_zone(zone, sc))
+ reclaimable = true;
+
+ if (global_reclaim(sc) &&
+ !reclaimable && zone_reclaimable(zone))
+ reclaimable = true;
}
/*
@@ -2407,27 +2471,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
*/
sc->gfp_mask = orig_mask;
- return aborted_reclaim;
-}
-
-/* All zones in zonelist are unreclaimable? */
-static bool all_unreclaimable(struct zonelist *zonelist,
- struct scan_control *sc)
-{
- struct zoneref *z;
- struct zone *zone;
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_zone(sc->gfp_mask), sc->nodemask) {
- if (!populated_zone(zone))
- continue;
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
- if (zone_reclaimable(zone))
- return false;
- }
-
- return true;
+ return reclaimable;
}
/*
@@ -2451,7 +2495,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
{
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
- bool aborted_reclaim;
+ bool zones_reclaimable;
delayacct_freepages_start();
@@ -2462,11 +2506,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
- aborted_reclaim = shrink_zones(zonelist, sc);
+ zones_reclaimable = shrink_zones(zonelist, sc);
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
- goto out;
+ break;
+
+ if (sc->compaction_ready)
+ break;
/*
* If we're getting trouble reclaiming, start doing
@@ -2488,28 +2535,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
- } while (--sc->priority >= 0 && !aborted_reclaim);
+ } while (--sc->priority >= 0);
-out:
delayacct_freepages_end();
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
- /*
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
- * check.
- */
- if (oom_killer_disabled)
- return 0;
-
/* Aborted reclaim to try compaction? don't OOM, then */
- if (aborted_reclaim)
+ if (sc->compaction_ready)
return 1;
- /* top priority shrink_zones still had more to do? don't OOM, then */
- if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
+ /* Any of the zones still reclaimable? Don't OOM. */
+ if (zones_reclaimable)
return 1;
return 0;
@@ -2525,10 +2563,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
@@ -2553,9 +2598,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
+ struct zoneref *z;
struct zone *zone;
- int high_zoneidx = gfp_zone(gfp_mask);
- pg_data_t *pgdat;
+ pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
@@ -2574,10 +2619,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (fatal_signal_pending(current))
goto out;
- /* Check if the pfmemalloc reserves are ok */
- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
goto out;
/* Account for the throttling */
@@ -2615,15 +2684,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
{
unsigned long nr_reclaimed;
struct scan_control sc = {
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .order = order,
+ .nodemask = nodemask,
+ .priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
- .nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_unmap = 1,
.may_swap = 1,
- .order = order,
- .priority = DEF_PRIORITY,
- .target_mem_cgroup = NULL,
- .nodemask = nodemask,
};
/*
@@ -2653,16 +2721,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
unsigned long *nr_scanned)
{
struct scan_control sc = {
- .nr_scanned = 0,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .target_mem_cgroup = memcg,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
- .order = 0,
- .priority = 0,
- .target_mem_cgroup = memcg,
};
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ int swappiness = mem_cgroup_swappiness(memcg);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2678,7 +2744,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_lruvec(lruvec, &sc);
+ shrink_lruvec(lruvec, swappiness, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2694,16 +2760,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_reclaimed;
int nid;
struct scan_control sc = {
- .may_writepage = !laptop_mode,
- .may_unmap = 1,
- .may_swap = !noswap,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .order = 0,
- .priority = DEF_PRIORITY,
- .target_mem_cgroup = memcg,
- .nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ .target_mem_cgroup = memcg,
+ .priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
+ .may_unmap = 1,
+ .may_swap = !noswap,
};
/*
@@ -2891,9 +2955,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
* high wmark plus a "gap" where the gap is either the low
* watermark or 1% of the zone, whichever is smaller.
*/
- balance_gap = min(low_wmark_pages(zone),
- (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
- KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+ zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
/*
* If there is no low memory pressure or the zone is balanced then no
@@ -2962,12 +3025,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
+ .order = order,
.priority = DEF_PRIORITY,
+ .may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
- .may_writepage = !laptop_mode,
- .order = order,
- .target_mem_cgroup = NULL,
};
count_vm_event(PAGEOUTRUN);
@@ -3302,7 +3364,10 @@ static int kswapd(void *p)
}
}
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return 0;
}
@@ -3345,14 +3410,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
struct reclaim_state reclaim_state;
struct scan_control sc = {
+ .nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
- .may_swap = 1,
- .may_unmap = 1,
+ .priority = DEF_PRIORITY,
.may_writepage = 1,
- .nr_to_reclaim = nr_to_reclaim,
+ .may_unmap = 1,
+ .may_swap = 1,
.hibernation_mode = 1,
- .order = 0,
- .priority = DEF_PRIORITY,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
@@ -3422,7 +3486,7 @@ int kswapd_run(int nid)
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold lock_memory_hotplug().
+ * hold mem_hotplug_begin/end().
*/
void kswapd_stop(int nid)
{
@@ -3532,13 +3596,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
struct task_struct *p = current;
struct reclaim_state reclaim_state;
struct scan_control sc = {
- .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
- .may_swap = 1,
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.order = order,
.priority = ZONE_RECLAIM_PRIORITY,
+ .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .may_swap = 1,
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
OpenPOWER on IntegriCloud