From f2f43e566a02a3bdde0a65e6a2e88d707c212a29 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 6 Jul 2017 15:36:50 -0700 Subject: mm/vmscan.c: fix unsequenced modification and access warning Clang and its -Wunsequenced emits a warning mm/vmscan.c:2961:25: error: unsequenced modification and access to 'gfp_mask' [-Wunsequenced] .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), ^ While it is not clear to me whether the initialization code violates the specification (6.7.8 par 19 (ISO/IEC 9899) looks like it disagrees) the code is quite confusing and worth cleaning up anyway. Fix this by reusing sc.gfp_mask rather than the updated input gfp_mask parameter. Link: http://lkml.kernel.org/r/20170510154030.10720-1-nick.desaulniers@gmail.com Signed-off-by: Nick Desaulniers Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index c3c1c6ac62da..a10e05870835 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2967,7 +2967,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -2982,12 +2982,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * 1 is returned so that the page allocator does not OOM kill at this * point. */ - if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, - gfp_mask, + sc.gfp_mask, sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3774,17 +3774,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int classzone_idx = gfp_zone(gfp_mask); unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, - .reclaim_idx = classzone_idx, + .reclaim_idx = gfp_zone(gfp_mask), }; cond_resched(); @@ -3795,7 +3794,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(gfp_mask); + lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; -- cgit v1.2.3 From 75f6d6d29a40b5541f0f107201cf7dec134ad210 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 6 Jul 2017 15:37:21 -0700 Subject: mm, THP, swap: unify swap slot free functions to put_swap_page Now, get_swap_page takes struct page and allocates swap space according to page size(ie, normal or THP) so it would be more cleaner to introduce put_swap_page which is a counter function of get_swap_page. Then, it calls right swap slot free function depending on page's size. [ying.huang@intel.com: minor cleanup and fix] Link: http://lkml.kernel.org/r/20170515112522.32457-3-ying.huang@intel.com Signed-off-by: Minchan Kim Signed-off-by: "Huang, Ying" Acked-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Rik van Riel Cc: Shaohua Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 12 ++---------- mm/shmem.c | 2 +- mm/swap_state.c | 13 +++---------- mm/swapfile.c | 16 ++++++++++++++-- mm/vmscan.c | 2 +- 5 files changed, 21 insertions(+), 24 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index d18876384de0..ead6fd7966b4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -387,6 +387,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); +extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]); extern int add_swap_count_continuation(swp_entry_t, gfp_t); @@ -394,7 +395,6 @@ extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); -extern void swapcache_free(swp_entry_t); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); @@ -453,7 +453,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline void swapcache_free(swp_entry_t swp) +static inline void put_swap_page(struct page *page, swp_entry_t swp) { } @@ -578,13 +578,5 @@ static inline bool mem_cgroup_swap_full(struct page *page) } #endif -#ifdef CONFIG_THP_SWAP -extern void swapcache_free_cluster(swp_entry_t entry); -#else -static inline void swapcache_free_cluster(swp_entry_t entry) -{ -} -#endif - #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/shmem.c b/mm/shmem.c index bbb987c58dad..a06f23731d3f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1327,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) mutex_unlock(&shmem_swaplist_mutex); free_swap: - swapcache_free(swap); + put_swap_page(page, swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) diff --git a/mm/swap_state.c b/mm/swap_state.c index 16ff89d058f4..0ad214d7a7ad 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -231,10 +231,7 @@ retry: return 1; fail_free: - if (PageTransHuge(page)) - swapcache_free_cluster(entry); - else - swapcache_free(entry); + put_swap_page(page, entry); fail: if (PageTransHuge(page) && !split_huge_page_to_list(page, list)) goto retry; @@ -259,11 +256,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - if (PageTransHuge(page)) - swapcache_free_cluster(entry); - else - swapcache_free(entry); - + put_swap_page(page, entry); page_ref_sub(page, hpage_nr_pages(page)); } @@ -415,7 +408,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); + put_swap_page(new_page, entry); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 984f0dd94948..8a6cdf9e55f9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1148,7 +1148,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void swapcache_free(swp_entry_t entry) +static void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; @@ -1160,7 +1160,7 @@ void swapcache_free(swp_entry_t entry) } #ifdef CONFIG_THP_SWAP -void swapcache_free_cluster(swp_entry_t entry) +static void swapcache_free_cluster(swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1184,8 +1184,20 @@ void swapcache_free_cluster(swp_entry_t entry) swap_free_cluster(si, idx); spin_unlock(&si->lock); } +#else +static inline void swapcache_free_cluster(swp_entry_t entry) +{ +} #endif /* CONFIG_THP_SWAP */ +void put_swap_page(struct page *page, swp_entry_t entry) +{ + if (!PageTransHuge(page)) + swapcache_free(entry); + else + swapcache_free_cluster(entry); +} + void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; diff --git a/mm/vmscan.c b/mm/vmscan.c index a10e05870835..cb7c154a4a9d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - swapcache_free(swap); + put_swap_page(page, swap); } else { void (*freepage)(struct page *); void *shadow = NULL; -- cgit v1.2.3 From 0f0746589e4be071a8f890b2035c97c30c7a4e16 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 6 Jul 2017 15:37:24 -0700 Subject: mm, THP, swap: move anonymous THP split logic to vmscan The add_to_swap aims to allocate swap_space(ie, swap slot and swapcache) so if it fails due to lack of space in case of THP or something(hdd swap but tries THP swapout) *caller* rather than add_to_swap itself should split the THP page and retry it with base page which is more natural. Link: http://lkml.kernel.org/r/20170515112522.32457-4-ying.huang@intel.com Signed-off-by: Minchan Kim Signed-off-by: "Huang, Ying" Acked-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Rik van Riel Cc: Shaohua Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 ++-- mm/swap_state.c | 23 ++++++----------------- mm/vmscan.c | 17 ++++++++++++++++- 3 files changed, 24 insertions(+), 20 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index ead6fd7966b4..5ab1c98c7d27 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -353,7 +353,7 @@ extern struct address_space *swapper_spaces[]; >> SWAP_ADDRESS_SPACE_SHIFT]) extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *, struct list_head *list); +extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *); @@ -473,7 +473,7 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp) return NULL; } -static inline int add_to_swap(struct page *page, struct list_head *list) +static inline int add_to_swap(struct page *page) { return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 0ad214d7a7ad..9c71b6b2562f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -184,7 +184,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page, struct list_head *list) +int add_to_swap(struct page *page) { swp_entry_t entry; int err; @@ -192,12 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); -retry: entry = get_swap_page(page); if (!entry.val) - goto fail; + return 0; + if (mem_cgroup_try_charge_swap(page, entry)) - goto fail_free; + goto fail; /* * Radix-tree node allocations from PF_MEMALLOC contexts could @@ -218,23 +218,12 @@ retry: * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - goto fail_free; - - if (PageTransHuge(page)) { - err = split_huge_page_to_list(page, list); - if (err) { - delete_from_swap_cache(page); - return 0; - } - } + goto fail; return 1; -fail_free: - put_swap_page(page, entry); fail: - if (PageTransHuge(page) && !split_huge_page_to_list(page, list)) - goto retry; + put_swap_page(page, entry); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index cb7c154a4a9d..729e37f02de6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1125,8 +1125,23 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page, page_list)) + if (!add_to_swap(page)) { + if (!PageTransHuge(page)) + goto activate_locked; + /* Split THP and swap individual base pages */ + if (split_huge_page_to_list(page, page_list)) + goto activate_locked; + if (!add_to_swap(page)) + goto activate_locked; + } + + /* XXX: We don't support THP writes */ + if (PageTransHuge(page) && + split_huge_page_to_list(page, page_list)) { + delete_from_swap_cache(page); goto activate_locked; + } + may_enter_fs = 1; /* Adding to swap updated mapping */ -- cgit v1.2.3 From b8f593cd0896b8b14c2b494a9776531b5cd54d98 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 6 Jul 2017 15:37:28 -0700 Subject: mm, THP, swap: check whether THP can be split firstly To swap out THP (Transparent Huage Page), before splitting the THP, the swap cluster will be allocated and the THP will be added into the swap cache. But it is possible that the THP cannot be split, so that we must delete the THP from the swap cache and free the swap cluster. To avoid that, in this patch, whether the THP can be split is checked firstly. The check can only be done racy, but it is good enough for most cases. With the patch, the swap out throughput improves 3.6% (from about 4.16GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case with 8 processes. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case creates 8 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used up. Link: http://lkml.kernel.org/r/20170515112522.32457-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Kirill A. Shutemov [for can_split_huge_page()] Cc: Johannes Weiner Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Hugh Dickins Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 7 +++++++ mm/huge_memory.c | 20 ++++++++++++++++---- mm/vmscan.c | 4 ++++ 3 files changed, 27 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a3762d49ba39..d3b3e8fcc717 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -113,6 +113,7 @@ extern unsigned long thp_get_unmapped_area(struct file *filp, extern void prep_transhuge_page(struct page *page); extern void free_transhuge_page(struct page *page); +bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { @@ -231,6 +232,12 @@ static inline void prep_transhuge_page(struct page *page) {} #define thp_get_unmapped_area NULL +static inline bool +can_split_huge_page(struct page *page, int *pextra_pins) +{ + BUILD_BUG(); + return false; +} static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a168e4bac4b..86975dec0ba1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2390,6 +2390,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) return ret; } +/* Racy check whether the huge page can be split */ +bool can_split_huge_page(struct page *page, int *pextra_pins) +{ + int extra_pins; + + /* Additional pins from radix tree */ + if (PageAnon(page)) + extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; + else + extra_pins = HPAGE_PMD_NR; + if (pextra_pins) + *pextra_pins = extra_pins; + return total_mapcount(page) == page_count(page) - extra_pins - 1; +} + /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. @@ -2437,7 +2452,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } - extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2449,8 +2463,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - /* Addidional pins from radix tree */ - extra_pins = HPAGE_PMD_NR; anon_vma = NULL; i_mmap_lock_read(mapping); } @@ -2459,7 +2471,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * Racy check if we can split the page, before freeze_page() will * split PMDs */ - if (total_mapcount(head) != page_count(head) - extra_pins - 1) { + if (!can_split_huge_page(head, &extra_pins)) { ret = -EBUSY; goto out_unlock; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 729e37f02de6..abaf0215a352 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1125,6 +1125,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; + /* cannot split THP, skip it */ + if (PageTransHuge(page) && + !can_split_huge_page(page, NULL)) + goto activate_locked; if (!add_to_swap(page)) { if (!PageTransHuge(page)) goto activate_locked; -- cgit v1.2.3 From 747552b1e71b400fa32a221e072be2c0b7661f14 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 6 Jul 2017 15:37:31 -0700 Subject: mm, THP, swap: enable THP swap optimization only if has compound map If there is no compound map for a THP (Transparent Huge Page), it is possible that the map count of some sub-pages of the THP is 0. So it is better to split the THP before swapping out. In this way, the sub-pages not mapped will be freed, and we can avoid the unnecessary swap out operations for these sub-pages. Link: http://lkml.kernel.org/r/20170515112522.32457-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index abaf0215a352..aebb157258f2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1125,10 +1125,19 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - /* cannot split THP, skip it */ - if (PageTransHuge(page) && - !can_split_huge_page(page, NULL)) - goto activate_locked; + if (PageTransHuge(page)) { + /* cannot split THP, skip it */ + if (!can_split_huge_page(page, NULL)) + goto activate_locked; + /* + * Split pages without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!compound_mapcount(page) && + split_huge_page_to_list(page, page_list)) + goto activate_locked; + } if (!add_to_swap(page)) { if (!PageTransHuge(page)) goto activate_locked; -- cgit v1.2.3 From 2262185c5b287f2758afda79c149b7cf6bee165c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 6 Jul 2017 15:40:25 -0700 Subject: mm: per-cgroup memory reclaim stats Track the following reclaim counters for every memory cgroup: PGREFILL, PGSCAN, PGSTEAL, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE and PGLAZYFREED. These values are exposed using the memory.stats interface of cgroup v2. The meaning of each value is the same as for global counters, available using /proc/vmstat. Also, for consistency, rename mem_cgroup_count_vm_event() to count_memcg_event_mm(). Link: http://lkml.kernel.org/r/1494530183-30808-1-git-send-email-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Li Zefan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v2.txt | 28 ++++++++++++++++++++++++++ fs/dax.c | 2 +- fs/ncpfs/mmap.c | 2 +- include/linux/memcontrol.h | 48 ++++++++++++++++++++++++++++++++++++++++++--- mm/filemap.c | 2 +- mm/memcontrol.c | 10 ++++++++++ mm/memory.c | 4 ++-- mm/shmem.c | 3 +-- mm/swap.c | 1 + mm/vmscan.c | 30 +++++++++++++++++++++------- 10 files changed, 113 insertions(+), 17 deletions(-) (limited to 'mm/vmscan.c') diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 558c3a739baf..5ac2fbde97e6 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -956,6 +956,34 @@ PAGE_SIZE multiple when read back. Number of times a shadow node has been reclaimed + pgrefill + + Amount of scanned pages (in an active LRU list) + + pgscan + + Amount of scanned pages (in an inactive LRU list) + + pgsteal + + Amount of reclaimed pages + + pgactivate + + Amount of pages moved to the active LRU list + + pgdeactivate + + Amount of pages moved to the inactive LRU lis + + pglazyfree + + Amount of pages postponed to be freed under memory pressure + + pglazyfreed + + Amount of reclaimed lazyfree pages + memory.swap.current A read-only single value file which exists on non-root diff --git a/fs/dax.c b/fs/dax.c index 33d05aa02aad..cd8fced592d0 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1213,7 +1213,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 0c3905e0542e..6719c0be674d 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_fault *vmf) * -- nyc */ count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 899949bbb2f9..b2a5b1cd4e55 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -357,6 +357,17 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) +{ + struct mem_cgroup_per_node *mz; + + if (mem_cgroup_disabled()) + return NULL; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + return mz->memcg; +} + /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find @@ -546,8 +557,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); -static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, - enum vm_event_item idx) +static inline void count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ + if (!mem_cgroup_disabled()) + this_cpu_add(memcg->stat->events[idx], count); +} + +static inline void count_memcg_page_event(struct page *page, + enum memcg_stat_item idx) +{ + if (page->mem_cgroup) + count_memcg_events(page->mem_cgroup, idx, 1); +} + +static inline void count_memcg_event_mm(struct mm_struct *mm, + enum vm_event_item idx) { struct mem_cgroup *memcg; @@ -675,6 +701,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) +{ + return NULL; +} + static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; @@ -789,8 +820,19 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head) { } +static inline void count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ +} + +static inline void count_memcg_page_event(struct page *page, + enum memcg_stat_item idx) +{ +} + static inline -void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } #endif /* CONFIG_MEMCG */ diff --git a/mm/filemap.c b/mm/filemap.c index aea58e983a73..2e906ef52143 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2265,7 +2265,7 @@ int filemap_fault(struct vm_fault *vmf) /* No page in the page cache at all */ do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc51a33ddcd1..3e2f8cf85b4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5230,6 +5230,16 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pgfault %lu\n", events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + + events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + + events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "workingset_refault %lu\n", stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", diff --git a/mm/memory.c b/mm/memory.c index bf3aab1684e9..e31dd97e6114 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf) /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -3837,7 +3837,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); + count_memcg_event_mm(vma->vm_mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); diff --git a/mm/shmem.c b/mm/shmem.c index a06f23731d3f..9418f5a9bc46 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1646,8 +1646,7 @@ repeat: if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(charge_mm, - PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); diff --git a/mm/swap.c b/mm/swap.c index 98d08b4579fa..4f44dbd7f780 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + count_memcg_page_event(page, PGLAZYFREE); update_page_reclaim_stat(lruvec, 1, 0); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index aebb157258f2..7d3c6c59897c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1294,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } count_vm_event(PGLAZYFREED); + count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -1323,6 +1324,7 @@ activate_locked: if (!PageMlocked(page)) { SetPageActive(page); pgactivate++; + count_memcg_page_event(page, PGACTIVATE); } keep_locked: unlock_page(page); @@ -1762,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); - else + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, + nr_scanned); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_DIRECT, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, + nr_scanned); } spin_unlock_irq(&pgdat->lru_lock); @@ -1778,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); - else + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, + nr_reclaimed); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, + nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); @@ -1927,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } } - if (!is_active_lru(lru)) + if (!is_active_lru(lru)) { __count_vm_events(PGDEACTIVATE, nr_moved); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_moved); + } return nr_moved; } @@ -1966,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan, reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); -- cgit v1.2.3 From 385386cff4c6f047907655e05791d88198c4c523 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:43 -0700 Subject: mm: vmstat: move slab statistics from zone to node counters Patch series "mm: per-lruvec slab stats" Josef is working on a new approach to balancing slab caches and the page cache. For this to work, he needs slab cache statistics on the lruvec level. These patches implement that by adding infrastructure that allows updating and reading generic VM stat items per lruvec, then switches some existing VM accounting sites, including the slab accounting ones, to this new cgroup-aware API. I'll follow up with more patches on this, because there is actually substantial simplification that can be done to the memory controller when we replace private memcg accounting with making the existing VM accounting sites cgroup-aware. But this is enough for Josef to base his slab reclaim work on, so here goes. This patch (of 5): To re-implement slab cache vs. page cache balancing, we'll need the slab counters at the lruvec level, which, ever since lru reclaim was moved from the zone to the node, is the intersection of the node, not the zone, and the memcg. We could retain the per-zone counters for when the page allocator dumps its memory information on failures, and have counters on both levels - which on all but NUMA node 0 is usually redundant. But let's keep it simple for now and just move them. If anybody complains we can restore the per-zone counters. [hannes@cmpxchg.org: fix oops] Link: http://lkml.kernel.org/r/20170605183511.GA8915@cmpxchg.org Link: http://lkml.kernel.org/r/20170530181724.27197-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Josef Bacik Cc: Michal Hocko Cc: Vladimir Davydov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 10 +++++----- include/linux/mmzone.h | 4 ++-- mm/page_alloc.c | 7 +++---- mm/slab.c | 8 ++++---- mm/slub.c | 4 ++-- mm/vmscan.c | 2 +- mm/vmstat.c | 4 ++-- 7 files changed, 19 insertions(+), 20 deletions(-) (limited to 'mm/vmscan.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index 1da0005341a1..6b1ee371ee97 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -129,11 +129,11 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) + - sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), - nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) + + node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * @@ -141,7 +141,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)); #else - nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); + nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE))); #endif n += hugetlb_report_node_meminfo(nid, buf + n); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index abc1641011f2..7e8f100cb56d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -125,8 +125,6 @@ enum zone_stat_item { NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ - NR_SLAB_RECLAIMABLE, - NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_KERNEL_STACK_KB, /* measured in KiB */ /* Second 128 byte cacheline */ @@ -152,6 +150,8 @@ enum node_stat_item { NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ + NR_SLAB_RECLAIMABLE, + NR_SLAB_UNRECLAIMABLE, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_REFAULT, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8aa860017d66..a35add8d7c0b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4643,8 +4643,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " slab_reclaimable:%lukB" - " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" " bounce:%lukB" @@ -4666,8 +4664,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), - K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), @@ -5153,6 +5149,7 @@ static void build_zonelists(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void setup_zone_pageset(struct zone *zone); /* @@ -6053,6 +6050,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); + pgdat->per_cpu_nodestats = &boot_nodestats; + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; diff --git a/mm/slab.c b/mm/slab.c index 503317188926..a38634ed478e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1425,10 +1425,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - add_zone_page_state(page_zone(page), + add_node_page_state(page_pgdat(page), NR_SLAB_RECLAIMABLE, nr_pages); else - add_zone_page_state(page_zone(page), + add_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); @@ -1459,10 +1459,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - sub_zone_page_state(page_zone(page), + sub_node_page_state(page_pgdat(page), NR_SLAB_RECLAIMABLE, nr_freed); else - sub_zone_page_state(page_zone(page), + sub_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, nr_freed); BUG_ON(!PageSlab(page)); diff --git a/mm/slub.c b/mm/slub.c index 388f66d1da5e..aa5aa6bfb35e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1615,7 +1615,7 @@ out: if (!page) return NULL; - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << oo_order(oo)); @@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) kmemcheck_free_shadow(page, compound_order(page)); - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7d3c6c59897c..9e95fafc026b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3874,7 +3874,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && - sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 46281825c710..744ceaeb42a0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -928,8 +928,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", "nr_bounce", @@ -952,6 +950,8 @@ const char * const vmstat_text[] = { "nr_inactive_file", "nr_active_file", "nr_unevictable", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", "workingset_refault", -- cgit v1.2.3