From 9625456cc76391b7f3f2809579126542a8ed4d39 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 3 Oct 2017 16:15:32 -0700 Subject: mm: fix data corruption caused by lazyfree page MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear SwapBacked). There is no lock to prevent the page is added to swap cache between these two steps by page reclaim. If page reclaim finds such page, it will simply add the page to swap cache without pageout the page to swap because the page is marked as clean. Next time, page fault will read data from the swap slot which doesn't have the original data, so we have a data corruption. To fix issue, we mark the page dirty and pageout the page. However, we shouldn't dirty all pages which is clean and in swap cache. swapin page is swap cache and clean too. So we only dirty page which is added into swap cache in page reclaim, which shouldn't be swapin page. As Minchan suggested, simply dirty the page in add_to_swap can do the job. Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages") Link: http://lkml.kernel.org/r/08c84256b007bf3f63c91d94383bd9eb6fee2daa.1506446061.git.shli@fb.com Signed-off-by: Shaohua Li Reported-by: Artem Savkov Acked-by: Michal Hocko Acked-by: Minchan Kim Cc: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: [4.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm/swap_state.c') diff --git a/mm/swap_state.c b/mm/swap_state.c index 71ce2d1ccbf7..ed91091d1e68 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -242,6 +242,17 @@ int add_to_swap(struct page *page) * clear SWAP_HAS_CACHE flag. */ goto fail; + /* + * Normally the page will be dirtied in unmap because its pte should be + * dirty. A special case is MADV_FREE page. The page'e pte could have + * dirty bit cleared but the page's SwapBacked bit is still set because + * clearing the dirty bit and SwapBacked bit has no lock protected. For + * such page, unmap will not set dirty bit for it, so page reclaim will + * not write the page out. This can cause data corruption when the page + * is swap in later. Always setting the dirty bit for the page solves + * the problem. + */ + set_page_dirty(page); return 1; -- cgit v1.2.3 From 61b639723be5a9fc4812d5d85cb769589afa5a38 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 13 Oct 2017 15:58:29 -0700 Subject: mm, swap: use page-cluster as max window of VMA based swap readahead When the VMA based swap readahead was introduced, a new knob /sys/kernel/mm/swap/vma_ra_max_order was added as the max window of VMA swap readahead. This is to make it possible to use different max window for VMA based readahead and original physical readahead. But Minchan Kim pointed out that this will cause a regression because setting page-cluster sysctl to zero cannot disable swap readahead with the change. To fix the regression, the page-cluster sysctl is used as the max window of both the VMA based swap readahead and original physical swap readahead. If more fine grained control is needed in the future, more knobs can be added as the subordinate knobs of the page-cluster sysctl. The vma_ra_max_order knob is deleted. Because the knob was introduced in v4.14-rc1, and this patch is targeting being merged before v4.14 releasing, there should be no existing users of this newly added ABI. Link: http://lkml.kernel.org/r/20171011070847.16003-1-ying.huang@intel.com Fixes: ec560175c0b6fce ("mm, swap: VMA based swap readahead") Signed-off-by: "Huang, Ying" Reported-by: Minchan Kim Acked-by: Minchan Kim Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Rik van Riel Cc: Shaohua Li Cc: Hugh Dickins Cc: Fengguang Wu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-mm-swap | 10 ------- mm/swap_state.c | 41 +++++--------------------- 2 files changed, 7 insertions(+), 44 deletions(-) (limited to 'mm/swap_state.c') diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-swap b/Documentation/ABI/testing/sysfs-kernel-mm-swap index 587db52084c7..94672016c268 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-swap +++ b/Documentation/ABI/testing/sysfs-kernel-mm-swap @@ -14,13 +14,3 @@ Description: Enable/disable VMA based swap readahead. still used for tmpfs etc. other users. If set to false, the global swap readahead algorithm will be used for all swappable pages. - -What: /sys/kernel/mm/swap/vma_ra_max_order -Date: August 2017 -Contact: Linux memory management mailing list -Description: The max readahead size in order for VMA based swap readahead - - VMA based swap readahead algorithm will readahead at - most 1 << max_order pages for each readahead. The - real readahead size for each readahead will be scaled - according to the estimation algorithm. diff --git a/mm/swap_state.c b/mm/swap_state.c index ed91091d1e68..05b6803f0cce 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -39,10 +39,6 @@ struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; bool swap_vma_readahead = true; -#define SWAP_RA_MAX_ORDER_DEFAULT 3 - -static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT; - #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK @@ -664,6 +660,13 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, pte_t *tpte; #endif + max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), + SWAP_RA_ORDER_CEILING); + if (max_win == 1) { + swap_ra->win = 1; + return NULL; + } + faddr = vmf->address; entry = pte_to_swp_entry(vmf->orig_pte); if ((unlikely(non_swap_entry(entry)))) @@ -672,12 +675,6 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, if (page) return page; - max_win = 1 << READ_ONCE(swap_ra_max_order); - if (max_win == 1) { - swap_ra->win = 1; - return NULL; - } - fpfn = PFN_DOWN(faddr); swap_ra_info = GET_SWAP_RA_VAL(vma); pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); @@ -786,32 +783,8 @@ static struct kobj_attribute vma_ra_enabled_attr = __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, vma_ra_enabled_store); -static ssize_t vma_ra_max_order_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", swap_ra_max_order); -} -static ssize_t vma_ra_max_order_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err, v; - - err = kstrtoint(buf, 10, &v); - if (err || v > SWAP_RA_ORDER_CEILING || v <= 0) - return -EINVAL; - - swap_ra_max_order = v; - - return count; -} -static struct kobj_attribute vma_ra_max_order_attr = - __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show, - vma_ra_max_order_store); - static struct attribute *swap_attrs[] = { &vma_ra_enabled_attr.attr, - &vma_ra_max_order_attr.attr, NULL, }; -- cgit v1.2.3