summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2017-08-11 13:51:59 +0200
committerIngo Molnar <mingo@kernel.org>2017-08-11 13:51:59 +0200
commit040cca3ab2f6f8b8d26e0e4965abea2b9aa14818 (patch)
tree25709ba52ee06fccf4bfbfbf2897bb8cf86da828 /mm
parentef0758dd0fd70b98b889af26e27f003656952db8 (diff)
parentb2dbdf2ca1d2803e9cdc46a94554c4a39ffb235a (diff)
downloadtalos-obmc-linux-040cca3ab2f6f8b8d26e0e4965abea2b9aa14818.tar.gz
talos-obmc-linux-040cca3ab2f6f8b8d26e0e4965abea2b9aa14818.zip
Merge branch 'linus' into locking/core, to resolve conflicts
Conflicts: include/linux/mm_types.h mm/huge_memory.c I removed the smp_mb__before_spinlock() like the following commit does: 8b1b436dd1cc ("mm, locking: Rework {set,clear,mm}_tlb_flush_pending()") and fixed up the affected commits. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/debug.c6
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/memory.c42
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/page_alloc.c11
-rw-r--r--mm/rmap.c52
-rw-r--r--mm/shmem.c12
-rw-r--r--mm/util.c2
11 files changed, 96 insertions, 47 deletions
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 9075aa54e955..b06d9fe23a28 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
{
unsigned long flags;
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO);
+ __GFP_NOMEMALLOC | __GFP_NORETRY);
if (!page)
return NULL;
diff --git a/mm/debug.c b/mm/debug.c
index db1cd26d8752..5715448ab0b5 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -124,9 +124,7 @@ void dump_mm(const struct mm_struct *mm)
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
#endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
-#endif
"def_flags: %#lx(%pGv)\n",
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
@@ -158,9 +156,7 @@ void dump_mm(const struct mm_struct *mm)
#ifdef CONFIG_NUMA_BALANCING
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
#endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
- mm->tlb_flush_pending,
-#endif
+ atomic_read(&mm->tlb_flush_pending),
mm->def_flags, &mm->def_flags
);
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c76a720b936b..ce883459e246 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1497,6 +1497,13 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
}
/*
+ * The page_table_lock above provides a memory barrier
+ * with change_protection_range.
+ */
+ if (mm_tlb_flush_pending(vma->vm_mm))
+ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
+ /*
* Since we took the NUMA fault, we must have observed the !accessible
* bit. Make sure all other CPUs agree with that, to avoid them
* modifying the page we're about to migrate.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a1a0ac0ad6f6..31e207cb399b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4062,9 +4062,9 @@ out:
return ret;
out_release_unlock:
spin_unlock(ptl);
-out_release_nounlock:
if (vm_shared)
unlock_page(page);
+out_release_nounlock:
put_page(page);
goto out;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 4dc92f138786..db20f8436bc3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1038,7 +1038,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
- (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) {
+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
+ mm_tlb_flush_pending(mm)) {
pte_t entry;
swapped = PageSwapCache(page);
diff --git a/mm/memory.c b/mm/memory.c
index f65beaad319b..e158f7ac6730 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -215,12 +215,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}
-/* tlb_gather_mmu
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @fullmm argument is used when @mm is without
- * users and we're going to destroy the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
@@ -275,10 +271,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
* Called at the end of the shootdown operation to free up any resources
* that were required.
*/
-void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+void arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
struct mmu_gather_batch *batch, *next;
+ if (force)
+ __tlb_adjust_range(tlb, start, end - start);
+
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -398,6 +398,34 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+/* tlb_gather_mmu
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @fullmm argument is used when @mm is without
+ * users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ arch_tlb_gather_mmu(tlb, mm, start, end);
+ inc_tlb_flush_pending(tlb->mm);
+}
+
+void tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * If there are parallel threads are doing PTE changes on same range
+ * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
+ * flush by batching, a thread has stable TLB entry can fail to flush
+ * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
+ * forcefully if we detect parallel PTE batching threads.
+ */
+ bool force = mm_tlb_flush_nested(tlb->mm);
+
+ arch_tlb_finish_mmu(tlb, start, end, force);
+ dec_tlb_flush_pending(tlb->mm);
+}
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4180ad8cc9c5..bd0f409922cb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -244,7 +244,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- set_tlb_flush_pending(mm);
+ inc_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -256,7 +256,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
- clear_tlb_flush_pending(mm);
+ dec_tlb_flush_pending(mm);
return pages;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c20d89601802..6acf612fdd99 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4501,8 +4501,9 @@ long si_mem_available(void)
* Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+ available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
+ wmark_low);
if (available < 0)
available = 0;
@@ -4645,8 +4646,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_SLAB_RECLAIMABLE),
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state(NR_SLAB_RECLAIMABLE),
+ global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
@@ -7711,7 +7712,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
- pr_info("%s: [%lx, %lx) PFNs busy\n",
+ pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
goto done;
diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..c1286d47aa1f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -888,10 +888,10 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
.flags = PVMW_SYNC,
};
int *cleaned = arg;
+ bool invalidation_needed = false;
while (page_vma_mapped_walk(&pvmw)) {
int ret = 0;
- address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -899,11 +899,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
if (!pte_dirty(*pte) && !pte_write(*pte))
continue;
- flush_cache_page(vma, address, pte_pfn(*pte));
- entry = ptep_clear_flush(vma, address, pte);
+ flush_cache_page(vma, pvmw.address, pte_pfn(*pte));
+ entry = ptep_clear_flush(vma, pvmw.address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
- set_pte_at(vma->vm_mm, address, pte, entry);
+ set_pte_at(vma->vm_mm, pvmw.address, pte, entry);
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -913,11 +913,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
- flush_cache_page(vma, address, page_to_pfn(page));
- entry = pmdp_huge_clear_flush(vma, address, pmd);
+ flush_cache_page(vma, pvmw.address, page_to_pfn(page));
+ entry = pmdp_huge_clear_flush(vma, pvmw.address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
- set_pmd_at(vma->vm_mm, address, pmd, entry);
+ set_pmd_at(vma->vm_mm, pvmw.address, pmd, entry);
ret = 1;
#else
/* unexpected pmd-mapped page? */
@@ -926,11 +926,16 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
if (ret) {
- mmu_notifier_invalidate_page(vma->vm_mm, address);
(*cleaned)++;
+ invalidation_needed = true;
}
}
+ if (invalidation_needed) {
+ mmu_notifier_invalidate_range(vma->vm_mm, address,
+ address + (1UL << compound_order(page)));
+ }
+
return true;
}
@@ -1323,7 +1328,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
};
pte_t pteval;
struct page *subpage;
- bool ret = true;
+ bool ret = true, invalidation_needed = false;
enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
@@ -1363,11 +1368,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!pvmw.pte, page);
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
- address = pvmw.address;
-
if (!(flags & TTU_IGNORE_ACCESS)) {
- if (ptep_clear_flush_young_notify(vma, address,
+ if (ptep_clear_flush_young_notify(vma, pvmw.address,
pvmw.pte)) {
ret = false;
page_vma_mapped_walk_done(&pvmw);
@@ -1376,7 +1379,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ flush_cache_page(vma, pvmw.address, pte_pfn(*pvmw.pte));
if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
@@ -1386,11 +1389,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* transition on a cached TLB entry is written through
* and traps if the PTE is unmapped.
*/
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ pteval = ptep_get_and_clear(mm, pvmw.address,
+ pvmw.pte);
set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
} else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ pteval = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
}
/* Move the dirty bit to the page. Now the pte is gone. */
@@ -1405,12 +1409,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageHuge(page)) {
int nr = 1 << compound_order(page);
hugetlb_count_sub(nr, mm);
- set_huge_swap_pte_at(mm, address,
+ set_huge_swap_pte_at(mm, pvmw.address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
} else {
dec_mm_counter(mm, mm_counter(page));
- set_pte_at(mm, address, pvmw.pte, pteval);
+ set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
}
} else if (pte_unused(pteval)) {
@@ -1434,7 +1438,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
@@ -1460,7 +1464,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* If the page was redirtied, it cannot be
* discarded. Remap the page to page table.
*/
- set_pte_at(mm, address, pvmw.pte, pteval);
+ set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
SetPageSwapBacked(page);
ret = false;
page_vma_mapped_walk_done(&pvmw);
@@ -1468,7 +1472,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
if (swap_duplicate(entry) < 0) {
- set_pte_at(mm, address, pvmw.pte, pteval);
+ set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
@@ -1484,14 +1488,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
} else
dec_mm_counter(mm, mm_counter_file(page));
discard:
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
- mmu_notifier_invalidate_page(mm, address);
+ invalidation_needed = true;
}
+
+ if (invalidation_needed)
+ mmu_notifier_invalidate_range(mm, address,
+ address + (1UL << compound_order(page)));
return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index b0aa6075d164..6540e5982444 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1022,7 +1022,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
*/
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
spin_lock(&sbinfo->shrinklist_lock);
- if (list_empty(&info->shrinklist)) {
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
list_add_tail(&info->shrinklist,
&sbinfo->shrinklist);
sbinfo->shrinklist_len++;
@@ -1817,7 +1821,11 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
* to shrink under memory pressure.
*/
spin_lock(&sbinfo->shrinklist_lock);
- if (list_empty(&info->shrinklist)) {
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
list_add_tail(&info->shrinklist,
&sbinfo->shrinklist);
sbinfo->shrinklist_len++;
diff --git a/mm/util.c b/mm/util.c
index 7b07ec852e01..9ecddf568fe3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -633,7 +633,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
- free += global_page_state(NR_SLAB_RECLAIMABLE);
+ free += global_node_page_state(NR_SLAB_RECLAIMABLE);
/*
* Leave reserved pages. The pages are not for anonymous pages.
OpenPOWER on IntegriCloud