summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/maccess.c70
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory.c104
-rw-r--r--mm/memory_hotplug.c16
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/vmalloc.c20
7 files changed, 208 insertions, 33 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index dbee2eb4dd05..7905934cd3ad 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -885,13 +885,13 @@ static int remove_stable_node(struct stable_node *stable_node)
return 0;
}
- if (WARN_ON_ONCE(page_mapped(page))) {
- /*
- * This should not happen: but if it does, just refuse to let
- * merge_across_nodes be switched - there is no need to panic.
- */
- err = -EBUSY;
- } else {
+ /*
+ * Page could be still mapped if this races with __mmput() running in
+ * between ksm_exit() and exit_mmap(). Just refuse to let
+ * merge_across_nodes/max_page_sharing be switched.
+ */
+ err = -EBUSY;
+ if (!page_mapped(page)) {
/*
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
diff --git a/mm/maccess.c b/mm/maccess.c
index d065736f6b87..3ca8d97e5010 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -18,6 +18,18 @@ probe_read_common(void *dst, const void __user *src, size_t size)
return ret ? -EFAULT : 0;
}
+static __always_inline long
+probe_write_common(void __user *dst, const void *src, size_t size)
+{
+ long ret;
+
+ pagefault_disable();
+ ret = __copy_to_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
/**
* probe_kernel_read(): safely attempt to read from a kernel-space location
* @dst: pointer to the buffer that shall take the data
@@ -31,11 +43,20 @@ probe_read_common(void *dst, const void __user *src, size_t size)
* do_page_fault() doesn't attempt to take mmap_sem. This makes
* probe_kernel_read() suitable for use within regions where the caller
* already holds mmap_sem, or other locks which nest inside mmap_sem.
+ *
+ * probe_kernel_read_strict() is the same as probe_kernel_read() except for
+ * the case where architectures have non-overlapping user and kernel address
+ * ranges: probe_kernel_read_strict() will additionally return -EFAULT for
+ * probing memory on a user address range where probe_user_read() is supposed
+ * to be used instead.
*/
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));
+long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_kernel_read")));
+
long __probe_kernel_read(void *dst, const void *src, size_t size)
{
long ret;
@@ -85,6 +106,7 @@ EXPORT_SYMBOL_GPL(probe_user_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
+
long __weak probe_kernel_write(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_write")));
@@ -94,15 +116,39 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
- pagefault_enable();
+ ret = probe_write_common((__force void __user *)dst, src, size);
set_fs(old_fs);
- return ret ? -EFAULT : 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(probe_kernel_write);
+/**
+ * probe_user_write(): safely attempt to write to a user-space location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_write(void __user *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_user_write")));
+
+long __probe_user_write(void __user *dst, const void *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(dst, size))
+ ret = probe_write_common(dst, src, size);
+ set_fs(old_fs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_write);
/**
* strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
@@ -120,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_kernel_write);
*
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
+ *
+ * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except
+ * for the case where architectures have non-overlapping user and kernel address
+ * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for
+ * probing memory on a user address range where strncpy_from_unsafe_user() is
+ * supposed to be used instead.
*/
-long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+
+long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+ __attribute__((alias("__strncpy_from_unsafe")));
+
+long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
+ long count)
+ __attribute__((alias("__strncpy_from_unsafe")));
+
+long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
{
mm_segment_t old_fs = get_fs();
const void *src = unsafe_addr;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 46ad252e6d6a..01f3f8b665e9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1800,7 +1800,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
diff --git a/mm/memory.c b/mm/memory.c
index b1ca51a079f2..b6a5d6a08438 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,18 @@ int randomize_va_space __read_mostly =
2;
#endif
+#ifndef arch_faults_on_old_pte
+static inline bool arch_faults_on_old_pte(void)
+{
+ /*
+ * Those arches which don't have hw access flag feature need to
+ * implement their own helper. By default, "true" means pagefault
+ * will be hit on old pte.
+ */
+ return true;
+}
+#endif
+
static int __init disable_randmaps(char *s)
{
randomize_va_space = 0;
@@ -2145,32 +2157,82 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
return same;
}
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
+static inline bool cow_user_page(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
+ bool ret;
+ void *kaddr;
+ void __user *uaddr;
+ bool force_mkyoung;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = vmf->address;
+
debug_dma_assert_idle(src);
+ if (likely(src)) {
+ copy_user_highpage(dst, src, addr, vma);
+ return true;
+ }
+
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
* just copying from the original user address. If that
* fails, we just zero-fill it. Live with it.
*/
- if (unlikely(!src)) {
- void *kaddr = kmap_atomic(dst);
- void __user *uaddr = (void __user *)(va & PAGE_MASK);
+ kaddr = kmap_atomic(dst);
+ uaddr = (void __user *)(addr & PAGE_MASK);
+
+ /*
+ * On architectures with software "accessed" bits, we would
+ * take a double page fault, so mark it accessed here.
+ */
+ force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte);
+ if (force_mkyoung) {
+ pte_t entry;
+
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ /*
+ * Other thread has already handled the fault
+ * and we don't need to do anything. If it's
+ * not the case, the fault will be triggered
+ * again on the same address.
+ */
+ ret = false;
+ goto pte_unlock;
+ }
+ entry = pte_mkyoung(vmf->orig_pte);
+ if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
+ update_mmu_cache(vma, addr, vmf->pte);
+ }
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
/*
- * This really shouldn't fail, because the page is there
- * in the page tables. But it might just be unreadable,
- * in which case we just give up and fill the result with
- * zeroes.
+ * Give a warn in case there can be some obscure
+ * use-case
*/
- if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- clear_page(kaddr);
- kunmap_atomic(kaddr);
- flush_dcache_page(dst);
- } else
- copy_user_highpage(dst, src, va, vma);
+ WARN_ON_ONCE(1);
+ clear_page(kaddr);
+ }
+
+ ret = true;
+
+pte_unlock:
+ if (force_mkyoung)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(dst);
+
+ return ret;
}
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
@@ -2327,7 +2389,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, vmf->address, vma);
+
+ if (!cow_user_page(new_page, old_page, vmf)) {
+ /*
+ * COW failed, if the fault was solved by other,
+ * it's fine. If not, userspace would re-fault on
+ * the same address and we will handle the fault
+ * from the second attempt.
+ */
+ put_page(new_page);
+ if (old_page)
+ put_page(old_page);
+ return 0;
+ }
}
if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3b62a9ff8ea0..f307bd82d750 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -331,7 +331,7 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long end_pfn)
{
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(start_pfn)))
+ if (unlikely(!pfn_to_online_page(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -356,7 +356,7 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
@@ -415,7 +415,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
*/
pfn = zone_start_pfn;
for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_valid(pfn)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (page_zone(pfn_to_page(pfn)) != zone)
@@ -471,6 +471,16 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn,
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long flags;
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
update_pgdat_span(pgdat);
diff --git a/mm/nommu.c b/mm/nommu.c
index 99b7ec318824..7de592058ab4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -155,11 +155,11 @@ void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
return __vmalloc(size, flags, PAGE_KERNEL);
}
-void *vmalloc_user(unsigned long size)
+static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
{
void *ret;
- ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
if (ret) {
struct vm_area_struct *vma;
@@ -172,8 +172,19 @@ void *vmalloc_user(unsigned long size)
return ret;
}
+
+void *vmalloc_user(unsigned long size)
+{
+ return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO);
+}
EXPORT_SYMBOL(vmalloc_user);
+void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
+{
+ return __vmalloc_user_flags(size, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vmalloc_user_node_flags);
+
struct page *vmalloc_to_page(const void *addr)
{
return virt_to_page(addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3c70e275f4e..4a7d7459c4f9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2672,6 +2672,26 @@ void *vzalloc_node(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node);
/**
+ * vmalloc_user_node_flags - allocate memory for userspace on a specific node
+ * @size: allocation size
+ * @node: numa node
+ * @flags: flags for the page level allocator
+ *
+ * The resulting memory area is zeroed so it can be mapped to userspace
+ * without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
+{
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ flags | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, node,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_user_node_flags);
+
+/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
*
OpenPOWER on IntegriCloud