summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/cma.c15
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/maccess.c122
-rw-r--r--mm/memcontrol.c22
-rw-r--r--mm/memory.c13
-rw-r--r--mm/memory_hotplug.c335
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page_alloc.c20
-rw-r--r--mm/shmem.c11
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/sparse-vmemmap.c21
-rw-r--r--mm/sparse.c355
-rw-r--r--mm/swap.c2
-rw-r--r--mm/util.c75
-rw-r--r--mm/vmscan.c44
-rw-r--r--mm/z3fold.c43
-rw-r--r--mm/zsmalloc.c12
20 files changed, 698 insertions, 424 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 495d7368ced8..56cec636a1fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
-# arch_add_memory() comprehends device memory
-config ARCH_HAS_ZONE_DEVICE
+config ARCH_HAS_PTE_DEVMAP
bool
config ZONE_DEVICE
@@ -658,7 +657,7 @@ config ZONE_DEVICE
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
- depends on ARCH_HAS_ZONE_DEVICE
+ depends on ARCH_HAS_PTE_DEVMAP
select XARRAY_MULTI
help
diff --git a/mm/cma.c b/mm/cma.c
index 3340ef34c154..7fe0b8356775 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -278,6 +278,12 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+ if (fixed && base & (alignment - 1)) {
+ ret = -EINVAL;
+ pr_err("Region at %pa must be aligned to %pa bytes\n",
+ &base, &alignment);
+ goto err;
+ }
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
@@ -308,6 +314,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
if (limit == 0 || limit > memblock_end)
limit = memblock_end;
+ if (base + size > limit) {
+ ret = -EINVAL;
+ pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+ &size, &base, &limit);
+ goto err;
+ }
+
/* Reserve memory */
if (fixed) {
if (memblock_is_region_reserved(base, size) ||
@@ -494,7 +507,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
* @pages: Allocated pages.
* @count: Number of allocated pages.
*
- * This function releases memory allocated by alloc_cma().
+ * This function releases memory allocated by cma_alloc().
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
diff --git a/mm/gup.c b/mm/gup.c
index 8bbaa5523116..98f13ab37bac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1895,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 885642c82aaa..1334ede667a8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -63,10 +63,15 @@ struct page *huge_zero_page __read_mostly;
bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{
+ /* The addr is used to check if the vma size fits */
+ unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+
+ if (!transhuge_vma_suitable(vma, addr))
+ return false;
if (vma_is_anonymous(vma))
return __transparent_hugepage_enabled(vma);
- if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
- return __transparent_hugepage_enabled(vma);
+ if (vma_is_shmem(vma))
+ return shmem_huge_enabled(vma);
return false;
}
@@ -689,7 +694,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
diff --git a/mm/maccess.c b/mm/maccess.c
index 482d4d670f19..d065736f6b87 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -6,8 +6,20 @@
#include <linux/mm.h>
#include <linux/uaccess.h>
+static __always_inline long
+probe_read_common(void *dst, const void __user *src, size_t size)
+{
+ long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
/**
- * probe_kernel_read(): safely attempt to read from a location
+ * probe_kernel_read(): safely attempt to read from a kernel-space location
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
@@ -30,17 +42,41 @@ long __probe_kernel_read(void *dst, const void *src, size_t size)
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst,
- (__force const void __user *)src, size);
- pagefault_enable();
+ ret = probe_read_common(dst, (__force const void __user *)src, size);
set_fs(old_fs);
- return ret ? -EFAULT : 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(probe_kernel_read);
/**
+ * probe_user_read(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_read(void *dst, const void __user *src, size_t size)
+ __attribute__((alias("__probe_user_read")));
+
+long __probe_user_read(void *dst, const void __user *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(src, size))
+ ret = probe_read_common(dst, src, size);
+ set_fs(old_fs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_read);
+
+/**
* probe_kernel_write(): safely attempt to write to a location
* @dst: address to write to
* @src: pointer to the data that shall be written
@@ -67,6 +103,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
}
EXPORT_SYMBOL_GPL(probe_kernel_write);
+
/**
* strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
* @dst: Destination address, in kernel space. This buffer must be at
@@ -106,3 +143,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
return ret ? -EFAULT : src - unsafe_addr;
}
+
+/**
+ * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ * address.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @unsafe_addr: Unsafe user address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe user address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+ long count)
+{
+ mm_segment_t old_fs = get_fs();
+ long ret;
+
+ if (unlikely(count <= 0))
+ return 0;
+
+ set_fs(USER_DS);
+ pagefault_disable();
+ ret = strncpy_from_user(dst, unsafe_addr, count);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret >= count) {
+ ret = count;
+ dst[ret - 1] = '\0';
+ } else if (ret > 0) {
+ ret++;
+ }
+
+ return ret;
+}
+
+/**
+ * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * @unsafe_addr: The string to measure.
+ * @count: Maximum count (including NUL)
+ *
+ * Get the size of a NUL-terminated string in user space without pagefault.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ *
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
+ * On exception (or invalid count), returns 0.
+ *
+ * Unlike strnlen_user, this can be used from IRQ handler etc. because
+ * it disables pagefaults.
+ */
+long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+{
+ mm_segment_t old_fs = get_fs();
+ int ret;
+
+ set_fs(USER_DS);
+ pagefault_disable();
+ ret = strnlen_user(unsafe_addr, count);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret;
+}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 249671873aa9..cdbb7a84cb6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -695,12 +695,15 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
if (mem_cgroup_disabled())
return;
- __this_cpu_add(memcg->vmstats_local->stat[idx], val);
-
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmstats[idx]);
x = 0;
@@ -749,13 +752,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
- /* Update lruvec */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup_per_node *pi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
@@ -777,12 +782,15 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;
- __this_cpu_add(memcg->vmstats_local->events[idx], count);
-
x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmevents[idx]);
x = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 89325f9c6173..e2bb51b6242e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3162,19 +3162,6 @@ map_pte:
}
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-
-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
- unsigned long haddr)
-{
- if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
- (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
- return false;
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
- return false;
- return true;
-}
-
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6166ba5a15f3..2a9bbddb0e55 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page)
#ifndef CONFIG_SPARSEMEM_VMEMMAP
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
- unsigned long *usemap, mapsize, section_nr, i;
+ unsigned long mapsize, section_nr, i;
struct mem_section *ms;
struct page *page, *memmap;
+ struct mem_section_usage *usage;
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, SECTION_INFO);
- usemap = ms->pageblock_flags;
- page = virt_to_page(usemap);
+ usage = ms->usage;
+ page = virt_to_page(usage);
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
#else /* CONFIG_SPARSEMEM_VMEMMAP */
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
- unsigned long *usemap, mapsize, section_nr, i;
+ unsigned long mapsize, section_nr, i;
struct mem_section *ms;
struct page *page, *memmap;
+ struct mem_section_usage *usage;
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
- usemap = ms->pageblock_flags;
- page = virt_to_page(usemap);
+ usage = ms->usage;
+ page = virt_to_page(usage);
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -250,22 +252,31 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
}
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
- struct vmem_altmap *altmap, bool want_memblock)
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
+ const char *reason)
{
- int ret;
-
- if (pfn_valid(phys_start_pfn))
- return -EEXIST;
-
- ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
- if (ret < 0)
- return ret;
-
- if (!want_memblock)
- return 0;
-
- return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
+ /*
+ * Disallow all operations smaller than a sub-section and only
+ * allow operations smaller than a section for
+ * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
+ * enforces a larger memory_block_size_bytes() granularity for
+ * memory that will be marked online, so this check should only
+ * fire for direct arch_{add,remove}_memory() users outside of
+ * add_memory_resource().
+ */
+ unsigned long min_align;
+
+ if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+ min_align = PAGES_PER_SUBSECTION;
+ else
+ min_align = PAGES_PER_SECTION;
+ if (!IS_ALIGNED(pfn, min_align)
+ || !IS_ALIGNED(nr_pages, min_align)) {
+ WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
+ reason, pfn, pfn + nr_pages - 1);
+ return -EINVAL;
+ }
+ return 0;
}
/*
@@ -274,62 +285,54 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
* call this function after deciding the zone to which to
* add the new pages.
*/
-int __ref __add_pages(int nid, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct mhp_restrictions *restrictions)
+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+ struct mhp_restrictions *restrictions)
{
- unsigned long i;
- int err = 0;
- int start_sec, end_sec;
+ int err;
+ unsigned long nr, start_sec, end_sec;
struct vmem_altmap *altmap = restrictions->altmap;
- /* during initialize mem_map, align hot-added range to section */
- start_sec = pfn_to_section_nr(phys_start_pfn);
- end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-
if (altmap) {
/*
* Validate altmap is within bounds of the total request
*/
- if (altmap->base_pfn != phys_start_pfn
+ if (altmap->base_pfn != pfn
|| vmem_altmap_offset(altmap) > nr_pages) {
pr_warn_once("memory add fail, invalid altmap\n");
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
altmap->alloc = 0;
}
- for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, section_nr_to_pfn(i), altmap,
- restrictions->flags & MHP_MEMBLOCK_API);
+ err = check_pfn_span(pfn, nr_pages, "add");
+ if (err)
+ return err;
- /*
- * EEXIST is finally dealt with by ioresource collision
- * check. see add_memory() => register_memory_resource()
- * Warning will be printed if there is collision.
- */
- if (err && (err != -EEXIST))
+ start_sec = pfn_to_section_nr(pfn);
+ end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ unsigned long pfns;
+
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ err = sparse_add_section(nid, pfn, pfns, altmap);
+ if (err)
break;
- err = 0;
+ pfn += pfns;
+ nr_pages -= pfns;
cond_resched();
}
vmemmap_populate_print_last();
-out:
return err;
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
-
- for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(start_pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_valid(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -349,15 +352,12 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
unsigned long pfn;
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
- for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_valid(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
@@ -379,7 +379,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
unsigned long zone_end_pfn = z;
unsigned long pfn;
- struct mem_section *ms;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
@@ -416,17 +415,15 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
* it check the zone has only hole or not.
*/
pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_valid(pfn)))
continue;
if (page_zone(pfn_to_page(pfn)) != zone)
continue;
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
+ /* Skip range to be removed */
+ if (pfn >= start_pfn && pfn < end_pfn)
continue;
/* If we find valid section, we have nothing to do */
@@ -447,7 +444,6 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
unsigned long pgdat_end_pfn = p;
unsigned long pfn;
- struct mem_section *ms;
int nid = pgdat->node_id;
if (pgdat_start_pfn == start_pfn) {
@@ -484,17 +480,15 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
* has only hole or not.
*/
pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_valid(pfn)))
continue;
if (pfn_to_nid(pfn) != nid)
continue;
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
+ /* Skip range to be removed */
+ if (pfn >= start_pfn && pfn < end_pfn)
continue;
/* If we find valid section, we have nothing to do */
@@ -506,10 +500,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
pgdat->node_spanned_pages = 0;
}
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __remove_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
{
struct pglist_data *pgdat = zone->zone_pgdat;
- int nr_pages = PAGES_PER_SECTION;
unsigned long flags;
pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -518,29 +512,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-static void __remove_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset,
- struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, unsigned long pfn,
+ unsigned long nr_pages, unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
- unsigned long start_pfn;
- int scn_nr;
+ struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
if (WARN_ON_ONCE(!valid_section(ms)))
return;
- unregister_memory_section(ms);
-
- scn_nr = __section_nr(ms);
- start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
- __remove_zone(zone, start_pfn);
-
- sparse_remove_one_section(zone, ms, map_offset, altmap);
+ __remove_zone(zone, pfn, nr_pages);
+ sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
}
/**
* __remove_pages() - remove sections of pages from a zone
* @zone: zone from which pages need to be removed
- * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
*
@@ -549,38 +537,35 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+void __remove_pages(struct zone *zone, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
- unsigned long i;
unsigned long map_offset = 0;
- int sections_to_remove;
+ unsigned long nr, start_sec, end_sec;
- /* In the ZONE_DEVICE case device driver owns the memory region */
- if (is_dev_zone(zone))
- map_offset = vmem_altmap_offset(altmap);
+ map_offset = vmem_altmap_offset(altmap);
clear_zone_contiguous(zone);
- /*
- * We can only remove entire sections
- */
- BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
- BUG_ON(nr_pages % PAGES_PER_SECTION);
+ if (check_pfn_span(pfn, nr_pages, "remove"))
+ return;
- sections_to_remove = nr_pages / PAGES_PER_SECTION;
- for (i = 0; i < sections_to_remove; i++) {
- unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+ start_sec = pfn_to_section_nr(pfn);
+ end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ unsigned long pfns;
cond_resched();
- __remove_section(zone, __pfn_to_section(pfn), map_offset,
- altmap);
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ __remove_section(zone, pfn, pfns, map_offset, altmap);
+ pfn += pfns;
+ nr_pages -= pfns;
map_offset = 0;
}
set_zone_contiguous(zone);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
int set_online_page_callback(online_page_callback_t callback)
{
@@ -1049,16 +1034,11 @@ int try_online_node(int nid)
static int check_hotplug_memory_range(u64 start, u64 size)
{
- unsigned long block_sz = memory_block_size_bytes();
- u64 block_nr_pages = block_sz >> PAGE_SHIFT;
- u64 nr_pages = size >> PAGE_SHIFT;
- u64 start_pfn = PFN_DOWN(start);
-
/* memory range must be block size aligned */
- if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) ||
- !IS_ALIGNED(nr_pages, block_nr_pages)) {
+ if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
+ !IS_ALIGNED(size, memory_block_size_bytes())) {
pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
- block_sz, start, size);
+ memory_block_size_bytes(), start, size);
return -EINVAL;
}
@@ -1078,9 +1058,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*/
int __ref add_memory_resource(int nid, struct resource *res)
{
- struct mhp_restrictions restrictions = {
- .flags = MHP_MEMBLOCK_API,
- };
+ struct mhp_restrictions restrictions = {};
u64 start, size;
bool new_node = false;
int ret;
@@ -1112,6 +1090,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
if (ret < 0)
goto error;
+ /* create memory block devices after memory was added */
+ ret = create_memory_block_devices(start, size);
+ if (ret) {
+ arch_remove_memory(nid, start, size, NULL);
+ goto error;
+ }
+
if (new_node) {
/* If sysfs file of new node can't be created, cpu on the node
* can't be hot-added. There is no rollback way now.
@@ -1135,8 +1120,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* online pages if requested */
if (memhp_auto_online)
- walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
- NULL, online_memory_block);
+ walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
error:
@@ -1671,58 +1655,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
-
-/**
- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
- * @start_pfn: start pfn of the memory range
- * @end_pfn: end pfn of the memory range
- * @arg: argument passed to func
- * @func: callback for each memory section walked
- *
- * This function walks through all present mem sections in range
- * [start_pfn, end_pfn) and call func on each mem section.
- *
- * Returns the return value of func.
- */
-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
- void *arg, int (*func)(struct memory_block *, void *))
-{
- struct memory_block *mem = NULL;
- struct mem_section *section;
- unsigned long pfn, section_nr;
- int ret;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- section_nr = pfn_to_section_nr(pfn);
- if (!present_section_nr(section_nr))
- continue;
-
- section = __nr_to_section(section_nr);
- /* same memblock? */
- if (mem)
- if ((section_nr >= mem->start_section_nr) &&
- (section_nr <= mem->end_section_nr))
- continue;
- mem = find_memory_block_hinted(section, mem);
- if (!mem)
- continue;
-
- ret = func(mem, arg);
- if (ret) {
- kobject_put(&mem->dev.kobj);
- return ret;
- }
- }
-
- if (mem)
- kobject_put(&mem->dev.kobj);
-
- return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1734,9 +1667,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
- }
- return ret;
+ return -EBUSY;
+ }
+ return 0;
}
static int check_cpu_on_node(pg_data_t *pgdat)
@@ -1819,19 +1753,9 @@ static void __release_memory_resource(resource_size_t start,
}
}
-/**
- * remove_memory
- * @nid: the node ID
- * @start: physical address of the region to remove
- * @size: size of the region to remove
- *
- * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
- * and online/offline operations before this call, as required by
- * try_offline_node().
- */
-void __ref __remove_memory(int nid, u64 start, u64 size)
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
- int ret;
+ int rc = 0;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1839,32 +1763,65 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
/*
* All memory blocks must be offlined before removing memory. Check
- * whether all memory blocks in question are offline and trigger a BUG()
+ * whether all memory blocks in question are offline and return error
* if this is not the case.
*/
- ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
- check_memblock_offlined_cb);
- if (ret)
- BUG();
+ rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
+ if (rc)
+ goto done;
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
memblock_free(start, size);
memblock_remove(start, size);
+ /* remove memory block devices before removing memory */
+ remove_memory_block_devices(start, size);
+
arch_remove_memory(nid, start, size, NULL);
__release_memory_resource(start, size);
try_offline_node(nid);
+done:
mem_hotplug_done();
+ return rc;
+}
+
+/**
+ * remove_memory
+ * @nid: the node ID
+ * @start: physical address of the region to remove
+ * @size: size of the region to remove
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
+void __remove_memory(int nid, u64 start, u64 size)
+{
+
+ /*
+ * trigger BUG() is some memory is not offlined prior to calling this
+ * function
+ */
+ if (try_remove_memory(nid, start, size))
+ BUG();
}
-void remove_memory(int nid, u64 start, u64 size)
+/*
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
+ * some memory is not offline
+ */
+int remove_memory(int nid, u64 start, u64 size)
{
+ int rc;
+
lock_device_hotplug();
- __remove_memory(nid, start, size);
+ rc = try_remove_memory(nid, start, size);
unlock_device_hotplug();
+
+ return rc;
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/migrate.c b/mm/migrate.c
index 3445747e229d..8992741f10aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -394,8 +394,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode,
- int extra_count)
+ struct page *newpage, struct page *page, int extra_count)
{
XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
@@ -681,7 +680,7 @@ int migrate_page(struct address_space *mapping,
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -780,7 +779,7 @@ recheck_buffers:
}
}
- rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
if (rc != MIGRATEPAGE_SUCCESS)
goto unlock_buffers;
diff --git a/mm/nommu.c b/mm/nommu.c
index eb3e2e558da1..fed1b6e9c89b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1261,7 +1261,9 @@ unsigned long do_mmap(struct file *file,
add_nommu_region(region);
/* clear anonymous mappings that don't ask for uninitialized data */
- if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+ if (!vma->vm_file &&
+ (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
+ !(flags & MAP_UNINITIALIZED)))
memset((void *)region->vm_start, 0,
region->vm_end - region->vm_start);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fd7f45a04eb..272c6de1bf4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -450,7 +450,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
- return __pfn_to_section(pfn)->pageblock_flags;
+ return section_to_usemap(__pfn_to_section(pfn));
#else
return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
@@ -4102,7 +4102,6 @@ static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
- struct reclaim_state reclaim_state;
int progress;
unsigned int noreclaim_flag;
unsigned long pflags;
@@ -4114,13 +4113,10 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- reclaim_state.reclaimed_slab = 0;
- current->reclaim_state = &reclaim_state;
progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
ac->nodemask);
- current->reclaim_state = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
psi_memstall_leave(&pflags);
@@ -5930,7 +5926,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start = jiffies;
int nid = pgdat->node_id;
- if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+ if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
return;
/*
@@ -5978,7 +5974,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
* pfn out of zone.
*
* Please note that MEMMAP_HOTPLUG path doesn't clear memmap
- * because this is done early in sparse_add_one_section
+ * because this is done early in section_activate()
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
@@ -7355,12 +7351,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
- /* Print out the early node map */
+ /*
+ * Print out the early node map, and initialize the
+ * subsection-map relative to active online memory ranges to
+ * enable future "sub-section" extensions of the memory map.
+ */
pr_info("Early memory node ranges\n");
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
+ subsection_map_init(start_pfn, end_pfn - start_pfn);
+ }
/* Initialise every node */
mminit_verify_pageflags_layout();
diff --git a/mm/shmem.c b/mm/shmem.c
index f4dce9c8670d..626d8c74b973 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -400,7 +400,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly;
-#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
+#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
{
if (!strcmp(str, "never"))
@@ -417,7 +417,9 @@ static int shmem_parse_huge(const char *str)
return SHMEM_HUGE_FORCE;
return -EINVAL;
}
+#endif
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
{
switch (huge) {
@@ -3775,10 +3777,6 @@ int __init shmem_init(void)
{
int error;
- /* If rootfs called this, don't re-init */
- if (shmem_inode_cachep)
- return 0;
-
shmem_init_inodecache();
error = register_filesystem(&shmem_fs_type);
@@ -3872,6 +3870,9 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
loff_t i_size;
pgoff_t off;
+ if ((vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
if (shmem_huge == SHMEM_HUGE_FORCE)
return true;
if (shmem_huge == SHMEM_HUGE_DENY)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6c49dbb3769e..807490fe217a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1028,7 +1028,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
}
struct kmem_cache *
-kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
+{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
EXPORT_SYMBOL(kmalloc_caches);
/*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 7fec05796796..200aef686722 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -245,19 +245,26 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
return 0;
}
-struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+struct page * __meminit __populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
unsigned long start;
unsigned long end;
- struct page *map;
- map = pfn_to_page(pnum * PAGES_PER_SECTION);
- start = (unsigned long)map;
- end = (unsigned long)(map + PAGES_PER_SECTION);
+ /*
+ * The minimum granularity of memmap extensions is
+ * PAGES_PER_SUBSECTION as allocations are tracked in the
+ * 'subsection_map' bitmap of the section.
+ */
+ end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
+ pfn &= PAGE_SUBSECTION_MASK;
+ nr_pages = end - pfn;
+
+ start = (unsigned long) pfn_to_page(pfn);
+ end = start + nr_pages * sizeof(struct page);
if (vmemmap_populate(start, end, nid, altmap))
return NULL;
- return map;
+ return pfn_to_page(pfn);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index fd13166949b5..72f010d9bff5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -83,8 +83,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
struct mem_section *section;
+ /*
+ * An existing section is possible in the sub-section hotplug
+ * case. First hot-add instantiates, follow-on hot-add reuses
+ * the existing section.
+ *
+ * The mem_hotplug_lock resolves the apparent race below.
+ */
if (mem_section[root])
- return -EEXIST;
+ return 0;
section = sparse_index_alloc(nid);
if (!section)
@@ -102,7 +109,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
#endif
#ifdef CONFIG_SPARSEMEM_EXTREME
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
{
unsigned long root_nr;
struct mem_section *root = NULL;
@@ -121,9 +128,9 @@ int __section_nr(struct mem_section* ms)
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
#else
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
{
- return (int)(ms - mem_section[0]);
+ return (unsigned long)(ms - mem_section[0]);
}
#endif
@@ -178,10 +185,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
* Keeping track of this gives us an easy way to break out of
* those loops early.
*/
-int __highest_present_section_nr;
+unsigned long __highest_present_section_nr;
static void section_mark_present(struct mem_section *ms)
{
- int section_nr = __section_nr(ms);
+ unsigned long section_nr = __section_nr(ms);
if (section_nr > __highest_present_section_nr)
__highest_present_section_nr = section_nr;
@@ -189,7 +196,7 @@ static void section_mark_present(struct mem_section *ms)
ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
-static inline int next_present_section_nr(int section_nr)
+static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
do {
section_nr++;
@@ -210,6 +217,41 @@ static inline unsigned long first_present_section_nr(void)
return next_present_section_nr(-1);
}
+void subsection_mask_set(unsigned long *map, unsigned long pfn,
+ unsigned long nr_pages)
+{
+ int idx = subsection_map_index(pfn);
+ int end = subsection_map_index(pfn + nr_pages - 1);
+
+ bitmap_set(map, idx, end - idx + 1);
+}
+
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+ int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ unsigned long nr, start_sec = pfn_to_section_nr(pfn);
+
+ if (!nr_pages)
+ return;
+
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ struct mem_section *ms;
+ unsigned long pfns;
+
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ ms = __nr_to_section(nr);
+ subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
+
+ pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
+ pfns, subsection_map_index(pfn),
+ subsection_map_index(pfn + pfns - 1));
+
+ pfn += pfns;
+ nr_pages -= pfns;
+ }
+}
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -288,33 +330,31 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
- unsigned long *pageblock_bitmap)
+ struct mem_section_usage *usage, unsigned long flags)
{
ms->section_mem_map &= ~SECTION_MAP_MASK;
- ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
- SECTION_HAS_MEM_MAP;
- ms->pageblock_flags = pageblock_bitmap;
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
+ | SECTION_HAS_MEM_MAP | flags;
+ ms->usage = usage;
}
-unsigned long usemap_size(void)
+static unsigned long usemap_size(void)
{
return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static unsigned long *__kmalloc_section_usemap(void)
+size_t mem_section_usage_size(void)
{
- return kmalloc(usemap_size(), GFP_KERNEL);
+ return sizeof(struct mem_section_usage) + usemap_size();
}
-#endif /* CONFIG_MEMORY_HOTPLUG */
#ifdef CONFIG_MEMORY_HOTREMOVE
-static unsigned long * __init
+static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
+ struct mem_section_usage *usage;
unsigned long goal, limit;
- unsigned long *p;
int nid;
/*
* A page may contain usemaps for other sections preventing the
@@ -330,15 +370,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
- if (!p && limit) {
+ usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
+ if (!usage && limit) {
limit = 0;
goto again;
}
- return p;
+ return usage;
}
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+ struct mem_section_usage *usage)
{
unsigned long usemap_snr, pgdat_snr;
static unsigned long old_usemap_snr;
@@ -352,7 +393,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
old_pgdat_snr = NR_MEM_SECTIONS;
}
- usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+ usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
@@ -380,14 +421,15 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
usemap_snr, pgdat_snr, nid);
}
#else
-static unsigned long * __init
+static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
}
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+ struct mem_section_usage *usage)
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -404,8 +446,8 @@ static unsigned long __init section_map_size(void)
return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+struct page __init *__populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
unsigned long size = section_map_size();
struct page *map = sparse_buffer_alloc(size);
@@ -474,32 +516,35 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count)
{
- unsigned long pnum, usemap_longs, *usemap;
+ struct mem_section_usage *usage;
+ unsigned long pnum;
struct page *map;
- usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
- usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
- usemap_size() *
- map_count);
- if (!usemap) {
+ usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+ mem_section_usage_size() * map_count);
+ if (!usage) {
pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
goto failed;
}
sparse_buffer_init(map_count * section_map_size(), nid);
for_each_present_section_nr(pnum_begin, pnum) {
+ unsigned long pfn = section_nr_to_pfn(pnum);
+
if (pnum >= pnum_end)
break;
- map = sparse_mem_map_populate(pnum, nid, NULL);
+ map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+ nid, NULL);
if (!map) {
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
goto failed;
}
- check_usemap_section_nr(nid, usemap);
- sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
- usemap += usemap_longs;
+ check_usemap_section_nr(nid, usage);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
+ SECTION_IS_EARLY);
+ usage = (void *) usage + mem_section_usage_size();
}
sparse_buffer_fini();
return;
@@ -590,21 +635,20 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+static struct page *populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- /* This will make the necessary allocations eventually. */
- return sparse_mem_map_populate(pnum, nid, altmap);
+ return __populate_section_memmap(pfn, nr_pages, nid, altmap);
}
-static void __kfree_section_memmap(struct page *memmap,
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
+ unsigned long end = start + nr_pages * sizeof(struct page);
vmemmap_free(start, end, altmap);
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
@@ -612,9 +656,9 @@ static void free_map_bootmem(struct page *memmap)
vmemmap_free(start, end, NULL);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
-static struct page *__kmalloc_section_memmap(void)
+struct page *populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
struct page *page, *ret;
unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
@@ -635,15 +679,11 @@ got_map_ptr:
return ret;
}
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- return __kmalloc_section_memmap();
-}
+ struct page *memmap = pfn_to_page(pfn);
-static void __kfree_section_memmap(struct page *memmap,
- struct vmem_altmap *altmap)
-{
if (is_vmalloc_addr(memmap))
vfree(memmap);
else
@@ -651,7 +691,6 @@ static void __kfree_section_memmap(struct page *memmap,
get_order(sizeof(struct page) * PAGES_PER_SECTION));
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap)
{
unsigned long maps_section_nr, removing_section_nr, i;
@@ -681,13 +720,122 @@ static void free_map_bootmem(struct page *memmap)
put_page_bootmem(page);
}
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ bool section_is_early = early_section(ms);
+ struct page *memmap = NULL;
+ unsigned long *subsection_map = ms->usage
+ ? &ms->usage->subsection_map[0] : NULL;
+
+ subsection_mask_set(map, pfn, nr_pages);
+ if (subsection_map)
+ bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+ if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+ "section already deactivated (%#lx + %ld)\n",
+ pfn, nr_pages))
+ return;
+
+ /*
+ * There are 3 cases to handle across two configurations
+ * (SPARSEMEM_VMEMMAP={y,n}):
+ *
+ * 1/ deactivation of a partial hot-added section (only possible
+ * in the SPARSEMEM_VMEMMAP=y case).
+ * a/ section was present at memory init
+ * b/ section was hot-added post memory init
+ * 2/ deactivation of a complete hot-added section
+ * 3/ deactivation of a complete section from memory init
+ *
+ * For 1/, when subsection_map does not empty we will not be
+ * freeing the usage map, but still need to free the vmemmap
+ * range.
+ *
+ * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
+ */
+ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+ if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+
+ if (!section_is_early) {
+ kfree(ms->usage);
+ ms->usage = NULL;
+ }
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+ ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+ }
+
+ if (section_is_early && memmap)
+ free_map_bootmem(memmap);
+ else
+ depopulate_section_memmap(pfn, nr_pages, altmap);
+}
+
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ struct mem_section_usage *usage = NULL;
+ unsigned long *subsection_map;
+ struct page *memmap;
+ int rc = 0;
+
+ subsection_mask_set(map, pfn, nr_pages);
+
+ if (!ms->usage) {
+ usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+ if (!usage)
+ return ERR_PTR(-ENOMEM);
+ ms->usage = usage;
+ }
+ subsection_map = &ms->usage->subsection_map[0];
+
+ if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+ rc = -EINVAL;
+ else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+ rc = -EEXIST;
+ else
+ bitmap_or(subsection_map, map, subsection_map,
+ SUBSECTIONS_PER_SECTION);
+
+ if (rc) {
+ if (usage)
+ ms->usage = NULL;
+ kfree(usage);
+ return ERR_PTR(rc);
+ }
+
+ /*
+ * The early init code does not consider partially populated
+ * initial sections, it simply assumes that memory will never be
+ * referenced. If we hot-add memory into such a section then we
+ * do not need to populate the memmap and can simply reuse what
+ * is already there.
+ */
+ if (nr_pages < PAGES_PER_SECTION && early_section(ms))
+ return pfn_to_page(pfn);
+
+ memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+ if (!memmap) {
+ section_deactivate(pfn, nr_pages, altmap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return memmap;
+}
+
/**
- * sparse_add_one_section - add a memory section
+ * sparse_add_section - add a memory section, or populate an existing one
* @nid: The node to add section on
* @start_pfn: start pfn of the memory range
+ * @nr_pages: number of pfns to add in the section
* @altmap: device page map
*
* This is only intended for hotplug.
@@ -697,56 +845,40 @@ static void free_map_bootmem(struct page *memmap)
* * -EEXIST - Section has been present.
* * -ENOMEM - Out of memory.
*/
-int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
- struct vmem_altmap *altmap)
+int __meminit sparse_add_section(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct mem_section *ms;
struct page *memmap;
- unsigned long *usemap;
int ret;
- /*
- * no locking for this, because it does its own
- * plus, it does a kmalloc
- */
ret = sparse_index_init(section_nr, nid);
- if (ret < 0 && ret != -EEXIST)
+ if (ret < 0)
return ret;
- ret = 0;
- memmap = kmalloc_section_memmap(section_nr, nid, altmap);
- if (!memmap)
- return -ENOMEM;
- usemap = __kmalloc_section_usemap();
- if (!usemap) {
- __kfree_section_memmap(memmap, altmap);
- return -ENOMEM;
- }
- ms = __pfn_to_section(start_pfn);
- if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
- ret = -EEXIST;
- goto out;
- }
+ memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+ if (IS_ERR(memmap))
+ return PTR_ERR(memmap);
/*
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
+ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+ ms = __pfn_to_section(start_pfn);
+ set_section_nid(section_nr, nid);
section_mark_present(ms);
- sparse_init_one_section(ms, section_nr, memmap, usemap);
-out:
- if (ret < 0) {
- kfree(usemap);
- __kfree_section_memmap(memmap, altmap);
- }
- return ret;
+ /* Align memmap to section boundary in the subsection case */
+ if (section_nr_to_pfn(section_nr) != start_pfn)
+ memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
+ sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
+
+ return 0;
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
@@ -777,51 +909,12 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
}
#endif
-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
+void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
+ unsigned long nr_pages, unsigned long map_offset,
struct vmem_altmap *altmap)
{
- struct page *usemap_page;
-
- if (!usemap)
- return;
-
- usemap_page = virt_to_page(usemap);
- /*
- * Check to see if allocation came from hot-plug-add
- */
- if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
- kfree(usemap);
- if (memmap)
- __kfree_section_memmap(memmap, altmap);
- return;
- }
-
- /*
- * The usemap came from bootmem. This is packed with other usemaps
- * on the section which has pgdat at boot time. Just keep it as is now.
- */
-
- if (memmap)
- free_map_bootmem(memmap);
-}
-
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset, struct vmem_altmap *altmap)
-{
- struct page *memmap = NULL;
- unsigned long *usemap = NULL;
-
- if (ms->section_mem_map) {
- usemap = ms->pageblock_flags;
- memmap = sparse_decode_mem_map(ms->section_mem_map,
- __section_nr(ms));
- ms->section_mem_map = 0;
- ms->pageblock_flags = NULL;
- }
-
- clear_hwpoisoned_pages(memmap + map_offset,
- PAGES_PER_SECTION - map_offset);
- free_section_usemap(memmap, usemap, altmap);
+ clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
+ nr_pages - map_offset);
+ section_deactivate(pfn, nr_pages, altmap);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 607c48229a1d..ae300397dfda 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -8,7 +8,7 @@
/*
* This file contains the default values for the operation of the
* Linux VM subsystem. Fine-tuning documentation can be found in
- * Documentation/sysctl/vm.txt.
+ * Documentation/admin-guide/sysctl/vm.rst.
* Started 18.12.91
* Swap aging added 23.2.95, Stephen Tweedie.
* Buffermem limits added 12.3.98, Rik van Riel.
diff --git a/mm/util.c b/mm/util.c
index 68575a315dc5..e6351a80f248 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,7 @@
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
@@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
}
#endif
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against
+ * @pages: number of pages to account
+ * @inc: %true if @pages should be considered positive, %false if not
+ * @task: task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_sem is held as writer.
+ *
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+ struct task_struct *task, bool bypass_rlim)
+{
+ unsigned long locked_vm, limit;
+ int ret = 0;
+
+ lockdep_assert_held_write(&mm->mmap_sem);
+
+ locked_vm = mm->locked_vm;
+ if (inc) {
+ if (!bypass_rlim) {
+ limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked_vm + pages > limit)
+ ret = -ENOMEM;
+ }
+ if (!ret)
+ mm->locked_vm = locked_vm + pages;
+ } else {
+ WARN_ON_ONCE(pages > locked_vm);
+ mm->locked_vm = locked_vm - pages;
+ }
+
+ pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+ (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+ locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+ ret ? " - exceeded" : "");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__account_locked_vm);
+
+/**
+ * account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against, may be NULL
+ * @pages: number of pages to account
+ * @inc: %true if @pages should be considered positive, %false if not
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0 on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+{
+ int ret;
+
+ if (pages == 0 || !mm)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+ ret = __account_locked_vm(mm, pages, inc, current,
+ capable(CAP_IPC_LOCK));
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(account_locked_vm);
+
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8e3dcd527b8..44df66a98f2a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,9 @@ struct scan_control {
unsigned int file_taken;
unsigned int taken;
} nr;
+
+ /* for recording the reclaimed slab by now */
+ struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCH
@@ -238,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
}
#endif /* CONFIG_MEMCG_KMEM */
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
@@ -3191,11 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
+ set_task_reclaim_state(current, &sc.reclaim_state);
trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
@@ -3218,6 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
};
unsigned long lru_pages;
+ set_task_reclaim_state(current, &sc.reclaim_state);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -3235,7 +3253,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
*nr_scanned = sc.nr_scanned;
+
return sc.nr_reclaimed;
}
@@ -3262,6 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_shrinkslab = 1,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
@@ -3282,6 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
@@ -3483,6 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
.may_unmap = 1,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
@@ -3664,6 +3687,8 @@ out:
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
psi_memstall_leave(&pflags);
+ set_task_reclaim_state(current, NULL);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
@@ -3787,15 +3812,10 @@ static int kswapd(void *p)
unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
-
- struct reclaim_state reclaim_state = {
- .reclaimed_slab = 0,
- };
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
- current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator",
@@ -3857,7 +3877,6 @@ kswapd_try_sleep:
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
- current->reclaim_state = NULL;
return 0;
}
@@ -3922,7 +3941,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
*/
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
- struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3934,18 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.hibernation_mode = 1,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
- struct task_struct *p = current;
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
fs_reclaim_acquire(sc.gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(current, &sc.reclaim_state);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
- p->reclaim_state = NULL;
+ set_task_reclaim_state(current, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
@@ -4110,7 +4126,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
- struct reclaim_state reclaim_state;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4135,8 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(p, &sc.reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
@@ -4148,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- p->reclaim_state = NULL;
+ set_task_reclaim_state(p, NULL);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index dfcd69d08c1e..1a029a7432ee 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -26,7 +26,6 @@
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
-#include <linux/dcache.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
@@ -36,12 +35,14 @@
#include <linux/compaction.h>
#include <linux/percpu.h>
#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
+#include <linux/magic.h>
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
@@ -101,6 +102,7 @@ struct z3fold_buddy_slots {
* @refcount: reference count for the z3fold page
* @work: work_struct for page layout optimization
* @slots: pointer to the structure holding buddy slots
+ * @pool: pointer to the containing pool
* @cpu: CPU which this page "belongs" to
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @middle_chunks: the size of the middle buddy in chunks, 0 if free
@@ -114,6 +116,7 @@ struct z3fold_header {
struct kref refcount;
struct work_struct work;
struct z3fold_buddy_slots *slots;
+ struct z3fold_pool *pool;
short cpu;
unsigned short first_chunks;
unsigned short middle_chunks;
@@ -193,8 +196,10 @@ static void compact_page_work(struct work_struct *w);
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
gfp_t gfp)
{
- struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
- gfp);
+ struct z3fold_buddy_slots *slots;
+
+ slots = kmem_cache_alloc(pool->c_handle,
+ (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
if (slots) {
memset(slots->slot, 0, sizeof(slots->slot));
@@ -241,19 +246,14 @@ static inline void free_handle(unsigned long handle)
}
}
-static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int z3fold_init_fs_context(struct fs_context *fc)
{
- static const struct dentry_operations ops = {
- .d_dname = simple_dname,
- };
-
- return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33);
+ return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
}
static struct file_system_type z3fold_fs = {
.name = "z3fold",
- .mount = z3fold_do_mount,
+ .init_fs_context = z3fold_init_fs_context,
.kill_sb = kill_anon_super,
};
@@ -320,6 +320,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
zhdr->start_middle = 0;
zhdr->cpu = -1;
zhdr->slots = slots;
+ zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_WORK(&zhdr->work, compact_page_work);
return zhdr;
@@ -426,7 +427,7 @@ static enum buddy handle_to_buddy(unsigned long handle)
static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
{
- return slots_to_pool(zhdr->slots);
+ return zhdr->pool;
}
static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
@@ -850,7 +851,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
enum buddy bud;
bool can_sleep = gfpflags_allow_blocking(gfp);
- if (!size || (gfp & __GFP_HIGHMEM))
+ if (!size)
return -EINVAL;
if (size > PAGE_SIZE)
@@ -1345,24 +1346,29 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
zhdr = page_address(page);
pool = zhdr_to_pool(zhdr);
- if (!trylock_page(page))
- return -EAGAIN;
-
if (!z3fold_page_trylock(zhdr)) {
- unlock_page(page);
return -EAGAIN;
}
if (zhdr->mapped_count != 0) {
z3fold_page_unlock(zhdr);
- unlock_page(page);
return -EBUSY;
}
+ if (work_pending(&zhdr->work)) {
+ z3fold_page_unlock(zhdr);
+ return -EAGAIN;
+ }
new_zhdr = page_address(newpage);
memcpy(new_zhdr, zhdr, PAGE_SIZE);
newpage->private = page->private;
page->private = 0;
z3fold_page_unlock(zhdr);
spin_lock_init(&new_zhdr->page_lock);
+ INIT_WORK(&new_zhdr->work, compact_page_work);
+ /*
+ * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
+ * so we only have to reinitialize it.
+ */
+ INIT_LIST_HEAD(&new_zhdr->buddy);
new_mapping = page_mapping(page);
__ClearPageMovable(page);
ClearPagePrivate(page);
@@ -1386,7 +1392,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
page_mapcount_reset(page);
- unlock_page(page);
put_page(page);
return 0;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index db09eb3669c5..57fbb7ced69f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -52,6 +52,7 @@
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
@@ -1798,19 +1799,14 @@ static void lock_zspage(struct zspage *zspage)
} while ((page = get_next_page(page)) != NULL);
}
-static struct dentry *zs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int zs_init_fs_context(struct fs_context *fc)
{
- static const struct dentry_operations ops = {
- .d_dname = simple_dname,
- };
-
- return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+ return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
}
static struct file_system_type zsmalloc_fs = {
.name = "zsmalloc",
- .mount = zs_mount,
+ .init_fs_context = zs_init_fs_context,
.kill_sb = kill_anon_super,
};
OpenPOWER on IntegriCloud