summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c148
-rw-r--r--mm/gup.c10
-rw-r--r--mm/huge_memory.c38
-rw-r--r--mm/hugetlb.c23
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c35
-rw-r--r--mm/page_alloc.c28
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/page_poison.c6
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/vmstat.c7
-rw-r--r--mm/z3fold.c101
16 files changed, 280 insertions, 139 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 218d0b2ec82d..81adec8ee02c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2049,7 +2049,7 @@ find_page:
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
/* pipes can't handle partially uptodate pages */
- if (unlikely(iter->type & ITER_PIPE))
+ if (unlikely(iov_iter_is_pipe(iter)))
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
@@ -2825,6 +2825,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
EXPORT_SYMBOL(read_cache_page_gfp);
/*
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
+ * LFS limits. If pos is under the limit it becomes a short access. If it
+ * exceeds the limit we return -EFBIG.
+ */
+static int generic_access_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t max_size = inode->i_sb->s_maxbytes;
+
+ if (!(file->f_flags & O_LARGEFILE))
+ max_size = MAX_NON_LFS;
+
+ if (unlikely(pos >= max_size))
+ return -EFBIG;
+ *count = min(*count, max_size - pos);
+ return 0;
+}
+
+static int generic_write_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+ loff_t limit = rlimit(RLIMIT_FSIZE);
+
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ *count = min(*count, limit - pos);
+ }
+
+ return generic_access_check_limits(file, pos, count);
+}
+
+/*
* Performs necessary checks before doing a write
*
* Can adjust writing position or amount of bytes to write.
@@ -2835,8 +2871,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- unsigned long limit = rlimit(RLIMIT_FSIZE);
- loff_t pos;
+ loff_t count;
+ int ret;
if (!iov_iter_count(from))
return 0;
@@ -2845,43 +2881,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
- pos = iocb->ki_pos;
-
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
- if (limit != RLIM_INFINITY) {
- if (iocb->ki_pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- iov_iter_truncate(from, limit - (unsigned long)pos);
- }
+ count = iov_iter_count(from);
+ ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+ if (ret)
+ return ret;
+
+ iov_iter_truncate(from, count);
+ return iov_iter_count(from);
+}
+EXPORT_SYMBOL(generic_write_checks);
+
+/*
+ * Performs necessary checks before doing a clone.
+ *
+ * Can adjust amount of bytes to clone.
+ * Returns appropriate error code that caller should return or
+ * zero in case the clone should be allowed.
+ */
+int generic_remap_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *req_count, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_in->f_mapping->host;
+ struct inode *inode_out = file_out->f_mapping->host;
+ uint64_t count = *req_count;
+ uint64_t bcount;
+ loff_t size_in, size_out;
+ loff_t bs = inode_out->i_sb->s_blocksize;
+ int ret;
+
+ /* The start of both ranges must be aligned to an fs block. */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EINVAL;
+
+ size_in = i_size_read(inode_in);
+ size_out = i_size_read(inode_out);
+
+ /* Dedupe requires both ranges to be within EOF. */
+ if ((remap_flags & REMAP_FILE_DEDUP) &&
+ (pos_in >= size_in || pos_in + count > size_in ||
+ pos_out >= size_out || pos_out + count > size_out))
+ return -EINVAL;
+
+ /* Ensure the infile range is within the infile. */
+ if (pos_in >= size_in)
+ return -EINVAL;
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_access_check_limits(file_in, pos_in, &count);
+ if (ret)
+ return ret;
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
/*
- * LFS rule
+ * If the user wanted us to link to the infile's EOF, round up to the
+ * next block boundary for this check.
+ *
+ * Otherwise, make sure the count is also block-aligned, having
+ * already confirmed the starting offsets' block alignment.
*/
- if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
- !(file->f_flags & O_LARGEFILE))) {
- if (pos >= MAX_NON_LFS)
- return -EFBIG;
- iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
+ if (pos_in + count == size_in) {
+ bcount = ALIGN(size_in, bs) - pos_in;
+ } else {
+ if (!IS_ALIGNED(count, bs))
+ count = ALIGN_DOWN(count, bs);
+ bcount = count;
}
+ /* Don't allow overlapped cloning within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + bcount > pos_in &&
+ pos_out < pos_in + bcount)
+ return -EINVAL;
+
/*
- * Are we about to exceed the fs block limit ?
- *
- * If we have written data it becomes a short write. If we have
- * exceeded without writing data we send a signal and return EFBIG.
- * Linus frestrict idea will clean these up nicely..
+ * We shortened the request but the caller can't deal with that, so
+ * bounce the request back to userspace.
*/
- if (unlikely(pos >= inode->i_sb->s_maxbytes))
- return -EFBIG;
+ if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+ return -EINVAL;
- iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
- return iov_iter_count(from);
+ *req_count = count;
+ return 0;
}
-EXPORT_SYMBOL(generic_write_checks);
int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
diff --git a/mm/gup.c b/mm/gup.c
index f76e77a2d34b..aa43620a3270 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -385,11 +385,17 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
+ * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
+ * pointer to output page_mask
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
+ * the device's dev_pagemap metadata to avoid repeating expensive lookups.
+ *
+ * On output, the @ctx->page_mask is set according to the size of the page.
+ *
+ * Return: the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
* by a page descriptor (see also vm_normal_page()).
*/
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4e4ef8fa479d..55478ab3c83b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,21 +629,40 @@ release:
* available
* never: never stall for any thp allocation
*/
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
{
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+ gfp_t this_node = 0;
+
+#ifdef CONFIG_NUMA
+ struct mempolicy *pol;
+ /*
+ * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
+ * specified, to express a general desire to stay on the current
+ * node for optimistic allocation attempts. If the defrag mode
+ * and/or madvise hint requires the direct reclaim then we prefer
+ * to fallback to other node rather than node reclaim because that
+ * can lead to excessive reclaim even though there is free memory
+ * on other nodes. We expect that NUMA preferences are specified
+ * by memory policies.
+ */
+ pol = get_vma_policy(vma, addr);
+ if (pol->mode != MPOL_BIND)
+ this_node = __GFP_THISNODE;
+ mpol_cond_put(pol);
+#endif
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- __GFP_KSWAPD_RECLAIM);
+ __GFP_KSWAPD_RECLAIM | this_node);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- 0);
- return GFP_TRANSHUGE_LIGHT;
+ this_node);
+ return GFP_TRANSHUGE_LIGHT | this_node;
}
/* Caller must hold page table lock. */
@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable);
return ret;
}
- gfp = alloc_hugepage_direct_gfpmask(vma);
- page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+ gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
+ page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_direct_gfpmask(vma);
- new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
+ new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
+ haddr, numa_node_id());
} else
new_page = NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c007fb5fb8d5..7f2a28ab46d5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3233,7 +3233,7 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
- pte_t *src_pte, *dst_pte, entry;
+ pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
int cow;
@@ -3261,15 +3261,30 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}
- /* If the pagetables are shared don't copy or take references */
- if (dst_pte == src_pte)
+ /*
+ * If the pagetables are shared don't copy or take references.
+ * dst_pte == src_pte is the common case of src/dest sharing.
+ *
+ * However, src could have 'unshared' and dst shares with
+ * another vma. If dst_pte !none, this implies sharing.
+ * Check here before taking page table lock, and once again
+ * after taking the lock below.
+ */
+ dst_entry = huge_ptep_get(dst_pte);
+ if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
continue;
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- if (huge_pte_none(entry)) { /* skip none entry */
+ dst_entry = huge_ptep_get(dst_pte);
+ if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ /*
+ * Skip if src entry none. Also, skip in the
+ * unlikely case dst entry !none as this implies
+ * sharing with another vma.
+ */
;
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
diff --git a/mm/memblock.c b/mm/memblock.c
index 7df468c8ebc8..9a2d5ae81ae1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1179,7 +1179,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
- * Common iterator interface used to define for_each_mem_range().
+ * Common iterator interface used to define for_each_mem_pfn_range().
*/
void __init_memblock __next_mem_pfn_range(int *idx, int nid,
unsigned long *out_start_pfn,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 54920cbc46bf..6e1469b80cb7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2593,7 +2593,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
struct mem_cgroup *memcg;
int ret = 0;
- if (memcg_kmem_bypass())
+ if (mem_cgroup_disabled() || memcg_kmem_bypass())
return 0;
memcg = get_mem_cgroup_from_current();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 61972da38d93..2b2b3ccbbfb5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -586,6 +586,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+ cond_resched();
ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
altmap);
map_offset = 0;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cfd26d7e61a1..5837a067124d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
} else if (PageTransHuge(page)) {
struct page *thp;
- thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
- HPAGE_PMD_ORDER);
+ thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
+ address, numa_node_id());
if (!thp)
return NULL;
prep_transhuge_page(thp);
@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
-static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
* @node: Which node to prefer for allocation (modulo policy).
- * @hugepage: for hugepages try only the preferred node if possible
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, int node, bool hugepage)
+ unsigned long addr, int node)
{
struct mempolicy *pol;
struct page *page;
@@ -2040,32 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
goto out;
}
- if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
- int hpage_node = node;
-
- /*
- * For hugepage allocation and non-interleave policy which
- * allows the current node (or other explicitly preferred
- * node) we only try to allocate from the current/preferred
- * node and don't fall back to other nodes, as the cost of
- * remote accesses would likely offset THP benefits.
- *
- * If the policy is interleave, or does not allow the current
- * node in its nodemask, we allocate the standard way.
- */
- if (pol->mode == MPOL_PREFERRED &&
- !(pol->flags & MPOL_F_LOCAL))
- hpage_node = pol->v.preferred_node;
-
- nmask = policy_nodemask(gfp, pol);
- if (!nmask || node_isset(hpage_node, *nmask)) {
- mpol_cond_put(pol);
- page = __alloc_pages_node(hpage_node,
- gfp | __GFP_THISNODE, order);
- goto out;
- }
- }
-
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a919ba5cb3c8..6847177dc4a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4061,17 +4061,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int reserve_flags;
/*
- * In the slowpath, we sanity check order to avoid ever trying to
- * reclaim >= MAX_ORDER areas which will never succeed. Callers may
- * be using allocators in order of preference for an area that is
- * too large.
- */
- if (order >= MAX_ORDER) {
- WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
- return NULL;
- }
-
- /*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
@@ -4364,6 +4353,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
+ /*
+ * There are several places where we assume that the order value is sane
+ * so bail out early if the request is out of bound.
+ */
+ if (unlikely(order >= MAX_ORDER)) {
+ WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ return NULL;
+ }
+
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
@@ -7789,6 +7787,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
goto unmovable;
/*
+ * If the zone is movable and we have ruled out all reserved
+ * pages then it should be reasonably safe to assume the rest
+ * is movable.
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ continue;
+
+ /*
* Hugepages are not in LRU lists, but they're movable.
* We need not scan over tail pages bacause we don't
* handle each tail page individually in migration.
diff --git a/mm/page_io.c b/mm/page_io.c
index a451ffa9491c..d4d1c89bcddd 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -294,7 +294,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
};
struct iov_iter from;
- iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
+ iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
@@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
goto out;
}
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
- bio_associate_blkg_from_page(bio, page);
+ bio_associate_blkcg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/page_poison.c b/mm/page_poison.c
index f7e2a676365a..f0c15e9017c0 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -17,6 +17,11 @@ static int __init early_page_poison_param(char *buf)
}
early_param("page_poison", early_page_poison_param);
+/**
+ * page_poisoning_enabled - check if page poisoning is enabled
+ *
+ * Return true if page poisoning is enabled, or false if not.
+ */
bool page_poisoning_enabled(void)
{
/*
@@ -29,6 +34,7 @@ bool page_poisoning_enabled(void)
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled()));
}
+EXPORT_SYMBOL_GPL(page_poisoning_enabled);
static void poison_page(struct page *page)
{
diff --git a/mm/percpu.c b/mm/percpu.c
index a6b74c6fe0be..db86282fd024 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2591,7 +2591,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
BUG_ON(ai->nr_groups != 1);
upa = ai->alloc_size/ai->unit_size;
nr_g0_units = roundup(num_possible_cpus(), upa);
- if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
+ if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
pcpu_free_alloc_info(ai);
return -EINVAL;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 56bf122e0bb4..d44991ea5ed4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
- HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+ HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
@@ -2563,9 +2563,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
inode_lock(inode);
/* We're holding i_mutex so we can access i_size directly */
- if (offset < 0)
- offset = -EINVAL;
- else if (offset >= inode->i_size)
+ if (offset < 0 || offset >= inode->i_size)
offset = -ENXIO;
else {
start = offset >> PAGE_SHIFT;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 644f746e167a..8688ae65ef58 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2813,7 +2813,7 @@ static struct swap_info_struct *alloc_swap_info(void)
unsigned int type;
int i;
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ p = kvzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -2824,7 +2824,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
- kfree(p);
+ kvfree(p);
return ERR_PTR(-EPERM);
}
if (type >= nr_swapfiles) {
@@ -2838,7 +2838,7 @@ static struct swap_info_struct *alloc_swap_info(void)
smp_wmb();
nr_swapfiles++;
} else {
- kfree(p);
+ kvfree(p);
p = swap_info[type];
/*
* Do not memset this entry: a racing procfs swap_next()
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6038ce593ce3..9c624595e904 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1827,12 +1827,13 @@ static bool need_update(int cpu)
/*
* The fast way of checking if there are any vmstat diffs.
- * This works because the diffs are byte sized items.
*/
- if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+ sizeof(p->vm_stat_diff[0])))
return true;
#ifdef CONFIG_NUMA
- if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+ if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
+ sizeof(p->vm_numa_stat_diff[0])))
return true;
#endif
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 4b366d181f35..aee9b0b8d907 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -99,6 +99,7 @@ struct z3fold_header {
#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
#define BUDDY_MASK (0x3)
+#define BUDDY_SHIFT 2
/**
* struct z3fold_pool - stores metadata for each z3fold pool
@@ -145,7 +146,7 @@ enum z3fold_page_flags {
MIDDLE_CHUNK_MAPPED,
NEEDS_COMPACTING,
PAGE_STALE,
- UNDER_RECLAIM
+ PAGE_CLAIMED, /* by either reclaim or free */
};
/*****************
@@ -174,7 +175,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
- clear_bit(UNDER_RECLAIM, &page->private);
+ clear_bit(PAGE_CLAIMED, &page->private);
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -223,8 +224,11 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
unsigned long handle;
handle = (unsigned long)zhdr;
- if (bud != HEADLESS)
- handle += (bud + zhdr->first_num) & BUDDY_MASK;
+ if (bud != HEADLESS) {
+ handle |= (bud + zhdr->first_num) & BUDDY_MASK;
+ if (bud == LAST)
+ handle |= (zhdr->last_chunks << BUDDY_SHIFT);
+ }
return handle;
}
@@ -234,6 +238,12 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
return (struct z3fold_header *)(handle & PAGE_MASK);
}
+/* only for LAST bud, returns zero otherwise */
+static unsigned short handle_to_chunks(unsigned long handle)
+{
+ return (handle & ~PAGE_MASK) >> BUDDY_SHIFT;
+}
+
/*
* (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
* but that doesn't matter. because the masking will result in the
@@ -720,37 +730,39 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
page = virt_to_page(zhdr);
if (test_bit(PAGE_HEADLESS, &page->private)) {
- /* HEADLESS page stored */
- bud = HEADLESS;
- } else {
- z3fold_page_lock(zhdr);
- bud = handle_to_buddy(handle);
-
- switch (bud) {
- case FIRST:
- zhdr->first_chunks = 0;
- break;
- case MIDDLE:
- zhdr->middle_chunks = 0;
- zhdr->start_middle = 0;
- break;
- case LAST:
- zhdr->last_chunks = 0;
- break;
- default:
- pr_err("%s: unknown bud %d\n", __func__, bud);
- WARN_ON(1);
- z3fold_page_unlock(zhdr);
- return;
+ /* if a headless page is under reclaim, just leave.
+ * NB: we use test_and_set_bit for a reason: if the bit
+ * has not been set before, we release this page
+ * immediately so we don't care about its value any more.
+ */
+ if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ spin_lock(&pool->lock);
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ free_z3fold_page(page);
+ atomic64_dec(&pool->pages_nr);
}
+ return;
}
- if (bud == HEADLESS) {
- spin_lock(&pool->lock);
- list_del(&page->lru);
- spin_unlock(&pool->lock);
- free_z3fold_page(page);
- atomic64_dec(&pool->pages_nr);
+ /* Non-headless case */
+ z3fold_page_lock(zhdr);
+ bud = handle_to_buddy(handle);
+
+ switch (bud) {
+ case FIRST:
+ zhdr->first_chunks = 0;
+ break;
+ case MIDDLE:
+ zhdr->middle_chunks = 0;
+ break;
+ case LAST:
+ zhdr->last_chunks = 0;
+ break;
+ default:
+ pr_err("%s: unknown bud %d\n", __func__, bud);
+ WARN_ON(1);
+ z3fold_page_unlock(zhdr);
return;
}
@@ -758,7 +770,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
atomic64_dec(&pool->pages_nr);
return;
}
- if (test_bit(UNDER_RECLAIM, &page->private)) {
+ if (test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
return;
}
@@ -836,20 +848,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
}
list_for_each_prev(pos, &pool->lru) {
page = list_entry(pos, struct page, lru);
+
+ /* this bit could have been set by free, in which case
+ * we pass over to the next page in the pool.
+ */
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ continue;
+
+ zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
- /* candidate found */
break;
- zhdr = page_address(page);
- if (!z3fold_page_trylock(zhdr))
+ if (!z3fold_page_trylock(zhdr)) {
+ zhdr = NULL;
continue; /* can't evict at this point */
+ }
kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
- set_bit(UNDER_RECLAIM, &page->private);
break;
}
+ if (!zhdr)
+ break;
+
list_del_init(&page->lru);
spin_unlock(&pool->lock);
@@ -898,6 +920,7 @@ next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
free_z3fold_page(page);
+ atomic64_dec(&pool->pages_nr);
return 0;
}
spin_lock(&pool->lock);
@@ -905,7 +928,7 @@ next:
spin_unlock(&pool->lock);
} else {
z3fold_page_lock(zhdr);
- clear_bit(UNDER_RECLAIM, &page->private);
+ clear_bit(PAGE_CLAIMED, &page->private);
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
@@ -964,7 +987,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
break;
case LAST:
- addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
break;
default:
pr_err("unknown buddy id %d\n", buddy);
OpenPOWER on IntegriCloud