summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/backing-dev.c186
-rw-r--r--mm/filemap.c33
-rw-r--r--mm/gup.c148
-rw-r--r--mm/memory.c2
-rw-r--r--mm/page_alloc.c3
-rw-r--r--mm/percpu.c40
-rw-r--r--mm/swap.c10
-rw-r--r--mm/usercopy.c19
-rw-r--r--mm/vmalloc.c2
10 files changed, 281 insertions, 163 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 9b8fccb969dc..beb7a455915d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -312,7 +312,6 @@ config NEED_BOUNCE_POOL
config NR_QUICK
int
depends on QUICKLIST
- default "2" if AVR32
default "1"
config VIRT_TO_BUS
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c6f2a37028c2..f028a9a472fd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,8 +12,6 @@
#include <linux/device.h>
#include <trace/events/writeback.h>
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
}
postcore_initcall(bdi_class_init);
+static int bdi_init(struct backing_dev_info *bdi);
+
static int __init default_bdi_init(void)
{
int err;
@@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
+ if (wb != &bdi->wb)
+ bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested)
- return -ENOMEM;
+ if (!wb->congested) {
+ err = -ENOMEM;
+ goto out_put_bdi;
+ }
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -335,9 +339,14 @@ out_destroy_stat:
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
wb_congested_put(wb->congested);
+out_put_bdi:
+ if (wb != &bdi->wb)
+ bdi_put(bdi);
return err;
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
+
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
@@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb)
spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock);
+ /*
+ * Wait for wb shutdown to finish if someone else is just
+ * running wb_shutdown(). Otherwise we could proceed to wb /
+ * bdi destruction before wb_shutdown() is finished.
+ */
+ wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
return;
}
+ set_bit(WB_shutting_down, &wb->state);
spin_unlock_bh(&wb->work_lock);
+ cgwb_remove_from_bdi_list(wb);
/*
* Drain work list and shutdown the delayed_work. !WB_registered
* tells wb_workfn() that @wb is dying and its work_list needs to
@@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ /*
+ * Make sure bit gets cleared after shutdown is finished. Matches with
+ * the barrier provided by test_and_clear_bit() above.
+ */
+ smp_wmb();
+ clear_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb)
fprop_local_destroy_percpu(&wb->completions);
wb_congested_put(wb->congested);
+ if (wb != &wb->bdi->wb)
+ bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
/*
* cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
* blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected. cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
/**
* wb_congested_get_create - get or create a wb_congested
@@ -438,7 +461,7 @@ retry:
return NULL;
atomic_set(&new_congested->refcnt, 0);
- new_congested->bdi = bdi;
+ new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
@@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
}
/* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->bdi) {
+ if (congested->__bdi) {
rb_erase(&congested->rb_node,
- &congested->bdi->cgwb_congested_tree);
- congested->bdi = NULL;
+ &congested->__bdi->cgwb_congested_tree);
+ congested->__bdi = NULL;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct backing_dev_info *bdi = wb->bdi;
-
- spin_lock_irq(&cgwb_lock);
- list_del_rcu(&wb->bdi_node);
- spin_unlock_irq(&cgwb_lock);
wb_shutdown(wb);
@@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
kfree_rcu(wb, rcu);
-
- if (atomic_dec_and_test(&bdi->usage_cnt))
- wake_up_all(&cgwb_release_wait);
}
static void cgwb_release(struct percpu_ref *refcnt)
@@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb)
percpu_ref_kill(&wb->refcnt);
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_del_rcu(&wb->bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+}
+
static int cgwb_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
@@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
/* we might have raced another instance of this function */
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
- atomic_inc(&bdi->usage_cnt);
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
- atomic_set(&bdi->usage_cnt, 1);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
@@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return ret;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
void **slot;
+ struct bdi_writeback *wb;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
- spin_unlock_irq(&cgwb_lock);
- /*
- * All cgwb's must be shutdown and released before returning. Drain
- * the usage counter to wait for all cgwb's ever created on @bdi.
- */
- atomic_dec(&bdi->usage_cnt);
- wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
- /*
- * Grab back our reference so that we hold it when @bdi gets
- * re-registered.
- */
- atomic_inc(&bdi->usage_cnt);
+ while (!list_empty(&bdi->wb_list)) {
+ wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+ bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+ wb_shutdown(wb);
+ spin_lock_irq(&cgwb_lock);
+ }
+ spin_unlock_irq(&cgwb_lock);
}
/**
@@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
rb_entry(rbn, struct bdi_writeback_congested, rb_node);
rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->bdi = NULL; /* mark @congested unlinked */
+ congested->__bdi = NULL; /* mark @congested unlinked */
}
spin_unlock_irq(&cgwb_lock);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+ spin_unlock_irq(&cgwb_lock);
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -777,16 +801,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return 0;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
static void cgwb_bdi_exit(struct backing_dev_info *bdi)
{
wb_congested_put(bdi->wb_congested);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ list_del_rcu(&wb->bdi_node);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
{
int ret;
@@ -802,11 +836,8 @@ int bdi_init(struct backing_dev_info *bdi)
ret = cgwb_bdi_init(bdi);
- list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
-
return ret;
}
-EXPORT_SYMBOL(bdi_init);
struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
{
@@ -823,22 +854,20 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
return bdi;
}
+EXPORT_SYMBOL(bdi_alloc_node);
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
- va_list args;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
- va_start(args, fmt);
- dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
- va_end(args);
+ dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
if (IS_ERR(dev))
return PTR_ERR(dev);
+ cgwb_bdi_register(bdi);
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
@@ -851,20 +880,25 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
trace_writeback_bdi_register(bdi);
return 0;
}
-EXPORT_SYMBOL(bdi_register);
+EXPORT_SYMBOL(bdi_register_va);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
- return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ return ret;
}
-EXPORT_SYMBOL(bdi_register_dev);
+EXPORT_SYMBOL(bdi_register);
int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
{
int rc;
- rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
- MINOR(owner->devt));
+ rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
if (rc)
return rc;
/* Leaking owner reference... */
@@ -892,7 +926,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
- cgwb_bdi_destroy(bdi);
+ cgwb_bdi_unregister(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
@@ -906,19 +940,16 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
}
-static void bdi_exit(struct backing_dev_info *bdi)
-{
- WARN_ON_ONCE(bdi->dev);
- wb_exit(&bdi->wb);
- cgwb_bdi_exit(bdi);
-}
-
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
- bdi_exit(bdi);
+ if (test_bit(WB_registered, &bdi->wb.state))
+ bdi_unregister(bdi);
+ WARN_ON_ONCE(bdi->dev);
+ wb_exit(&bdi->wb);
+ cgwb_bdi_exit(bdi);
kfree(bdi);
}
@@ -926,38 +957,7 @@ void bdi_put(struct backing_dev_info *bdi)
{
kref_put(&bdi->refcnt, release_bdi);
}
-
-void bdi_destroy(struct backing_dev_info *bdi)
-{
- bdi_unregister(bdi);
- bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
- int err;
-
- bdi->name = name;
- bdi->capabilities = 0;
- err = bdi_init(bdi);
- if (err)
- return err;
-
- err = bdi_register(bdi, NULL, "%.28s-%ld", name,
- atomic_long_inc_return(&bdi_seq));
- if (err) {
- bdi_destroy(bdi);
- return err;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
+EXPORT_SYMBOL(bdi_put);
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
diff --git a/mm/filemap.c b/mm/filemap.c
index 1694623a6289..dc59c5f35b37 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -519,7 +519,7 @@ EXPORT_SYMBOL(filemap_write_and_wait);
*
* Write out and wait upon file offsets lstart->lend, inclusive.
*
- * Note that `lend' is inclusive (describes the last byte to be written) so
+ * Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
*/
int filemap_write_and_wait_range(struct address_space *mapping,
@@ -1277,12 +1277,14 @@ EXPORT_SYMBOL(find_lock_entry);
*
* PCG flags modify how the page is returned.
*
- * FGP_ACCESSED: the page will be marked accessed
- * FGP_LOCK: Page is return locked
- * FGP_CREAT: If page is not present then a new page is allocated using
- * @gfp_mask and added to the page cache and the VM's LRU
- * list. The page is returned locked and with an increased
- * refcount. Otherwise, %NULL is returned.
+ * @fgp_flags can be:
+ *
+ * - FGP_ACCESSED: the page will be marked accessed
+ * - FGP_LOCK: Page is return locked
+ * - FGP_CREAT: If page is not present then a new page is allocated using
+ * @gfp_mask and added to the page cache and the VM's LRU
+ * list. The page is returned locked and with an increased
+ * refcount. Otherwise, NULL is returned.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
@@ -2033,7 +2035,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- struct iov_iter data = *iter;
loff_t size;
size = i_size_read(inode);
@@ -2044,11 +2045,12 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
file_accessed(file);
- retval = mapping->a_ops->direct_IO(iocb, &data);
+ retval = mapping->a_ops->direct_IO(iocb, iter);
if (retval >= 0) {
iocb->ki_pos += retval;
- iov_iter_advance(iter, retval);
+ count -= retval;
}
+ iov_iter_revert(iter, iov_iter_count(iter) - count);
/*
* Btrfs can have a short DIO read if we encounter
@@ -2059,7 +2061,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
- if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size ||
+ if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
goto out;
}
@@ -2704,7 +2706,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t written;
size_t write_len;
pgoff_t end;
- struct iov_iter data;
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
@@ -2733,8 +2734,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
}
- data = *from;
- written = mapping->a_ops->direct_IO(iocb, &data);
+ written = mapping->a_ops->direct_IO(iocb, from);
/*
* Finally, try again to invalidate clean pages which might have been
@@ -2751,13 +2751,14 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (written > 0) {
pos += written;
- iov_iter_advance(from, written);
+ write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
iocb->ki_pos = pos;
}
+ iov_iter_revert(from, write_len - iov_iter_count(from));
out:
return written;
}
@@ -3001,7 +3002,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
* @gfp_mask: memory allocation flags (and I/O mode)
*
* The address_space is to try to release any data against the page
- * (presumably at page->private). If the release was successful, return `1'.
+ * (presumably at page->private). If the release was successful, return '1'.
* Otherwise return zero.
*
* This may also be called if PG_fscache is set on a page, indicating that the
diff --git a/mm/gup.c b/mm/gup.c
index 04aa405350dc..527ec2c6cca3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1189,34 +1189,57 @@ struct page *get_dump_page(unsigned long addr)
*/
#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifndef gup_get_pte
+/*
+ * We assume that the PTE can be read atomically. If this is not the case for
+ * your architecture, please provide the helper.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+#endif
+
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+ while ((*nr) - nr_start) {
+ struct page *page = pages[--(*nr)];
+
+ ClearPageReferenced(page);
+ put_page(page);
+ }
+}
+
#ifdef __HAVE_ARCH_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
+ struct dev_pagemap *pgmap = NULL;
+ int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
- int ret = 0;
ptem = ptep = pte_offset_map(&pmd, addr);
do {
- /*
- * In the line below we are assuming that the pte can be read
- * atomically. If this is not the case for your architecture,
- * please wrap this in a helper function!
- *
- * for an example see gup_get_pte in arch/x86/mm/gup.c
- */
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = gup_get_pte(ptep);
struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
* path using the pte_protnone check.
*/
- if (!pte_present(pte) || pte_special(pte) ||
- pte_protnone(pte) || (write && !pte_write(pte)))
+ if (pte_protnone(pte))
goto pte_unmap;
- if (!arch_pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, write))
+ goto pte_unmap;
+
+ if (pte_devmap(pte)) {
+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ goto pte_unmap;
+ }
+ } else if (pte_special(pte))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
VM_BUG_ON_PAGE(compound_head(page) != head, page);
+
+ put_dev_pagemap(pgmap);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static int __gup_device_huge(unsigned long pfn, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ int nr_start = *nr;
+ struct dev_pagemap *pgmap = NULL;
+
+ do {
+ struct page *page = pfn_to_page(pfn);
+
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ get_page(page);
+ put_dev_pagemap(pgmap);
+ (*nr)++;
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 1;
+}
+
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+
+ fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+
+ fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+#else
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
+
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (write && !pmd_write(orig))
+ if (!pmd_access_permitted(orig, write))
return 0;
+ if (pmd_devmap(orig))
+ return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
struct page *head, *page;
int refs;
- if (write && !pud_write(orig))
+ if (!pud_access_permitted(orig, write))
return 0;
+ if (pud_devmap(orig))
+ return __gup_device_huge_pud(orig, addr, end, pages, nr);
+
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
int refs;
struct page *head, *page;
- if (write && !pgd_write(orig))
+ if (!pgd_access_permitted(orig, write))
return 0;
+ BUILD_BUG_ON(pgd_devmap(orig));
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
@@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
+#ifndef gup_fast_permitted
+/*
+ * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * we need to fall back to the slow version:
+ */
+bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ return end >= start;
+}
+#endif
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- int nr, ret;
+ int nr = 0, ret = 0;
start &= PAGE_MASK;
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
- ret = nr;
+
+ if (gup_fast_permitted(start, nr_pages, write)) {
+ nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ ret = nr;
+ }
if (nr < nr_pages) {
/* Try to get the remaining pages with get_user_pages */
diff --git a/mm/memory.c b/mm/memory.c
index 235ba51b2fbf..6ff5d729ded0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4298,7 +4298,7 @@ void __might_fault(const char *file, int line)
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
- if (segment_eq(get_fs(), KERNEL_DS))
+ if (uaccess_kernel())
return;
if (pagefault_disabled())
return;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07efbc3a8656..bd01501efab9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4247,7 +4247,8 @@ EXPORT_SYMBOL(free_pages_exact);
* nr_free_zone_pages() counts the number of counts pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
- * managed_pages - high_pages
+ *
+ * nr_free_zone_pages = managed_pages - high_pages
*/
static unsigned long nr_free_zone_pages(int offset)
{
diff --git a/mm/percpu.c b/mm/percpu.c
index 60a6488e9e6d..e0aa8ae7bde7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1284,18 +1284,7 @@ void free_percpu(void __percpu *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
-/**
- * is_kernel_percpu_address - test whether address is from static percpu area
- * @addr: address to test
- *
- * Test whether @addr belongs to in-kernel static percpu area. Module
- * static percpu areas are not considered. For those, use
- * is_module_percpu_address().
- *
- * RETURNS:
- * %true if @addr is from in-kernel static percpu area, %false otherwise.
- */
-bool is_kernel_percpu_address(unsigned long addr)
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
const size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -1304,16 +1293,39 @@ bool is_kernel_percpu_address(unsigned long addr)
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
+ void *va = (void *)addr;
- if ((void *)addr >= start && (void *)addr < start + static_size)
+ if (va >= start && va < start + static_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(base, get_boot_cpu_id());
+ }
return true;
- }
+ }
+ }
#endif
/* on UP, can't distinguish from other static vars, always false */
return false;
}
/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area. Module
+ * static percpu areas are not considered. For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+ return __is_kernel_percpu_address(addr, NULL);
+}
+
+/**
* per_cpu_ptr_to_phys - convert translated percpu address to physical address
* @addr: the address to be converted to physical address
*
diff --git a/mm/swap.c b/mm/swap.c
index 5dabf444d724..d8d9ee9e311a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
void __put_page(struct page *page)
{
+ if (is_zone_device_page(page)) {
+ put_dev_pagemap(page->pgmap);
+
+ /*
+ * The page belongs to the device that created pgmap. Do
+ * not return it to page allocator.
+ */
+ return;
+ }
+
if (unlikely(PageCompound(page)))
__put_compound_page(page);
else
diff --git a/mm/usercopy.c b/mm/usercopy.c
index d155e12563b1..a9852b24715d 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -19,15 +19,9 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
+#include <linux/thread_info.h>
#include <asm/sections.h>
-enum {
- BAD_STACK = -1,
- NOT_STACK = 0,
- GOOD_FRAME,
- GOOD_STACK,
-};
-
/*
* Checks if a given pointer and length is contained by the current
* stack frame (if possible).
@@ -206,17 +200,6 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
{
struct page *page;
- /*
- * Some architectures (arm64) return true for virt_addr_valid() on
- * vmalloced addresses. Work around this by checking for vmalloc
- * first.
- *
- * We also need to check for module addresses explicitly since we
- * may copy static data from modules to userspace
- */
- if (is_vmalloc_or_module_addr(ptr))
- return NULL;
-
if (!virt_addr_valid(ptr))
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0b057628a7ba..b52aeed3f58e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1579,7 +1579,7 @@ void vfree_atomic(const void *addr)
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
*
- * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
+ * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
{
OpenPOWER on IntegriCloud