diff options
Diffstat (limited to 'mm/migrate.c')
-rw-r--r-- | mm/migrate.c | 412 |
1 files changed, 209 insertions, 203 deletions
diff --git a/mm/migrate.c b/mm/migrate.c index a42858d8e00b..b1092876e537 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> +#include <linux/pagewalk.h> #include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> @@ -47,6 +48,7 @@ #include <linux/page_owner.h> #include <linux/sched/mm.h> #include <linux/ptrace.h> +#include <linux/oom.h> #include <asm/tlbflush.h> @@ -459,7 +461,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage + i); + xas_store(&xas, newpage); } } @@ -985,7 +987,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } /* - * Anonymous and movable page->mapping will be cleard by + * Anonymous and movable page->mapping will be cleared by * free_pages_prepare so don't reset it here for keeping * the type to work PageAnon, for example. */ @@ -1167,15 +1169,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - struct page *newpage; + struct page *newpage = NULL; if (!thp_migration_supported() && PageTransHuge(page)) return -ENOMEM; - newpage = get_new_page(page, private); - if (!newpage) - return -ENOMEM; - if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ ClearPageActive(page); @@ -1186,13 +1184,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, __ClearPageIsolated(page); unlock_page(page); } - if (put_new_page) - put_new_page(newpage, private); - else - put_page(newpage); goto out; } + newpage = get_new_page(page, private); + if (!newpage) + return -ENOMEM; + rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); @@ -1202,8 +1200,7 @@ out: /* * A page that has been migrated has all references * removed and will be freed. A page that has not been - * migrated will have kepts its references and be - * restored. + * migrated will have kept its references and be restored. */ list_del(&page->lru); @@ -1515,9 +1512,11 @@ static int do_move_pages_to_node(struct mm_struct *mm, /* * Resolves the given address to a struct page, isolates it from the LRU and * puts it to the given pagelist. - * Returns -errno if the page cannot be found/isolated or 0 when it has been - * queued or the page doesn't need to be migrated because it is already on - * the target node + * Returns: + * errno - if the page cannot be found/isolated + * 0 - when it doesn't have to be migrated because it is already on the + * target node + * 1 - when it has been queued */ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, int node, struct list_head *pagelist, bool migrate_all) @@ -1556,7 +1555,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { isolate_huge_page(page, pagelist); - err = 0; + err = 1; } } else { struct page *head; @@ -1566,7 +1565,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (err) goto out_putpage; - err = 0; + err = 1; list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_cache(head), @@ -1611,7 +1610,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, goto out_flush; if (get_user(node, nodes + i)) goto out_flush; - addr = (unsigned long)p; + addr = (unsigned long)untagged_addr(p); err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) @@ -1628,8 +1627,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, start = i; } else if (node != current_node) { err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) + if (err) { + /* + * Positive err means the number of failed + * pages to migrate. Since we are going to + * abort and return the number of non-migrated + * pages, so need to incude the rest of the + * nr_pages that have not been attempted as + * well. + */ + if (err > 0) + err += nr_pages - i - 1; goto out; + } err = store_status(status, start, current_node, i - start); if (err) goto out; @@ -1643,16 +1653,28 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, */ err = add_page_for_migration(mm, addr, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); - if (!err) + + if (!err) { + /* The page is already on the target node */ + err = store_status(status, i, current_node, 1); + if (err) + goto out_flush; continue; + } else if (err > 0) { + /* The page is successfully queued for migration */ + continue; + } err = store_status(status, i, err, 1); if (err) goto out_flush; err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) + if (err) { + if (err > 0) + err += nr_pages - i - 1; goto out; + } if (i > start) { err = store_status(status, start, current_node, i - start); if (err) @@ -1666,9 +1688,16 @@ out_flush: /* Make sure we do not overwrite the existing error */ err1 = do_move_pages_to_node(mm, &pagelist, current_node); + /* + * Don't have to report non-attempted pages here since: + * - If the above loop is done gracefully all pages have been + * attempted. + * - If the above loop is aborted it means a fatal error + * happened, should return ret. + */ if (!err1) err1 = store_status(status, start, current_node, i - start); - if (!err) + if (err >= 0) err = err1; out: return err; @@ -1862,7 +1891,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, - 0, 0)) + ZONE_MOVABLE, 0)) continue; return true; } @@ -1891,7 +1920,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) + if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) return 0; if (isolate_lru_page(page)) @@ -2119,25 +2148,16 @@ out_unlock: #endif /* CONFIG_NUMA */ -#if defined(CONFIG_MIGRATE_VMA_HELPER) -struct migrate_vma { - struct vm_area_struct *vma; - unsigned long *dst; - unsigned long *src; - unsigned long cpages; - unsigned long npages; - unsigned long start; - unsigned long end; -}; - +#ifdef CONFIG_DEVICE_PRIVATE static int migrate_vma_collect_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; unsigned long addr; - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; migrate->npages++; @@ -2154,7 +2174,7 @@ static int migrate_vma_collect_skip(unsigned long start, struct migrate_vma *migrate = walk->private; unsigned long addr; - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = 0; } @@ -2176,7 +2196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, again: if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, walk); + return migrate_vma_collect_hole(start, end, -1, walk); if (pmd_trans_huge(*pmdp)) { struct page *page; @@ -2209,7 +2229,7 @@ again: return migrate_vma_collect_skip(start, end, walk); if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, + return migrate_vma_collect_hole(start, end, -1, walk); } } @@ -2227,17 +2247,15 @@ again: pte_t pte; pte = *ptep; - pfn = pte_pfn(pte); if (pte_none(pte)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } if (!pte_present(pte)) { - mpfn = pfn = 0; + mpfn = 0; /* * Only care about unaddressable device page special @@ -2249,15 +2267,15 @@ again: goto next; page = device_private_entry_to_page(entry); - mpfn = migrate_pfn(page_to_pfn(page))| - MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } page = vm_normal_page(migrate->vma, addr, pte); @@ -2267,10 +2285,9 @@ again: /* FIXME support THP */ if (!page || !page->mapping || PageTransCompound(page)) { - mpfn = pfn = 0; + mpfn = 0; goto next; } - pfn = page_to_pfn(page); /* * By getting a reference on the page we pin it and that blocks @@ -2329,6 +2346,11 @@ next: return 0; } +static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, +}; + /* * migrate_vma_collect() - collect pages over a range of virtual addresses * @migrate: migrate struct containing all migration information @@ -2340,21 +2362,15 @@ next: static void migrate_vma_collect(struct migrate_vma *migrate) { struct mmu_notifier_range range; - struct mm_walk mm_walk = { - .pmd_entry = migrate_vma_collect_pmd, - .pte_hole = migrate_vma_collect_hole, - .vma = migrate->vma, - .mm = migrate->vma->vm_mm, - .private = migrate, - }; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, + migrate->vma->vm_mm, migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); - walk_page_range(migrate->start, migrate->end, &mm_walk); - mmu_notifier_invalidate_range_end(&range); + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); } @@ -2577,6 +2593,118 @@ restore: } } +/** + * migrate_vma_setup() - prepare to migrate a range of memory + * @args: contains the vma, start, and and pfns arrays for the migration + * + * Returns: negative errno on failures, 0 when 0 or more pages were migrated + * without an error. + * + * Prepare to migrate a range of memory virtual address range by collecting all + * the pages backing each virtual address in the range, saving them inside the + * src array. Then lock those pages and unmap them. Once the pages are locked + * and unmapped, check whether each page is pinned or not. Pages that aren't + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the + * corresponding src array entry. Then restores any pages that are pinned, by + * remapping and unlocking those pages. + * + * The caller should then allocate destination memory and copy source memory to + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE + * flag set). Once these are allocated and copied, the caller must update each + * corresponding entry in the dst array with the pfn value of the destination + * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set + * (destination pages must have their struct pages locked, via lock_page()). + * + * Note that the caller does not have to migrate all the pages that are marked + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from + * device memory to system memory. If the caller cannot migrate a device page + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe + * consequences for the userspace process, so it must be avoided if at all + * possible. + * + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing the caller to allocate device memory for those unback virtual + * address. For this the caller simply has to allocate device memory and + * properly set the destination entry like for regular migration. Note that + * this can still fails and thus inside the device driver must check if the + * migration was successful for those entries after calling migrate_vma_pages() + * just like for regular migration. + * + * After that, the callers must call migrate_vma_pages() to go over each entry + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then migrate_vma_pages() to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the + * src array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * Once migrate_vma_pages() returns the caller may inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table after migrate_vma_pages() because + * both destination and source page are still locked, and the mmap_sem is held + * in read mode (hence no one can unmap the range being migrated). + * + * Once the caller is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) it finally calls + * migrate_vma_finalize() to update the CPU page table to point to new pages + * for successfully migrated pages or otherwise restore the CPU page table to + * point to the original source pages. + */ +int migrate_vma_setup(struct migrate_vma *args) +{ + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; + + args->start &= PAGE_MASK; + args->end &= PAGE_MASK; + if (!args->vma || is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (nr_pages <= 0) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + if (!args->src || !args->dst) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + + if (args->cpages) + migrate_vma_prepare(args); + if (args->cpages) + migrate_vma_unmap(args); + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the drivers. + */ + return 0; + +} +EXPORT_SYMBOL(migrate_vma_setup); + +/* + * This code closely matches the code in: + * __handle_mm_fault() + * handle_pte_fault() + * do_anonymous_page() + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE + * private page. + */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, struct page *page, @@ -2657,30 +2785,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (check_stable_address_space(mm)) + goto unlock_abort; + if (pte_present(*ptep)) { unsigned long pfn = pte_pfn(*ptep); - if (!is_zero_pfn(pfn)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + if (!is_zero_pfn(pfn)) + goto unlock_abort; flush = true; - } else if (!pte_none(*ptep)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + } else if (!pte_none(*ptep)) + goto unlock_abort; /* - * Check for usefaultfd but do not deliver the fault. Instead, + * Check for userfaultfd but do not deliver the fault. Instead, * just back off. */ - if (userfaultfd_missing(vma)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + if (userfaultfd_missing(vma)) + goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); @@ -2704,11 +2826,14 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, *src = MIGRATE_PFN_MIGRATE; return; +unlock_abort: + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); abort: *src &= ~MIGRATE_PFN_MIGRATE; } -/* +/** * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information * @@ -2716,7 +2841,7 @@ abort: * struct page. This effectively finishes the migration from source page to the * destination page. */ -static void migrate_vma_pages(struct migrate_vma *migrate) +void migrate_vma_pages(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; @@ -2736,9 +2861,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) } if (!page) { - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) continue; - } if (!notified) { notified = true; @@ -2790,8 +2914,9 @@ static void migrate_vma_pages(struct migrate_vma *migrate) if (notified) mmu_notifier_invalidate_range_only_end(&range); } +EXPORT_SYMBOL(migrate_vma_pages); -/* +/** * migrate_vma_finalize() - restore CPU page table entry * @migrate: migrate struct containing all migration information * @@ -2802,7 +2927,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) * This also unlocks the pages and puts them back on the lru, or drops the extra * refcount, for device pages. */ -static void migrate_vma_finalize(struct migrate_vma *migrate) +void migrate_vma_finalize(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; unsigned long i; @@ -2845,124 +2970,5 @@ static void migrate_vma_finalize(struct migrate_vma *migrate) } } } - -/* - * migrate_vma() - migrate a range of memory inside vma - * - * @ops: migration callback for allocating destination memory and copying - * @vma: virtual memory area containing the range to be migrated - * @start: start address of the range to migrate (inclusive) - * @end: end address of the range to migrate (exclusive) - * @src: array of hmm_pfn_t containing source pfns - * @dst: array of hmm_pfn_t containing destination pfns - * @private: pointer passed back to each of the callback - * Returns: 0 on success, error code otherwise - * - * This function tries to migrate a range of memory virtual address range, using - * callbacks to allocate and copy memory from source to destination. First it - * collects all the pages backing each virtual address in the range, saving this - * inside the src array. Then it locks those pages and unmaps them. Once the pages - * are locked and unmapped, it checks whether each page is pinned or not. Pages - * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) - * in the corresponding src array entry. It then restores any pages that are - * pinned, by remapping and unlocking those pages. - * - * At this point it calls the alloc_and_copy() callback. For documentation on - * what is expected from that callback, see struct migrate_vma_ops comments in - * include/linux/migrate.h - * - * After the alloc_and_copy() callback, this function goes over each entry in - * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, - * then the function tries to migrate struct page information from the source - * struct page to the destination struct page. If it fails to migrate the struct - * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src - * array. - * - * At this point all successfully migrated pages have an entry in the src - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst - * array entry with MIGRATE_PFN_VALID flag set. - * - * It then calls the finalize_and_map() callback. See comments for "struct - * migrate_vma_ops", in include/linux/migrate.h for details about - * finalize_and_map() behavior. - * - * After the finalize_and_map() callback, for successfully migrated pages, this - * function updates the CPU page table to point to new pages, otherwise it - * restores the CPU page table to point to the original source pages. - * - * Function returns 0 after the above steps, even if no pages were migrated - * (The function only returns an error if any of the arguments are invalid.) - * - * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT - * unsigned long entries. - */ -int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private) -{ - struct migrate_vma migrate; - - /* Sanity check the arguments */ - start &= PAGE_MASK; - end &= PAGE_MASK; - if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) - return -EINVAL; - if (start < vma->vm_start || start >= vma->vm_end) - return -EINVAL; - if (end <= vma->vm_start || end > vma->vm_end) - return -EINVAL; - if (!ops || !src || !dst || start >= end) - return -EINVAL; - - memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); - migrate.src = src; - migrate.dst = dst; - migrate.start = start; - migrate.npages = 0; - migrate.cpages = 0; - migrate.end = end; - migrate.vma = vma; - - /* Collect, and try to unmap source pages */ - migrate_vma_collect(&migrate); - if (!migrate.cpages) - return 0; - - /* Lock and isolate page */ - migrate_vma_prepare(&migrate); - if (!migrate.cpages) - return 0; - - /* Unmap pages */ - migrate_vma_unmap(&migrate); - if (!migrate.cpages) - return 0; - - /* - * At this point pages are locked and unmapped, and thus they have - * stable content and can safely be copied to destination memory that - * is allocated by the callback. - * - * Note that migration can fail in migrate_vma_struct_page() for each - * individual page. - */ - ops->alloc_and_copy(vma, src, dst, start, end, private); - - /* This does the real migration of struct page */ - migrate_vma_pages(&migrate); - - ops->finalize_and_map(vma, src, dst, start, end, private); - - /* Unlock and remap pages */ - migrate_vma_finalize(&migrate); - - return 0; -} -EXPORT_SYMBOL(migrate_vma); -#endif /* defined(MIGRATE_VMA_HELPER) */ +EXPORT_SYMBOL(migrate_vma_finalize); +#endif /* CONFIG_DEVICE_PRIVATE */ |