diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/cma.c | 10 | ||||
-rw-r--r-- | mm/filemap.c | 220 | ||||
-rw-r--r-- | mm/hmm.c | 2 | ||||
-rw-r--r-- | mm/kasan/init.c | 10 | ||||
-rw-r--r-- | mm/memblock.c | 371 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 38 | ||||
-rw-r--r-- | mm/page_alloc.c | 10 | ||||
-rw-r--r-- | mm/page_ext.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 84 | ||||
-rw-r--r-- | mm/sparse.c | 27 |
10 files changed, 413 insertions, 361 deletions
@@ -327,16 +327,14 @@ int __init cma_declare_contiguous(phys_addr_t base, * memory in case of failure. */ if (base < highmem_start && limit > highmem_start) { - addr = memblock_alloc_range(size, alignment, - highmem_start, limit, - MEMBLOCK_NONE); + addr = memblock_phys_alloc_range(size, alignment, + highmem_start, limit); limit = highmem_start; } if (!addr) { - addr = memblock_alloc_range(size, alignment, base, - limit, - MEMBLOCK_NONE); + addr = memblock_phys_alloc_range(size, alignment, base, + limit); if (!addr) { ret = -ENOMEM; goto err; diff --git a/mm/filemap.c b/mm/filemap.c index a3b4021c448f..d78f577baef2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1587,6 +1587,9 @@ EXPORT_SYMBOL(find_lock_entry); * @gfp_mask and added to the page cache and the VM's LRU * list. The page is returned locked and with an increased * refcount. + * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do + * its own locking dance if the page is already in cache, or unlock the page + * before returning if we had to add the page to pagecache. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -1641,7 +1644,7 @@ no_page: if (!page) return NULL; - if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; /* Init accessed so avoid atomic mark_page_accessed later */ @@ -1655,6 +1658,13 @@ no_page: if (err == -EEXIST) goto repeat; } + + /* + * add_to_page_cache_lru locks the page, and for mmap we expect + * an unlocked page. + */ + if (page && (fgp_flags & FGP_FOR_MMAP)) + unlock_page(page); } return page; @@ -2379,64 +2389,98 @@ out: EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU -/** - * page_cache_read - adds requested page to the page cache if not already there - * @file: file to read - * @offset: page index - * @gfp_mask: memory allocation flags - * - * This adds the requested page to the page cache if it isn't already there, - * and schedules an I/O to read in its contents from disk. - * - * Return: %0 on success, negative error code otherwise. - */ -static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) +#define MMAP_LOTSAMISS (100) +static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) { - struct address_space *mapping = file->f_mapping; - struct page *page; - int ret; + int flags = vmf->flags; - do { - page = __page_cache_alloc(gfp_mask); - if (!page) - return -ENOMEM; + if (fpin) + return fpin; - ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask); - if (ret == 0) - ret = mapping->a_ops->readpage(file, page); - else if (ret == -EEXIST) - ret = 0; /* losing race to add is OK */ + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} - put_page(page); +/* + * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem + * @vmf - the vm_fault for this fault. + * @page - the page to lock. + * @fpin - the pointer to the file we may pin (or is already pinned). + * + * This works similar to lock_page_or_retry in that it can drop the mmap_sem. + * It differs in that it actually returns the page locked if it returns 1 and 0 + * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin + * will point to the pinned file and needs to be fput()'ed at a later point. + */ +static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, + struct file **fpin) +{ + if (trylock_page(page)) + return 1; - } while (ret == AOP_TRUNCATED_PAGE); + /* + * NOTE! This will make us return with VM_FAULT_RETRY, but with + * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT + * is supposed to work. We have way too many special cases.. + */ + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; - return ret; + *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); + if (vmf->flags & FAULT_FLAG_KILLABLE) { + if (__lock_page_killable(page)) { + /* + * We didn't have the right flags to drop the mmap_sem, + * but all fault_handlers only check for fatal signals + * if we return VM_FAULT_RETRY, so we need to drop the + * mmap_sem here and return 0 if we don't have a fpin. + */ + if (*fpin == NULL) + up_read(&vmf->vma->vm_mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; } -#define MMAP_LOTSAMISS (100) /* - * Synchronous readahead happens when we don't even find - * a page in the page cache at all. + * Synchronous readahead happens when we don't even find a page in the page + * cache at all. We don't want to perform IO under the mmap sem, so if we have + * to drop the mmap sem we return the file that was pinned in order for us to do + * that. If we didn't pin a file then we return NULL. The file that is + * returned needs to be fput()'ed when we're done with it. */ -static void do_sync_mmap_readahead(struct vm_area_struct *vma, - struct file_ra_state *ra, - struct file *file, - pgoff_t offset) +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; + pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vma->vm_flags & VM_RAND_READ) - return; + if (vmf->vma->vm_flags & VM_RAND_READ) + return fpin; if (!ra->ra_pages) - return; + return fpin; - if (vma->vm_flags & VM_SEQ_READ) { + if (vmf->vma->vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); - return; + return fpin; } /* Avoid banging the cache line if not needed */ @@ -2448,37 +2492,44 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, * stop bothering with read-ahead. It will only hurt. */ if (ra->mmap_miss > MMAP_LOTSAMISS) - return; + return fpin; /* * mmap read-around */ + fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, offset - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); + return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, - * so we want to possibly extend the readahead further.. + * so we want to possibly extend the readahead further. We return the file that + * was pinned if we have to drop the mmap_sem in order to do IO. */ -static void do_async_mmap_readahead(struct vm_area_struct *vma, - struct file_ra_state *ra, - struct file *file, - struct page *page, - pgoff_t offset) +static struct file *do_async_mmap_readahead(struct vm_fault *vmf, + struct page *page) { + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; + struct file *fpin = NULL; + pgoff_t offset = vmf->pgoff; /* If we don't want any read-ahead, don't bother */ - if (vma->vm_flags & VM_RAND_READ) - return; + if (vmf->vma->vm_flags & VM_RAND_READ) + return fpin; if (ra->mmap_miss > 0) ra->mmap_miss--; - if (PageReadahead(page)) + if (PageReadahead(page)) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_readahead(mapping, ra, file, page, offset, ra->ra_pages); + } + return fpin; } /** @@ -2510,6 +2561,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; + struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2531,23 +2583,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vmf->vma, ra, file, page, offset); + fpin = do_async_mmap_readahead(vmf, page); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; + fpin = do_sync_mmap_readahead(vmf); retry_find: - page = find_get_page(mapping, offset); - if (!page) - goto no_cached_page; + page = pagecache_get_page(mapping, offset, + FGP_CREAT|FGP_FOR_MMAP, + vmf->gfp_mask); + if (!page) { + if (fpin) + goto out_retry; + return vmf_error(-ENOMEM); + } } - if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { - put_page(page); - return ret | VM_FAULT_RETRY; - } + if (!lock_page_maybe_drop_mmap(vmf, page, &fpin)) + goto out_retry; /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) { @@ -2565,6 +2620,16 @@ retry_find: goto page_not_uptodate; /* + * We've made it this far and we had to drop our mmap_sem, now is the + * time to return to the upper layer and have it re-find the vma and + * redo the fault. + */ + if (fpin) { + unlock_page(page); + goto out_retry; + } + + /* * Found the page and have a reference on it. * We must recheck i_size under page lock. */ @@ -2578,28 +2643,6 @@ retry_find: vmf->page = page; return ret | VM_FAULT_LOCKED; -no_cached_page: - /* - * We're only likely to ever get here if MADV_RANDOM is in - * effect. - */ - error = page_cache_read(file, offset, vmf->gfp_mask); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - return vmf_error(error); - page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. @@ -2608,12 +2651,15 @@ page_not_uptodate: * and we need to check for errors. */ ClearPageError(page); + fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); if (!PageUptodate(page)) error = -EIO; } + if (fpin) + goto out_retry; put_page(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -2622,6 +2668,18 @@ page_not_uptodate: /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); return VM_FAULT_SIGBUS; + +out_retry: + /* + * We dropped the mmap_sem, we need to return to the fault handler to + * re-find the vma and come back and find our hopefully still populated + * page. + */ + if (page) + put_page(page); + if (fpin) + fput(fpin); + return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); @@ -990,7 +990,7 @@ static void hmm_devmem_ref_kill(struct percpu_ref *ref) percpu_ref_kill(ref); } -static int hmm_devmem_fault(struct vm_area_struct *vma, +static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, unsigned long addr, const struct page *page, unsigned int flags, diff --git a/mm/kasan/init.c b/mm/kasan/init.c index fcaa1ca03175..ce45c491ebcd 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -83,8 +83,14 @@ static inline bool kasan_early_shadow_page_entry(pte_t pte) static __init void *early_alloc(size_t size, int node) { - return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_ACCESSIBLE, node); + void *ptr = memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, node); + + if (!ptr) + panic("%s: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n", + __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS)); + + return ptr; } static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, diff --git a/mm/memblock.c b/mm/memblock.c index 470601115892..e7665cf914b1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -76,8 +76,19 @@ * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node` * performs such an assignment directly. * - * Once memblock is setup the memory can be allocated using either - * memblock or bootmem APIs. + * Once memblock is setup the memory can be allocated using one of the + * API variants: + * + * * :c:func:`memblock_phys_alloc*` - these functions return the + * **physical** address of the allocated memory + * * :c:func:`memblock_alloc*` - these functions return the **virtual** + * address of the allocated memory. + * + * Note, that both API variants use implict assumptions about allowed + * memory ranges and the fallback methods. Consult the documentation + * of :c:func:`memblock_alloc_internal` and + * :c:func:`memblock_alloc_range_nid` functions for more elaboarte + * description. * * As the system boot progresses, the architecture specific * :c:func:`mem_init` function frees all the memory to the buddy page @@ -132,7 +143,7 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; -enum memblock_flags __init_memblock choose_memblock_flags(void) +static enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -261,7 +272,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * Return: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, +static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid, enum memblock_flags flags) @@ -435,17 +446,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, else in_slab = &memblock_reserved_in_slab; - /* Try to find some space for it. - * - * WARNING: We assume that either slab_is_available() and we use it or - * we use MEMBLOCK for allocations. That means that this is unsafe to - * use when bootmem is currently active (unless bootmem itself is - * implemented on top of MEMBLOCK which isn't the case yet) - * - * This should however not be an issue for now, as we currently only - * call into MEMBLOCK while it's still active, or much later when slab - * is active for memory hotplug operations - */ + /* Try to find some space for it */ if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; @@ -858,11 +859,14 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, if (ret) return ret; - for (i = start_rgn; i < end_rgn; i++) + for (i = start_rgn; i < end_rgn; i++) { + struct memblock_region *r = &type->regions[i]; + if (set) - memblock_set_region_flags(&type->regions[i], flag); + r->flags |= flag; else - memblock_clear_region_flags(&type->regions[i], flag); + r->flags &= ~flag; + } memblock_merge_regions(type); return 0; @@ -962,8 +966,31 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, *idx = ULLONG_MAX; } +static bool should_skip_region(struct memblock_region *m, int nid, int flags) +{ + int m_nid = memblock_get_region_node(m); + + /* only memory regions are associated with nodes, check it */ + if (nid != NUMA_NO_NODE && nid != m_nid) + return true; + + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + return true; + + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + return true; + + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + return true; + + return false; +} + /** - * __next__mem_range - next function for for_each_free_mem_range() etc. + * __next_mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes @@ -1009,20 +1036,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) - continue; - - /* skip hotpluggable memory regions if needed */ - if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) - continue; - - /* if we want mirror memory skip non-mirror memory regions */ - if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) - continue; - - /* skip nomap memory unless we were asked for it explicitly */ - if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + if (should_skip_region(m, nid, flags)) continue; if (!type_b) { @@ -1126,20 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) - continue; - - /* skip hotpluggable memory regions if needed */ - if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) - continue; - - /* if we want mirror memory skip non-mirror memory regions */ - if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) - continue; - - /* skip nomap memory unless we were asked for it explicitly */ - if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + if (should_skip_region(m, nid, flags)) continue; if (!type_b) { @@ -1255,94 +1256,123 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +/** + * memblock_alloc_range_nid - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (phys address) + * @end: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. + * + * If the specified node can not hold the requested memory the + * allocation falls back to any node in the system + * + * For systems with memory mirroring, the allocation is attempted first + * from the regions with mirroring enabled and then retried from any + * memory region. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for + * allocated boot memory block, so that it is never reported as leaks. + * + * Return: + * Physical address of allocated memory block on success, %0 on failure. + */ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, - enum memblock_flags flags) + phys_addr_t end, int nid) { + enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); align = SMP_CACHE_BYTES; } + if (end > memblock.current_limit) + end = memblock.current_limit; + +again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); - if (found && !memblock_reserve(found, size)) { - /* - * The min_count is set to 0 so that memblock allocations are - * never reported as leaks. - */ - kmemleak_alloc_phys(found, size, 0, 0); - return found; - } - return 0; -} - -phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end, - enum memblock_flags flags) -{ - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, - flags); -} - -phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, - phys_addr_t align, phys_addr_t max_addr, - int nid, enum memblock_flags flags) -{ - return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); -} - -phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) -{ - enum memblock_flags flags = choose_memblock_flags(); - phys_addr_t ret; + if (found && !memblock_reserve(found, size)) + goto done; -again: - ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, - nid, flags); + if (nid != NUMA_NO_NODE) { + found = memblock_find_in_range_node(size, align, start, + end, NUMA_NO_NODE, + flags); + if (found && !memblock_reserve(found, size)) + goto done; + } - if (!ret && (flags & MEMBLOCK_MIRROR)) { + if (flags & MEMBLOCK_MIRROR) { flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); goto again; } - return ret; -} - -phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) -{ - return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE, - MEMBLOCK_NONE); -} -phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) -{ - phys_addr_t alloc; - - alloc = __memblock_alloc_base(size, align, max_addr); + return 0; - if (alloc == 0) - panic("ERROR: Failed to allocate %pa bytes below %pa.\n", - &size, &max_addr); +done: + /* Skip kmemleak for kasan_init() due to high volume. */ + if (end != MEMBLOCK_ALLOC_KASAN) + /* + * The min_count is set to 0 so that memblock allocated + * blocks are never reported as leaks. This is because many + * of these blocks are only referred via the physical + * address which is not looked up by kmemleak. + */ + kmemleak_alloc_phys(found, size, 0, 0); - return alloc; + return found; } -phys_addr_t __init memblock_phys_alloc(phys_addr_t size, phys_addr_t align) +/** + * memblock_phys_alloc_range - allocate a memory block inside specified range + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (physical address) + * @end: the upper bound of the memory region to allocate (physical address) + * + * Allocate @size bytes in the between @start and @end. + * + * Return: physical address of the allocated memory block on success, + * %0 on failure. + */ +phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, + phys_addr_t align, + phys_addr_t start, + phys_addr_t end) { - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); } +/** + * memblock_phys_alloc_try_nid - allocate a memory block from specified MUMA node + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Allocates memory block from the specified NUMA node. If the node + * has no available memory, attempts to allocated from any node in the + * system. + * + * Return: physical address of the allocated memory block on success, + * %0 on failure. + */ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { - phys_addr_t res = memblock_phys_alloc_nid(size, align, nid); - - if (res) - return res; - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + return memblock_alloc_range_nid(size, align, 0, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); } /** @@ -1353,19 +1383,13 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * @max_addr: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * The @min_addr limit is dropped if it can not be satisfied and the allocation - * will fall back to memory below @min_addr. Also, allocation may fall back - * to any node in the system if the specified node can not - * hold the requested memory. + * Allocates memory block using memblock_alloc_range_nid() and + * converts the returned physical address to virtual. * - * The allocation is performed from memory region limited by - * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. - * - * The phys address of allocated boot memory block is converted to virtual and - * allocated memory is reset to 0. - * - * In addition, function sets the min_count to 0 using kmemleak_alloc for - * allocated boot memory block, so that it is never reported as leaks. + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Other constraints, such + * as node and mirrored memory will be handled again in + * memblock_alloc_range_nid(). * * Return: * Virtual address of allocated memory block on success, NULL on failure. @@ -1376,11 +1400,6 @@ static void * __init memblock_alloc_internal( int nid) { phys_addr_t alloc; - void *ptr; - enum memblock_flags flags = choose_memblock_flags(); - - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; /* * Detect any accidental use of these APIs after slab is ready, as at @@ -1390,54 +1409,16 @@ static void * __init memblock_alloc_internal( if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); - if (!align) { - dump_stack(); - align = SMP_CACHE_BYTES; - } + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid); - if (max_addr > memblock.current_limit) - max_addr = memblock.current_limit; -again: - alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, - nid, flags); - if (alloc && !memblock_reserve(alloc, size)) - goto done; + /* retry allocation without lower limit */ + if (!alloc && min_addr) + alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid); - if (nid != NUMA_NO_NODE) { - alloc = memblock_find_in_range_node(size, align, min_addr, - max_addr, NUMA_NO_NODE, - flags); - if (alloc && !memblock_reserve(alloc, size)) - goto done; - } - - if (min_addr) { - min_addr = 0; - goto again; - } + if (!alloc) + return NULL; - if (flags & MEMBLOCK_MIRROR) { - flags &= ~MEMBLOCK_MIRROR; - pr_warn("Could not allocate %pap bytes of mirrored memory\n", - &size); - goto again; - } - - return NULL; -done: - ptr = phys_to_virt(alloc); - - /* Skip kmemleak for kasan_init() due to high volume. */ - if (max_addr != MEMBLOCK_ALLOC_KASAN) - /* - * The min_count is set to 0 so that bootmem allocated - * blocks are never reported as leaks. This is because many - * of these blocks are only referred via the physical - * address which is not looked up by kmemleak. - */ - kmemleak_alloc(ptr, size, 0, 0); - - return ptr; + return phys_to_virt(alloc); } /** @@ -1479,7 +1460,7 @@ void * __init memblock_alloc_try_nid_raw( } /** - * memblock_alloc_try_nid_nopanic - allocate boot memory block + * memblock_alloc_try_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation @@ -1495,42 +1476,6 @@ void * __init memblock_alloc_try_nid_raw( * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_alloc_try_nid_nopanic( - phys_addr_t size, phys_addr_t align, - phys_addr_t min_addr, phys_addr_t max_addr, - int nid) -{ - void *ptr; - - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", - __func__, (u64)size, (u64)align, nid, &min_addr, - &max_addr, (void *)_RET_IP_); - - ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid); - if (ptr) - memset(ptr, 0, size); - return ptr; -} - -/** - * memblock_alloc_try_nid - allocate boot memory block with panicking - * @size: size of memory block to be allocated in bytes - * @align: alignment of the region and block's size - * @min_addr: the lower bound of the memory region from where the allocation - * is preferred (phys address) - * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to - * allocate only from memory limited by memblock.current_limit value - * @nid: nid of the free area to find, %NUMA_NO_NODE for any node - * - * Public panicking version of memblock_alloc_try_nid_nopanic() - * which provides debug information (including caller info), if enabled, - * and panics if the request can not be satisfied. - * - * Return: - * Virtual address of allocated memory block on success, NULL on failure. - */ void * __init memblock_alloc_try_nid( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, @@ -1543,24 +1488,20 @@ void * __init memblock_alloc_try_nid( &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); - if (ptr) { + if (ptr) memset(ptr, 0, size); - return ptr; - } - panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n", - __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); - return NULL; + return ptr; } /** - * __memblock_free_late - free bootmem block pages directly to buddy allocator + * __memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * - * This is only useful when the bootmem allocator has already been torn + * This is only useful when the memblock allocator has already been torn * down, but we are still initializing the system. Pages are released directly - * to the buddy allocator, no bootmem metadata is updated because it is gone. + * to the buddy allocator. */ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6b05576fb4ec..f767582af4f8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -96,27 +96,29 @@ void mem_hotplug_done(void) cpus_read_unlock(); } +u64 max_mem_size = U64_MAX; + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { - struct resource *res, *conflict; - res = kzalloc(sizeof(struct resource), GFP_KERNEL); - if (!res) - return ERR_PTR(-ENOMEM); - - res->name = "System RAM"; - res->start = start; - res->end = start + size - 1; - res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - conflict = request_resource_conflict(&iomem_resource, res); - if (conflict) { - if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { - pr_debug("Device unaddressable memory block " - "memory hotplug at %#010llx !\n", - (unsigned long long)start); - } - pr_debug("System RAM resource %pR cannot be added\n", res); - kfree(res); + struct resource *res; + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + char *resource_name = "System RAM"; + + if (start + size > max_mem_size) + return ERR_PTR(-E2BIG); + + /* + * Request ownership of the new memory range. This might be + * a child of an existing resource that was present but + * not marked as busy. + */ + res = __request_region(&iomem_resource, start, size, + resource_name, flags); + + if (!res) { + pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", + start, start + size); return ERR_PTR(-EEXIST); } return res; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3eb01dedfb50..03fcf73d47da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6445,8 +6445,8 @@ static void __ref setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = NULL; if (usemapsize) { zone->pageblock_flags = - memblock_alloc_node_nopanic(usemapsize, - pgdat->node_id); + memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, + pgdat->node_id); if (!zone->pageblock_flags) panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", usemapsize, zone->name, pgdat->node_id); @@ -6679,7 +6679,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_alloc_node_nopanic(size, pgdat->node_id); + map = memblock_alloc_node(size, SMP_CACHE_BYTES, + pgdat->node_id); if (!map) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); @@ -7959,8 +7960,7 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_alloc_nopanic(size, - SMP_CACHE_BYTES); + table = memblock_alloc(size, SMP_CACHE_BYTES); else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); diff --git a/mm/page_ext.c b/mm/page_ext.c index ab4244920e0f..d8f1aca4ad43 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -161,7 +161,7 @@ static int __init alloc_node_page_ext(int nid) table_size = get_entry_size() * nr_pages; - base = memblock_alloc_try_nid_nopanic( + base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) diff --git a/mm/percpu.c b/mm/percpu.c index c5c750781628..2e6fc8d552c9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1086,6 +1086,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, struct pcpu_chunk *chunk; unsigned long aligned_addr, lcm_align; int start_offset, offset_bits, region_size, region_bits; + size_t alloc_size; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; @@ -1101,9 +1102,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ - chunk = memblock_alloc(sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT), - SMP_CACHE_BYTES); + alloc_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT); + chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); INIT_LIST_HEAD(&chunk->list); @@ -1114,12 +1118,25 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), - SMP_CACHE_BYTES); - chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), - SMP_CACHE_BYTES); - chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), - SMP_CACHE_BYTES); + alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); + chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->alloc_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = + BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); + chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->bound_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); + chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->md_blocks) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -1888,7 +1905,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = memblock_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); + ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; @@ -2044,6 +2061,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int group, unit, i; int map_size; unsigned long tmp_addr; + size_t alloc_size; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ @@ -2075,14 +2093,29 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]), - SMP_CACHE_BYTES); - group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]), - SMP_CACHE_BYTES); - unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), - SMP_CACHE_BYTES); - unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), - SMP_CACHE_BYTES); + alloc_size = ai->nr_groups * sizeof(group_offsets[0]); + group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!group_offsets) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = ai->nr_groups * sizeof(group_sizes[0]); + group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!group_sizes) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = nr_cpu_ids * sizeof(unit_map[0]); + unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!unit_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = nr_cpu_ids * sizeof(unit_off[0]); + unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!unit_off) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2148,6 +2181,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]), SMP_CACHE_BYTES); + if (!pcpu_slot) + panic("%s: Failed to allocate %zu bytes\n", __func__, + pcpu_nr_slots * sizeof(pcpu_slot[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -2460,7 +2496,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = memblock_alloc_nopanic(areas_size, SMP_CACHE_BYTES); + areas = memblock_alloc(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; @@ -2602,6 +2638,9 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); + if (!pages) + panic("%s: Failed to allocate %zu bytes\n", __func__, + pages_size); /* allocate pages */ j = 0; @@ -2690,8 +2729,7 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return memblock_alloc_from_nopanic( - size, align, __pa(MAX_DMA_ADDRESS)); + return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS)); } static void __init pcpu_dfl_fc_free(void *ptr, size_t size) @@ -2739,9 +2777,7 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = memblock_alloc_from_nopanic(unit_size, - PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); + fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ diff --git a/mm/sparse.c b/mm/sparse.c index 77a0554fa5bd..69904aa6165b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -65,11 +65,15 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) + if (slab_is_available()) { section = kzalloc_node(array_size, GFP_KERNEL, nid); - else + } else { section = memblock_alloc_node(array_size, SMP_CACHE_BYTES, nid); + if (!section) + panic("%s: Failed to allocate %lu bytes nid=%d\n", + __func__, array_size, nid); + } return section; } @@ -218,6 +222,9 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); mem_section = memblock_alloc(size, align); + if (!mem_section) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, size, align); } #endif @@ -323,9 +330,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_alloc_try_nid_nopanic(size, - SMP_CACHE_BYTES, goal, limit, - nid); + p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); if (!p && limit) { limit = 0; goto again; @@ -379,7 +384,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return memblock_alloc_node_nopanic(size, pgdat->node_id); + return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -404,13 +409,18 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, { unsigned long size = section_map_size(); struct page *map = sparse_buffer_alloc(size); + phys_addr_t addr = __pa(MAX_DMA_ADDRESS); if (map) return map; map = memblock_alloc_try_nid(size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + PAGE_SIZE, addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + if (!map) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", + __func__, size, PAGE_SIZE, nid, &addr); + return map; } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -420,10 +430,11 @@ static void *sparsemap_buf_end __meminitdata; static void __init sparse_buffer_init(unsigned long size, int nid) { + phys_addr_t addr = __pa(MAX_DMA_ADDRESS); WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ sparsemap_buf = memblock_alloc_try_nid_raw(size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS), + addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } |