diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 44 | ||||
-rw-r--r-- | mm/page-writeback.c | 30 | ||||
-rw-r--r-- | mm/percpu.c | 83 | ||||
-rw-r--r-- | mm/shmem.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 8 |
11 files changed, 119 insertions, 71 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 247760729593..edd300aca173 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -244,10 +244,12 @@ config DEFAULT_MMAP_MIN_ADDR This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. +config ARCH_SUPPORTS_MEMORY_FAILURE + bool config MEMORY_FAILURE depends on MMU - depends on X86_MCE + depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" help Enables code to recover from some memory failures on systems diff --git a/mm/Makefile b/mm/Makefile index 515fd793c17f..ebf849042ed3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,14 +5,14 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o + vmalloc.o pagewalk.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o mmu_context.o \ - pagewalk.o $(mmu-y) + $(mmu-y) obj-y += init-mm.o obj-$(CONFIG_BOUNCE) += bounce.o diff --git a/mm/filemap.c b/mm/filemap.c index 6c84e598b4a9..ef169f37156d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1611,7 +1611,7 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, }; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 427dfe3ce78c..1888b2d71bb8 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -296,7 +296,7 @@ out: } } -static struct vm_operations_struct xip_file_vm_ops = { +static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6f048fcc749c..5d7601b02874 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1721,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -struct vm_operations_struct hugetlb_vm_ops = { +const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, diff --git a/mm/mmap.c b/mm/mmap.c index 21d4029a07b3..73f5e4b64010 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2282,7 +2282,7 @@ static void special_mapping_close(struct vm_area_struct *vma) { } -static struct vm_operations_struct special_mapping_vmops = { +static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, }; diff --git a/mm/nommu.c b/mm/nommu.c index 56a446f05971..5189b5aed8c0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -79,7 +79,7 @@ static struct kmem_cache *vm_region_jar; struct rb_root nommu_region_tree = RB_ROOT; DECLARE_RWSEM(nommu_region_sem); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { }; /* @@ -826,7 +826,7 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED || addr) { + if (flags & MAP_FIXED) { printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n", current->pid); @@ -1034,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1051,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) */ static int do_mmap_private(struct vm_area_struct *vma, struct vm_region *region, - unsigned long len) + unsigned long len, + unsigned long capabilities) { struct page *pages; unsigned long total, point, n, rlen; @@ -1062,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma, * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (vma->vm_file) { + if (capabilities & BDI_CAP_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1181,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - if (!(flags & MAP_FIXED)) - addr = round_hint_to_min(addr); - /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, @@ -1193,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file, return ret; } + /* we ignore the address hint */ + addr = 0; + /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); @@ -1306,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (file && file->f_op->get_unmapped_area) { + if (capabilities & BDI_CAP_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR((void *) addr)) { @@ -1330,15 +1331,17 @@ unsigned long do_mmap_pgoff(struct file *file, } vma->vm_region = region; - add_nommu_region(region); - /* set up the mapping */ + /* set up the mapping + * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + */ if (file && vma->vm_flags & VM_SHARED) ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, region, len); + ret = do_mmap_private(vma, region, len, capabilities); if (ret < 0) - goto error_put_region; + goto error_just_free; + add_nommu_region(region); /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; @@ -1356,19 +1359,6 @@ share: kleave(" = %lx", result); return result; -error_put_region: - __put_nommu_region(region); - if (vma) { - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } - kmem_cache_free(vm_area_cachep, vma); - } - kleave(" = %d [pr]", ret); - return ret; - error_just_free: up_write(&nommu_region_sem); error: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d99664e8607e..a3b14090b1fb 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -44,18 +44,21 @@ static long ratelimit_pages = 32; /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * It should be somewhat larger than dirtied pages to ensure that reasonably * large amounts of I/O are submitted. */ -static inline long sync_writeback_pages(void) +static inline long sync_writeback_pages(unsigned long dirtied) { - return ratelimit_pages + ratelimit_pages / 2; + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; } /* The following parameters are exported via /proc/sys/vm */ /* - * Start background writeback (via pdflush) at this percentage + * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; @@ -474,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; @@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping) unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); unsigned long pause = 1; struct backing_dev_info *bdi = mapping->backing_dev_info; @@ -579,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ + return; /* * In laptop mode, we wait until hitting the higher threshold before @@ -590,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS)) + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) > background_thresh))) - bdi_start_writeback(bdi, nr_writeback); + bdi_start_writeback(bdi, NULL, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -640,9 +643,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); diff --git a/mm/percpu.c b/mm/percpu.c index 43d8cacfdaa5..4a048abad043 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1043,7 +1043,9 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { + static int warn_limit = 10; struct pcpu_chunk *chunk; + const char *err; int slot, off; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { @@ -1059,11 +1061,14 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) + pcpu_extend_area_map(chunk) < 0) { + err = "failed to extend area map of reserved chunk"; goto fail_unlock; + } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; + err = "alloc from reserved chunk failed"; goto fail_unlock; } @@ -1080,6 +1085,7 @@ restart: case 1: goto restart; /* pcpu_lock dropped, restart */ default: + err = "failed to extend area map"; goto fail_unlock; } @@ -1093,8 +1099,10 @@ restart: spin_unlock_irq(&pcpu_lock); chunk = alloc_pcpu_chunk(); - if (!chunk) + if (!chunk) { + err = "failed to allocate new chunk"; goto fail_unlock_mutex; + } spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); @@ -1107,6 +1115,7 @@ area_found: if (pcpu_populate_chunk(chunk, off, size)) { spin_lock_irq(&pcpu_lock); pcpu_free_area(chunk, off); + err = "failed to populate"; goto fail_unlock; } @@ -1119,6 +1128,13 @@ fail_unlock: spin_unlock_irq(&pcpu_lock); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); + if (warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " + "%s\n", size, align, err); + dump_stack(); + if (!--warn_limit) + pr_info("PERCPU: limit reached, disable warning\n"); + } return NULL; } @@ -1347,6 +1363,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info( struct pcpu_alloc_info *ai; unsigned int *cpu_map; + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_map)); + /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest @@ -1574,6 +1594,7 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { + static char cpus_buf[4096] __initdata; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; @@ -1585,17 +1606,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int *unit_map; int group, unit, i; + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); + +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("PERCPU: failed to initialize, %s", #cond); \ + pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(ai->nr_groups <= 0); - BUG_ON(!ai->static_size); - BUG_ON(!base_addr); - BUG_ON(ai->unit_size < size_sum); - BUG_ON(ai->unit_size & ~PAGE_MASK); - BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); - - pcpu_dump_alloc_info(KERN_DEBUG, ai); + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); + PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); /* process group information and build config tables accordingly */ group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); @@ -1604,7 +1634,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) - unit_map[cpu] = NR_CPUS; + unit_map[cpu] = UINT_MAX; pcpu_first_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { @@ -1618,8 +1648,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (cpu == NR_CPUS) continue; - BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); - BUG_ON(unit_map[cpu] != NR_CPUS); + PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; @@ -1632,7 +1663,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_units = unit; for_each_possible_cpu(cpu) - BUG_ON(unit_map[cpu] == NR_CPUS); + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); + + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_INFO, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; @@ -1782,7 +1817,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, areas_size; + size_t size_sum, areas_size, max_distance; int group, i, rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, @@ -1832,8 +1867,24 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, } /* base address is now known, determine group base offsets */ - for (group = 0; group < ai->nr_groups; group++) + max_distance = 0; + for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; + max_distance = max(max_distance, ai->groups[group].base_offset); + } + max_distance += ai->unit_size; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + pr_warning("PERCPU: max_distance=0x%lx too large for vmalloc " + "space 0x%lx\n", + max_distance, VMALLOC_END - VMALLOC_START); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free; +#endif + } pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, diff --git a/mm/shmem.c b/mm/shmem.c index 98631c26c200..356dd99566ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -218,7 +218,7 @@ static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; -static struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ @@ -1046,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * sync from ever calling shmem_writepage; but a stacking filesystem * may use the ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for pdflush or sync. However, in those cases, we do still - * want to check if there's a redundant swappage to be discarded. + * and not for the writeback threads or sync. However, in those cases, + * we do still want to check if there's a redundant swappage to be + * discarded. */ if (wbc->for_reclaim) swap = get_swap_page(); @@ -2497,7 +2498,7 @@ static const struct super_operations shmem_ops = { .put_super = shmem_put_super, }; -static struct vm_operations_struct shmem_vm_ops = { +static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, diff --git a/mm/vmscan.c b/mm/vmscan.c index 1219ceb8a9b2..64e438898832 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1709,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed |