diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/fadvise.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 9 | ||||
-rw-r--r-- | mm/fremap.c | 4 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 155 | ||||
-rw-r--r-- | mm/memory.c | 24 | ||||
-rw-r--r-- | mm/mempolicy.c | 24 | ||||
-rw-r--r-- | mm/migrate.c | 8 | ||||
-rw-r--r-- | mm/mincore.c | 4 | ||||
-rw-r--r-- | mm/mlock.c | 8 | ||||
-rw-r--r-- | mm/mmap.c | 93 | ||||
-rw-r--r-- | mm/mprotect.c | 4 | ||||
-rw-r--r-- | mm/mremap.c | 6 | ||||
-rw-r--r-- | mm/msync.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 1052 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 9 | ||||
-rw-r--r-- | mm/vmalloc.c | 20 |
18 files changed, 971 insertions, 473 deletions
diff --git a/mm/fadvise.c b/mm/fadvise.c index a1da969bd980..54a0f8040afa 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -24,7 +24,7 @@ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. */ -asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; @@ -126,12 +126,26 @@ out: fput(file); return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice) +{ + return SYSC_fadvise64_64((int) fd, offset, len, (int) advice); +} +SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64); +#endif #ifdef __ARCH_WANT_SYS_FADVISE64 -asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) +SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice) { return sys_fadvise64_64(fd, offset, len, advice); } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice) +{ + return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice); +} +SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64); +#endif #endif diff --git a/mm/filemap.c b/mm/filemap.c index ceba0bd03662..23acefe51808 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; @@ -1393,6 +1393,13 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_readahead(long fd, loff_t offset, long count) +{ + return SYSC_readahead((int) fd, offset, (size_t) count); +} +SYSCALL_ALIAS(sys_readahead, SyS_readahead); +#endif #ifdef CONFIG_MMU /** diff --git a/mm/fremap.c b/mm/fremap.c index 62d5bbda921a..736ba7f3306a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, * and the vma's default protection is used. Arbitrary protections * might be implemented in the future. */ -asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long prot, unsigned long pgoff, unsigned long flags) +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct address_space *mapping; diff --git a/mm/madvise.c b/mm/madvise.c index f9349c18a1b5..b9ce574827c8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; struct vm_area_struct * vma, *prev; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2996b80601f..8e4be9cb2a6a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -202,6 +202,7 @@ pcg_default_flags[NR_CHARGE_TYPE] = { static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, @@ -358,6 +359,10 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ smp_rmb(); /* unused page is not rotated. */ if (!PageCgroupUsed(pc)) @@ -374,7 +379,10 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - /* barrier to sync with "charge" */ + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ smp_rmb(); if (!PageCgroupUsed(pc)) return; @@ -559,6 +567,14 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; pc = lookup_page_cgroup(page); + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ + smp_rmb(); + if (!PageCgroupUsed(pc)) + return NULL; + mz = page_cgroup_zoneinfo(pc); if (!mz) return NULL; @@ -618,7 +634,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * called with hierarchy_mutex held */ static struct mem_cgroup * -mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) +__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) { struct cgroup *cgroup, *curr_cgroup, *root_cgroup; @@ -629,19 +645,16 @@ mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) /* * Walk down to children */ - mem_cgroup_put(curr); cgroup = list_entry(curr_cgroup->children.next, struct cgroup, sibling); curr = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(curr); goto done; } visit_parent: if (curr_cgroup == root_cgroup) { - mem_cgroup_put(curr); - curr = root_mem; - mem_cgroup_get(curr); + /* caller handles NULL case */ + curr = NULL; goto done; } @@ -649,11 +662,9 @@ visit_parent: * Goto next sibling */ if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { - mem_cgroup_put(curr); cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, sibling); curr = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(curr); goto done; } @@ -664,7 +675,6 @@ visit_parent: goto visit_parent; done: - root_mem->last_scanned_child = curr; return curr; } @@ -674,40 +684,46 @@ done: * that to reclaim free pages from. */ static struct mem_cgroup * -mem_cgroup_get_first_node(struct mem_cgroup *root_mem) +mem_cgroup_get_next_node(struct mem_cgroup *root_mem) { struct cgroup *cgroup; - struct mem_cgroup *ret; + struct mem_cgroup *orig, *next; bool obsolete; - obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child); - /* * Scan all children under the mem_cgroup mem */ mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); + + orig = root_mem->last_scanned_child; + obsolete = mem_cgroup_is_obsolete(orig); + if (list_empty(&root_mem->css.cgroup->children)) { - ret = root_mem; + /* + * root_mem might have children before and last_scanned_child + * may point to one of them. We put it later. + */ + if (orig) + VM_BUG_ON(!obsolete); + next = NULL; goto done; } - if (!root_mem->last_scanned_child || obsolete) { - - if (obsolete && root_mem->last_scanned_child) - mem_cgroup_put(root_mem->last_scanned_child); - + if (!orig || obsolete) { cgroup = list_first_entry(&root_mem->css.cgroup->children, struct cgroup, sibling); - ret = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(ret); + next = mem_cgroup_from_cont(cgroup); } else - ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, - root_mem); + next = __mem_cgroup_get_next_node(orig, root_mem); done: - root_mem->last_scanned_child = ret; + if (next) + mem_cgroup_get(next); + root_mem->last_scanned_child = next; + if (orig) + mem_cgroup_put(orig); mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); - return ret; + return (next) ? next : root_mem; } static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) @@ -758,28 +774,25 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, * but there might be left over accounting, even after children * have left. */ - ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, + ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, get_swappiness(root_mem)); if (mem_cgroup_check_under_limit(root_mem)) - return 0; + return 1; /* indicate reclaim has succeeded */ if (!root_mem->use_hierarchy) return ret; - next_mem = mem_cgroup_get_first_node(root_mem); + next_mem = mem_cgroup_get_next_node(root_mem); while (next_mem != root_mem) { if (mem_cgroup_is_obsolete(next_mem)) { - mem_cgroup_put(next_mem); - next_mem = mem_cgroup_get_first_node(root_mem); + next_mem = mem_cgroup_get_next_node(root_mem); continue; } - ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, + ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, get_swappiness(next_mem)); if (mem_cgroup_check_under_limit(root_mem)) - return 0; - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); - next_mem = mem_cgroup_get_next_node(next_mem, root_mem); - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); + return 1; /* indicate reclaim has succeeded */ + next_mem = mem_cgroup_get_next_node(root_mem); } return ret; } @@ -863,6 +876,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, noswap); + if (ret) + continue; /* * try_to_free_mem_cgroup_pages() might not give us a full @@ -979,14 +994,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - css_put(&from->css); res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); if (do_swap_account) res_counter_uncharge(&from->memsw, PAGE_SIZE); + css_put(&from->css); + + css_get(&to->css); pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, pc, true); - css_get(&to->css); ret = 0; out: unlock_page_cgroup(pc); @@ -1019,8 +1035,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (ret || !parent) return ret; - if (!get_page_unless_zero(page)) - return -EBUSY; + if (!get_page_unless_zero(page)) { + ret = -EBUSY; + goto uncharge; + } ret = isolate_lru_page(page); @@ -1029,19 +1047,23 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, ret = mem_cgroup_move_account(pc, child, parent); - /* drop extra refcnt by try_charge() (move_account increment one) */ - css_put(&parent->css); putback_lru_page(page); if (!ret) { put_page(page); + /* drop extra refcnt by try_charge() */ + css_put(&parent->css); return 0; } - /* uncharge if move fails */ + cancel: + put_page(page); +uncharge: + /* drop extra refcnt by try_charge() */ + css_put(&parent->css); + /* uncharge if move fails */ res_counter_uncharge(&parent->res, PAGE_SIZE); if (do_swap_account) res_counter_uncharge(&parent->memsw, PAGE_SIZE); - put_page(page); return ret; } @@ -1663,7 +1685,7 @@ move_account: /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); ret = 0; - for_each_node_state(node, N_POSSIBLE) { + for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; for_each_lru(l) { @@ -1971,6 +1993,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup *parent; + if (val > 100) return -EINVAL; @@ -1978,15 +2001,22 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; parent = mem_cgroup_from_cont(cgrp->parent); + + cgroup_lock(); + /* If under hierarchy, only empty-root can set this value */ if ((parent->use_hierarchy) || - (memcg->use_hierarchy && !list_empty(&cgrp->children))) + (memcg->use_hierarchy && !list_empty(&cgrp->children))) { + cgroup_unlock(); return -EINVAL; + } spin_lock(&memcg->reclaim_param_lock); memcg->swappiness = val; spin_unlock(&memcg->reclaim_param_lock); + cgroup_unlock(); + return 0; } @@ -2164,10 +2194,23 @@ static void mem_cgroup_get(struct mem_cgroup *mem) static void mem_cgroup_put(struct mem_cgroup *mem) { - if (atomic_dec_and_test(&mem->refcnt)) + if (atomic_dec_and_test(&mem->refcnt)) { + struct mem_cgroup *parent = parent_mem_cgroup(mem); __mem_cgroup_free(mem); + if (parent) + mem_cgroup_put(parent); + } } +/* + * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. + */ +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) +{ + if (!mem->res.parent) + return NULL; + return mem_cgroup_from_res_counter(mem->res.parent, res); +} #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static void __init enable_swap_cgroup(void) @@ -2181,7 +2224,7 @@ static void __init enable_swap_cgroup(void) } #endif -static struct cgroup_subsys_state * +static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem, *parent; @@ -2206,6 +2249,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (parent && parent->use_hierarchy) { res_counter_init(&mem->res, &parent->res); res_counter_init(&mem->memsw, &parent->memsw); + /* + * We increment refcnt of the parent to ensure that we can + * safely access it on res_counter_charge/uncharge. + * This refcnt will be decremented when freeing this + * mem_cgroup(see mem_cgroup_put). + */ + mem_cgroup_get(parent); } else { res_counter_init(&mem->res, NULL); res_counter_init(&mem->memsw, NULL); @@ -2232,7 +2282,14 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - mem_cgroup_put(mem_cgroup_from_cont(cont)); + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *last_scanned_child = mem->last_scanned_child; + + if (last_scanned_child) { + VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); + mem_cgroup_put(last_scanned_child); + } + mem_cgroup_put(mem); } static int mem_cgroup_populate(struct cgroup_subsys *ss, diff --git a/mm/memory.c b/mm/memory.c index e009ce870859..22bfa7a47a0b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1511,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { int ret; + pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1525,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE)) + if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) return -EINVAL; - ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot); + ret = insert_pfn(vma, addr, pfn, pgprot); if (ret) untrack_pfn_vma(vma, pfn, PAGE_SIZE); @@ -1671,9 +1672,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); - if (err) + err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); + if (err) { + /* + * To indicate that track_pfn related cleanup is not + * needed from higher level routine calling unmap_vmas + */ + vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); return -EINVAL; + } BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -3165,6 +3172,15 @@ void print_vma_addr(char *prefix, unsigned long ip) #ifdef CONFIG_PROVE_LOCKING void might_fault(void) { + /* + * Some code (nfs/sunrpc) uses socket ops on kernel memory while + * holding the mmap_sem, this is safe because kernel memory doesn't + * get paged out, therefore we'll never actually fault, and the + * below annotations will generate false positives. + */ + if (segment_eq(get_fs(), KERNEL_DS)) + return; + might_sleep(); /* * it would be nicer only to annotate paths which are not under diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e412ffa8e52e..3eb4a6fdc043 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1068,10 +1068,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) +SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, + unsigned long, mode, unsigned long __user *, nmask, + unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; int err; @@ -1091,8 +1090,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, } /* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) +SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, + unsigned long, maxnode) { int err; nodemask_t nodes; @@ -1110,9 +1109,9 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, flags, &nodes); } -asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, - const unsigned long __user *old_nodes, - const unsigned long __user *new_nodes) +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) { const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm; @@ -1185,10 +1184,9 @@ out: /* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) +SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) { int err; int uninitialized_var(pval); diff --git a/mm/migrate.c b/mm/migrate.c index a30ea5fcf9f1..2bb4e1d63520 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1055,10 +1055,10 @@ out: * Move a list of pages in the address space of the currently executing * process. */ -asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, - const void __user * __user *pages, - const int __user *nodes, - int __user *status, int flags) +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) { const struct cred *cred = current_cred(), *tcred; struct task_struct *task; diff --git a/mm/mincore.c b/mm/mincore.c index 5178800bc129..8cb508f84ea4 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -177,8 +177,8 @@ none_mapped: * mapped * -EAGAIN - A kernel resource was temporarily unavailable. */ -asmlinkage long sys_mincore(unsigned long start, size_t len, - unsigned char __user * vec) +SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, + unsigned char __user *, vec) { long retval; unsigned long pages; diff --git a/mm/mlock.c b/mm/mlock.c index e125156c664e..2904a347e476 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -530,7 +530,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -asmlinkage long sys_mlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; unsigned long lock_limit; @@ -558,7 +558,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) return error; } -asmlinkage long sys_munlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; @@ -595,7 +595,7 @@ out: return 0; } -asmlinkage long sys_mlockall(int flags) +SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret = -EINVAL; @@ -623,7 +623,7 @@ out: return ret; } -asmlinkage long sys_munlockall(void) +SYSCALL_DEFINE0(munlockall) { int ret; diff --git a/mm/mmap.c b/mm/mmap.c index a910c045cfd4..214b6a258eeb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage unsigned long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; @@ -658,6 +658,9 @@ again: remove_next = 1 + (end > next->vm_end); validate_mm(mm); } +/* Flags that can be inherited from an existing mapping when merging */ +#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -665,7 +668,7 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if (vma->vm_flags != vm_flags) + if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) return 0; if (vma->vm_file != file) return 0; @@ -1087,6 +1090,15 @@ int vma_wants_writenotify(struct vm_area_struct *vma) mapping_cap_account_dirty(vma->vm_file->f_mapping); } +/* + * We account for memory if it's a private writeable mapping, + * and VM_NORESERVE wasn't set. + */ +static inline int accountable_mapping(unsigned int vm_flags) +{ + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; +} + unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, unsigned int vm_flags, unsigned long pgoff, @@ -1114,36 +1126,32 @@ munmap_back: if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - if (flags & MAP_NORESERVE) + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. We only honor MAP_NORESERVE + * if we're allowed to overcommit memory. + */ + if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + if (!accountable) vm_flags |= VM_NORESERVE; - if (accountable && (!(flags & MAP_NORESERVE) || - sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { - if (vm_flags & VM_SHARED) { - /* Check memory availability in shmem_file_setup? */ - vm_flags |= VM_ACCOUNT; - } else if (vm_flags & VM_WRITE) { - /* - * Private writable mapping: check memory availability - */ - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; } /* - * Can we just expand an old private anonymous mapping? - * The VM_SHARED test is necessary because shmem_zero_setup - * will create the file object for a shared anonymous map below. + * Can we just expand an old mapping? */ - if (!file && !(vm_flags & VM_SHARED)) { - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL); - if (vma) - goto out; - } + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); + if (vma) + goto out; /* * Determine the object being mapped and call the appropriate @@ -1186,14 +1194,6 @@ munmap_back: goto free_vma; } - /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform - * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) - * that memory reservation must be checked; but that reservation - * belongs to shared memory object, not to vma: so now clear it. - */ - if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) - vma->vm_flags &= ~VM_ACCOUNT; - /* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their @@ -1206,17 +1206,8 @@ munmap_back: if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); - if (file && vma_merge(mm, prev, addr, vma->vm_end, - vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { - mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); - fput(file); - if (vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } else { - vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - } + vma_link(mm, vma, prev, rb_link, rb_parent); + file = vma->vm_file; /* Once vma denies write, undo our temporary denial count */ if (correct_wcount) @@ -1948,7 +1939,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; @@ -2472,3 +2463,13 @@ void mm_drop_all_locks(struct mm_struct *mm) mutex_unlock(&mm_all_locks_mutex); } + +/* + * initialise the VMA slab + */ +void __init mmap_init(void) +{ + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL); +} diff --git a/mm/mprotect.c b/mm/mprotect.c index d0f6e7ce09f1..abe2694e13f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -217,8 +217,8 @@ fail: return error; } -asmlinkage long -sys_mprotect(unsigned long start, size_t len, unsigned long prot) +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) { unsigned long vm_flags, nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; diff --git a/mm/mremap.c b/mm/mremap.c index 646de959aa58..a39b7b91be46 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,9 +420,9 @@ out_nc: return ret; } -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; diff --git a/mm/msync.c b/mm/msync.c index 07dae08cf31c..4083209b7f02 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -28,7 +28,7 @@ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) { unsigned long end; struct mm_struct *mm = current->mm; diff --git a/mm/nommu.c b/mm/nommu.c index 1c28ea3a4e9c..2fcf47d449b4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -6,11 +6,11 @@ * * See Documentation/nommu-mmap.txt * - * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com> + * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> - * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> + * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> */ #include <linux/module.h> @@ -33,6 +33,28 @@ #include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include "internal.h" + +static inline __attribute__((format(printf, 1, 2))) +void no_printk(const char *fmt, ...) +{ +} + +#if 0 +#define kenter(FMT, ...) \ + printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) +#else +#define kenter(FMT, ...) \ + no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) +#endif #include "internal.h" @@ -40,19 +62,22 @@ void *high_memory; struct page *mem_map; unsigned long max_mapnr; unsigned long num_physpages; -unsigned long askedalloc, realalloc; atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ int heap_stack_gap = 0; +atomic_t mmap_pages_allocated; + EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(num_physpages); -/* list of shareable VMAs */ -struct rb_root nommu_vma_tree = RB_ROOT; -DECLARE_RWSEM(nommu_vma_sem); +/* list of mapped, potentially shareable regions */ +static struct kmem_cache *vm_region_jar; +struct rb_root nommu_region_tree = RB_ROOT; +DECLARE_RWSEM(nommu_region_sem); struct vm_operations_struct generic_file_vm_ops = { }; @@ -124,6 +149,20 @@ unsigned int kobjsize(const void *objp) return ksize(objp); /* + * If it's not a compound page, see if we have a matching VMA + * region. This test is intentionally done in reverse order, + * so if there's no VMA, we still fall through and hand back + * PAGE_SIZE for 0-order pages. + */ + if (!PageCompound(page)) { + struct vm_area_struct *vma; + + vma = find_vma(current->mm, (unsigned long)objp); + if (vma) + return vma->vm_end - vma->vm_start; + } + + /* * The ksize() function is only guaranteed to work for pointers * returned by kmalloc(). So handle arbitrary pointers here. */ @@ -355,6 +394,24 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vm_map_ram); + +void vm_unmap_ram(const void *mem, unsigned int count) +{ + BUG(); +} +EXPORT_SYMBOL(vm_unmap_ram); + +void vm_unmap_aliases(void) +{ +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + /* * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. @@ -377,7 +434,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage unsigned long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { struct mm_struct *mm = current->mm; @@ -401,129 +458,178 @@ asmlinkage unsigned long sys_brk(unsigned long brk) return mm->brk = brk; } -#ifdef DEBUG -static void show_process_blocks(void) +/* + * initialise the VMA and region record slabs + */ +void __init mmap_init(void) { - struct vm_list_struct *vml; - - printk("Process blocks %d:", current->pid); - - for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { - printk(" %p: %p", vml, vml->vma); - if (vml->vma) - printk(" (%d @%lx #%d)", - kobjsize((void *) vml->vma->vm_start), - vml->vma->vm_start, - atomic_read(&vml->vma->vm_usage)); - printk(vml->next ? " ->" : ".\n"); - } + vm_region_jar = kmem_cache_create("vm_region_jar", + sizeof(struct vm_region), 0, + SLAB_PANIC, NULL); + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL); } -#endif /* DEBUG */ /* - * add a VMA into a process's mm_struct in the appropriate place in the list - * - should be called with mm->mmap_sem held writelocked + * validate the region tree + * - the caller must hold the region lock */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) +#ifdef CONFIG_DEBUG_NOMMU_REGIONS +static noinline void validate_nommu_regions(void) { - struct vm_list_struct **ppv; - - for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) - if ((*ppv)->vma->vm_start > vml->vma->vm_start) - break; - - vml->next = *ppv; - *ppv = vml; + struct vm_region *region, *last; + struct rb_node *p, *lastp; + + lastp = rb_first(&nommu_region_tree); + if (!lastp) + return; + + last = rb_entry(lastp, struct vm_region, vm_rb); + if (unlikely(last->vm_end <= last->vm_start)) + BUG(); + if (unlikely(last->vm_top < last->vm_end)) + BUG(); + + while ((p = rb_next(lastp))) { + region = rb_entry(p, struct vm_region, vm_rb); + last = rb_entry(lastp, struct vm_region, vm_rb); + + if (unlikely(region->vm_end <= region->vm_start)) + BUG(); + if (unlikely(region->vm_top < region->vm_end)) + BUG(); + if (unlikely(region->vm_start < last->vm_top)) + BUG(); + + lastp = p; + } } +#else +#define validate_nommu_regions() do {} while(0) +#endif /* - * look up the first VMA in which addr resides, NULL if none - * - should be called with mm->mmap_sem at least held readlocked + * add a region into the global tree */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +static void add_nommu_region(struct vm_region *region) { - struct vm_list_struct *loop, *vml; + struct vm_region *pregion; + struct rb_node **p, *parent; - /* search the vm_start ordered list */ - vml = NULL; - for (loop = mm->context.vmlist; loop; loop = loop->next) { - if (loop->vma->vm_start > addr) - break; - vml = loop; + validate_nommu_regions(); + + BUG_ON(region->vm_start & ~PAGE_MASK); + + parent = NULL; + p = &nommu_region_tree.rb_node; + while (*p) { + parent = *p; + pregion = rb_entry(parent, struct vm_region, vm_rb); + if (region->vm_start < pregion->vm_start) + p = &(*p)->rb_left; + else if (region->vm_start > pregion->vm_start) + p = &(*p)->rb_right; + else if (pregion == region) + return; + else + BUG(); } - if (vml && vml->vma->vm_end > addr) - return vml->vma; + rb_link_node(®ion->vm_rb, parent, p); + rb_insert_color(®ion->vm_rb, &nommu_region_tree); - return NULL; + validate_nommu_regions(); } -EXPORT_SYMBOL(find_vma); /* - * find a VMA - * - we don't extend stack VMAs under NOMMU conditions + * delete a region from the global tree */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +static void delete_nommu_region(struct vm_region *region) { - return find_vma(mm, addr); -} + BUG_ON(!nommu_region_tree.rb_node); -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - return -ENOMEM; + validate_nommu_regions(); + rb_erase(®ion->vm_rb, &nommu_region_tree); + validate_nommu_regions(); } /* - * look up the first VMA exactly that exactly matches addr - * - should be called with mm->mmap_sem at least held readlocked + * free a contiguous series of pages */ -static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, - unsigned long addr) +static void free_page_series(unsigned long from, unsigned long to) { - struct vm_list_struct *vml; - - /* search the vm_start ordered list */ - for (vml = mm->context.vmlist; vml; vml = vml->next) { - if (vml->vma->vm_start == addr) - return vml->vma; - if (vml->vma->vm_start > addr) - break; + for (; from < to; from += PAGE_SIZE) { + struct page *page = virt_to_page(from); + + kdebug("- free %lx", from); + atomic_dec(&mmap_pages_allocated); + if (page_count(page) != 1) + kdebug("free page %p [%d]", page, page_count(page)); + put_page(page); } - - return NULL; } /* - * find a VMA in the global tree + * release a reference to a region + * - the caller must hold the region semaphore, which this releases + * - the region may not have been added to the tree yet, in which case vm_top + * will equal vm_start */ -static inline struct vm_area_struct *find_nommu_vma(unsigned long start) +static void __put_nommu_region(struct vm_region *region) + __releases(nommu_region_sem) { - struct vm_area_struct *vma; - struct rb_node *n = nommu_vma_tree.rb_node; + kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); - while (n) { - vma = rb_entry(n, struct vm_area_struct, vm_rb); + BUG_ON(!nommu_region_tree.rb_node); - if (start < vma->vm_start) - n = n->rb_left; - else if (start > vma->vm_start) - n = n->rb_right; - else - return vma; + if (atomic_dec_and_test(®ion->vm_usage)) { + if (region->vm_top > region->vm_start) + delete_nommu_region(region); + up_write(&nommu_region_sem); + + if (region->vm_file) + fput(region->vm_file); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ + if (region->vm_flags & VM_MAPPED_COPY) { + kdebug("free series"); + free_page_series(region->vm_start, region->vm_top); + } + kmem_cache_free(vm_region_jar, region); + } else { + up_write(&nommu_region_sem); } +} - return NULL; +/* + * release a reference to a region + */ +static void put_nommu_region(struct vm_region *region) +{ + down_write(&nommu_region_sem); + __put_nommu_region(region); } /* - * add a VMA in the global tree + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_sem held writelocked */ -static void add_nommu_vma(struct vm_area_struct *vma) +static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma; + struct vm_area_struct *pvma, **pp; struct address_space *mapping; - struct rb_node **p = &nommu_vma_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p, *parent; + + kenter(",%p", vma); + + BUG_ON(!vma->vm_region); + + mm->map_count++; + vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { @@ -534,42 +640,62 @@ static void add_nommu_vma(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); } - /* add the VMA to the master list */ + /* add the VMA to the tree */ + parent = NULL; + p = &mm->mm_rb.rb_node; while (*p) { parent = *p; pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - if (vma->vm_start < pvma->vm_start) { + /* sort by: start addr, end addr, VMA struct addr in that order + * (the latter is necessary as we may get identical VMAs) */ + if (vma->vm_start < pvma->vm_start) p = &(*p)->rb_left; - } - else if (vma->vm_start > pvma->vm_start) { + else if (vma->vm_start > pvma->vm_start) p = &(*p)->rb_right; - } - else { - /* mappings are at the same address - this can only - * happen for shared-mem chardevs and shared file - * mappings backed by ramfs/tmpfs */ - BUG_ON(!(pvma->vm_flags & VM_SHARED)); - - if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) - p = &(*p)->rb_right; - else - BUG(); - } + else if (vma->vm_end < pvma->vm_end) + p = &(*p)->rb_left; + else if (vma->vm_end > pvma->vm_end) + p = &(*p)->rb_right; + else if (vma < pvma) + p = &(*p)->rb_left; + else if (vma > pvma) + p = &(*p)->rb_right; + else + BUG(); } rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &nommu_vma_tree); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); + + /* add VMA to the VMA list also */ + for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { + if (pvma->vm_start > vma->vm_start) + break; + if (pvma->vm_start < vma->vm_start) + continue; + if (pvma->vm_end < vma->vm_end) + break; + } + + vma->vm_next = *pp; + *pp = vma; } /* - * delete a VMA from the global list + * delete a VMA from its owning mm_struct and address space */ -static void delete_nommu_vma(struct vm_area_struct *vma) +static void delete_vma_from_mm(struct vm_area_struct *vma) { + struct vm_area_struct **pp; struct address_space *mapping; + struct mm_struct *mm = vma->vm_mm; + + kenter("%p", vma); + + mm->map_count--; + if (mm->mmap_cache == vma) + mm->mmap_cache = NULL; /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -580,8 +706,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); } - /* remove from the master list */ - rb_erase(&vma->vm_rb, &nommu_vma_tree); + /* remove from the MM's tree and list */ + rb_erase(&vma->vm_rb, &mm->mm_rb); + for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { + if (*pp == vma) { + *pp = vma->vm_next; + break; + } + } + + vma->vm_mm = NULL; +} + +/* + * destroy a VMA record + */ +static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + kenter("%p", vma); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } + put_nommu_region(vma->vm_region); + kmem_cache_free(vm_area_cachep, vma); +} + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_sem at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + struct rb_node *n = mm->mm_rb.rb_node; + + /* check the cache first */ + vma = mm->mmap_cache; + if (vma && vma->vm_start <= addr && vma->vm_end > addr) + return vma; + + /* trawl the tree (there may be multiple mappings in which addr + * resides) */ + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end > addr) { + mm->mmap_cache = vma; + return vma; + } + } + + return NULL; +} +EXPORT_SYMBOL(find_vma); + +/* + * find a VMA + * - we don't extend stack VMAs under NOMMU conditions + */ +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return -ENOMEM; +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_sem at least held readlocked + */ +static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct rb_node *n = mm->mm_rb.rb_node; + unsigned long end = addr + len; + + /* check the cache first */ + vma = mm->mmap_cache; + if (vma && vma->vm_start == addr && vma->vm_end == end) + return vma; + + /* trawl the tree (there may be multiple mappings in which addr + * resides) */ + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + if (vma->vm_start < addr) + continue; + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end == end) { + mm->mmap_cache = vma; + return vma; + } + } + + return NULL; } /* @@ -596,7 +829,7 @@ static int validate_mmap_request(struct file *file, unsigned long pgoff, unsigned long *_capabilities) { - unsigned long capabilities; + unsigned long capabilities, rlen; unsigned long reqprot = prot; int ret; @@ -616,12 +849,12 @@ static int validate_mmap_request(struct file *file, return -EINVAL; /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) + rlen = PAGE_ALIGN(len); + if (!rlen || rlen > TASK_SIZE) return -ENOMEM; /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; if (file) { @@ -795,13 +1028,18 @@ static unsigned long determine_vm_flags(struct file *file, } /* - * set up a shared mapping on a file + * set up a shared mapping on a file (the driver or filesystem provides and + * pins the storage) */ -static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) +static int do_mmap_shared_file(struct vm_area_struct *vma) { int ret; ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret == 0) { + vma->vm_region->vm_top = vma->vm_region->vm_end; + return ret; + } if (ret != -ENOSYS) return ret; @@ -815,10 +1053,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) /* * set up a private mapping or an anonymous shared mapping */ -static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) +static int do_mmap_private(struct vm_area_struct *vma, + struct vm_region *region, + unsigned long len) { + struct page *pages; + unsigned long total, point, n, rlen; void *base; - int ret; + int ret, order; /* invoke the file's mapping function so that it can keep track of * shared mappings on devices or memory @@ -826,34 +1068,63 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) */ if (vma->vm_file) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); - if (ret != -ENOSYS) { + if (ret == 0) { /* shouldn't return success if we're not sharing */ - BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); - return ret; /* success or a real error */ + BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); + vma->vm_region->vm_top = vma->vm_region->vm_end; + return ret; } + if (ret != -ENOSYS) + return ret; /* getting an ENOSYS error indicates that direct mmap isn't * possible (as opposed to tried but failed) so we'll try to * make a private copy of the data and map that instead */ } + rlen = PAGE_ALIGN(len); + /* allocate some memory to hold the mapping * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - base = kmalloc(len, GFP_KERNEL|__GFP_COMP); - if (!base) + order = get_order(rlen); + kdebug("alloc order %d for %lx", order, len); + + pages = alloc_pages(GFP_KERNEL, order); + if (!pages) goto enomem; - vma->vm_start = (unsigned long) base; - vma->vm_end = vma->vm_start + len; - vma->vm_flags |= VM_MAPPED_COPY; + total = 1 << order; + atomic_add(total, &mmap_pages_allocated); + + point = rlen >> PAGE_SHIFT; + + /* we allocated a power-of-2 sized page set, so we may want to trim off + * the excess */ + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { + while (total > point) { + order = ilog2(total - point); + n = 1 << order; + kdebug("shave %lu/%lu @%lu", n, total - point, total); + atomic_sub(n, &mmap_pages_allocated); + total -= n; + set_page_refcounted(pages + total); + __free_pages(pages + total, order); + } + } -#ifdef WARN_ON_SLACK - if (len + WARN_ON_SLACK <= kobjsize(result)) - printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", - len, current->pid, kobjsize(result) - len); -#endif + for (point = 1; point < total; point++) + set_page_refcounted(&pages[point]); + + base = page_address(pages); + region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + region->vm_start = (unsigned long) base; + region->vm_end = region->vm_start + rlen; + region->vm_top = region->vm_start + (total << PAGE_SHIFT); + + vma->vm_start = region->vm_start; + vma->vm_end = region->vm_start + len; if (vma->vm_file) { /* read the contents of a file into the copy */ @@ -865,31 +1136,33 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) old_fs = get_fs(); set_fs(KERNEL_DS); - ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); + ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); set_fs(old_fs); if (ret < 0) goto error_free; /* clear the last little bit */ - if (ret < len) - memset(base + ret, 0, len - ret); + if (ret < rlen) + memset(base + ret, 0, rlen - ret); } else { /* if it's an anonymous mapping, then just clear it */ - memset(base, 0, len); + memset(base, 0, rlen); } return 0; error_free: - kfree(base); - vma->vm_start = 0; + free_page_series(region->vm_start, region->vm_end); + region->vm_start = vma->vm_start = 0; + region->vm_end = vma->vm_end = 0; + region->vm_top = 0; return ret; enomem: - printk("Allocation of length %lu from process %d failed\n", - len, current->pid); + printk("Allocation of length %lu from process %d (%s) failed\n", + len, current->pid, current->comm); show_free_areas(); return -ENOMEM; } @@ -904,13 +1177,14 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long flags, unsigned long pgoff) { - struct vm_list_struct *vml = NULL; - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma; + struct vm_region *region; struct rb_node *rb; - unsigned long capabilities, vm_flags; - void *result; + unsigned long capabilities, vm_flags, result; int ret; + kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -918,73 +1192,120 @@ unsigned long do_mmap_pgoff(struct file *file, * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, &capabilities); - if (ret < 0) + if (ret < 0) { + kleave(" = %d [val]", ret); return ret; + } /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); - /* we're going to need to record the mapping if it works */ - vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); - if (!vml) - goto error_getting_vml; + /* we're going to need to record the mapping */ + region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); + if (!region) + goto error_getting_region; + + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + goto error_getting_vma; + + atomic_set(®ion->vm_usage, 1); + region->vm_flags = vm_flags; + region->vm_pgoff = pgoff; + + INIT_LIST_HEAD(&vma->anon_vma_node); + vma->vm_flags = vm_flags; + vma->vm_pgoff = pgoff; + + if (file) { + region->vm_file = file; + get_file(file); + vma->vm_file = file; + get_file(file); + if (vm_flags & VM_EXECUTABLE) { + added_exe_file_vma(current->mm); + vma->vm_mm = current->mm; + } + } - down_write(&nommu_vma_sem); + down_write(&nommu_region_sem); - /* if we want to share, we need to check for VMAs created by other + /* if we want to share, we need to check for regions created by other * mmap() calls that overlap with our proposed mapping - * - we can only share with an exact match on most regular files + * - we can only share with a superset match on most regular files * - shared mappings on character devices and memory backed files are * permitted to overlap inexactly as far as we are concerned for in * these cases, sharing is handled in the driver or filesystem rather * than here */ if (vm_flags & VM_MAYSHARE) { - unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vmpglen; + struct vm_region *pregion; + unsigned long pglen, rpglen, pgend, rpgend, start; - /* suppress VMA sharing for shared regions */ - if (vm_flags & VM_SHARED && - capabilities & BDI_CAP_MAP_DIRECT) - goto dont_share_VMAs; + pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgend = pgoff + pglen; - for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { - vma = rb_entry(rb, struct vm_area_struct, vm_rb); + for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { + pregion = rb_entry(rb, struct vm_region, vm_rb); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (!(pregion->vm_flags & VM_MAYSHARE)) continue; /* search for overlapping mappings on the same file */ - if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) + if (pregion->vm_file->f_path.dentry->d_inode != + file->f_path.dentry->d_inode) continue; - if (vma->vm_pgoff >= pgoff + pglen) + if (pregion->vm_pgoff >= pgend) continue; - vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; - vmpglen >>= PAGE_SHIFT; - if (pgoff >= vma->vm_pgoff + vmpglen) + rpglen = pregion->vm_end - pregion->vm_start; + rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; + rpgend = pregion->vm_pgoff + rpglen; + if (pgoff >= rpgend) continue; - /* handle inexactly overlapping matches between mappings */ - if (vma->vm_pgoff != pgoff || vmpglen != pglen) { + /* handle inexactly overlapping matches between + * mappings */ + if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && + !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { + /* new mapping is not a subset of the region */ if (!(capabilities & BDI_CAP_MAP_DIRECT)) goto sharing_violation; continue; } - /* we've found a VMA we can share */ - atomic_inc(&vma->vm_usage); - - vml->vma = vma; - result = (void *) vma->vm_start; - goto shared; + /* we've found a region we can share */ + atomic_inc(&pregion->vm_usage); + vma->vm_region = pregion; + start = pregion->vm_start; + start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; + vma->vm_start = start; + vma->vm_end = start + len; + + if (pregion->vm_flags & VM_MAPPED_COPY) { + kdebug("share copy"); + vma->vm_flags |= VM_MAPPED_COPY; + } else { + kdebug("share mmap"); + ret = do_mmap_shared_file(vma); + if (ret < 0) { + vma->vm_region = NULL; + vma->vm_start = 0; + vma->vm_end = 0; + atomic_dec(&pregion->vm_usage); + pregion = NULL; + goto error_just_free; + } + } + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; + goto share; } - dont_share_VMAs: - vma = NULL; - /* obtain the address at which to make a shared mapping * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping @@ -995,113 +1316,93 @@ unsigned long do_mmap_pgoff(struct file *file, if (IS_ERR((void *) addr)) { ret = addr; if (ret != (unsigned long) -ENOSYS) - goto error; + goto error_just_free; /* the driver refused to tell us where to site * the mapping so we'll have to attempt to copy * it */ ret = (unsigned long) -ENODEV; if (!(capabilities & BDI_CAP_MAP_COPY)) - goto error; + goto error_just_free; capabilities &= ~BDI_CAP_MAP_DIRECT; + } else { + vma->vm_start = region->vm_start = addr; + vma->vm_end = region->vm_end = addr + len; } } } - /* we're going to need a VMA struct as well */ - vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); - if (!vma) - goto error_getting_vma; - - INIT_LIST_HEAD(&vma->anon_vma_node); - atomic_set(&vma->vm_usage, 1); - if (file) { - get_file(file); - if (vm_flags & VM_EXECUTABLE) { - added_exe_file_vma(current->mm); - vma->vm_mm = current->mm; - } - } - vma->vm_file = file; - vma->vm_flags = vm_flags; - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; - - vml->vma = vma; + vma->vm_region = region; /* set up the mapping */ if (file && vma->vm_flags & VM_SHARED) - ret = do_mmap_shared_file(vma, len); + ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, len); + ret = do_mmap_private(vma, region, len); if (ret < 0) - goto error; - - /* okay... we have a mapping; now we have to register it */ - result = (void *) vma->vm_start; + goto error_put_region; - if (vma->vm_flags & VM_MAPPED_COPY) { - realalloc += kobjsize(result); - askedalloc += len; - } + add_nommu_region(region); - realalloc += kobjsize(vma); - askedalloc += sizeof(*vma); + /* okay... we have a mapping; now we have to register it */ + result = vma->vm_start; current->mm->total_vm += len >> PAGE_SHIFT; - add_nommu_vma(vma); - - shared: - realalloc += kobjsize(vml); - askedalloc += sizeof(*vml); +share: + add_vma_to_mm(current->mm, vma); - add_vma_to_mm(current->mm, vml); - - up_write(&nommu_vma_sem); + up_write(&nommu_region_sem); if (prot & PROT_EXEC) - flush_icache_range((unsigned long) result, - (unsigned long) result + len); + flush_icache_range(result, result + len); -#ifdef DEBUG - printk("do_mmap:\n"); - show_process_blocks(); -#endif + kleave(" = %lx", result); + return result; - return (unsigned long) result; - - error: - up_write(&nommu_vma_sem); - kfree(vml); +error_put_region: + __put_nommu_region(region); if (vma) { if (vma->vm_file) { fput(vma->vm_file); if (vma->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(vma->vm_mm); } - kfree(vma); + kmem_cache_free(vm_area_cachep, vma); } + kleave(" = %d [pr]", ret); return ret; - sharing_violation: - up_write(&nommu_vma_sem); - printk("Attempt to share mismatched mappings\n"); - kfree(vml); - return -EINVAL; +error_just_free: + up_write(&nommu_region_sem); +error: + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + kmem_cache_free(vm_area_cachep, vma); + kleave(" = %d", ret); + return ret; - error_getting_vma: - up_write(&nommu_vma_sem); - kfree(vml); - printk("Allocation of vma for %lu byte allocation from process %d failed\n", +sharing_violation: + up_write(&nommu_region_sem); + printk(KERN_WARNING "Attempt to share mismatched mappings\n"); + ret = -EINVAL; + goto error; + +error_getting_vma: + kmem_cache_free(vm_region_jar, region); + printk(KERN_WARNING "Allocation of vma for %lu byte allocation" + " from process %d failed\n", len, current->pid); show_free_areas(); return -ENOMEM; - error_getting_vml: - printk("Allocation of vml for %lu byte allocation from process %d failed\n", +error_getting_region: + printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" + " from process %d failed\n", len, current->pid); show_free_areas(); return -ENOMEM; @@ -1109,90 +1410,188 @@ unsigned long do_mmap_pgoff(struct file *file, EXPORT_SYMBOL(do_mmap_pgoff); /* - * handle mapping disposal for uClinux + * split a vma into two pieces at address 'addr', a new vma is allocated either + * for the first part or the tail. */ -static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) { - if (vma) { - down_write(&nommu_vma_sem); + struct vm_area_struct *new; + struct vm_region *region; + unsigned long npages; - if (atomic_dec_and_test(&vma->vm_usage)) { - delete_nommu_vma(vma); + kenter(""); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + /* we're only permitted to split anonymous regions that have a single + * owner */ + if (vma->vm_file || + atomic_read(&vma->vm_region->vm_usage) != 1) + return -ENOMEM; - /* IO memory and memory shared directly out of the pagecache from - * ramfs/tmpfs mustn't be released here */ - if (vma->vm_flags & VM_MAPPED_COPY) { - realalloc -= kobjsize((void *) vma->vm_start); - askedalloc -= vma->vm_end - vma->vm_start; - kfree((void *) vma->vm_start); - } + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; - realalloc -= kobjsize(vma); - askedalloc -= sizeof(*vma); + region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); + if (!region) + return -ENOMEM; - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } - kfree(vma); - } + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) { + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; + } + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + *region = *vma->vm_region; + new->vm_region = region; + + npages = (addr - vma->vm_start) >> PAGE_SHIFT; - up_write(&nommu_vma_sem); + if (new_below) { + region->vm_top = region->vm_end = new->vm_end = addr; + } else { + region->vm_start = new->vm_start = addr; + region->vm_pgoff = new->vm_pgoff += npages; } + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + delete_vma_from_mm(vma); + down_write(&nommu_region_sem); + delete_nommu_region(vma->vm_region); + if (new_below) { + vma->vm_region->vm_start = vma->vm_start = addr; + vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; + } else { + vma->vm_region->vm_end = vma->vm_end = addr; + vma->vm_region->vm_top = addr; + } + add_nommu_region(vma->vm_region); + add_nommu_region(new->vm_region); + up_write(&nommu_region_sem); + add_vma_to_mm(mm, vma); + add_vma_to_mm(mm, new); + return 0; } /* - * release a mapping - * - under NOMMU conditions the parameters must match exactly to the mapping to - * be removed + * shrink a VMA by removing the specified chunk from either the beginning or + * the end */ -int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +static int shrink_vma(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long from, unsigned long to) { - struct vm_list_struct *vml, **parent; - unsigned long end = addr + len; + struct vm_region *region; -#ifdef DEBUG - printk("do_munmap:\n"); -#endif + kenter(""); - for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { - if ((*parent)->vma->vm_start > addr) - break; - if ((*parent)->vma->vm_start == addr && - ((len == 0) || ((*parent)->vma->vm_end == end))) - goto found; + /* adjust the VMA's pointers, which may reposition it in the MM's tree + * and list */ + delete_vma_from_mm(vma); + if (from > vma->vm_start) + vma->vm_end = from; + else + vma->vm_start = to; + add_vma_to_mm(mm, vma); + + /* cut the backing region down to size */ + region = vma->vm_region; + BUG_ON(atomic_read(®ion->vm_usage) != 1); + + down_write(&nommu_region_sem); + delete_nommu_region(region); + if (from > region->vm_start) { + to = region->vm_top; + region->vm_top = region->vm_end = from; + } else { + region->vm_start = to; } + add_nommu_region(region); + up_write(&nommu_region_sem); - printk("munmap of non-mmaped memory by process %d (%s): %p\n", - current->pid, current->comm, (void *) addr); - return -EINVAL; + free_page_series(from, to); + return 0; +} - found: - vml = *parent; +/* + * release a mapping + * - under NOMMU conditions the chunk to be unmapped must be backed by a single + * VMA, though it need not cover the whole VMA + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + struct rb_node *rb; + unsigned long end = start + len; + int ret; - put_vma(mm, vml->vma); + kenter(",%lx,%zx", start, len); - *parent = vml->next; - realalloc -= kobjsize(vml); - askedalloc -= sizeof(*vml); - kfree(vml); + if (len == 0) + return -EINVAL; - update_hiwater_vm(mm); - mm->total_vm -= len >> PAGE_SHIFT; + /* find the first potentially overlapping VMA */ + vma = find_vma(mm, start); + if (!vma) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d (%s):" + " 0x%lx-0x%lx\n", + current->pid, current->comm, start, start + len - 1); + return -EINVAL; + } -#ifdef DEBUG - show_process_blocks(); -#endif + /* we're allowed to split an anonymous VMA but not a file-backed one */ + if (vma->vm_file) { + do { + if (start > vma->vm_start) { + kleave(" = -EINVAL [miss]"); + return -EINVAL; + } + if (end == vma->vm_end) + goto erase_whole_vma; + rb = rb_next(&vma->vm_rb); + vma = rb_entry(rb, struct vm_area_struct, vm_rb); + } while (rb); + kleave(" = -EINVAL [split file]"); + return -EINVAL; + } else { + /* the chunk must be a subset of the VMA found */ + if (start == vma->vm_start && end == vma->vm_end) + goto erase_whole_vma; + if (start < vma->vm_start || end > vma->vm_end) { + kleave(" = -EINVAL [superset]"); + return -EINVAL; + } + if (start & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned start]"); + return -EINVAL; + } + if (end != vma->vm_end && end & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned split]"); + return -EINVAL; + } + if (start != vma->vm_start && end != vma->vm_end) { + ret = split_vma(mm, vma, start, 1); + if (ret < 0) { + kleave(" = %d [split]", ret); + return ret; + } + } + return shrink_vma(mm, vma, start, end); + } +erase_whole_vma: + delete_vma_from_mm(vma); + delete_vma(mm, vma); + kleave(" = 0"); return 0; } EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; @@ -1204,32 +1603,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len) } /* - * Release all mappings + * release all the mappings made in a process's VM space */ -void exit_mmap(struct mm_struct * mm) +void exit_mmap(struct mm_struct *mm) { - struct vm_list_struct *tmp; - - if (mm) { -#ifdef DEBUG - printk("Exit_mmap:\n"); -#endif + struct vm_area_struct *vma; - mm->total_vm = 0; + if (!mm) + return; - while ((tmp = mm->context.vmlist)) { - mm->context.vmlist = tmp->next; - put_vma(mm, tmp->vma); + kenter(""); - realalloc -= kobjsize(tmp); - askedalloc -= sizeof(*tmp); - kfree(tmp); - } + mm->total_vm = 0; -#ifdef DEBUG - show_process_blocks(); -#endif + while ((vma = mm->mmap)) { + mm->mmap = vma->vm_next; + delete_vma_from_mm(vma); + delete_vma(mm, vma); } + + kleave(""); } unsigned long do_brk(unsigned long addr, unsigned long len) @@ -1242,8 +1635,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) * time (controlled by the MREMAP_MAYMOVE flag and available VM space) * * under NOMMU conditions, we only permit changing a mapping's size, and only - * as long as it stays within the hole allocated by the kmalloc() call in - * do_mmap_pgoff() and the block is not shareable + * as long as it stays within the region allocated by do_mmap_private() and the + * block is not shareable * * MREMAP_FIXED is not supported under NOMMU conditions */ @@ -1254,13 +1647,16 @@ unsigned long do_mremap(unsigned long addr, struct vm_area_struct *vma; /* insanity checks first */ - if (new_len == 0) + if (old_len == 0 || new_len == 0) return (unsigned long) -EINVAL; + if (addr & ~PAGE_MASK) + return -EINVAL; + if (flags & MREMAP_FIXED && new_addr != addr) return (unsigned long) -EINVAL; - vma = find_vma_exact(current->mm, addr); + vma = find_vma_exact(current->mm, addr, old_len); if (!vma) return (unsigned long) -EINVAL; @@ -1270,22 +1666,18 @@ unsigned long do_mremap(unsigned long addr, if (vma->vm_flags & VM_MAYSHARE) return (unsigned long) -EPERM; - if (new_len > kobjsize((void *) addr)) + if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) return (unsigned long) -ENOMEM; /* all checks complete - do it */ vma->vm_end = vma->vm_start + new_len; - - askedalloc -= old_len; - askedalloc += new_len; - return vma->vm_start; } EXPORT_SYMBOL(do_mremap); -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; diff --git a/mm/shmem.c b/mm/shmem.c index 5d0de96c9789..19d566ccdeea 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2628,7 +2628,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto close_file; #ifdef CONFIG_SHMEM - SHMEM_I(inode)->flags = flags & VM_ACCOUNT; + SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT; #endif d_instantiate(dentry, inode); inode->i_size = size; diff --git a/mm/swapfile.c b/mm/swapfile.c index da422c47e2ee..7e6304dfafab 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -698,8 +698,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte; int ret = 1; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { ret = -ENOMEM; + goto out_nolock; + } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { @@ -723,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, activate_page(page); out: pte_unmap_unlock(pte, ptl); +out_nolock: return ret; } @@ -1377,7 +1380,7 @@ out: return ret; } -asmlinkage long sys_swapoff(const char __user * specialfile) +SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct * p = NULL; unsigned short *swap_map; @@ -1633,7 +1636,7 @@ late_initcall(max_swapfiles_check); * * The swapon system call */ -asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct * p; char *name = NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c5db9a7264d9..75f49d312e8c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -14,7 +14,6 @@ #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/mutex.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -24,6 +23,7 @@ #include <linux/rbtree.h> #include <linux/radix-tree.h> #include <linux/rcupdate.h> +#include <linux/bootmem.h> #include <asm/atomic.h> #include <asm/uaccess.h> @@ -495,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { - static DEFINE_MUTEX(purge_lock); + static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; int nr = 0; @@ -506,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, * the case that isn't actually used at the moment anyway. */ if (!sync && !force_flush) { - if (!mutex_trylock(&purge_lock)) + if (!spin_trylock(&purge_lock)) return; } else - mutex_lock(&purge_lock); + spin_lock(&purge_lock); rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { @@ -541,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, __free_vmap_area(va); spin_unlock(&vmap_area_lock); } - mutex_unlock(&purge_lock); + spin_unlock(&purge_lock); } /* @@ -984,6 +984,8 @@ EXPORT_SYMBOL(vm_map_ram); void __init vmalloc_init(void) { + struct vmap_area *va; + struct vm_struct *tmp; int i; for_each_possible_cpu(i) { @@ -996,6 +998,14 @@ void __init vmalloc_init(void) vbq->nr_dirty = 0; } + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = alloc_bootmem(sizeof(struct vmap_area)); + va->flags = tmp->flags | VM_VM_AREA; + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + __insert_vmap_area(va); + } vmap_initialized = true; } |