From 90c5029e471636f21221bf66b9a46ada2ab79a22 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 27 Jul 2005 11:43:50 -0700 Subject: [PATCH] Undo mempolicy shared policy rbtree microoptimization All mempolicy changes must be inside the spinlock and readding the rb_erase prevents a crash while doing: > echo "1" > /tmp/numatest > numactl --length=0x4000 --shm /tmp/numatest --localalloc > numactl --length=0x2000 --offset=0 --shm /tmp/numatest --membind=0 > numactl --length=0x2000 --offset=0x2000 --shm /tmp/numatest --membind=1 > ipcs > ipcrm -M "the_key_value_of_this_shm_area" Based on a patch by John Blackwood Cc: Cc: Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index cb41c31e7c87..1694845526be 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1138,11 +1138,11 @@ void mpol_free_shared_policy(struct shared_policy *p) while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); + rb_erase(&n->nd, &p->root); mpol_free(n->policy); kmem_cache_free(sn_cache, n); } spin_unlock(&p->lock); - p->root = RB_ROOT; } /* assumes fs == KERNEL_DS */ -- cgit v1.2.1 From 1aaf18ff9de1f37bf674236fc0779c3aaa65b998 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 27 Jul 2005 11:43:54 -0700 Subject: [PATCH] check_user_page_readable() deadlock fix Fix bug identifued by Richard Purdie . oprofile calls check_user_page_readable() from interrupt context, so we deadlock over various VFS locks. But check_user_page_readable() doesn't imply either a read or a write of the page's contents. Change __follow_page() so that check_user_page_readable() can tell __follow_page() that we're not accessing the page's contents, and use that info to avoid the troublesome lock-takings. Also, make follow_page() inline for the single callsite in memory.c to save a bit of stack space. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index beabdefa6254..6fe77acbc1cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -776,8 +776,8 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -static struct page * -__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) +static struct page *__follow_page(struct mm_struct *mm, unsigned long address, + int read, int write, int accessed) { pgd_t *pgd; pud_t *pud; @@ -818,9 +818,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write) pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (write && !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); + if (accessed) { + if (write && !pte_dirty(pte) &&!PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } return page; } } @@ -829,16 +831,19 @@ out: return NULL; } -struct page * +inline struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) { - return __follow_page(mm, address, /*read*/0, write); + return __follow_page(mm, address, 0, write, 1); } -int -check_user_page_readable(struct mm_struct *mm, unsigned long address) +/* + * check_user_page_readable() can be called frm niterrupt context by oprofile, + * so we need to avoid taking any non-irq-safe locks + */ +int check_user_page_readable(struct mm_struct *mm, unsigned long address) { - return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; + return __follow_page(mm, address, 1, 0, 0) != NULL; } EXPORT_SYMBOL(check_user_page_readable); -- cgit v1.2.1 From 165cd40235732644b1856a5ed5e158c9b93f6010 Mon Sep 17 00:00:00 2001 From: suzuki Date: Wed, 27 Jul 2005 11:43:59 -0700 Subject: [PATCH] madvise() does not always return -EBADF on non-file mapped area The madvise() system call returns -EBADF for areas which does not map to files, only for *behaviour* request MADV_WILLNEED. According to man pages, madvise returns : EBADF - the map exists, but the area maps something that isn't a file. Fixes bug 2995. Signed-off-by: Suzuki K P Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 73180a22877e..c8c01a12fea4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -83,9 +83,6 @@ static long madvise_willneed(struct vm_area_struct * vma, { struct file *file = vma->vm_file; - if (!file) - return -EBADF; - if (file->f_mapping->a_ops->get_xip_page) { /* no bad return value, but ignore advice */ return 0; @@ -140,11 +137,16 @@ static long madvise_dontneed(struct vm_area_struct * vma, return 0; } -static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) +static long +madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) { + struct file *filp = vma->vm_file; long error = -EBADF; + if (!filp) + goto out; + switch (behavior) { case MADV_NORMAL: case MADV_SEQUENTIAL: @@ -165,6 +167,7 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev break; } +out: return error; } -- cgit v1.2.1 From 12b1c5f382194d3f656e78fb5c9c8f2bfbe8ed8a Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 27 Jul 2005 11:44:02 -0700 Subject: [PATCH] Remove bogus warning in page_alloc.c Originally __free_pages_bulk used the relative page number within a zone to define its buddies. This meant that to maintain the "maximally aligned" requirements (that an allocation of size N will be aligned at least to N physically) zones had to also be aligned to 1< Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1d6ba6a4b594..42bccfb8464d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1861,7 +1861,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); int cpu, nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; @@ -1934,9 +1933,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->zone_mem_map = pfn_to_page(zone_start_pfn); zone->zone_start_pfn = zone_start_pfn; - if ((zone_start_pfn) & (zone_required_alignment-1)) - printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); - memmap_init(size, nid, j, zone_start_pfn); zonetable_add(zone, nid, j, zone_start_pfn, size); -- cgit v1.2.1 From e310fd43256b3cf4d37f6447b8f7413ca744657a Mon Sep 17 00:00:00 2001 From: "Martin J. Bligh" Date: Fri, 29 Jul 2005 22:59:18 -0700 Subject: [PATCH] Fix NUMA node sizing in nr_free_zone_pages We are iterating over all nodes in nr_free_zone_pages(). Because the fallback zonelists contain all nodes in the system, and we walk all the zonelists, we're counting memory multiple times (once for each node). This caused us to make a size estimate of 32GB for an 8GB AMD64 box, which makes all the dirty ratio calculations, etc incorrect. There's still a further bug to fix from e820 holes causing overestimation as well, but this fix is separate, and good as is, and fixes one class of problems. Problem found by Badari, and tested by Ram Pai - thanks! Signed-off-by: Martin J. Bligh Signed-off-by: Matt Dobson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 42bccfb8464d..8d088371196a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1061,20 +1061,19 @@ unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) static unsigned int nr_free_zone_pages(int offset) { - pg_data_t *pgdat; + /* Just pick one node, since fallback list is circular */ + pg_data_t *pgdat = NODE_DATA(numa_node_id()); unsigned int sum = 0; - for_each_pgdat(pgdat) { - struct zonelist *zonelist = pgdat->node_zonelists + offset; - struct zone **zonep = zonelist->zones; - struct zone *zone; + struct zonelist *zonelist = pgdat->node_zonelists + offset; + struct zone **zonep = zonelist->zones; + struct zone *zone; - for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; - } + for (zone = *zonep++; zone; zone = *zonep++) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; } return sum; -- cgit v1.2.1 From 4ceb5db9757aaeadcf8fbbf97d76bd42aa4df0d6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 1 Aug 2005 11:14:49 -0700 Subject: Fix get_user_pages() race for write access There's no real guarantee that handle_mm_fault() will always be able to break a COW situation - if an update from another thread ends up modifying the page table some way, handle_mm_fault() may end up requiring us to re-try the operation. That's normally fine, but get_user_pages() ended up re-trying it as a read, and thus a write access could in theory end up losing the dirty bit or be done on a page that had not been properly COW'ed. This makes get_user_pages() always retry write accesses as write accesses by making "follow_page()" require that a writable follow has the dirty bit set. That simplifies the code and solves the race: if the COW break fails for some reason, we'll just loop around and try again. Signed-off-by: Linus Torvalds --- mm/memory.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6fe77acbc1cd..4e1c673784db 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -811,18 +811,15 @@ static struct page *__follow_page(struct mm_struct *mm, unsigned long address, pte = *ptep; pte_unmap(ptep); if (pte_present(pte)) { - if (write && !pte_write(pte)) + if (write && !pte_dirty(pte)) goto out; if (read && !pte_read(pte)) goto out; pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (accessed) { - if (write && !pte_dirty(pte) &&!PageDirty(page)) - set_page_dirty(page); + if (accessed) mark_page_accessed(page); - } return page; } } @@ -941,10 +938,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, spin_lock(&mm->page_table_lock); do { struct page *page; - int lookup_write = write; cond_resched_lock(&mm->page_table_lock); - while (!(page = follow_page(mm, start, lookup_write))) { + while (!(page = follow_page(mm, start, write))) { /* * Shortcut for anonymous pages. We don't want * to force the creation of pages tables for @@ -952,8 +948,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * nobody touched so far. This is important * for doing a core dump for these mappings. */ - if (!lookup_write && - untouched_anonymous_page(mm,vma,start)) { + if (!write && untouched_anonymous_page(mm,vma,start)) { page = ZERO_PAGE(start); break; } @@ -972,14 +967,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } - /* - * Now that we have performed a write fault - * and surely no longer have a shared page we - * shouldn't write, we shouldn't ignore an - * unwritable page in the page table if - * we are forcing write access. - */ - lookup_write = write && !force; spin_lock(&mm->page_table_lock); } if (pages) { -- cgit v1.2.1 From 690dbe1ced143876d8fa56b72310738dbe079d0a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 1 Aug 2005 21:11:42 -0700 Subject: [PATCH] x86_64: access of some bad address x86_64 has a large sparse gate area between VSYSCALL_START and VSYSCALL_END, not all of it presently backed by pmds. Alexander Nyberg has found that in some circumstances gdb may try to ptrace here, and hit get_user_pages BUG_ON. It seems odd that gdb should be accessing here, but it certainly shouldn't crash in this way: relax BUG_ON to -EFAULT. Fixes kernel bugzilla #4801. Signed-off-by: Hugh Dickins Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 4e1c673784db..2405289dfdf8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -910,9 +910,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud = pud_offset(pgd, pg); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); - BUG_ON(pmd_none(*pmd)); + if (pmd_none(*pmd)) + return i ? : -EFAULT; pte = pte_offset_map(pmd, pg); - BUG_ON(pte_none(*pte)); + if (pte_none(*pte)) { + pte_unmap(pte); + return i ? : -EFAULT; + } if (pages) { pages[i] = pte_page(*pte); get_page(pages[i]); -- cgit v1.2.1 From ba17101b41977f124948e0a7797fdcbb59e19f3e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Aug 2005 21:11:43 -0700 Subject: [PATCH] sys_set_mempolicy() doesnt check if mode < 0 A kernel BUG() is triggered by a call to set_mempolicy() with a negative first argument. This is because the mode is declared as an int, and the validity check doesnt check < 0 values. Alternatively, mode could be declared as unsigned int or unsigned long. Signed-off-by: Eric Dumazet Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1694845526be..b4eababc8198 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -443,7 +443,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, struct mempolicy *new; DECLARE_BITMAP(nodes, MAX_NUMNODES); - if (mode > MPOL_MAX) + if (mode < 0 || mode > MPOL_MAX) return -EINVAL; err = get_nodes(nodes, nmask, maxnode, mode); if (err) -- cgit v1.2.1 From f33ea7f404e592e4563b12101b7a4d17da6558d7 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 3 Aug 2005 20:24:01 +1000 Subject: [PATCH] fix get_user_pages bug Checking pte_dirty instead of pte_write in __follow_page is problematic for s390, and for copy_one_pte which leaves dirty when clearing write. So revert __follow_page to check pte_write as before, and make do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has done its full job: once get_user_pages receives this value, it no longer requires pte_write in __follow_page. But most callers of handle_mm_fault, in the various architectures, have switch statements which do not expect this new case. To avoid changing them all in a hurry, make an inline wrapper function (using the old name) that masks off the new bit, and use the extended interface with double underscores. Yes, we do have a call to do_wp_page from do_swap_page, but no need to change that: in rare case it's needed, another do_wp_page will follow. Signed-off-by: Hugh Dickins [ Cleanups by Nick Piggin ] Signed-off-by: Linus Torvalds --- mm/memory.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 2405289dfdf8..81d7117aa58b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -811,15 +811,18 @@ static struct page *__follow_page(struct mm_struct *mm, unsigned long address, pte = *ptep; pte_unmap(ptep); if (pte_present(pte)) { - if (write && !pte_dirty(pte)) + if (write && !pte_write(pte)) goto out; if (read && !pte_read(pte)) goto out; pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (accessed) + if (accessed) { + if (write && !pte_dirty(pte) &&!PageDirty(page)) + set_page_dirty(page); mark_page_accessed(page); + } return page; } } @@ -941,10 +944,11 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } spin_lock(&mm->page_table_lock); do { + int write_access = write; struct page *page; cond_resched_lock(&mm->page_table_lock); - while (!(page = follow_page(mm, start, write))) { + while (!(page = follow_page(mm, start, write_access))) { /* * Shortcut for anonymous pages. We don't want * to force the creation of pages tables for @@ -957,7 +961,16 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, break; } spin_unlock(&mm->page_table_lock); - switch (handle_mm_fault(mm,vma,start,write)) { + switch (__handle_mm_fault(mm, vma, start, + write_access)) { + case VM_FAULT_WRITE: + /* + * do_wp_page has broken COW when + * necessary, even if maybe_mkwrite + * decided not to set pte_write + */ + write_access = 0; + /* FALLTHRU */ case VM_FAULT_MINOR: tsk->min_flt++; break; @@ -1220,6 +1233,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); pte_t entry; + int ret; if (unlikely(!pfn_valid(pfn))) { /* @@ -1247,7 +1261,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, lazy_mmu_prot_update(entry); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; + return VM_FAULT_MINOR|VM_FAULT_WRITE; } } pte_unmap(page_table); @@ -1274,6 +1288,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, /* * Re-check the pte - we dropped the lock */ + ret = VM_FAULT_MINOR; spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, pte))) { @@ -1290,12 +1305,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, /* Free the old page.. */ new_page = old_page; + ret |= VM_FAULT_WRITE; } pte_unmap(page_table); page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; + return ret; no_new_page: page_cache_release(old_page); @@ -1987,7 +2003,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, entry); - entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); @@ -2002,7 +2017,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access) { pgd_t *pgd; -- cgit v1.2.1 From a68d2ebc1581a3aec57bd032651e013fa609f530 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 3 Aug 2005 10:07:09 -0700 Subject: Fix up recent get_user_pages() handling The VM_FAULT_WRITE thing is an extra bit, not a valid return value, and has to be treated as such by get_user_pages(). Signed-off-by: Linus Torvalds --- mm/memory.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 81d7117aa58b..e046b7e4b530 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -949,6 +949,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched_lock(&mm->page_table_lock); while (!(page = follow_page(mm, start, write_access))) { + int ret; + /* * Shortcut for anonymous pages. We don't want * to force the creation of pages tables for @@ -961,16 +963,18 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, break; } spin_unlock(&mm->page_table_lock); - switch (__handle_mm_fault(mm, vma, start, - write_access)) { - case VM_FAULT_WRITE: - /* - * do_wp_page has broken COW when - * necessary, even if maybe_mkwrite - * decided not to set pte_write - */ + ret = __handle_mm_fault(mm, vma, start, write_access); + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has + * broken COW when necessary, even if maybe_mkwrite + * decided not to set pte_write. We can thus safely do + * subsequent page lookups as if they were reads. + */ + if (ret & VM_FAULT_WRITE) write_access = 0; - /* FALLTHRU */ + + switch (ret & ~VM_FAULT_WRITE) { case VM_FAULT_MINOR: tsk->min_flt++; break; -- cgit v1.2.1 From 1c5ad84516ae7ea4ec868436a910a6bd8d20215a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 4 Aug 2005 13:07:09 -0700 Subject: [PATCH] fix VmSize and VmData after mremap mremap's move_vma is applying __vm_stat_account to the old vma which may have already been freed: move it to just before the do_munmap. mremapping to and fro with CONFIG_DEBUG_SLAB=y showed /proc//status VmSize and VmData wrapping just like in kernel bugzilla #4842, and fixed by this patch - worth including in 2.6.13, though not yet confirmed that it fixes that specific report from Frank van Maarseveen. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index ec7238a78f36..fc45dc9a617b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -229,6 +229,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * since do_munmap() will decrement it by old_len == new_len */ mm->total_vm += new_len >> PAGE_SHIFT; + __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); if (do_munmap(mm, old_addr, old_len) < 0) { /* OOM: unable to split vma, just get accounts right */ @@ -243,7 +244,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, vma->vm_next->vm_flags |= VM_ACCOUNT; } - __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); if (vm_flags & VM_LOCKED) { mm->locked_vm += new_len >> PAGE_SHIFT; if (new_len > old_len) -- cgit v1.2.1 From 2f60f8d3573ff90fe5d75a6d11fd2add1248e7d6 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Thu, 4 Aug 2005 19:52:03 -0700 Subject: [PATCH] __vm_enough_memory() signedness fix We have found what seems to be a small bug in __vm_enough_memory() when sysctl_overcommit_memory is set to OVERCOMMIT_NEVER. When this bug occurs the systems fails to boot, with /sbin/init whining about fork() returning ENOMEM. We hunted down the problem to this: The deferred update mecanism used in vm_acct_memory(), on a SMP system, allows the vm_committed_space counter to have a negative value. This should not be a problem since this counter is known to be inaccurate. But in __vm_enough_memory() this counter is compared to the `allowed' variable, which is an unsigned long. This comparison is broken since it will consider the negative values of vm_committed_space to be huge positive values, resulting in a memory allocation failure. Signed-off-by: Signed-off-by: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 6 +++++- mm/nommu.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index da3fa90a0aae..404319477e71 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -143,7 +143,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin) leave 3% of the size of this process for other processes */ allowed -= current->mm->total_vm / 32; - if (atomic_read(&vm_committed_space) < allowed) + /* + * cast `allowed' as a signed long because vm_committed_space + * sometimes has a negative value + */ + if (atomic_read(&vm_committed_space) < (long)allowed) return 0; vm_unacct_memory(pages); diff --git a/mm/nommu.c b/mm/nommu.c index ce74452c02d9..fd4e8df0f02d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1167,7 +1167,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin) leave 3% of the size of this process for other processes */ allowed -= current->mm->total_vm / 32; - if (atomic_read(&vm_committed_space) < allowed) + /* + * cast `allowed' as a signed long because vm_committed_space + * sometimes has a negative value + */ + if (atomic_read(&vm_committed_space) < (long)allowed) return 0; vm_unacct_memory(pages); -- cgit v1.2.1 From c7546f8f03f5a4fa612605b6be930234d6026860 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Aug 2005 11:59:35 -0700 Subject: [PATCH] Fix hugepage crash on failing mmap() This patch fixes a crash in the hugepage code. unmap_hugepage_area() was assuming that (due to prefault) PTEs must exist for all the area in question. However, this may not be the case, if mmap() encounters an error before the prefault and calls unmap_region() to clean up any partial mapping. Depending on the hugepage configuration, this crash can be triggered by an unpriveleged user. Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fbd1111ea119..6bf720bc662c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -301,6 +301,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm = vma->vm_mm; unsigned long address; + pte_t *ptep; pte_t pte; struct page *page; @@ -309,9 +310,17 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(end & ~HPAGE_MASK); for (address = start; address < end; address += HPAGE_SIZE) { - pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address)); + ptep = huge_pte_offset(mm, address); + if (! ptep) + /* This can happen on truncate, or if an + * mmap() is aborted due to an error before + * the prefault */ + continue; + + pte = huge_ptep_get_and_clear(mm, address, ptep); if (pte_none(pte)) continue; + page = pte_page(pte); put_page(page); } -- cgit v1.2.1