From 90c5029e471636f21221bf66b9a46ada2ab79a22 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@muc.de>
Date: Wed, 27 Jul 2005 11:43:50 -0700
Subject: [PATCH] Undo mempolicy shared policy rbtree microoptimization

All mempolicy changes must be inside the spinlock and readding the rb_erase
prevents a crash while doing:

> echo "1" > /tmp/numatest
> numactl --length=0x4000 --shm /tmp/numatest --localalloc
> numactl --length=0x2000 --offset=0 --shm /tmp/numatest --membind=0
> numactl --length=0x2000 --offset=0x2000 --shm /tmp/numatest --membind=1
> ipcs
> ipcrm -M "the_key_value_of_this_shm_area"

Based on a patch by John Blackwood

Cc: <john.blackwood@ccur.com>
Cc: <andrea@suse.de>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cb41c31e7c87..1694845526be 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1138,11 +1138,11 @@ void mpol_free_shared_policy(struct shared_policy *p)
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
+		rb_erase(&n->nd, &p->root);
 		mpol_free(n->policy);
 		kmem_cache_free(sn_cache, n);
 	}
 	spin_unlock(&p->lock);
-	p->root = RB_ROOT;
 }
 
 /* assumes fs == KERNEL_DS */
-- 
cgit v1.2.1


From 1aaf18ff9de1f37bf674236fc0779c3aaa65b998 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 27 Jul 2005 11:43:54 -0700
Subject: [PATCH] check_user_page_readable() deadlock fix

Fix bug identifued by Richard Purdie <rpurdie@rpsys.net>.

oprofile calls check_user_page_readable() from interrupt context, so we
deadlock over various VFS locks.

But check_user_page_readable() doesn't imply either a read or a write of the
page's contents.  Change __follow_page() so that check_user_page_readable()
can tell __follow_page() that we're not accessing the page's contents, and use
that info to avoid the troublesome lock-takings.

Also, make follow_page() inline for the single callsite in memory.c to save a
bit of stack space.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index beabdefa6254..6fe77acbc1cd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -776,8 +776,8 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
  * Do a quick page-table lookup for a single page.
  * mm->page_table_lock must be held.
  */
-static struct page *
-__follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
+static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
+			int read, int write, int accessed)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -818,9 +818,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
 		pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (write && !pte_dirty(pte) && !PageDirty(page))
-				set_page_dirty(page);
-			mark_page_accessed(page);
+			if (accessed) {
+				if (write && !pte_dirty(pte) &&!PageDirty(page))
+					set_page_dirty(page);
+				mark_page_accessed(page);
+			}
 			return page;
 		}
 	}
@@ -829,16 +831,19 @@ out:
 	return NULL;
 }
 
-struct page *
+inline struct page *
 follow_page(struct mm_struct *mm, unsigned long address, int write)
 {
-	return __follow_page(mm, address, /*read*/0, write);
+	return __follow_page(mm, address, 0, write, 1);
 }
 
-int
-check_user_page_readable(struct mm_struct *mm, unsigned long address)
+/*
+ * check_user_page_readable() can be called frm niterrupt context by oprofile,
+ * so we need to avoid taking any non-irq-safe locks
+ */
+int check_user_page_readable(struct mm_struct *mm, unsigned long address)
 {
-	return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
+	return __follow_page(mm, address, 1, 0, 0) != NULL;
 }
 EXPORT_SYMBOL(check_user_page_readable);
 
-- 
cgit v1.2.1


From 165cd40235732644b1856a5ed5e158c9b93f6010 Mon Sep 17 00:00:00 2001
From: suzuki <suzuki@in.ibm.com>
Date: Wed, 27 Jul 2005 11:43:59 -0700
Subject: [PATCH] madvise() does not always return -EBADF on non-file mapped
 area

The madvise() system call returns -EBADF for areas which does not map to
files, only for *behaviour* request MADV_WILLNEED.

According to man pages, madvise returns :

EBADF - the map exists, but the area maps something that isn't a file.

Fixes bug 2995.

Signed-off-by: Suzuki K P <suzuki@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/madvise.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index 73180a22877e..c8c01a12fea4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -83,9 +83,6 @@ static long madvise_willneed(struct vm_area_struct * vma,
 {
 	struct file *file = vma->vm_file;
 
-	if (!file)
-		return -EBADF;
-
 	if (file->f_mapping->a_ops->get_xip_page) {
 		/* no bad return value, but ignore advice */
 		return 0;
@@ -140,11 +137,16 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 	return 0;
 }
 
-static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
-			unsigned long start, unsigned long end, int behavior)
+static long
+madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+		unsigned long start, unsigned long end, int behavior)
 {
+	struct file *filp = vma->vm_file;
 	long error = -EBADF;
 
+	if (!filp)
+		goto  out;
+
 	switch (behavior) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
@@ -165,6 +167,7 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev
 		break;
 	}
 		
+out:
 	return error;
 }
 
-- 
cgit v1.2.1


From 12b1c5f382194d3f656e78fb5c9c8f2bfbe8ed8a Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Wed, 27 Jul 2005 11:44:02 -0700
Subject: [PATCH] Remove bogus warning in page_alloc.c

Originally __free_pages_bulk used the relative page number within a zone to
define its buddies.  This meant that to maintain the "maximally aligned"
requirements (that an allocation of size N will be aligned at least to N
physically) zones had to also be aligned to 1<<MAX_ORDER pages.  When
__free_pages_bulk was updated to use the relative page frame numbers of the
free'd pages to pair buddies this released the alignment constraint on the
'left' edge of the zone.  This allows _either_ edge of the zone to contain
partial MAX_ORDER sized buddies.  These simply never will have matching
buddies and thus will never make it to the 'top' of the pyramid.

The patch below removes a now redundant check ensuring that the mem_map was
aligned to MAX_ORDER.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1d6ba6a4b594..42bccfb8464d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1861,7 +1861,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long i, j;
-	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
 	int cpu, nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
@@ -1934,9 +1933,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		zone->zone_mem_map = pfn_to_page(zone_start_pfn);
 		zone->zone_start_pfn = zone_start_pfn;
 
-		if ((zone_start_pfn) & (zone_required_alignment-1))
-			printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
-
 		memmap_init(size, nid, j, zone_start_pfn);
 
 		zonetable_add(zone, nid, j, zone_start_pfn, size);
-- 
cgit v1.2.1


From e310fd43256b3cf4d37f6447b8f7413ca744657a Mon Sep 17 00:00:00 2001
From: "Martin J. Bligh" <mbligh@mbligh.org>
Date: Fri, 29 Jul 2005 22:59:18 -0700
Subject: [PATCH] Fix NUMA node sizing in nr_free_zone_pages

We are iterating over all nodes in nr_free_zone_pages().  Because the
fallback zonelists contain all nodes in the system, and we walk all the
zonelists, we're counting memory multiple times (once for each node).  This
caused us to make a size estimate of 32GB for an 8GB AMD64 box, which makes
all the dirty ratio calculations, etc incorrect.

There's still a further bug to fix from e820 holes causing overestimation
as well, but this fix is separate, and good as is, and fixes one class of
problems.  Problem found by Badari, and tested by Ram Pai - thanks!

Signed-off-by:  Martin J. Bligh <mbligh@mbligh.org>
Signed-off-by:  Matt Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 42bccfb8464d..8d088371196a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1061,20 +1061,19 @@ unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 
 static unsigned int nr_free_zone_pages(int offset)
 {
-	pg_data_t *pgdat;
+	/* Just pick one node, since fallback list is circular */
+	pg_data_t *pgdat = NODE_DATA(numa_node_id());
 	unsigned int sum = 0;
 
-	for_each_pgdat(pgdat) {
-		struct zonelist *zonelist = pgdat->node_zonelists + offset;
-		struct zone **zonep = zonelist->zones;
-		struct zone *zone;
+	struct zonelist *zonelist = pgdat->node_zonelists + offset;
+	struct zone **zonep = zonelist->zones;
+	struct zone *zone;
 
-		for (zone = *zonep++; zone; zone = *zonep++) {
-			unsigned long size = zone->present_pages;
-			unsigned long high = zone->pages_high;
-			if (size > high)
-				sum += size - high;
-		}
+	for (zone = *zonep++; zone; zone = *zonep++) {
+		unsigned long size = zone->present_pages;
+		unsigned long high = zone->pages_high;
+		if (size > high)
+			sum += size - high;
 	}
 
 	return sum;
-- 
cgit v1.2.1


From 4ceb5db9757aaeadcf8fbbf97d76bd42aa4df0d6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 1 Aug 2005 11:14:49 -0700
Subject: Fix get_user_pages() race for write access

There's no real guarantee that handle_mm_fault() will always be able to
break a COW situation - if an update from another thread ends up
modifying the page table some way, handle_mm_fault() may end up
requiring us to re-try the operation.

That's normally fine, but get_user_pages() ended up re-trying it as a
read, and thus a write access could in theory end up losing the dirty
bit or be done on a page that had not been properly COW'ed.

This makes get_user_pages() always retry write accesses as write
accesses by making "follow_page()" require that a writable follow has
the dirty bit set.  That simplifies the code and solves the race: if the
COW break fails for some reason, we'll just loop around and try again.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 6fe77acbc1cd..4e1c673784db 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -811,18 +811,15 @@ static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
 	pte = *ptep;
 	pte_unmap(ptep);
 	if (pte_present(pte)) {
-		if (write && !pte_write(pte))
+		if (write && !pte_dirty(pte))
 			goto out;
 		if (read && !pte_read(pte))
 			goto out;
 		pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (accessed) {
-				if (write && !pte_dirty(pte) &&!PageDirty(page))
-					set_page_dirty(page);
+			if (accessed)
 				mark_page_accessed(page);
-			}
 			return page;
 		}
 	}
@@ -941,10 +938,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		spin_lock(&mm->page_table_lock);
 		do {
 			struct page *page;
-			int lookup_write = write;
 
 			cond_resched_lock(&mm->page_table_lock);
-			while (!(page = follow_page(mm, start, lookup_write))) {
+			while (!(page = follow_page(mm, start, write))) {
 				/*
 				 * Shortcut for anonymous pages. We don't want
 				 * to force the creation of pages tables for
@@ -952,8 +948,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				 * nobody touched so far. This is important
 				 * for doing a core dump for these mappings.
 				 */
-				if (!lookup_write &&
-				    untouched_anonymous_page(mm,vma,start)) {
+				if (!write && untouched_anonymous_page(mm,vma,start)) {
 					page = ZERO_PAGE(start);
 					break;
 				}
@@ -972,14 +967,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				default:
 					BUG();
 				}
-				/*
-				 * Now that we have performed a write fault
-				 * and surely no longer have a shared page we
-				 * shouldn't write, we shouldn't ignore an
-				 * unwritable page in the page table if
-				 * we are forcing write access.
-				 */
-				lookup_write = write && !force;
 				spin_lock(&mm->page_table_lock);
 			}
 			if (pages) {
-- 
cgit v1.2.1


From 690dbe1ced143876d8fa56b72310738dbe079d0a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 1 Aug 2005 21:11:42 -0700
Subject: [PATCH] x86_64: access of some bad address

x86_64 has a large sparse gate area between VSYSCALL_START and
VSYSCALL_END, not all of it presently backed by pmds.  Alexander Nyberg has
found that in some circumstances gdb may try to ptrace here, and hit
get_user_pages BUG_ON.  It seems odd that gdb should be accessing here, but
it certainly shouldn't crash in this way: relax BUG_ON to -EFAULT.  Fixes
kernel bugzilla #4801.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 4e1c673784db..2405289dfdf8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -910,9 +910,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			pud = pud_offset(pgd, pg);
 			BUG_ON(pud_none(*pud));
 			pmd = pmd_offset(pud, pg);
-			BUG_ON(pmd_none(*pmd));
+			if (pmd_none(*pmd))
+				return i ? : -EFAULT;
 			pte = pte_offset_map(pmd, pg);
-			BUG_ON(pte_none(*pte));
+			if (pte_none(*pte)) {
+				pte_unmap(pte);
+				return i ? : -EFAULT;
+			}
 			if (pages) {
 				pages[i] = pte_page(*pte);
 				get_page(pages[i]);
-- 
cgit v1.2.1


From ba17101b41977f124948e0a7797fdcbb59e19f3e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 1 Aug 2005 21:11:43 -0700
Subject: [PATCH] sys_set_mempolicy() doesnt check if mode < 0

A kernel BUG() is triggered by a call to set_mempolicy() with a negative
first argument.  This is because the mode is declared as an int, and the
validity check doesnt check < 0 values.  Alternatively, mode could be
declared as unsigned int or unsigned long.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1694845526be..b4eababc8198 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -443,7 +443,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 	struct mempolicy *new;
 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
 
-	if (mode > MPOL_MAX)
+	if (mode < 0 || mode > MPOL_MAX)
 		return -EINVAL;
 	err = get_nodes(nodes, nmask, maxnode, mode);
 	if (err)
-- 
cgit v1.2.1


From f33ea7f404e592e4563b12101b7a4d17da6558d7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Wed, 3 Aug 2005 20:24:01 +1000
Subject: [PATCH] fix get_user_pages bug

Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.

So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.

But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case.  To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.

Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 2405289dfdf8..81d7117aa58b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -811,15 +811,18 @@ static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
 	pte = *ptep;
 	pte_unmap(ptep);
 	if (pte_present(pte)) {
-		if (write && !pte_dirty(pte))
+		if (write && !pte_write(pte))
 			goto out;
 		if (read && !pte_read(pte))
 			goto out;
 		pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (accessed)
+			if (accessed) {
+				if (write && !pte_dirty(pte) &&!PageDirty(page))
+					set_page_dirty(page);
 				mark_page_accessed(page);
+			}
 			return page;
 		}
 	}
@@ -941,10 +944,11 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		}
 		spin_lock(&mm->page_table_lock);
 		do {
+			int write_access = write;
 			struct page *page;
 
 			cond_resched_lock(&mm->page_table_lock);
-			while (!(page = follow_page(mm, start, write))) {
+			while (!(page = follow_page(mm, start, write_access))) {
 				/*
 				 * Shortcut for anonymous pages. We don't want
 				 * to force the creation of pages tables for
@@ -957,7 +961,16 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 					break;
 				}
 				spin_unlock(&mm->page_table_lock);
-				switch (handle_mm_fault(mm,vma,start,write)) {
+				switch (__handle_mm_fault(mm, vma, start,
+							write_access)) {
+				case VM_FAULT_WRITE:
+					/*
+					 * do_wp_page has broken COW when
+					 * necessary, even if maybe_mkwrite
+					 * decided not to set pte_write
+					 */
+					write_access = 0;
+					/* FALLTHRU */
 				case VM_FAULT_MINOR:
 					tsk->min_flt++;
 					break;
@@ -1220,6 +1233,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	struct page *old_page, *new_page;
 	unsigned long pfn = pte_pfn(pte);
 	pte_t entry;
+	int ret;
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
@@ -1247,7 +1261,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 			lazy_mmu_prot_update(entry);
 			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
-			return VM_FAULT_MINOR;
+			return VM_FAULT_MINOR|VM_FAULT_WRITE;
 		}
 	}
 	pte_unmap(page_table);
@@ -1274,6 +1288,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
+	ret = VM_FAULT_MINOR;
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, pte))) {
@@ -1290,12 +1305,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 
 		/* Free the old page.. */
 		new_page = old_page;
+		ret |= VM_FAULT_WRITE;
 	}
 	pte_unmap(page_table);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	spin_unlock(&mm->page_table_lock);
-	return VM_FAULT_MINOR;
+	return ret;
 
 no_new_page:
 	page_cache_release(old_page);
@@ -1987,7 +2003,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	if (write_access) {
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address, pte, pmd, entry);
-
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
@@ -2002,7 +2017,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 		unsigned long address, int write_access)
 {
 	pgd_t *pgd;
-- 
cgit v1.2.1


From a68d2ebc1581a3aec57bd032651e013fa609f530 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 3 Aug 2005 10:07:09 -0700
Subject: Fix up recent get_user_pages() handling

The VM_FAULT_WRITE thing is an extra bit, not a valid return value, and
has to be treated as such by get_user_pages().

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 81d7117aa58b..e046b7e4b530 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -949,6 +949,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
 			cond_resched_lock(&mm->page_table_lock);
 			while (!(page = follow_page(mm, start, write_access))) {
+				int ret;
+
 				/*
 				 * Shortcut for anonymous pages. We don't want
 				 * to force the creation of pages tables for
@@ -961,16 +963,18 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 					break;
 				}
 				spin_unlock(&mm->page_table_lock);
-				switch (__handle_mm_fault(mm, vma, start,
-							write_access)) {
-				case VM_FAULT_WRITE:
-					/*
-					 * do_wp_page has broken COW when
-					 * necessary, even if maybe_mkwrite
-					 * decided not to set pte_write
-					 */
+				ret = __handle_mm_fault(mm, vma, start, write_access);
+
+				/*
+				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
+				 * broken COW when necessary, even if maybe_mkwrite
+				 * decided not to set pte_write. We can thus safely do
+				 * subsequent page lookups as if they were reads.
+				 */
+				if (ret & VM_FAULT_WRITE)
 					write_access = 0;
-					/* FALLTHRU */
+				
+				switch (ret & ~VM_FAULT_WRITE) {
 				case VM_FAULT_MINOR:
 					tsk->min_flt++;
 					break;
-- 
cgit v1.2.1


From 1c5ad84516ae7ea4ec868436a910a6bd8d20215a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 4 Aug 2005 13:07:09 -0700
Subject: [PATCH] fix VmSize and VmData after mremap

mremap's move_vma is applying __vm_stat_account to the old vma which may
have already been freed: move it to just before the do_munmap.

mremapping to and fro with CONFIG_DEBUG_SLAB=y showed /proc/<pid>/status
VmSize and VmData wrapping just like in kernel bugzilla #4842, and fixed by
this patch - worth including in 2.6.13, though not yet confirmed that it
fixes that specific report from Frank van Maarseveen.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mremap.c b/mm/mremap.c
index ec7238a78f36..fc45dc9a617b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -229,6 +229,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	 * since do_munmap() will decrement it by old_len == new_len
 	 */
 	mm->total_vm += new_len >> PAGE_SHIFT;
+	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
 	if (do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
@@ -243,7 +244,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 			vma->vm_next->vm_flags |= VM_ACCOUNT;
 	}
 
-	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += new_len >> PAGE_SHIFT;
 		if (new_len > old_len)
-- 
cgit v1.2.1


From 2f60f8d3573ff90fe5d75a6d11fd2add1248e7d6 Mon Sep 17 00:00:00 2001
From: Simon Derr <Simon.Derr@bull.net>
Date: Thu, 4 Aug 2005 19:52:03 -0700
Subject: [PATCH] __vm_enough_memory() signedness fix

We have found what seems to be a small bug in __vm_enough_memory() when
sysctl_overcommit_memory is set to OVERCOMMIT_NEVER.

When this bug occurs the systems fails to boot, with /sbin/init whining
about fork() returning ENOMEM.

We hunted down the problem to this:

The deferred update mecanism used in vm_acct_memory(), on a SMP system,
allows the vm_committed_space counter to have a negative value.

This should not be a problem since this counter is known to be inaccurate.

But in __vm_enough_memory() this counter is compared to the `allowed'
variable, which is an unsigned long.  This comparison is broken since it
will consider the negative values of vm_committed_space to be huge positive
values, resulting in a memory allocation failure.

Signed-off-by: <Jean-Marc.Saffroy@ext.bull.net>
Signed-off-by: <Simon.Derr@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c  | 6 +++++-
 mm/nommu.c | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index da3fa90a0aae..404319477e71 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -143,7 +143,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 	   leave 3% of the size of this process for other processes */
 	allowed -= current->mm->total_vm / 32;
 
-	if (atomic_read(&vm_committed_space) < allowed)
+	/*
+	 * cast `allowed' as a signed long because vm_committed_space
+	 * sometimes has a negative value
+	 */
+	if (atomic_read(&vm_committed_space) < (long)allowed)
 		return 0;
 
 	vm_unacct_memory(pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index ce74452c02d9..fd4e8df0f02d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1167,7 +1167,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 	   leave 3% of the size of this process for other processes */
 	allowed -= current->mm->total_vm / 32;
 
-	if (atomic_read(&vm_committed_space) < allowed)
+	/*
+	 * cast `allowed' as a signed long because vm_committed_space
+	 * sometimes has a negative value
+	 */
+	if (atomic_read(&vm_committed_space) < (long)allowed)
 		return 0;
 
 	vm_unacct_memory(pages);
-- 
cgit v1.2.1


From c7546f8f03f5a4fa612605b6be930234d6026860 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 5 Aug 2005 11:59:35 -0700
Subject: [PATCH] Fix hugepage crash on failing mmap()

This patch fixes a crash in the hugepage code.  unmap_hugepage_area() was
assuming that (due to prefault) PTEs must exist for all the area in
question.  However, this may not be the case, if mmap() encounters an error
before the prefault and calls unmap_region() to clean up any partial
mapping.

Depending on the hugepage configuration, this crash can be triggered by an
unpriveleged user.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fbd1111ea119..6bf720bc662c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -301,6 +301,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
+	pte_t *ptep;
 	pte_t pte;
 	struct page *page;
 
@@ -309,9 +310,17 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(end & ~HPAGE_MASK);
 
 	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+		ptep = huge_pte_offset(mm, address);
+		if (! ptep)
+			/* This can happen on truncate, or if an
+			 * mmap() is aborted due to an error before
+			 * the prefault */
+			continue;
+
+		pte = huge_ptep_get_and_clear(mm, address, ptep);
 		if (pte_none(pte))
 			continue;
+
 		page = pte_page(pte);
 		put_page(page);
 	}
-- 
cgit v1.2.1