From e9959f0f37160e1f5351af828cc981712b5066c1 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 24 Nov 2010 12:57:09 -0800
Subject: mm/page_alloc.c: fix build_all_zonelist() where percpu_alloc() is
 wrongly called under stop_machine_run()

During memory hotplug, build_allzonelists() may be called under
stop_machine_run().  In this function, setup_zone_pageset() is called.
But it's bug because it will do page allocation under stop_machine_run().

Here is a report from Alok Kataria.

  BUG: sleeping function called from invalid context at kernel/mutex.c:94
  in_atomic(): 0, irqs_disabled(): 1, pid: 4, name: migration/0
  Pid: 4, comm: migration/0 Not tainted 2.6.35.6-45.fc14.x86_64 #1
  Call Trace:
   [<ffffffff8103d12b>] __might_sleep+0xeb/0xf0
   [<ffffffff81468245>] mutex_lock+0x24/0x50
   [<ffffffff8110eaa6>] pcpu_alloc+0x6d/0x7ee
   [<ffffffff81048888>] ? load_balance+0xbe/0x60e
   [<ffffffff8103a1b3>] ? rt_se_boosted+0x21/0x2f
   [<ffffffff8103e1cf>] ? dequeue_rt_stack+0x18b/0x1ed
   [<ffffffff8110f237>] __alloc_percpu+0x10/0x12
   [<ffffffff81465e22>] setup_zone_pageset+0x38/0xbe
   [<ffffffff810d6d81>] ? build_zonelists_node.clone.58+0x79/0x8c
   [<ffffffff81452539>] __build_all_zonelists+0x419/0x46c
   [<ffffffff8108ef01>] ? cpu_stopper_thread+0xb2/0x198
   [<ffffffff8108f075>] stop_machine_cpu_stop+0x8e/0xc5
   [<ffffffff8108efe7>] ? stop_machine_cpu_stop+0x0/0xc5
   [<ffffffff8108ef57>] cpu_stopper_thread+0x108/0x198
   [<ffffffff81467a37>] ? schedule+0x5b2/0x5cc
   [<ffffffff8108ee4f>] ? cpu_stopper_thread+0x0/0x198
   [<ffffffff81065f29>] kthread+0x7f/0x87
   [<ffffffff8100aae4>] kernel_thread_helper+0x4/0x10
   [<ffffffff81065eaa>] ? kthread+0x0/0x87
   [<ffffffff8100aae0>] ? kernel_thread_helper+0x0/0x10
  Built 5 zonelists in Node order, mobility grouping on.  Total pages: 289456
  Policy zone: Normal

This patch tries to fix the issue by moving setup_zone_pageset() out from
stop_machine_run(). It's obviously not necessary to be called under
stop_machine_run().

[akpm@linux-foundation.org: remove unneeded local]
Reported-by: Alok Kataria <akataria@vmware.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Petr Vandrovec <petr@vmware.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'mm/page_alloc.c')
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07a654486f75..e4092704c1a9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3008,14 +3008,6 @@ static __init_refok int __build_all_zonelists(void *data)
 		build_zonelist_cache(pgdat);
 	}
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-	/* Setup real pagesets for the new zone */
-	if (data) {
-		struct zone *zone = data;
-		setup_zone_pageset(zone);
-	}
-#endif
-
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
@@ -3064,7 +3056,11 @@ void build_all_zonelists(void *data)
 	} else {
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
-		stop_machine(__build_all_zonelists, data, NULL);
+#ifdef CONFIG_MEMORY_HOTPLUG
+		if (data)
+			setup_zone_pageset((struct zone *)data);
+#endif
+		stop_machine(__build_all_zonelists, NULL, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
-- 
cgit v1.2.1


From fa9f90be745d3b600a9d97a063be404c5e5d9071 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sun, 28 Nov 2010 21:39:34 +0100
Subject: =?UTF-8?q?Kill=20off=20a=20bunch=20of=20warning:=20=E2=80=98inlin?=
 =?UTF-8?q?e=E2=80=99=20is=20not=20at=20beginning=20of=20declaration?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These warnings are spewed during a build of a 'allnoconfig' kernel
(especially the ones from u64_stats_sync.h show up a lot) when building
with -Wextra (which I often do)..
They are
  a) annoying
  b) easy to get rid of.
This patch kills them off.

include/linux/u64_stats_sync.h:70:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:77:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:84:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:96:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:115:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:127:1: warning: ‘inline’ is not at beginning of declaration
kernel/time.c:241:1: warning: ‘inline’ is not at beginning of declaration
kernel/time.c:257:1: warning: ‘inline’ is not at beginning of declaration
kernel/perf_event.c:4513:1: warning: ‘inline’ is not at beginning of declaration
mm/page_alloc.c:4012:1: warning: ‘inline’ is not at beginning of declaration

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07a654486f75..f823ee192e94 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4013,7 +4013,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
 		zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
 }
 #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 
-- 
cgit v1.2.1


From c9e664f1fdf34aa8cede047b206deaa8f1945af0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 3 Dec 2010 22:57:45 +0100
Subject: PM / Hibernate: Fix memory corruption related to swap

There is a problem that swap pages allocated before the creation of
a hibernation image can be released and used for storing the contents
of different memory pages while the image is being saved.  Since the
kernel stored in the image doesn't know of that, it causes memory
corruption to occur after resume from hibernation, especially on
systems with relatively small RAM that need to swap often.

This issue can be addressed by keeping the GFP_IOFS bits clear
in gfp_allowed_mask during the entire hibernation, including the
saving of the image, until the system is finally turned off or
the hibernation is aborted.  Unfortunately, for this purpose
it's necessary to rework the way in which the hibernate and
suspend code manipulates gfp_allowed_mask.

This change is based on an earlier patch from Hugh Dickins.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reported-by: Ondrej Zary <linux@rainbow-software.org>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: stable@kernel.org
---
 mm/page_alloc.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e4092704c1a9..ff7e15872398 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
-void set_gfp_allowed_mask(gfp_t mask)
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
-	gfp_allowed_mask = mask;
+	if (saved_gfp_mask) {
+		gfp_allowed_mask = saved_gfp_mask;
+		saved_gfp_mask = 0;
+	}
 }
 
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
 {
-	gfp_t ret = gfp_allowed_mask;
-
 	WARN_ON(!mutex_is_locked(&pm_mutex));
-	gfp_allowed_mask &= ~mask;
-	return ret;
+	WARN_ON(saved_gfp_mask);
+	saved_gfp_mask = gfp_allowed_mask;
+	gfp_allowed_mask &= ~GFP_IOFS;
 }
 #endif /* CONFIG_PM_SLEEP */
 
-- 
cgit v1.2.1


From 88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:45:41 -0800
Subject: mm: page allocator: adjust the per-cpu counter threshold when memory
 is low

Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES.  To
avoid synchronization overhead, these counters are maintained on a per-cpu
basis and drained both periodically and when a threshold is above a
threshold.  On large CPU systems, the difference between the estimate and
real value of NR_FREE_PAGES can be very high.  The system can get into a
case where pages are allocated far below the min watermark potentially
causing livelock issues.  The commit solved the problem by taking a better
reading of NR_FREE_PAGES when memory was low.

Unfortately, as reported by Shaohua Li this accurate reading can consume a
large amount of CPU time on systems with many sockets due to cache line
bouncing.  This patch takes a different approach.  For large machines
where counter drift might be unsafe and while kswapd is awake, the per-cpu
thresholds for the target pgdat are reduced to limit the level of drift to
what should be a safe level.  This incurs a performance penalty in heavy
memory pressure by a factor that depends on the workload and the machine
but the machine should function correctly without accidentally exhausting
all memory on a node.  There is an additional cost when kswapd wakes and
sleeps but the event is not expected to be frequent - in Shaohua's test
case, there was one recorded sleep and wake event at least.

To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
introduced that takes a more accurate reading of NR_FREE_PAGES when called
from wakeup_kswapd, when deciding whether it is really safe to go back to
sleep in sleeping_prematurely() and when deciding if a zone is really
balanced or not in balance_pgdat().  We are still using an expensive
function but limiting how often it is called.

When the test case is reproduced, the time spent in the watermark
functions is reduced.  The following report is on the percentage of time
spent cumulatively spent in the functions zone_nr_free_pages(),
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
zone_page_state_snapshot(), zone_page_state().

vanilla                      11.6615%
disable-threshold            0.2584%

David said:

: We had to pull aa454840 "mm: page allocator: calculate a better estimate
: of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36
: internally because tests showed that it would cause the machine to stall
: as the result of heavy kswapd activity.  I merged it back with this fix as
: it is pending in the -mm tree and it solves the issue we were seeing, so I
: definitely think this should be pushed to -stable (and I would seriously
: consider it for 2.6.37 inclusion even at this late date).

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Tested-by: Nicolas Bareil <nico@chdir.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: <stable@kernel.org>		[2.6.37.1, 2.6.36.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 826ba6922e84..22a1bb7723e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		      int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
-	long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
 	int o;
 
+	free_pages -= (1 << order) + 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-		return 0;
+		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		min >>= 1;
 
 		if (free_pages <= min)
-			return 0;
+			return false;
 	}
-	return 1;
+	return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags)
+{
+	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+					zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags)
+{
+	long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+								free_pages);
 }
 
 #ifdef CONFIG_NUMA
@@ -2442,7 +2461,7 @@ void show_free_areas(void)
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
-			K(zone_nr_free_pages(zone)),
+			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
-- 
cgit v1.2.1


From 3e7d344970673c5334cf7b5bb27c8c0942b06126 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:45:56 -0800
Subject: mm: vmscan: reclaim order-0 and use compaction instead of lumpy
 reclaim

Lumpy reclaim is disruptive.  It reclaims a large number of pages and
ignores the age of the pages it reclaims.  This can incur significant
stalls and potentially increase the number of major faults.

Compaction has reached the point where it is considered reasonably stable
(meaning it has passed a lot of testing) and is a potential candidate for
displacing lumpy reclaim.  This patch introduces an alternative to lumpy
reclaim whe compaction is available called reclaim/compaction.  The basic
operation is very simple - instead of selecting a contiguous range of
pages to reclaim, a number of order-0 pages are reclaimed and then
compaction is later by either kswapd (compact_zone_order()) or direct
compaction (__alloc_pages_direct_compact()).

[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: use conventional task_struct naming]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22a1bb7723e4..03a66a31bfcd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page;
+	struct task_struct *tsk = current;
 
 	if (!order || compaction_deferred(preferred_zone))
 		return NULL;
 
+	tsk->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 								nodemask);
+	tsk->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 
 		/* Page migration frees to the PCP lists but we want merging */
@@ -2121,6 +2124,19 @@ rebalance:
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
+	} else {
+		/*
+		 * High-order allocations do not necessarily loop after
+		 * direct reclaim and reclaim/compaction depends on compaction
+		 * being called after reclaim so call directly if necessary
+		 */
+		page = __alloc_pages_direct_compact(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask,
+					alloc_flags, preferred_zone,
+					migratetype, &did_some_progress);
+		if (page)
+			goto got_pg;
 	}
 
 nopage:
-- 
cgit v1.2.1


From 77f1fe6b08b13a87391549c8a820ddc817b6f50e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:45:57 -0800
Subject: mm: migration: allow migration to operate asynchronously and avoid
 synchronous compaction in the faster path

Migration synchronously waits for writeback if the initial passes fails.
Callers of memory compaction do not necessarily want this behaviour if the
caller is latency sensitive or expects that synchronous migration is not
going to have a significantly better success rate.

This patch adds a sync parameter to migrate_pages() allowing the caller to
indicate if wait_on_page_writeback() is allowed within migration or not.
For reclaim/compaction, try_to_compact_pages() is first called
asynchronously, direct reclaim runs and then try_to_compact_pages() is
called synchronously as there is a greater expectation that it'll succeed.

[akpm@linux-foundation.org: build/merge fix]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 03a66a31bfcd..0fd486467b4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1812,7 +1812,8 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, unsigned long *did_some_progress)
+	int migratetype, unsigned long *did_some_progress,
+	bool sync_migration)
 {
 	struct page *page;
 	struct task_struct *tsk = current;
@@ -1822,7 +1823,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	tsk->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-								nodemask);
+						nodemask, sync_migration);
 	tsk->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 
@@ -1859,7 +1860,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, unsigned long *did_some_progress)
+	int migratetype, unsigned long *did_some_progress,
+	bool sync_migration)
 {
 	return NULL;
 }
@@ -2001,6 +2003,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	struct task_struct *p = current;
+	bool sync_migration = false;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2063,14 +2066,19 @@ rebalance:
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 
-	/* Try direct compaction */
+	/*
+	 * Try direct compaction. The first pass is asynchronous. Subsequent
+	 * attempts after direct reclaim are synchronous
+	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
-					migratetype, &did_some_progress);
+					migratetype, &did_some_progress,
+					sync_migration);
 	if (page)
 		goto got_pg;
+	sync_migration = true;
 
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2134,7 +2142,8 @@ rebalance:
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
-					migratetype, &did_some_progress);
+					migratetype, &did_some_progress,
+					sync_migration);
 		if (page)
 			goto got_pg;
 	}
-- 
cgit v1.2.1


From 9950474883e027e6e728cbcff25f7f2bf0c96530 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Thu, 13 Jan 2011 15:46:20 -0800
Subject: mm: kswapd: stop high-order balancing when any suitable zone is
 balanced

Simon Kirby reported the following problem

   We're seeing cases on a number of servers where cache never fully
   grows to use all available memory.  Sometimes we see servers with 4 GB
   of memory that never seem to have less than 1.5 GB free, even with a
   constantly-active VM.  In some cases, these servers also swap out while
   this happens, even though they are constantly reading the working set
   into memory.  We have been seeing this happening for a long time; I
   don't think it's anything recent, and it still happens on 2.6.36.

After some debugging work by Simon, Dave Hansen and others, the prevaling
theory became that kswapd is reclaiming order-3 pages requested by SLUB
too aggressive about it.

There are two apparent problems here.  On the target machine, there is a
small Normal zone in comparison to DMA32.  As kswapd tries to balance all
zones, it would continually try reclaiming for Normal even though DMA32
was balanced enough for callers.  The second problem is that
sleeping_prematurely() does not use the same logic as balance_pgdat() when
deciding whether to sleep or not.  This keeps kswapd artifically awake.

A number of tests were run and the figures from previous postings will
look very different for a few reasons.  One, the old figures were forcing
my network card to use GFP_ATOMIC in attempt to replicate Simon's problem.
 Second, I previous specified slub_min_order=3 again in an attempt to
reproduce Simon's problem.  In this posting, I'm depending on Simon to say
whether his problem is fixed or not and these figures are to show the
impact to the ordinary cases.  Finally, the "vmscan" figures are taken
from /proc/vmstat instead of the tracepoints.  There is less information
but recording is less disruptive.

The first test of relevance was postmark with a process running in the
background reading a large amount of anonymous memory in blocks.  The
objective was to vaguely simulate what was happening on Simon's machine
and it's memory intensive enough to have kswapd awake.

POSTMARK
                                            traceonly          kanyzone
Transactions per second:              156.00 ( 0.00%)   153.00 (-1.96%)
Data megabytes read per second:        21.51 ( 0.00%)    21.52 ( 0.05%)
Data megabytes written per second:     29.28 ( 0.00%)    29.11 (-0.58%)
Files created alone per second:       250.00 ( 0.00%)   416.00 (39.90%)
Files create/transact per second:      79.00 ( 0.00%)    76.00 (-3.95%)
Files deleted alone per second:       520.00 ( 0.00%)   420.00 (-23.81%)
Files delete/transact per second:      79.00 ( 0.00%)    76.00 (-3.95%)

MMTests Statistics: duration
User/Sys Time Running Test (seconds)         16.58      17.4
Total Elapsed Time (seconds)                218.48    222.47

VMstat Reclaim Statistics: vmscan
Direct reclaims                                  0          4
Direct reclaim pages scanned                     0        203
Direct reclaim pages reclaimed                   0        184
Kswapd pages scanned                        326631     322018
Kswapd pages reclaimed                      312632     309784
Kswapd low wmark quickly                         1          4
Kswapd high wmark quickly                      122        475
Kswapd skip congestion_wait                      1          0
Pages activated                             700040     705317
Pages deactivated                           212113     203922
Pages written                                 9875       6363

Total pages scanned                         326631    322221
Total pages reclaimed                       312632    309968
%age total pages scanned/reclaimed          95.71%    96.20%
%age total pages scanned/written             3.02%     1.97%

proc vmstat: Faults
Major Faults                                   300       254
Minor Faults                                645183    660284
Page ins                                    493588    486704
Page outs                                  4960088   4986704
Swap ins                                      1230       661
Swap outs                                     9869      6355

Performance is mildly affected because kswapd is no longer doing as much
work and the background memory consumer process is getting in the way.
Note that kswapd scanned and reclaimed fewer pages as it's less aggressive
and overall fewer pages were scanned and reclaimed.  Swap in/out is
particularly reduced again reflecting kswapd throwing out fewer pages.

The slight performance impact is unfortunate here but it looks like a
direct result of kswapd being less aggressive.  As the bug report is about
too many pages being freed by kswapd, it may have to be accepted for now.

The second test is a streaming IO benchmark that was previously used by
Johannes to show regressions in page reclaim.

MICRO
					 traceonly  kanyzone
User/Sys Time Running Test (seconds)         29.29     28.87
Total Elapsed Time (seconds)                492.18    488.79

VMstat Reclaim Statistics: vmscan
Direct reclaims                               2128       1460
Direct reclaim pages scanned               2284822    1496067
Direct reclaim pages reclaimed              148919     110937
Kswapd pages scanned                      15450014   16202876
Kswapd pages reclaimed                     8503697    8537897
Kswapd low wmark quickly                      3100       3397
Kswapd high wmark quickly                     1860       7243
Kswapd skip congestion_wait                    708        801
Pages activated                               9635       9573
Pages deactivated                             1432       1271
Pages written                                  223       1130

Total pages scanned                       17734836  17698943
Total pages reclaimed                      8652616   8648834
%age total pages scanned/reclaimed          48.79%    48.87%
%age total pages scanned/written             0.00%     0.01%

proc vmstat: Faults
Major Faults                                   165       221
Minor Faults                               9655785   9656506
Page ins                                      3880      7228
Page outs                                 37692940  37480076
Swap ins                                         0        69
Swap outs                                       19        15

Again fewer pages are scanned and reclaimed as expected and this time the
test completed faster.  Note that kswapd is hitting its watermarks faster
(low and high wmark quickly) which I expect is due to kswapd reclaiming
fewer pages.

I also ran fs-mark, iozone and sysbench but there is nothing interesting
to report in the figures.  Performance is not significantly changed and
the reclaim statistics look reasonable.

Tgis patch:

When the allocator enters its slow path, kswapd is woken up to balance the
node.  It continues working until all zones within the node are balanced.
For order-0 allocations, this makes perfect sense but for higher orders it
can have unintended side-effects.  If the zone sizes are imbalanced,
kswapd may reclaim heavily within a smaller zone discarding an excessive
number of pages.  The user-visible behaviour is that kswapd is awake and
reclaiming even though plenty of pages are free from a suitable zone.

This patch alters the "balance" logic for high-order reclaim allowing
kswapd to stop if any suitable zone becomes balanced to reduce the number
of pages it reclaims from other zones.  kswapd still tries to ensure that
order-0 watermarks for all zones are met before sleeping.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Eric B Munson <emunson@mgebm.net>
Cc: Simon Kirby <sim@hostway.ca>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0fd486467b4b..c31460918506 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1944,13 +1944,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-						enum zone_type high_zoneidx)
+						enum zone_type high_zoneidx,
+						enum zone_type classzone_idx)
 {
 	struct zoneref *z;
 	struct zone *zone;
 
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-		wakeup_kswapd(zone, order);
+		wakeup_kswapd(zone, order, classzone_idx);
 }
 
 static inline int
@@ -2028,7 +2029,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	wake_all_kswapd(order, zonelist, high_zoneidx);
+	wake_all_kswapd(order, zonelist, high_zoneidx,
+						zone_idx(preferred_zone));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
-- 
cgit v1.2.1


From ecb256f815232b35ae8382cff36ca8ce0bbd077e Mon Sep 17 00:00:00 2001
From: "Volodymyr G. Lukiianyk" <volodymyrgl@gmail.com>
Date: Thu, 13 Jan 2011 15:46:26 -0800
Subject: mm: set correct numa_zonelist_order string when configured on the
 kernel command line

When numa_zonelist_order parameter is set to "node" or "zone" on the
command line it's still showing as "default" in sysctl.  That's because
early_param parsing function changes only user_zonelist_order variable.
Fix this by copying user-provided string to numa_zonelist_order if it was
successfully parsed.

Signed-off-by: Volodymyr G Lukiianyk <volodymyrgl@gmail.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c31460918506..c18d1f7cff84 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2631,9 +2631,16 @@ static int __parse_numa_zonelist_order(char *s)
 
 static __init int setup_numa_zonelist_order(char *s)
 {
-	if (s)
-		return __parse_numa_zonelist_order(s);
-	return 0;
+	int ret;
+
+	if (!s)
+		return 0;
+
+	ret = __parse_numa_zonelist_order(s);
+	if (ret == 0)
+		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+	return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 
-- 
cgit v1.2.1


From 4e9f64c42d0ba5eb0c78569435ada4c224332ce4 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:29 -0800
Subject: thp: fix bad_page to show the real reason the page is bad

page_count shows the count of the head page, but the actual check is done
on the tail page, so show what is really being checked.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c18d1f7cff84..2a67c3bd403a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5618,7 +5618,7 @@ void dump_page(struct page *page)
 {
 	printk(KERN_ALERT
 	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-		page, page_count(page), page_mapcount(page),
+		page, atomic_read(&page->_count), page_mapcount(page),
 		page->mapping, page->index);
 	dump_page_flags(page->flags);
 }
-- 
cgit v1.2.1


From 8dd60a3a65c1b057bf0031d28436d3447a3c545b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:34 -0800
Subject: thp: clear compound mapping

Clear compound mapping for anonymous compound pages like it already
happens for regular anonymous pages.  But crash if mapping is set for any
tail page, also the PageAnon check is meaningless for tail pages.  This
check only makes sense for the head page, for tail page it can only hide
bugs and we definitely don't want to hide bugs.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2a67c3bd403a..8be81422d4bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -651,13 +651,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 	trace_mm_page_free_direct(page, order);
 	kmemcheck_free_shadow(page, order);
 
-	for (i = 0; i < (1 << order); i++) {
-		struct page *pg = page + i;
-
-		if (PageAnon(pg))
-			pg->mapping = NULL;
-		bad += free_pages_check(pg);
-	}
+	if (PageAnon(page))
+		page->mapping = NULL;
+	for (i = 0; i < (1 << order); i++)
+		bad += free_pages_check(page + i);
 	if (bad)
 		return false;
 
-- 
cgit v1.2.1


From 59ff421631295cd54dbf75dcc53d27e84af6d9c0 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:38 -0800
Subject: thp: comment reminder in destroy_compound_page

Warn destroy_compound_page that __split_huge_page_refcount is heavily
dependent on its internal behavior.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8be81422d4bd..6e62d5f9d40b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
 	}
 }
 
+/* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
-- 
cgit v1.2.1


From 32dba98e085f8b2b4345887df9abf5e0e93bfc12 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:49 -0800
Subject: thp: _GFP_NO_KSWAPD

Transparent hugepage allocations must be allowed not to invoke kswapd or
any other kind of indirect reclaim (especially when the defrag sysfs is
control disabled).  It's unacceptable to swap out anonymous pages
(potentially anonymous transparent hugepages) in order to create new
transparent hugepages.  This is true for the MADV_HUGEPAGE areas too
(swapping out a kvm virtual machine and so having it suffer an unbearable
slowdown, so another one with guest physical memory marked MADV_HUGEPAGE
can run 30% faster if it is running memory intensive workloads, makes no
sense).  If a transparent hugepage allocation fails the slowdown is minor
and there is total fallback, so kswapd should never be asked to swapout
memory to allow the high order allocation to succeed.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e62d5f9d40b..bbd0423f2820 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2027,7 +2027,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	wake_all_kswapd(order, zonelist, high_zoneidx,
+	if (!(gfp_mask & __GFP_NO_KSWAPD))
+		wake_all_kswapd(order, zonelist, high_zoneidx,
 						zone_idx(preferred_zone));
 
 	/*
-- 
cgit v1.2.1


From 5c3240d92e29ae7bfb9cb58a9b37e80ab40894ff Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:49 -0800
Subject: thp: don't alloc harder for gfp nomemalloc even if nowait

Not worth throwing away the precious reserved free memory pool for
allocations that can fail gracefully (either through mempool or because
they're transhuge allocations later falling back to 4k allocations).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bbd0423f2820..e7664b9f706c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1971,7 +1971,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 
 	if (!wait) {
-		alloc_flags |= ALLOC_HARDER;
+		/*
+		 * Not worth trying to allocate harder for
+		 * __GFP_NOMEMALLOC even if it can't schedule.
+		 */
+		if  (!(gfp_mask & __GFP_NOMEMALLOC))
+			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
-- 
cgit v1.2.1


From 5f24ce5fd34c3ca1b3d10d30da754732da64d5c0 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:47:00 -0800
Subject: thp: remove PG_buddy

PG_buddy can be converted to _mapcount == -2.  So the PG_compound_lock can
be added to page->flags without overflowing (because of the sparse section
bits increasing) with CONFIG_X86_PAE=y and CONFIG_X86_PAT=y.  This also
has to move the memory hotplug code from _mapcount to lru.next to avoid
any risk of clashes.  We can't use lru.next for PG_buddy removal, but
memory hotplug can use lru.next even more easily than the mapcount
instead.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e7664b9f706c..9dfe49bceff4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -449,8 +449,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
@@ -483,7 +483,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
@@ -5574,7 +5574,6 @@ static struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
-	{1UL << PG_buddy,		"buddy"		},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
-- 
cgit v1.2.1


From 43506fad21ca3d8dc59e768ff458f7c5e5c01086 Mon Sep 17 00:00:00 2001
From: KyongHo Cho <pullip.cho@samsung.com>
Date: Thu, 13 Jan 2011 15:47:24 -0800
Subject: mm/page_alloc.c: simplify calculation of combined index of adjacent
 buddy lists

The previous approach of calucation of combined index was

	page_idx & ~(1 << order))

but we have same result with

	page_idx & buddy_idx

This reduces instructions slightly as well as enhances readability.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix used-unintialised warning]
Signed-off-by: KyongHo Cho <pullip.cho@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9dfe49bceff4..bda1db301d44 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -427,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-	unsigned long buddy_idx = page_idx ^ (1 << order);
-
-	return page + (buddy_idx - page_idx);
-}
-
 static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
 {
-	return (page_idx & ~(1 << order));
+	return page_idx ^ (1 << order);
 }
 
 /*
@@ -500,6 +492,7 @@ static inline void __free_one_page(struct page *page,
 {
 	unsigned long page_idx;
 	unsigned long combined_idx;
+	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
 
 	if (unlikely(PageCompound(page)))
@@ -514,7 +507,8 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON(bad_range(zone, page));
 
 	while (order < MAX_ORDER-1) {
-		buddy = __page_find_buddy(page, page_idx, order);
+		buddy_idx = __find_buddy_index(page_idx, order);
+		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
 
@@ -522,7 +516,7 @@ static inline void __free_one_page(struct page *page,
 		list_del(&buddy->lru);
 		zone->free_area[order].nr_free--;
 		rmv_page_order(buddy);
-		combined_idx = __find_combined_index(page_idx, order);
+		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
@@ -539,9 +533,10 @@ static inline void __free_one_page(struct page *page,
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
-		combined_idx = __find_combined_index(page_idx, order);
-		higher_page = page + combined_idx - page_idx;
-		higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+		combined_idx = buddy_idx & page_idx;
+		higher_page = page + (combined_idx - page_idx);
+		buddy_idx = __find_buddy_index(combined_idx, order + 1);
+		higher_buddy = page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
-- 
cgit v1.2.1


From c06b1fca18c3ad868bfcaca230146e3038583422 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 13 Jan 2011 15:47:32 -0800
Subject: mm/page_alloc.c: don't cache `current' in a local

It's old-fashioned and unneeded.

akpm:/usr/src/25> size mm/page_alloc.o
   text    data     bss     dec     hex filename
  39884 1241317   18808 1300009  13d629 mm/page_alloc.o (before)
  39838 1241317   18808 1299963  13d5fb mm/page_alloc.o (after)

Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bda1db301d44..90c1439549fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1809,15 +1809,14 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	bool sync_migration)
 {
 	struct page *page;
-	struct task_struct *tsk = current;
 
 	if (!order || compaction_deferred(preferred_zone))
 		return NULL;
 
-	tsk->flags |= PF_MEMALLOC;
+	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration);
-	tsk->flags &= ~PF_MEMALLOC;
+	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 
 		/* Page migration frees to the PCP lists but we want merging */
@@ -1869,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 {
 	struct page *page = NULL;
 	struct reclaim_state reclaim_state;
-	struct task_struct *p = current;
 	bool drained = false;
 
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
-	p->flags |= PF_MEMALLOC;
+	current->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
-	p->reclaim_state = &reclaim_state;
+	current->reclaim_state = &reclaim_state;
 
 	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 
-	p->reclaim_state = NULL;
+	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
-	p->flags &= ~PF_MEMALLOC;
+	current->flags &= ~PF_MEMALLOC;
 
 	cond_resched();
 
@@ -1950,7 +1948,6 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-	struct task_struct *p = current;
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 
@@ -1977,12 +1974,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
-	} else if (unlikely(rt_task(p)) && !in_interrupt())
+	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (!in_interrupt() &&
-		    ((p->flags & PF_MEMALLOC) ||
+		    ((current->flags & PF_MEMALLOC) ||
 		     unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
@@ -2001,7 +1998,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
-	struct task_struct *p = current;
 	bool sync_migration = false;
 
 	/*
@@ -2060,7 +2056,7 @@ rebalance:
 		goto nopage;
 
 	/* Avoid recursion of direct reclaim */
-	if (p->flags & PF_MEMALLOC)
+	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 
 	/* Avoid allocations with no watermarks from looping endlessly */
@@ -2153,7 +2149,7 @@ nopage:
 	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
 		printk(KERN_WARNING "%s: page allocation failure."
 			" order:%d, mode:0x%x\n",
-			p->comm, order, gfp_mask);
+			current->comm, order, gfp_mask);
 		dump_stack();
 		show_mem();
 	}
-- 
cgit v1.2.1


From f33261d75b88f55a08e6a9648cef73509979bfba Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 25 Jan 2011 15:07:20 -0800
Subject: mm: fix deferred congestion timeout if preferred zone is not allowed

Before 0e093d99763e ("writeback: do not sleep on the congestion queue if
there are no congested BDIs or if significant congestion is not being
encountered in the current zone"), preferred_zone was only used for NUMA
statistics, to determine the zoneidx from which to allocate from given
the type requested, and whether to utilize memory compaction.

wait_iff_congested(), though, uses preferred_zone to determine if the
congestion wait should be deferred because its dirty pages are backed by
a congested bdi.  This incorrectly defers the timeout and busy loops in
the page allocator with various cond_resched() calls if preferred_zone
is not allowed in the current context, usually consuming 100% of a cpu.

This patch ensures preferred_zone is an allowed zone in the fastpath
depending on whether current is constrained by its cpuset or nodes in
its mempolicy (when the nodemask passed is non-NULL).  This is correct
since the fastpath allocation always passes ALLOC_CPUSET when trying to
allocate memory.  In the slowpath, this patch resets preferred_zone to
the first zone of the allowed type when the allocation is not
constrained by current's cpuset, i.e.  it does not pass ALLOC_CPUSET.

This patch also ensures preferred_zone is from the set of allowed nodes
when called from within direct reclaim since allocations are always
constrained by cpusets in this context (it is blockable).

Both of these uses of cpuset_current_mems_allowed are protected by
get_mems_allowed().

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90c1439549fd..f4967910c967 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2034,6 +2034,14 @@ restart:
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
+	/*
+	 * Find the true preferred zone if the allocation is unconstrained by
+	 * cpusets.
+	 */
+	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+		first_zones_zonelist(zonelist, high_zoneidx, NULL,
+					&preferred_zone);
+
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2192,7 +2200,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 
 	get_mems_allowed();
 	/* The preferred zone is used for statistics later */
-	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+	first_zones_zonelist(zonelist, high_zoneidx,
+				nodemask ? : &cpuset_current_mems_allowed,
+				&preferred_zone);
 	if (!preferred_zone) {
 		put_mems_allowed();
 		return NULL;
-- 
cgit v1.2.1


From 2ff754fa8f416e82327f2d8f1354a033b66286df Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 25 Jan 2011 15:07:23 -0800
Subject: mm: clear pages_scanned only if draining a pcp adds pages to the
 buddy allocator

Commit 0e093d99763e ("writeback: do not sleep on the congestion queue if
there are no congested BDIs or if significant congestion is not being
encountered in the current zone") uncovered a livelock in the page
allocator that resulted in tasks infinitely looping trying to find
memory and kswapd running at 100% cpu.

The issue occurs because drain_all_pages() is called immediately
following direct reclaim when no memory is freed and try_to_free_pages()
returns non-zero because all zones in the zonelist do not have their
all_unreclaimable flag set.

When draining the per-cpu pagesets back to the buddy allocator for each
zone, the zone->pages_scanned counter is cleared to avoid erroneously
setting zone->all_unreclaimable later.  The problem is that no pages may
actually be drained and, thus, the unreclaimable logic never fails
direct reclaim so the oom killer may be invoked.

This apparently only manifested after wait_iff_congested() was
introduced and the zone was full of anonymous memory that would not
congest the backing store.  The page allocator would infinitely loop if
there were no other tasks waiting to be scheduled and clear
zone->pages_scanned because of drain_all_pages() as the result of this
change before kswapd could scan enough pages to trigger the reclaim
logic.  Additionally, with every loop of the page allocator and in the
reclaim path, kswapd would be kicked and would end up running at 100%
cpu.  In this scenario, current and kswapd are all running continuously
with kswapd incrementing zone->pages_scanned and current clearing it.

The problem is even more pronounced when current swaps some of its
memory to swap cache and the reclaimable logic then considers all active
anonymous memory in the all_unreclaimable logic, which requires a much
higher zone->pages_scanned value for try_to_free_pages() to return zero
that is never attainable in this scenario.

Before wait_iff_congested(), the page allocator would incur an
unconditional timeout and allow kswapd to elevate zone->pages_scanned to
a level that the oom killer would be called the next time it loops.

The fix is to only attempt to drain pcp pages if there is actually a
quantity to be drained.  The unconditional clearing of
zone->pages_scanned in free_pcppages_bulk() need not be changed since
other callers already ensure that draining will occur.  This patch
ensures that free_pcppages_bulk() will actually free memory before
calling into it from drain_all_pages() so zone->pages_scanned is only
cleared if appropriate.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f4967910c967..a873e61e312e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
 		pcp = &pset->pcp;
-		free_pcppages_bulk(zone, pcp->count, pcp);
-		pcp->count = 0;
+		if (pcp->count) {
+			free_pcppages_bulk(zone, pcp->count, pcp);
+			pcp->count = 0;
+		}
 		local_irq_restore(flags);
 	}
 }
-- 
cgit v1.2.1